Exemple #1
0
def create_converters(r, in_size):
    """

    :param r:
    :param in_size:
    :return:
    """
    if in_size != -1:
        pca_model = pickle.load(
            open("models/pca_" + r + "_" + str(in_size) + ".p", 'r'))
    else:
        pca_model = None
    # end if

    # >> 1. Choose a text to symbol converter.
    if r == "pos":
        converter = RCNLPPosConverter(resize=-1,
                                      pca_model=pca_model,
                                      fill_in=True)
    elif r == "tag":
        converter = RCNLPTagConverter(resize=-1,
                                      pca_model=pca_model,
                                      fill_in=True)
    elif r == "fw":
        converter = RCNLPFuncWordConverter(resize=-1,
                                           pca_model=pca_model,
                                           fill_in=True)
    else:
        converter = RCNLPWordVectorConverter(resize=-1, pca_model=pca_model)
    # end if

    return converter
Exemple #2
0
        # For each size
        for in_size in reps[r]:
            print("For representation %s size %d" % (r, in_size))
            if in_size != -1:
                pca_model = pickle.load(open("models/pca_" + r + "_" + str(in_size) + ".p", 'r'))
            else:
                pca_model = None
            # end if

            # >> 1. Choose a text to symbol converter.
            if r == "pos":
                converter = RCNLPPosConverter(resize=-1, pca_model=pca_model)
            elif r == "tag":
                converter = RCNLPTagConverter(resize=-1, pca_model=pca_model)
            elif r == "fw":
                converter = RCNLPFuncWordConverter(resize=-1, pca_model=pca_model)
            elif r == "letter":
                converter = LetterConverter(resize=-1, pca_model=pca_model)
            else:
                converter = RCNLPWordVectorConverter(resize=-1, pca_model=pca_model)
            # end if

            # >> 3. Array for results
            average_success_rate = np.array([])

            # For each samples
            for s in range(0, args.samples):
                print("#")
                # >> 5. Prepare training and test set.
                training_set_indexes = np.arange(0, 100, 1)[s:s+args.training_size]
                test_set_indexes = np.delete(np.arange(0, 100, 1), training_set_indexes, axis=0)[:args.test_size]
Exemple #3
0
    # PCA model
    pca_model = None
    if args.pca_model != "":
        pca_model = pickle.load(open(args.pca_model, 'r'))
    # end if

    # >> 1. Choose a text to symbol converter.
    if args.converter == "pos":
        converter = RCNLPPosConverter(resize=args.in_components,
                                      pca_model=pca_model)
    elif args.converter == "tag":
        converter = RCNLPTagConverter(resize=args.in_components,
                                      pca_model=pca_model)
    elif args.converter == "fw":
        converter = RCNLPFuncWordConverter(resize=args.in_components,
                                           pca_model=pca_model)
    else:
        converter = RCNLPWordVectorConverter(resize=args.in_components,
                                             pca_model=pca_model)
    # end if

    # >> 3. Array for results
    doc_success_rate_avg = np.array([])
    sen_success_rate_avg = np.array([])
    doc_success_rate_std = np.array([])
    sen_success_rate_std = np.array([])

    # Training set sizes
    training_set_sizes = np.arange(1, 96, args.step)

    # For each training size
Exemple #4
0
    logging = RCNLPLogging(exp_name=ex_name,
                           exp_inst=ex_instance,
                           exp_value=RCNLPLogging.generate_experience_name(
                               locals()))
    logging.save_globals()
    logging.save_variables(locals())

    # Reduce POS
    pca_reduction(RCNLPPosConverter()(io.open(args.text, 'r').read()),
                  title="POS",
                  ncomponents=args.poscomponents)

    # Reduce Tags
    pca_reduction(RCNLPTagConverter()(io.open(args.text, 'r').read()),
                  title="Tags",
                  ncomponents=args.tagcomponents)

    # Reduce Word vectors
    pca_reduction(RCNLPWordVectorConverter()(io.open(args.text, 'r').read()),
                  title="Word vectors",
                  ncomponents=args.wvcomponents)

    # Reduce FW
    pca_reduction(RCNLPFuncWordConverter()(io.open(args.text, 'r').read()),
                  title="Function words",
                  ncomponents=args.fwcomponents)

    # Open logging dir
    logging.open_dir()

# end if
Exemple #5
0
def clustering_states(args,
                      texts1,
                      texts2,
                      ex_name,
                      ex_instance,
                      size,
                      input_scaling,
                      leak_rate,
                      spectral_radius,
                      input_sparsity,
                      w_sparsity,
                      logging,
                      save_graph=False,
                      pca_model=None,
                      flow=None):

    # >> 1. Convert the text to symbolic or continuous representations
    if args.converter == "pos":
        converter = RCNLPPosConverter(resize=args.in_components,
                                      pca_model=pca_model)
    elif args.converter == "tag":
        converter = RCNLPTagConverter(resize=args.in_components,
                                      pca_model=pca_model)
    elif args.converter == "fw":
        converter = RCNLPFuncWordConverter(resize=args.in_components,
                                           pca_model=pca_model)
    else:
        converter = RCNLPWordVectorConverter(resize=args.in_components,
                                             pca_model=pca_model)
    # end if

    # >> 2. Create an echo state network
    if flow is None:
        flow = create_reservoir(converter.get_n_inputs(), size, input_scaling,
                                leak_rate, spectral_radius, input_sparsity,
                                w_sparsity)
    # end if

    # >> 3. Generate Temporal Representations
    # Generate "temporal representations" for first author
    a1_states, a1_index, a1_n_samples = generate_documents_states(
        converter, flow, texts1, args)
    if save_graph:
        generate_plot(logging,
                      ex_name,
                      ex_instance,
                      "Temporal representations for Author 1",
                      "Time",
                      "Neurons",
                      a1_states[:args.show_states],
                      transpose=True,
                      cmap='Greys')
    # end if

    # Generate "temporal representations" for second author
    a2_states, a2_index, a2_n_samples = generate_documents_states(
        converter, flow, texts2, args)
    if save_graph:
        generate_plot(logging,
                      ex_name,
                      ex_instance,
                      "Temporal representations for Author 2",
                      "Time",
                      "Neurons",
                      a2_states[:args.show_states],
                      transpose=True,
                      cmap='Greys')
    # end if

    # >> 4. Complete states.
    complete_states = np.vstack((a1_states, a2_states))
    if save_graph:
        generate_plot(logging,
                      ex_name,
                      ex_instance,
                      "Complete joined Reservoir states",
                      "Time",
                      "Neurons",
                      complete_states,
                      transpose=True,
                      cmap='Greys')
    # end if

    # Get average and std dev
    logging.save_results("Average neural activations",
                         np.average(complete_states))
    logging.save_results("Std dev of neural activations",
                         np.std(complete_states))

    # Same size for each authors in needed
    if args.homogene:
        if a1_states.shape[0] > a2_states.shape[0]:
            a1_states = a1_states[:a2_states.shape[0]]
        elif a2_states.shape[0] > a1_states.shape[0]:
            a2_states = a2_states[:a1_states.shape[0]]
        # end if
    # end if

    # >> 5. Join states.
    join_states = np.vstack((a1_states, a2_states))
    if save_graph:
        generate_plot(logging,
                      ex_name,
                      ex_instance,
                      "Joined Reservoir states",
                      "Time",
                      "Neurons",
                      join_states,
                      transpose=True,
                      cmap='Greys')
    # end if

    # >> 6. Clustering
    if args.out_components != -1:
        # PCA
        pca = PCA(n_components=args.out_components)
        pca.fit(join_states)

        # Generate PCA image of principal components
        if args.pca_images and save_graph:
            # Generate PCA
            a1_states_pca = pca.transform(a1_states)
            a2_states_pca = pca.transform(a2_states)
            for c in np.arange(0, 8):
                save_pca_image(logging, a1_states_pca, a2_states_pca, c, c + 1)
            # end for
        # end if

        # Reduce whole states
        join_states_reduced = pca.transform(join_states)

        # Get centroids for the whole components
        centroids, _ = kmeans(join_states_reduced, 2)

        # Assign each sample to a cluster
        idx, _ = vq(pca.transform(complete_states), centroids)
        a1_idx = idx[:a1_n_samples]
        a2_idx = idx[a1_n_samples:]
    else:
        # Get centroids for the whole components
        centroids, _ = kmeans(join_states, 2)

        # Assign each sample to a cluster
        idx, _ = vq(complete_states, centroids)
        a1_idx = idx[:a1_n_samples]
        a2_idx = idx[a1_n_samples:]
    # end if

    # Compute average precision
    return get_v_measure_score(a1_idx, a2_idx, a1_index, a2_index)
Exemple #6
0
if __name__ == "__main__":

    # Argument parser
    parser = argparse.ArgumentParser(
        description=
        "RCNLP - Authorship attribution with Part-Of-Speech to Echo State Network"
    )

    # Argument
    parser.add_argument("--file", type=str, help="Input text file")
    parser.add_argument("--lang",
                        type=str,
                        help="Language (ar, en, es, pt)",
                        default='en')
    parser.add_argument("--sample-size",
                        type=int,
                        help="Word vector sample size",
                        default=300)
    args = parser.parse_args()

    # Convert the text to Temporal Vector Representation
    converter = RCNLPFuncWordConverter()
    doc_array = converter(io.open(args.file, 'r').read())

    # Display the Temporal Vector Representation
    RCNLPFuncWordConverter.display_representations(doc_array)

    # Transform the TVR to ESN learning input
    data_set = RCNLPFuncWordConverter.generate_data_set_inputs(doc_array, 2, 0)

# end if
Exemple #7
0
    parser.add_argument("--texts", type=str, help="Text directory.")
    parser.add_argument("--startup", type=int, help="Number of start-up states to remove.", default=20)
    parser.add_argument("--components", type=int, help="Number of principal component to reduce inputs.", required=True)
    parser.add_argument("--converter", type=str, help="The text converter to use (fw, pos, tag, wv).")
    parser.add_argument("--lang", type=str, help="Language model", default='en')
    parser.add_argument("--samples", type=int, help="Number of authors to take", default=20)
    parser.add_argument("--output", type=str, help="Output model file", default='pca_output.p')
    args = parser.parse_args()

    # >> 1. Convert the text to symbolic or continuous representations
    if args.converter == "pos":
        converter = RCNLPPosConverter()
    elif args.converter == "tag":
        converter = RCNLPTagConverter()
    elif args.converter == "fw":
        converter = RCNLPFuncWordConverter()
    elif args.converter == "letter":
        converter = LetterConverter()
    else:
        converter = RCNLPWordVectorConverter()
    # end if

    # Get texts
    for i in np.arange(0, args.samples):
        # Choose authors and text
        authors_id = np.random.randint(1, 50)
        texts = os.path.join(args.texts, str(authors_id))

        # Generate states for first author
        print("Transforming texts from author %s to symbols" % texts)
        for index, text_file in enumerate(os.listdir(texts)):