def create_converters(r, in_size): """ :param r: :param in_size: :return: """ if in_size != -1: pca_model = pickle.load( open("models/pca_" + r + "_" + str(in_size) + ".p", 'r')) else: pca_model = None # end if # >> 1. Choose a text to symbol converter. if r == "pos": converter = RCNLPPosConverter(resize=-1, pca_model=pca_model, fill_in=True) elif r == "tag": converter = RCNLPTagConverter(resize=-1, pca_model=pca_model, fill_in=True) elif r == "fw": converter = RCNLPFuncWordConverter(resize=-1, pca_model=pca_model, fill_in=True) else: converter = RCNLPWordVectorConverter(resize=-1, pca_model=pca_model) # end if return converter
def generate_reservoir_states(the_flow, filename, remove_startup=0): # Convert the text to Temporal Vector Representation converter = RCNLPPosConverter() doc_array = converter(io.open(filename, 'r').read()) # Generate the reservoir state states = the_flow(doc_array)[remove_startup:] return states
reps['letter'] = [-1] # For each representations for r in reps.keys(): # For each size for in_size in reps[r]: print("For representation %s size %d" % (r, in_size)) if in_size != -1: pca_model = pickle.load(open("models/pca_" + r + "_" + str(in_size) + ".p", 'r')) else: pca_model = None # end if # >> 1. Choose a text to symbol converter. if r == "pos": converter = RCNLPPosConverter(resize=-1, pca_model=pca_model) elif r == "tag": converter = RCNLPTagConverter(resize=-1, pca_model=pca_model) elif r == "fw": converter = RCNLPFuncWordConverter(resize=-1, pca_model=pca_model) elif r == "letter": converter = LetterConverter(resize=-1, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=-1, pca_model=pca_model) # end if # >> 3. Array for results average_success_rate = np.array([]) # For each samples for s in range(0, args.samples):
logging = RCNLPLogging(exp_name=ex_name, exp_inst=ex_instance, exp_value=RCNLPLogging.generate_experience_name( locals())) logging.save_globals() logging.save_variables(locals()) # PCA model pca_model = None if args.pca_model != "": pca_model = pickle.load(open(args.pca_model, 'r')) # end if # >> 1. Choose a text to symbol converter. if args.converter == "pos": converter = RCNLPPosConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "tag": converter = RCNLPTagConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "fw": converter = RCNLPFuncWordConverter(resize=args.in_components, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=args.in_components, pca_model=pca_model) # end if # >> 3. Array for results doc_success_rate_avg = np.array([]) sen_success_rate_avg = np.array([]) doc_success_rate_std = np.array([])
import io import argparse from core.converters.RCNLPPosConverter import RCNLPPosConverter if __name__ == "__main__": # Argument parser parser = argparse.ArgumentParser( description= "RCNLP - Authorship attribution with Part-Of-Speech to Echo State Network" ) # Argument parser.add_argument("--file", type=str, help="Input text file") parser.add_argument("--lang", type=str, help="Language (ar, en, es, pt)", default='en') args = parser.parse_args() converter = RCNLPPosConverter() doc_array = converter(io.open(args.file, 'r').read()) print(doc_array) print(doc_array.shape) print(doc_array[0]) RCNLPPosConverter.display_representations(doc_array) data_set = RCNLPPosConverter.generate_data_set_inputs(doc_array, 2, 0) print(data_set) # end if
parser.add_argument("--lang", type=str, help="Language (ar, en, es, pt)", default='en') args = parser.parse_args() # Logging logging = RCNLPLogging(exp_name=ex_name, exp_inst=ex_instance, exp_value=RCNLPLogging.generate_experience_name( locals())) logging.save_globals() logging.save_variables(locals()) # Reduce POS pca_reduction(RCNLPPosConverter()(io.open(args.text, 'r').read()), title="POS", ncomponents=args.poscomponents) # Reduce Tags pca_reduction(RCNLPTagConverter()(io.open(args.text, 'r').read()), title="Tags", ncomponents=args.tagcomponents) # Reduce Word vectors pca_reduction(RCNLPWordVectorConverter()(io.open(args.text, 'r').read()), title="Word vectors", ncomponents=args.wvcomponents) # Reduce FW pca_reduction(RCNLPFuncWordConverter()(io.open(args.text, 'r').read()),
def clustering_states(args, texts1, texts2, ex_name, ex_instance, size, input_scaling, leak_rate, spectral_radius, input_sparsity, w_sparsity, logging, save_graph=False, pca_model=None, flow=None): # >> 1. Convert the text to symbolic or continuous representations if args.converter == "pos": converter = RCNLPPosConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "tag": converter = RCNLPTagConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "fw": converter = RCNLPFuncWordConverter(resize=args.in_components, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=args.in_components, pca_model=pca_model) # end if # >> 2. Create an echo state network if flow is None: flow = create_reservoir(converter.get_n_inputs(), size, input_scaling, leak_rate, spectral_radius, input_sparsity, w_sparsity) # end if # >> 3. Generate Temporal Representations # Generate "temporal representations" for first author a1_states, a1_index, a1_n_samples = generate_documents_states( converter, flow, texts1, args) if save_graph: generate_plot(logging, ex_name, ex_instance, "Temporal representations for Author 1", "Time", "Neurons", a1_states[:args.show_states], transpose=True, cmap='Greys') # end if # Generate "temporal representations" for second author a2_states, a2_index, a2_n_samples = generate_documents_states( converter, flow, texts2, args) if save_graph: generate_plot(logging, ex_name, ex_instance, "Temporal representations for Author 2", "Time", "Neurons", a2_states[:args.show_states], transpose=True, cmap='Greys') # end if # >> 4. Complete states. complete_states = np.vstack((a1_states, a2_states)) if save_graph: generate_plot(logging, ex_name, ex_instance, "Complete joined Reservoir states", "Time", "Neurons", complete_states, transpose=True, cmap='Greys') # end if # Get average and std dev logging.save_results("Average neural activations", np.average(complete_states)) logging.save_results("Std dev of neural activations", np.std(complete_states)) # Same size for each authors in needed if args.homogene: if a1_states.shape[0] > a2_states.shape[0]: a1_states = a1_states[:a2_states.shape[0]] elif a2_states.shape[0] > a1_states.shape[0]: a2_states = a2_states[:a1_states.shape[0]] # end if # end if # >> 5. Join states. join_states = np.vstack((a1_states, a2_states)) if save_graph: generate_plot(logging, ex_name, ex_instance, "Joined Reservoir states", "Time", "Neurons", join_states, transpose=True, cmap='Greys') # end if # >> 6. Clustering if args.out_components != -1: # PCA pca = PCA(n_components=args.out_components) pca.fit(join_states) # Generate PCA image of principal components if args.pca_images and save_graph: # Generate PCA a1_states_pca = pca.transform(a1_states) a2_states_pca = pca.transform(a2_states) for c in np.arange(0, 8): save_pca_image(logging, a1_states_pca, a2_states_pca, c, c + 1) # end for # end if # Reduce whole states join_states_reduced = pca.transform(join_states) # Get centroids for the whole components centroids, _ = kmeans(join_states_reduced, 2) # Assign each sample to a cluster idx, _ = vq(pca.transform(complete_states), centroids) a1_idx = idx[:a1_n_samples] a2_idx = idx[a1_n_samples:] else: # Get centroids for the whole components centroids, _ = kmeans(join_states, 2) # Assign each sample to a cluster idx, _ = vq(complete_states, centroids) a1_idx = idx[:a1_n_samples] a2_idx = idx[a1_n_samples:] # end if # Compute average precision return get_v_measure_score(a1_idx, a2_idx, a1_index, a2_index)
logging.save_globals() logging.save_variables(locals()) # PCA model pca_model = None if args.pca_model != "": pca_model = pickle.load(open(args.pca_model, 'r')) # end if # Results state_results = np.array([]) doc_results = np.array([]) # >> 1. Convert the text to symbolic or continuous representations if args.converter == "pos": converter = RCNLPPosConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "tag": converter = RCNLPTagConverter(resize=args.in_components, pca_model=pca_model) elif args.converter == "fw": converter = RCNLPFuncWordConverter(resize=args.in_components, pca_model=pca_model) else: converter = RCNLPWordVectorConverter(resize=args.in_components, pca_model=pca_model) # end if # Whathever max_score = 0.0 # Iterate over reservoirs
# Argument parser parser = argparse.ArgumentParser(description="RCNLP - Create PCA model of symbolic representations.") # Argument parser.add_argument("--texts", type=str, help="Text directory.") parser.add_argument("--startup", type=int, help="Number of start-up states to remove.", default=20) parser.add_argument("--components", type=int, help="Number of principal component to reduce inputs.", required=True) parser.add_argument("--converter", type=str, help="The text converter to use (fw, pos, tag, wv).") parser.add_argument("--lang", type=str, help="Language model", default='en') parser.add_argument("--samples", type=int, help="Number of authors to take", default=20) parser.add_argument("--output", type=str, help="Output model file", default='pca_output.p') args = parser.parse_args() # >> 1. Convert the text to symbolic or continuous representations if args.converter == "pos": converter = RCNLPPosConverter() elif args.converter == "tag": converter = RCNLPTagConverter() elif args.converter == "fw": converter = RCNLPFuncWordConverter() elif args.converter == "letter": converter = LetterConverter() else: converter = RCNLPWordVectorConverter() # end if # Get texts for i in np.arange(0, args.samples): # Choose authors and text authors_id = np.random.randint(1, 50) texts = os.path.join(args.texts, str(authors_id))