def retrieval_demo(): """Function intended to illustrate retrieval in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Retrieval' print 'Graph type: Dependency' print 'Centrality: PageRank' print print '> Reading data..' desc_path = '../data/air/problem_descriptions_dependencies' sol_path = '../data/air/solutions_preprocessed' problems, _ = data.read_files(desc_path) solutions, _ = data.read_files(sol_path) print '> Creating solution representations..' metric = freq_representation.FrequencyMetrics.TF_IDF sol_vectors = freq_representation.text_to_vector(solutions, metric) print '> Creating problem description representations..' dicts = [] for i, doc in enumerate(problems): print ' ',str(i)+'/'+str(len(problems)) g = graph_representation.construct_dependency_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.PAGERANK) dicts.append(d) desc_vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_retrieval(desc_vectors, sol_vectors) print ' score:', score print
def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/' + dataset + '/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict( training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors( training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test retrieval using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) filenames = data.get_file_names(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): print ' '+str(i)+"/"+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/retr' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def retrieval_demo(): """Function intended to illustrate retrieval in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Retrieval' print 'Graph type: Dependency' print 'Centrality: PageRank' print print '> Reading data..' desc_path = '../data/air/problem_descriptions_dependencies' sol_path = '../data/air/solutions_preprocessed' problems, _ = data.read_files(desc_path) solutions, _ = data.read_files(sol_path) print '> Creating solution representations..' metric = freq_representation.FrequencyMetrics.TF_IDF sol_vectors = freq_representation.text_to_vector(solutions, metric) print '> Creating problem description representations..' dicts = [] for i, doc in enumerate(problems): print ' ', str(i) + '/' + str(len(problems)) g = graph_representation.construct_dependency_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.PAGERANK) dicts.append(d) desc_vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_retrieval(desc_vectors, sol_vectors) print ' score:', score print
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i % 1 == 0: print ' document', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_cooccurrence_network( text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res') return results
def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/'+dataset+'/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict(training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False): """ Experiment used for comparative evaluation of different network representations on retrieval. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ def make_dicts(docs, icc=None): rep = [] for i, doc in enumerate(docs): if i%100==0: print ' graph',str(i)+'/'+str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'} gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network, 'dependency':graph_representation.construct_dependency_network} metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency':graph.GraphMetrics.EIGENVECTOR} print '--', graph_type print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions'+postfix[graph_type] docs, labels = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/'+dataset+'/solutions_preprocessed' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) icc = None if use_icc: print '> Calculating ICC..' m = metrics[graph_type].split()[0] print graph_type if graph_type == 'co-occurrence': p = 'output/centralities/co-occurrence/'+dataset+'/problem_descriptions/window/'+m+'.cent' elif graph_type == 'dependency': p = 'output/centralities/dependency/'+dataset+'/problem_descriptions/'+m+'.cent' print ' fetching', p icc = data.pickle_from_file(p) print ' icc:', type(icc) print '> Creating problem description representations..' dicts = make_dicts(docs, icc) descriptions_rep = graph_representation.dicts_to_vectors(dicts)#, remove_stop_words=True) print '> Evaluating..' results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) print results s = 'retrieval comparison ' if use_icc: s += 'USING TC-ICC' s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def edge_direction_evaluation(direction): """ Evaluate impact of using different edge directions on dependency networks. Values for *direction*: ``forward``, ``backward``, and ``undirected``. """ results = {'_edge-direction':direction} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): if i%100==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_dependency_network(text, direction=direction) metric = graph.GraphMetrics.CLOSENESS d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' score:', score results['classification'] = score print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): if i%100==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, direction=direction) metric = graph.GraphMetrics.EIGENVECTOR d = graph_representation.graph_to_dict(g, metric) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print ' score:', score results['retrieval'] = score data.pickle_to_file(results, 'output/dependencies/stop_words_retr_'+direction) pp.pprint(results) return results
def centrality_weights_retrieval(weighted=True): """ Evaluate whether edge weights are beneficial to the depdendency network represenation for the retrieval task. """ results = {'_is_weighted': weighted, '_evaluation': 'retrieval'} graph_metrics = graph_representation.get_metrics(weighted) print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating graph representations..' for i, text in enumerate(description_texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_dependency_network( text, weighted=weighted) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. if i % 100 == 0: if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file( rep, 'output/dependencies/exp1_retr_tmp_' + str(i) + '_' + postfix) print '> Creating vector representations..' for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric] = score if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(results, 'output/dependencies/exp1_retr' + postfix) pp.pprint(results) return results
def print_common_hub_words(rem_stop_words): """ Print a list of the most common hub words in the created networks. Purpose of experiment was to show that hub words typically are stop words. The *rem_stop_words* determine whether stop words are removed before creating the networks. """ results = {'_removing stop-words': rem_stop_words} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' fd = nltk.probability.FreqDist() for i, text in enumerate(texts): if i % 100 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_dependency_network( text, remove_stop_words=rem_stop_words) hubs = graph.get_hubs(g, 10) for h in hubs: fd.inc(h[0]) g = None # just to make sure.. results['tasa'] = fd.keys() print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) print '> Creating representations..' fd = nltk.probability.FreqDist() for i, text in enumerate(description_texts): if i % 100 == 0: print ' ', str(i) + '/' + str(len(description_texts)) g = graph_representation.construct_dependency_network( text, remove_stop_words=rem_stop_words) hubs = graph.get_hubs(g, 10) for h in hubs: fd.inc(h[0]) g = None # just to make sure.. results['air'] = fd.keys() if rem_stop_words: modifier = 'without' else: modifier = 'with' data.pickle_to_file( results, 'output/dependencies/common_hubs_' + modifier + 'stop_words') pp.pprint(results) return results
def evaluate_tc_icc_retrieval(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'air/problem_descriptions' context = 'window' solutions_path = '../data/air/solutions_preprocessed' path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating solution representations..' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating problem description representations..' for i, text in enumerate(description_texts): if i%1==0: print ' document',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window') for metric in graph_metrics: if not icc[metric]: continue #~ print ' ',metric d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solutions_rep) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res') return results
def centrality_weights_retrieval(weighted=True): """ Evaluate whether edge weights are beneficial to the depdendency network represenation for the retrieval task. """ results = {'_is_weighted':weighted, '_evaluation':'retrieval'} graph_metrics = graph_representation.get_metrics(weighted) print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating graph representations..' for i, text in enumerate(description_texts): if i%10==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, weighted=weighted) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. if i%100==0: if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(rep, 'output/dependencies/exp1_retr_tmp_'+str(i)+'_'+postfix) print '> Creating vector representations..' for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric] = score if weighted: postfix = '_weighted' else: postfix = '_unweighted' data.pickle_to_file(results, 'output/dependencies/exp1_retr'+postfix) pp.pprint(results) return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i % 10 == 0: print i g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_' + str(window_size)) pp.pprint(results) return results
def print_common_hub_words(rem_stop_words): """ Print a list of the most common hub words in the created networks. Purpose of experiment was to show that hub words typically are stop words. The *rem_stop_words* determine whether stop words are removed before creating the networks. """ results = {'_removing stop-words':rem_stop_words} print '------ CLASSIFICATION EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/tasa/TASA900_dependencies' texts, labels = data.read_files(descriptions_path) print '> Creating representations..' fd = nltk.probability.FreqDist() for i, text in enumerate(texts): if i%100==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words) hubs = graph.get_hubs(g, 10) for h in hubs: fd.inc(h[0]) g = None # just to make sure.. results['tasa'] = fd.keys() print '------ RETRIEVAL EVALUATION --------' print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_dependencies' description_texts, labels = data.read_files(descriptions_path) print '> Creating representations..' fd = nltk.probability.FreqDist() for i, text in enumerate(description_texts): if i%100==0: print ' ',str(i)+'/'+str(len(description_texts)) g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words) hubs = graph.get_hubs(g, 10) for h in hubs: fd.inc(h[0]) g = None # just to make sure.. results['air'] = fd.keys() if rem_stop_words: modifier = 'without' else: modifier = 'with' data.pickle_to_file(results, 'output/dependencies/common_hubs_'+modifier+'stop_words') pp.pprint(results) return results
def do_retrieval_experiments( descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = { '_solutions': solutions, '_descriptions': descriptions, '_evaluation': 'retrieval' } print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/' + descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/' + solutions + '_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ', gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path + '_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) print pp.pprint(results) return results
def do_context_size_evaluation_retrieval(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the retrieval task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for i, text in enumerate(description_texts): if i%10==0: print i g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_retrieval(vectors, solution_vectors) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/retr_context_'+str(window_size)) pp.pprint(results) return results
def corpus_dependency_properties(dataset = 'air/problem_descriptions'): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_dependencies' (documents, labels) = data.read_files(corpus_path) props = {} giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_dependency_network(text,remove_stop_words=True) giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k,v in p.iteritems(): if i==0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: props_total[key+'_mean'] = numpy.mean(props[key]) props_total[key+'_std'] = numpy.std(props[key]) data.pickle_to_file(giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words') data.pickle_to_file(props, 'output/properties/dependency/docs_air_all_no_stop_words') data.pickle_to_file(props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')
def print_degree_distributions(dataset, context): """ Extracts degree distribution values from networks, and print them to cvs-file. **warning** overwrites if file exists. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_text' (documents, labels) = data.read_files(corpus_path) degsfile = open('output/properties/cooccurrence/degrees_docs_'+dataset.replace('/','.'), 'w') giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_cooccurrence_network(text,context=context) giant.add_edges_from(g.edges()) degs = nx.degree(g).values() degs = [str(d) for d in degs] degsfile.write(','.join(degs)+'\n') degsfile.close() print '> Writing giant\'s distribution' with open('output/properties/cooccurrence/degrees_giant_'+dataset.replace('/','.'), 'w') as f: ds = nx.degree(giant).values() ds = [str(d) for d in ds] f.write(','.join(ds))
def corpus_properties(dataset, context): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset+'_text' (documents, labels) = data.read_files(corpus_path) props = {} #~ giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i%10==0: print ' ',str(i)+'/'+str(len(documents)) g = graph_representation.construct_cooccurrence_network(text,context=context) #~ giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k,v in p.iteritems(): if i==0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: print ' ',key props_total[key+'_mean'] = numpy.mean(props[key]) props_total[key+'_std'] = numpy.std(props[key]) data_name = dataset.replace('/','.') #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name) data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name) data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
def do_context_sentence_evaluation_classification(): """ Experiment evaluating performance of sentences as contexts for co-occurrence networks in the classification task. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) print '> Evaluating..' graphs = [] results = {} for text in texts: g = graph_representation.construct_cooccurrence_network(text, context='sentence') graphs.append(g) for metric in graph_representation.get_metrics(): print ' ', metric vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True) score = evaluation.evaluate_classification(vectors, labels) results[metric+' (sentence)'] = score data.pickle_to_file(results, 'output/class_context_sentence') pp.pprint(results) return results
def word_vectors(): train = read_files('train') test = read_files('test') tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(train['text']) train_seq = tokenizer.texts_to_sequences(train['text']) test_seq = tokenizer.texts_to_sequences(test['text']) x_train = sequence.pad_sequences(train_seq, maxlen=200) # shape (25000, 200) y_train = train['label'] x_test = sequence.pad_sequences(test_seq, maxlen=200) # shape (25000, 200) y_test = test['label'] return x_train, y_train.values, x_test, y_test.values
def solution_similarity_stats(dataset='air/solutions_preprocessed'): """ Plots histogram of solution-solution similarity distribution of a dataset. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) print '> Creating vector representations..' vectors = freq_representation.text_to_vector( documents, freq_representation.FrequencyMetrics.TF_IDF) print '> Calculating similarities..' distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine') diag = numpy.diag([2.0] * len(distances), 0) # move similarities of "self" to -1 distances = distances + diag similarities = 1.0 - distances similarities = similarities.ravel() similarities = [s for s in similarities if s >= 0] print plotter.histogram(similarities, 'similarity', '# matches', '', bins=150) print print max(similarities) print min(similarities) print float(sum(similarities)) / len(similarities) num = len([sim for sim in similarities if sim < 0.23]) print 'fraction sims < .23:', float(num) / len(similarities)
def test_scale_free(): import random import data import graph_representation import plfit import numpy corpus_path = '../data/air/problem_descriptions_text' (documents, labels) = data.read_files(corpus_path) g = graph_representation.construct_cooccurrence_network(documents[0],context='sentence') degree_sequence=sorted(nx.degree(g).values(),reverse=True) # degree sequence dmax=max(degree_sequence) degree_sequence = numpy.array(degree_sequence) print degree_sequence pl = plfit.plfit(degree_sequence) p,ksv = pl.test_pl() print print print print seq = [random.randrange(0,100) for i in range(len(degree_sequence))] degree_sequence = numpy.array(seq) print degree_sequence pl = plfit.plfit(degree_sequence) p,ksv = pl.test_pl() print print print print
def test_scale_free(): import random import data import graph_representation import plfit import numpy corpus_path = '../data/air/problem_descriptions_text' (documents, labels) = data.read_files(corpus_path) g = graph_representation.construct_cooccurrence_network(documents[0], context='sentence') degree_sequence = sorted(nx.degree(g).values(), reverse=True) # degree sequence dmax = max(degree_sequence) degree_sequence = numpy.array(degree_sequence) print degree_sequence pl = plfit.plfit(degree_sequence) p, ksv = pl.test_pl() print print print print seq = [random.randrange(0, 100) for i in range(len(degree_sequence))] degree_sequence = numpy.array(seq) print degree_sequence pl = plfit.plfit(degree_sequence) p, ksv = pl.test_pl() print print print print
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i, d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d) == 0: print file_names[i], 'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:', len(documents) print '# empty documents:', empty print '# words:', sum(lengths) print 'length avg:', lengths.mean() print 'length stddev:', lengths.std() print print 'document lengths (sorted):', sorted(lengths) plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i,d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d)==0: print file_names[i],'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:',len(documents) print '# empty documents:',empty print '# words:',sum(lengths) print 'length avg:',lengths.mean() print 'length stddev:',lengths.std() print print 'document lengths (sorted):',sorted(lengths) plotter.histogram(lengths,'# tokens','# documents','',bins=80)
def print_degree_distributions(dataset, context): """ Extracts degree distribution values from networks, and print them to cvs-file. **warning** overwrites if file exists. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset + '_text' (documents, labels) = data.read_files(corpus_path) degsfile = open( 'output/properties/cooccurrence/degrees_docs_' + dataset.replace('/', '.'), 'w') giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i % 10 == 0: print ' ', str(i) + '/' + str(len(documents)) g = graph_representation.construct_cooccurrence_network( text, context=context) giant.add_edges_from(g.edges()) degs = nx.degree(g).values() degs = [str(d) for d in degs] degsfile.write(','.join(degs) + '\n') degsfile.close() print '> Writing giant\'s distribution' with open( 'output/properties/cooccurrence/degrees_giant_' + dataset.replace('/', '.'), 'w') as f: ds = nx.degree(giant).values() ds = [str(d) for d in ds] f.write(','.join(ds))
def test_document_lengths(dataset='mir'): print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "PROBLEM DESCRIPTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty" path = '../data/' + dataset + '/solutions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "SOLUTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty"
def do_context_sentence_evaluation_classification(): """ Experiment evaluating performance of sentences as contexts for co-occurrence networks in the classification task. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) print '> Evaluating..' graphs = [] results = {} for text in texts: g = graph_representation.construct_cooccurrence_network( text, context='sentence') graphs.append(g) for metric in graph_representation.get_metrics(): print ' ', metric vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True) score = evaluation.evaluate_classification(vectors, labels) results[metric + ' (sentence)'] = score data.pickle_to_file(results, 'output/class_context_sentence') pp.pprint(results) return results
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test classification using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) filenames = data.get_file_names(path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): print ' '+str(i)+"/"+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/class' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def classification_demo(): """Function intended to illustrate classification in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Classification' print 'Graph type: Co-occurrence w/2-word window context' print 'Centrality: Weighted degree' print print '> Reading data..' corpus_path = '../data/tasa/TASA900_preprocessed' docs, labels = data.read_files(corpus_path) print '> Creating representations..' dicts = [] for i, doc in enumerate(docs): print ' ', str(i) + '/' + str(len(docs)) g = graph_representation.construct_cooccurrence_network(doc) d = graph_representation.graph_to_dict( g, graph.GraphMetrics.WEIGHTED_DEGREE) dicts.append(d) vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_classification(vectors, labels) print ' score:', score print
def solution_similarity_stats(dataset='air/solutions_preprocessed'): """ Plots histogram of solution-solution similarity distribution of a dataset. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) print '> Creating vector representations..' vectors = freq_representation.text_to_vector(documents, freq_representation.FrequencyMetrics.TF_IDF) print '> Calculating similarities..' distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine') diag = numpy.diag([2.0]*len(distances),0) # move similarities of "self" to -1 distances = distances + diag similarities = 1.0 - distances similarities = similarities.ravel() similarities = [s for s in similarities if s >= 0] print plotter.histogram(similarities,'similarity','# matches','',bins=150) print print max(similarities) print min(similarities) print float(sum(similarities))/len(similarities) num = len([sim for sim in similarities if sim < 0.23]) print 'fraction sims < .23:', float(num)/len(similarities)
def test_document_lengths(dataset='mir'): print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "PROBLEM DESCRIPTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty" path = '../data/'+dataset+'/solutions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "SOLUTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty"
def classification_demo(): """Function intended to illustrate classification in the experimental framework. Intended as a basis for new experiments for those not intimately familiar with the code. """ print 'Evaluation type: Classification' print 'Graph type: Co-occurrence w/2-word window context' print 'Centrality: Weighted degree' print print '> Reading data..' corpus_path = '../data/tasa/TASA900_preprocessed' docs, labels = data.read_files(corpus_path) print '> Creating representations..' dicts = [] for i, doc in enumerate(docs): print ' ',str(i)+'/'+str(len(docs)) g = graph_representation.construct_cooccurrence_network(doc) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) dicts.append(d) vectors = graph_representation.dicts_to_vectors(dicts) print '> Evaluating..' score = evaluation.evaluate_classification(vectors, labels) print ' score:', score print
def do_retrieval_experiments(descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence','dependency','random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_solutions':solutions, '_descriptions':descriptions, '_evaluation':'retrieval'} print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/'+descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/'+solutions+'_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ',gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path+'_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) print pp.pprint(results) return results
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/' + corpus + '_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i % 10 == 0: print ' ', str(i) + '/' + str(len(texts)) g = graph_representation.construct_cooccurrence_network( text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file( results, 'output/tc_icc/cooccurrence/' + corpus + '/classification.res') return results
def test(): import data dataset = 'test/freq' corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) vectors = {} print '> Building vector representations..' for metric in get_metrics(): print ' ', metric vectors[metric] = text_to_vector(documents, metric)
def test(): import data dataset = 'test/freq' corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) vectors = {} print '> Building vector representations..' for metric in get_metrics(): print ' ', metric vectors[metric] = text_to_vector(documents, metric)
def store_corpus_network(corpus, context): print '> Constructing corpus network for', corpus path = '../data/'+corpus+'_text' store_path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping' return texts, labels = data.read_files(path) gdoc = ' '.join(texts) giant = graph_representation.construct_cooccurrence_network(gdoc, context=context, already_preprocessed=False, verbose=True) print '> Serializing and saving..' data.pickle_to_file(giant, store_path)
def vectorization(vector_type): train = read_files('train') test = read_files('test') if vector_type == 'CountVectorizer': vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, max_features=2000) elif vector_type == 'TfidfVectorizer': vectorizer = TfidfVectorizer() else: vectorizer = HashingVectorizer(n_features=200) vectorizer.fit(train['text']) x_train = vectorizer.transform(train['text']) y_train = train['label'].values x_test = vectorizer.transform(test['text']) y_test = test['label'].values return x_train, y_train, x_test, y_test
def evaluate_tc_icc_classification(): graph_metrics = graph_representation.get_metrics(True, exclude_flow=True) print '> Reading cases..' corpus = 'tasa/TASA900' #~ corpus = 'tasa/TASATest2' context = 'sentence' path = '../data/'+corpus+'_text' texts, labels = data.read_files(path) rep = {} icc = {} print '> Calculating ICCs..' for metric in graph_metrics: print ' ', metric rep[metric] = [] centralities = retrieve_centralities(corpus, context, metric) if centralities: icc[metric] = graph_representation.calculate_icc_dict(centralities) else: icc[metric] = None print '> Creating graph representations..' for i, text in enumerate(texts): if i%10==0: print ' ',str(i)+'/'+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context=context) for metric in graph_metrics: print ' ', metric if not icc[metric]: continue d = graph_representation.graph_to_dict(g, metric, icc[metric]) rep[metric].append(d) g = None # just to make sure.. print '> Creating vector representations..' for metric in graph_metrics: if not icc[metric]: continue rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' results = {} for metric in graph_metrics: if not icc[metric]: results[metric] = None continue vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric] = score pp.pprint(results) data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res') return results
def retrieval_comparison_freq(dataset='mir'): print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions_preprocessed' docs, _ = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/'+dataset+'/solutions_preprocessed' solutions_docs, _ = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_docs, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' results = {} for metric in freq_representation.get_metrics(): print ' ', metric, descriptions_rep = freq_representation.text_to_vector(docs, metric) score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) results[metric] = score print score pp.pprint(results) s = 'retrieval comparison \nrepresentation: frequency\ndataset:'+dataset+' \nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def store_corpus_network(corpus, context): print '> Constructing corpus network for', corpus path = '../data/' + corpus + '_text' store_path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping' return texts, labels = data.read_files(path) gdoc = ' '.join(texts) giant = graph_representation.construct_cooccurrence_network( gdoc, context=context, already_preprocessed=False, verbose=True) print '> Serializing and saving..' data.pickle_to_file(giant, store_path)
def retrieval_comparison_freq(dataset='mir'): print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions_preprocessed' docs, _ = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/' + dataset + '/solutions_preprocessed' solutions_docs, _ = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_docs, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' results = {} for metric in freq_representation.get_metrics(): print ' ', metric, descriptions_rep = freq_representation.text_to_vector(docs, metric) score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) results[metric] = score print score pp.pprint(results) s = 'retrieval comparison \nrepresentation: frequency\ndataset:' + dataset + ' \nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def freq_classification(dataset='tasa/TASA900'): results = {'_dataset':dataset, '_evaluation':'classification'} corpus_path = '../data/'+dataset results['results'] = {} for metric in freq_representation.get_metrics(): print metric documents, labels = data.read_files(corpus_path+'_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation') results['results'][metric] = r print ' ', r print pp.pprint(results) return results
def test_dependency_graph(): (docs, labels) = data.read_files('../data/tasa/TASA900_dependencies') graphs = [] for i, text in enumerate(docs): print i graphs.append(construct_dependency_network(text)) g = graphs[0] print g.nodes() print g.edges() print '#graphs:', len(graphs) pos = nx.spring_layout(g) graph.draw_with_centrality(g, layout=pos)
def test_vocabulary_size(path = '../data/air/problem_descriptions_preprocessed'): """ Print vocabulary sizes for documents in dataset. """ texts, labels = data.read_files(path) lengths = [] for text in texts: text = text.split(' ') l = len(list(set(text))) lengths.append(l) print ' ',l lengths = numpy.array(lengths) print 'avg', lengths.mean() print 'max', lengths.max() print 'min', lengths.min()
def freq_classification(dataset='tasa/TASA900'): results = {'_dataset': dataset, '_evaluation': 'classification'} corpus_path = '../data/' + dataset results['results'] = {} for metric in freq_representation.get_metrics(): print metric documents, labels = data.read_files(corpus_path + '_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation') results['results'][metric] = r print ' ', r print pp.pprint(results) return results
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1, 11) + [20, 40, 80]: print '-- window size:', window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network( text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_' + str(window_size)) pp.pprint(results) return results
def store_corpus_network(corpus): print '> Constructing corpus network for', corpus path = '../data/'+corpus+'_dependencies' store_path = 'output/giants/dependency/'+corpus+'/graph.net' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping' return texts, labels = data.read_files(path) gdeps = {} for i, text in enumerate(texts): if i%1==0: print ' ',str(i)+'/'+str(len(texts)) d = pickle.loads(text) for dep in d.keys(): gdeps[dep] = gdeps.get(dep, []) + d[dep] giant = graph_representation.construct_dependency_network(gdeps,verbose=True,unpickle=False) print '> Serializing and saving..' data.pickle_to_file(giant, store_path)
def do_context_size_evaluation_classification(): """ Experiment evaluating performance of different context sizes for co-occurrence networks in the classification task. """ results = {} graph_metrics = graph_representation.get_metrics() for metric in graph_metrics: results[metric] = [] print '> Reading cases..' path = '../data/tasa/TASA900_preprocessed' texts, labels = data.read_files(path) for window_size in range(1,11)+[20,40,80]: print '-- window size:',window_size rep = {} for metric in graph_metrics: rep[metric] = [] print '> Creating representations..' # creating graphs and finding centralities for text in texts: g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True) for metric in graph_metrics: d = graph_representation.graph_to_dict(g, metric) rep[metric].append(d) g = None # just to make sure.. # creating representation vectors for metric in graph_metrics: rep[metric] = graph_representation.dicts_to_vectors(rep[metric]) print '> Evaluating..' for metric in graph_metrics: vectors = rep[metric] score = evaluation.evaluate_classification(vectors, labels) print ' ', metric, score results[metric].append(score) data.pickle_to_file(results, 'output/class_context_'+str(window_size)) pp.pprint(results) return results
def test_best_classification(): print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) rep = [] print '> Creating representations..' for i, text in enumerate(texts): if i%100==0: print ' ',i g = graph_representation.construct_cooccurrence_network(text, context='sentence') d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' ', score
def store_corpus_network(corpus): print '> Constructing corpus network for', corpus path = '../data/' + corpus + '_dependencies' store_path = 'output/giants/dependency/' + corpus + '/graph.net' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping' return texts, labels = data.read_files(path) gdeps = {} for i, text in enumerate(texts): if i % 1 == 0: print ' ', str(i) + '/' + str(len(texts)) d = pickle.loads(text) for dep in d.keys(): gdeps[dep] = gdeps.get(dep, []) + d[dep] giant = graph_representation.construct_dependency_network(gdeps, verbose=True, unpickle=False) print '> Serializing and saving..' data.pickle_to_file(giant, store_path)
def test_best_classification(): print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) rep = [] print '> Creating representations..' for i, text in enumerate(texts): if i % 100 == 0: print ' ', i g = graph_representation.construct_cooccurrence_network( text, context='sentence') d = graph_representation.graph_to_dict( g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) g = None # just to make sure.. rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print ' ', score
def do_classification_experiments( dataset='tasa/TASA900', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on classification. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_dataset': dataset, '_evaluation': 'classification'} print '> Evaluation type: classification' print '> Reading data..', dataset corpus_path = '../data/' + dataset docdata = data.read_data(corpus_path, graph_types) print '> Evaluating..' for gtype in graph_types: print ' ', gtype documents, labels = docdata[gtype] graphs = graph_representation.create_graphs(documents, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_classification( vectors, labels) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric documents, labels = data.read_files(corpus_path + '_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) results['freq'][metric] = evaluation.evaluate_classification( vectors, labels) print pp.pprint(results) return results
def corpus_dependency_properties(dataset='air/problem_descriptions'): """ Identify and pickle to file various properties of the given dataset. These can alter be converted to pretty tables using :func:`~experiments.print_network_props`. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset + '_dependencies' (documents, labels) = data.read_files(corpus_path) props = {} giant = nx.DiGraph() print '> Building networks..' for i, text in enumerate(documents): if i % 10 == 0: print ' ', str(i) + '/' + str(len(documents)) g = graph_representation.construct_dependency_network( text, remove_stop_words=True) giant.add_edges_from(g.edges()) p = graph.network_properties(g) for k, v in p.iteritems(): if i == 0: props[k] = [] props[k].append(v) g = None # just to make sure.. print '> Calculating means and deviations..' props_total = {} for key in props: props_total[key + '_mean'] = numpy.mean(props[key]) props_total[key + '_std'] = numpy.std(props[key]) data.pickle_to_file( giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words') data.pickle_to_file( props, 'output/properties/dependency/docs_air_all_no_stop_words') data.pickle_to_file( props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')