def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/'+dataset+'/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict(training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/' + dataset + '/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict( training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors( training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def do_retrieval_experiments( descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = { '_solutions': solutions, '_descriptions': descriptions, '_evaluation': 'retrieval' } print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/' + descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/' + solutions + '_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector( solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ', gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path + '_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval( vectors, solution_vectors) print pp.pprint(results) return results
def freq_classification(dataset='tasa/TASA900'): results = {'_dataset':dataset, '_evaluation':'classification'} corpus_path = '../data/'+dataset results['results'] = {} for metric in freq_representation.get_metrics(): print metric documents, labels = data.read_files(corpus_path+'_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation') results['results'][metric] = r print ' ', r print pp.pprint(results) return results
def freq_classification(dataset='tasa/TASA900'): results = {'_dataset': dataset, '_evaluation': 'classification'} corpus_path = '../data/' + dataset results['results'] = {} for metric in freq_representation.get_metrics(): print metric documents, labels = data.read_files(corpus_path + '_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation') results['results'][metric] = r print ' ', r print pp.pprint(results) return results
def do_retrieval_experiments(descriptions='air/problem_descriptions', solutions='air/solutions', graph_types=['co-occurrence','dependency','random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on the retrieval task. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_solutions':solutions, '_descriptions':descriptions, '_evaluation':'retrieval'} print '> Evaluation type: retrieval' print '> Reading cases..' descriptions_path = '../data/'+descriptions descriptiondata = data.read_data(descriptions_path, graph_types) solutions_path = '../data/'+solutions+'_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' for gtype in graph_types: print ' ',gtype docs, labels = descriptiondata[gtype] graphs = graph_representation.create_graphs(docs, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric docs, labels = data.read_files(descriptions_path+'_preprocessed') vectors = freq_representation.text_to_vector(docs, metric) results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors) print pp.pprint(results) return results
def do_classification_experiments( dataset='tasa/TASA900', graph_types=['co-occurrence', 'dependency', 'random'], use_frequency=True): """ Experiment used for comparative evaluation of different network representations on classification. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_dataset': dataset, '_evaluation': 'classification'} print '> Evaluation type: classification' print '> Reading data..', dataset corpus_path = '../data/' + dataset docdata = data.read_data(corpus_path, graph_types) print '> Evaluating..' for gtype in graph_types: print ' ', gtype documents, labels = docdata[gtype] graphs = graph_representation.create_graphs(documents, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_classification( vectors, labels) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric documents, labels = data.read_files(corpus_path + '_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) results['freq'][metric] = evaluation.evaluate_classification( vectors, labels) print pp.pprint(results) return results
def retrieval_comparison_freq(dataset='mir'): print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions_preprocessed' docs, _ = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/'+dataset+'/solutions_preprocessed' solutions_docs, _ = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_docs, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' results = {} for metric in freq_representation.get_metrics(): print ' ', metric, descriptions_rep = freq_representation.text_to_vector(docs, metric) score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) results[metric] = score print score pp.pprint(results) s = 'retrieval comparison \nrepresentation: frequency\ndataset:'+dataset+' \nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def do_classification_experiments(dataset='tasa/TASA900', graph_types = ['co-occurrence','dependency','random'], use_frequency = True): """ Experiment used for comparative evaluation of different network representations on classification. Toggle comparison with frequency-based methods using *use_frequency*. """ results = {'_dataset':dataset, '_evaluation':'classification'} print '> Evaluation type: classification' print '> Reading data..', dataset corpus_path = '../data/'+dataset docdata = data.read_data(corpus_path, graph_types) print '> Evaluating..' for gtype in graph_types: print ' ',gtype documents, labels = docdata[gtype] graphs = graph_representation.create_graphs(documents, gtype) results[gtype] = {} for metric in graph_representation.get_metrics(): print ' -', metric vectors = graph_representation.graphs_to_vectors(graphs, metric) results[gtype][metric] = evaluation.evaluate_classification(vectors, labels) if use_frequency: print ' frequency' results['freq'] = {} for metric in freq_representation.get_metrics(): print ' -', metric documents, labels = data.read_files(corpus_path+'_preprocessed') vectors = freq_representation.text_to_vector(documents, metric) results['freq'][metric] = evaluation.evaluate_classification(vectors, labels) print pp.pprint(results) return results
def retrieval_comparison_freq(dataset='mir'): print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions_preprocessed' docs, _ = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/' + dataset + '/solutions_preprocessed' solutions_docs, _ = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_docs, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' results = {} for metric in freq_representation.get_metrics(): print ' ', metric, descriptions_rep = freq_representation.text_to_vector(docs, metric) score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) results[metric] = score print score pp.pprint(results) s = 'retrieval comparison \nrepresentation: frequency\ndataset:' + dataset + ' \nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results