def test_document_lengths(dataset='mir'): print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "PROBLEM DESCRIPTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty" path = '../data/'+dataset+'/solutions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "SOLUTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty"
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i,d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d)==0: print file_names[i],'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:',len(documents) print '# empty documents:',empty print '# words:',sum(lengths) print 'length avg:',lengths.mean() print 'length stddev:',lengths.std() print print 'document lengths (sorted):',sorted(lengths) plotter.histogram(lengths,'# tokens','# documents','',bins=80)
def test_document_lengths(dataset='mir'): print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "PROBLEM DESCRIPTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty" path = '../data/' + dataset + '/solutions_preprocessed' docs, _ = data.read_files(path) names = data.get_file_names(path) print "SOLUTIONS" for i, d in enumerate(docs): if not d: print names[i], "is empty"
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i, d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d) == 0: print file_names[i], 'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:', len(documents) print '# empty documents:', empty print '# words:', sum(lengths) print 'length avg:', lengths.mean() print 'length stddev:', lengths.std() print print 'document lengths (sorted):', sorted(lengths) plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test classification using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) filenames = data.get_file_names(path) print '> Creating representations..' rep = [] for i, text in enumerate(texts): print ' '+str(i)+"/"+str(len(texts)) g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_classification(rep, labels) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/class' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]): """ Test retrieval using different combinations of higher orders and weightings of these. The list *orders* define which higher order relations to include. The relative importance of the orders are defined by *order_weights*. """ print '> Reading cases..' descriptions_path = '../data/air/problem_descriptions_preprocessed' description_texts, labels = data.read_files(descriptions_path) filenames = data.get_file_names(descriptions_path) solutions_path = '../data/air/solutions_preprocessed' solution_texts, labels = data.read_files(solutions_path) solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF) print '> Creating representations..' rep = [] for i, text in enumerate(description_texts): print ' '+str(i)+"/"+str(len(description_texts)) g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i]) d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE) rep.append(d) rep = graph_representation.dicts_to_vectors(rep) print '> Evaluating..' score = evaluation.evaluate_retrieval(rep, solution_vectors) print 'orders:', orders print 'score:', score fname = 'output/higher_order/results/retr' with open(fname, 'a+') as f: s = reduce(lambda x,y:str(x)+str(y), orders) f.write(str(s)+' '+str(score)+'\n') return score