コード例 #1
0
ファイル: experiments.py プロジェクト: himanshusapra9/TextNet
def test_document_lengths(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "PROBLEM DESCRIPTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
    path = '../data/'+dataset+'/solutions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "SOLUTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
コード例 #2
0
ファイル: experiments.py プロジェクト: himanshusapra9/TextNet
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i,d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d)==0:
            print file_names[i],'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:',len(documents)
    print '# empty documents:',empty
    print '# words:',sum(lengths)
    print 'length avg:',lengths.mean()
    print 'length stddev:',lengths.std()
    print
    print 'document lengths (sorted):',sorted(lengths)
    plotter.histogram(lengths,'# tokens','# documents','',bins=80)
コード例 #3
0
def test_document_lengths(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "PROBLEM DESCRIPTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
    path = '../data/' + dataset + '/solutions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "SOLUTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
コード例 #4
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i, d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d) == 0:
            print file_names[i], 'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:', len(documents)
    print '# empty documents:', empty
    print '# words:', sum(lengths)
    print 'length avg:', lengths.mean()
    print 'length stddev:', lengths.std()
    print
    print 'document lengths (sorted):', sorted(lengths)
    plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
コード例 #5
0
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test classification using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)
    filenames = data.get_file_names(path)
    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        print '    '+str(i)+"/"+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)
    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/class'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score
コード例 #6
0
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test retrieval using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)
    filenames = data.get_file_names(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        print '    '+str(i)+"/"+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/retr'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score