Esempio n. 1
0
def retrieval_demo():
    """Function intended to illustrate retrieval in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Retrieval'
    print 'Graph type:      Dependency'
    print 'Centrality:      PageRank'
    print
    print '> Reading data..'
    desc_path = '../data/air/problem_descriptions_dependencies'
    sol_path = '../data/air/solutions_preprocessed'
    problems, _ = data.read_files(desc_path)
    solutions, _ = data.read_files(sol_path)

    print '> Creating solution representations..'
    metric = freq_representation.FrequencyMetrics.TF_IDF
    sol_vectors = freq_representation.text_to_vector(solutions, metric)

    print '> Creating problem description representations..'
    dicts = []
    for i, doc in enumerate(problems):
        print '   ',str(i)+'/'+str(len(problems))
        g = graph_representation.construct_dependency_network(doc)
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.PAGERANK)
        dicts.append(d)
    desc_vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(desc_vectors, sol_vectors)
    print '    score:', score
    print
Esempio n. 2
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(
            training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(
            training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training': training_rep, 'test': test_rep}
        labels = {'training': training_labels, 'test': test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Esempio n. 3
0
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test retrieval using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)
    filenames = data.get_file_names(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        print '    '+str(i)+"/"+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/retr'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score
Esempio n. 4
0
def retrieval_demo():
    """Function intended to illustrate retrieval in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Retrieval'
    print 'Graph type:      Dependency'
    print 'Centrality:      PageRank'
    print
    print '> Reading data..'
    desc_path = '../data/air/problem_descriptions_dependencies'
    sol_path = '../data/air/solutions_preprocessed'
    problems, _ = data.read_files(desc_path)
    solutions, _ = data.read_files(sol_path)

    print '> Creating solution representations..'
    metric = freq_representation.FrequencyMetrics.TF_IDF
    sol_vectors = freq_representation.text_to_vector(solutions, metric)

    print '> Creating problem description representations..'
    dicts = []
    for i, doc in enumerate(problems):
        print '   ', str(i) + '/' + str(len(problems))
        g = graph_representation.construct_dependency_network(doc)
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.PAGERANK)
        dicts.append(d)
    desc_vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(desc_vectors, sol_vectors)
    print '    score:', score
    print
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path = '../data/air/solutions_preprocessed'
    path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i % 1 == 0:
            print '    document', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(
            text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res')
    return results
Esempio n. 6
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training':training_rep, 'test':test_rep}
        labels = {'training':training_labels, 'test':test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Esempio n. 7
0
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False):
    """
    Experiment used for comparative evaluation of different network
    representations on retrieval.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    def make_dicts(docs, icc=None):
        rep = []
        for i, doc in enumerate(docs):
            if i%100==0: print '    graph',str(i)+'/'+str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'}
    gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network,
                'dependency':graph_representation.construct_dependency_network}
    metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE,
                'dependency':graph.GraphMetrics.EIGENVECTOR}

    print '--', graph_type
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions'+postfix[graph_type]
    docs, labels = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/'+dataset+'/solutions_preprocessed'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    icc = None
    if use_icc:
        print '> Calculating ICC..'
        m = metrics[graph_type].split()[0]
        print graph_type
        if graph_type == 'co-occurrence':
            p = 'output/centralities/co-occurrence/'+dataset+'/problem_descriptions/window/'+m+'.cent'
        elif graph_type == 'dependency':
            p = 'output/centralities/dependency/'+dataset+'/problem_descriptions/'+m+'.cent'
        print '    fetching', p
        icc = data.pickle_from_file(p)
        print '    icc:', type(icc)

    print '> Creating problem description representations..'
    dicts = make_dicts(docs, icc)
    descriptions_rep = graph_representation.dicts_to_vectors(dicts)#, remove_stop_words=True)

    print '> Evaluating..'
    results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
    print results
    s = 'retrieval comparison '
    if use_icc: s += 'USING TC-ICC'
    s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
Esempio n. 8
0
def edge_direction_evaluation(direction):
    """
    Evaluate impact of using different edge directions on dependency networks.

    Values for *direction*: ``forward``, ``backward``, and ``undirected``.
    """
    results = {'_edge-direction':direction}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, direction=direction)
        metric  = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, direction=direction)
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    results['retrieval'] = score

    data.pickle_to_file(results, 'output/dependencies/stop_words_retr_'+direction)

    pp.pprint(results)
    return results
Esempio n. 9
0
def centrality_weights_retrieval(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the retrieval task.
    """
    results = {'_is_weighted': weighted, '_evaluation': 'retrieval'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(description_texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None  # just to make sure..
        if i % 100 == 0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(
                rep,
                'output/dependencies/exp1_retr_tmp_' + str(i) + '_' + postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solution_vectors)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_retr' + postfix)

    pp.pprint(results)
    return results
Esempio n. 10
0
def print_common_hub_words(rem_stop_words):
    """
    Print a list of the most common hub words in the created networks.
    Purpose of experiment was to show that hub words typically are stop words.

    The *rem_stop_words* determine whether stop words are removed before creating
    the networks.
    """
    results = {'_removing stop-words': rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None  # just to make sure..

    results['tasa'] = fd.keys()

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(description_texts):
        if i % 100 == 0:
            print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None  # just to make sure..

    results['air'] = fd.keys()

    if rem_stop_words:
        modifier = 'without'
    else:
        modifier = 'with'
    data.pickle_to_file(
        results, 'output/dependencies/common_hubs_' + modifier + 'stop_words')

    pp.pprint(results)
    return results
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path  = '../data/air/solutions_preprocessed'
    path            = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i%1==0: print '    document',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res')
    return results
Esempio n. 12
0
def centrality_weights_retrieval(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the retrieval task.
    """
    results = {'_is_weighted':weighted, '_evaluation':'retrieval'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(description_texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None # just to make sure..
        if i%100==0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(rep, 'output/dependencies/exp1_retr_tmp_'+str(i)+'_'+postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solution_vectors)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_retr'+postfix)

    pp.pprint(results)
    return results
Esempio n. 13
0
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i % 10 == 0: print i
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_' + str(window_size))

    pp.pprint(results)
    return results
Esempio n. 14
0
def print_common_hub_words(rem_stop_words):
    """
    Print a list of the most common hub words in the created networks.
    Purpose of experiment was to show that hub words typically are stop words.

    The *rem_stop_words* determine whether stop words are removed before creating
    the networks.
    """
    results = {'_removing stop-words':rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None # just to make sure..

    results['tasa'] = fd.keys()

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None # just to make sure..

    results['air'] = fd.keys()

    if rem_stop_words:
        modifier = 'without'
    else:
        modifier = 'with'
    data.pickle_to_file(results, 'output/dependencies/common_hubs_'+modifier+'stop_words')

    pp.pprint(results)
    return results
Esempio n. 15
0
def do_retrieval_experiments(
        descriptions='air/problem_descriptions',
        solutions='air/solutions',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {
        '_solutions': solutions,
        '_descriptions': descriptions,
        '_evaluation': 'retrieval'
    }

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/' + descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/' + solutions + '_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i%10==0: print i
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_'+str(window_size))

    pp.pprint(results)
    return results
Esempio n. 17
0
def corpus_dependency_properties(dataset = 'air/problem_descriptions'):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_dependencies'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_dependency_network(text,remove_stop_words=True)
        giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data.pickle_to_file(giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words')
    data.pickle_to_file(props, 'output/properties/dependency/docs_air_all_no_stop_words')
    data.pickle_to_file(props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')
def print_degree_distributions(dataset, context):
    """
    Extracts degree distribution values from networks, and print them to
    cvs-file.

    **warning** overwrites if file exists.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_text'
    (documents, labels) = data.read_files(corpus_path)

    degsfile = open('output/properties/cooccurrence/degrees_docs_'+dataset.replace('/','.'), 'w')

    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_cooccurrence_network(text,context=context)
        giant.add_edges_from(g.edges())
        degs = nx.degree(g).values()
        degs = [str(d) for d in degs]
        degsfile.write(','.join(degs)+'\n')
    degsfile.close()

    print '> Writing giant\'s distribution'
    with open('output/properties/cooccurrence/degrees_giant_'+dataset.replace('/','.'), 'w') as f:
        ds = nx.degree(giant).values()
        ds = [str(d) for d in ds]
        f.write(','.join(ds))
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_cooccurrence_network(text,context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ',key
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data_name = dataset.replace('/','.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name)
    data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric+' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
Esempio n. 21
0
def word_vectors():

    train = read_files('train')
    test = read_files('test')

    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(train['text'])

    train_seq = tokenizer.texts_to_sequences(train['text'])
    test_seq = tokenizer.texts_to_sequences(test['text'])

    x_train = sequence.pad_sequences(train_seq, maxlen=200)  # shape  (25000, 200)
    y_train = train['label']
    x_test = sequence.pad_sequences(test_seq, maxlen=200)  # shape (25000, 200)
    y_test = test['label']
    return x_train, y_train.values, x_test, y_test.values
Esempio n. 22
0
def solution_similarity_stats(dataset='air/solutions_preprocessed'):
    """
    Plots histogram of solution-solution similarity distribution of a dataset.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)

    print '> Creating vector representations..'
    vectors = freq_representation.text_to_vector(
        documents, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Calculating similarities..'
    distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine')
    diag = numpy.diag([2.0] * len(distances),
                      0)  # move similarities of "self" to -1
    distances = distances + diag
    similarities = 1.0 - distances
    similarities = similarities.ravel()
    similarities = [s for s in similarities if s >= 0]
    print plotter.histogram(similarities,
                            'similarity',
                            '# matches',
                            '',
                            bins=150)
    print
    print max(similarities)
    print min(similarities)
    print float(sum(similarities)) / len(similarities)
    num = len([sim for sim in similarities if sim < 0.23])
    print 'fraction sims < .23:', float(num) / len(similarities)
Esempio n. 23
0
def test_scale_free():
    import random
    import data
    import graph_representation
    import plfit
    import numpy
    corpus_path = '../data/air/problem_descriptions_text'
    (documents, labels) = data.read_files(corpus_path)
    g = graph_representation.construct_cooccurrence_network(documents[0],context='sentence')
    degree_sequence=sorted(nx.degree(g).values(),reverse=True) # degree sequence
    dmax=max(degree_sequence)

    degree_sequence = numpy.array(degree_sequence)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p,ksv = pl.test_pl()
    print
    print
    print
    print

    seq = [random.randrange(0,100) for i in range(len(degree_sequence))]
    degree_sequence = numpy.array(seq)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p,ksv = pl.test_pl()
    print
    print
    print
    print
Esempio n. 24
0
def test_scale_free():
    import random
    import data
    import graph_representation
    import plfit
    import numpy
    corpus_path = '../data/air/problem_descriptions_text'
    (documents, labels) = data.read_files(corpus_path)
    g = graph_representation.construct_cooccurrence_network(documents[0],
                                                            context='sentence')
    degree_sequence = sorted(nx.degree(g).values(),
                             reverse=True)  # degree sequence
    dmax = max(degree_sequence)

    degree_sequence = numpy.array(degree_sequence)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p, ksv = pl.test_pl()
    print
    print
    print
    print

    seq = [random.randrange(0, 100) for i in range(len(degree_sequence))]
    degree_sequence = numpy.array(seq)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p, ksv = pl.test_pl()
    print
    print
    print
    print
Esempio n. 25
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i, d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d) == 0:
            print file_names[i], 'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:', len(documents)
    print '# empty documents:', empty
    print '# words:', sum(lengths)
    print 'length avg:', lengths.mean()
    print 'length stddev:', lengths.std()
    print
    print 'document lengths (sorted):', sorted(lengths)
    plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
Esempio n. 26
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i,d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d)==0:
            print file_names[i],'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:',len(documents)
    print '# empty documents:',empty
    print '# words:',sum(lengths)
    print 'length avg:',lengths.mean()
    print 'length stddev:',lengths.std()
    print
    print 'document lengths (sorted):',sorted(lengths)
    plotter.histogram(lengths,'# tokens','# documents','',bins=80)
Esempio n. 27
0
def print_degree_distributions(dataset, context):
    """
    Extracts degree distribution values from networks, and print them to
    cvs-file.

    **warning** overwrites if file exists.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_text'
    (documents, labels) = data.read_files(corpus_path)

    degsfile = open(
        'output/properties/cooccurrence/degrees_docs_' +
        dataset.replace('/', '.'), 'w')

    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        giant.add_edges_from(g.edges())
        degs = nx.degree(g).values()
        degs = [str(d) for d in degs]
        degsfile.write(','.join(degs) + '\n')
    degsfile.close()

    print '> Writing giant\'s distribution'
    with open(
            'output/properties/cooccurrence/degrees_giant_' +
            dataset.replace('/', '.'), 'w') as f:
        ds = nx.degree(giant).values()
        ds = [str(d) for d in ds]
        f.write(','.join(ds))
Esempio n. 28
0
def test_document_lengths(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "PROBLEM DESCRIPTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
    path = '../data/' + dataset + '/solutions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "SOLUTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
Esempio n. 29
0
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs,
                                                         metric,
                                                         verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric + ' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
Esempio n. 30
0
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test classification using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)
    filenames = data.get_file_names(path)
    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        print '    '+str(i)+"/"+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)
    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/class'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score
Esempio n. 31
0
def classification_demo():
    """Function intended to illustrate classification in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Classification'
    print 'Graph type:      Co-occurrence w/2-word window context'
    print 'Centrality:      Weighted degree'
    print
    print '> Reading data..'
    corpus_path = '../data/tasa/TASA900_preprocessed'
    docs, labels = data.read_files(corpus_path)

    print '> Creating representations..'
    dicts = []
    for i, doc in enumerate(docs):
        print '   ', str(i) + '/' + str(len(docs))
        g = graph_representation.construct_cooccurrence_network(doc)
        d = graph_representation.graph_to_dict(
            g, graph.GraphMetrics.WEIGHTED_DEGREE)
        dicts.append(d)
    vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(vectors, labels)
    print '    score:', score
    print
Esempio n. 32
0
def solution_similarity_stats(dataset='air/solutions_preprocessed'):
    """
    Plots histogram of solution-solution similarity distribution of a dataset.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)

    print '> Creating vector representations..'
    vectors = freq_representation.text_to_vector(documents, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Calculating similarities..'
    distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine')
    diag = numpy.diag([2.0]*len(distances),0) # move similarities of "self" to -1
    distances = distances + diag
    similarities = 1.0 - distances
    similarities = similarities.ravel()
    similarities = [s for s in similarities if s >= 0]
    print plotter.histogram(similarities,'similarity','# matches','',bins=150)
    print
    print max(similarities)
    print min(similarities)
    print float(sum(similarities))/len(similarities)
    num = len([sim for sim in similarities if sim < 0.23])
    print 'fraction sims < .23:', float(num)/len(similarities)
Esempio n. 33
0
def test_document_lengths(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "PROBLEM DESCRIPTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
    path = '../data/'+dataset+'/solutions_preprocessed'
    docs, _ = data.read_files(path)
    names = data.get_file_names(path)
    print "SOLUTIONS"
    for i, d in enumerate(docs):
        if not d:
            print names[i], "is empty"
Esempio n. 34
0
def classification_demo():
    """Function intended to illustrate classification in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Classification'
    print 'Graph type:      Co-occurrence w/2-word window context'
    print 'Centrality:      Weighted degree'
    print
    print '> Reading data..'
    corpus_path = '../data/tasa/TASA900_preprocessed'
    docs, labels = data.read_files(corpus_path)

    print '> Creating representations..'
    dicts = []
    for i, doc in enumerate(docs):
        print '   ',str(i)+'/'+str(len(docs))
        g = graph_representation.construct_cooccurrence_network(doc)
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        dicts.append(d)
    vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(vectors, labels)
    print '    score:', score
    print
Esempio n. 35
0
def do_retrieval_experiments(descriptions='air/problem_descriptions',
                                solutions='air/solutions',
                                graph_types=['co-occurrence','dependency','random'],
                                use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_solutions':solutions,
                '_descriptions':descriptions,
                '_evaluation':'retrieval'}

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/'+descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/'+solutions+'_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
Esempio n. 36
0
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/' + corpus + '_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results,
        'output/tc_icc/cooccurrence/' + corpus + '/classification.res')
    return results
def test():
    import data
    dataset = 'test/freq'
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)

    vectors = {}
    print '> Building vector representations..'
    for metric in get_metrics():
        print '   ', metric
        vectors[metric] = text_to_vector(documents, metric)
Esempio n. 38
0
def test():
    import data
    dataset = 'test/freq'
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)

    vectors = {}
    print '> Building vector representations..'
    for metric in get_metrics():
        print '   ', metric
        vectors[metric] = text_to_vector(documents, metric)
def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_text'
    store_path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
Esempio n. 40
0
def vectorization(vector_type):

    train = read_files('train')
    test = read_files('test')

    if vector_type == 'CountVectorizer':
        vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, max_features=2000)
    elif vector_type == 'TfidfVectorizer':
        vectorizer = TfidfVectorizer()
    else:
        vectorizer = HashingVectorizer(n_features=200)

    vectorizer.fit(train['text'])

    x_train = vectorizer.transform(train['text'])
    y_train = train['label'].values

    x_test = vectorizer.transform(test['text'])
    y_test = test['label'].values

    return x_train, y_train, x_test, y_test
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/'+corpus+'_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res')
    return results
Esempio n. 42
0
def retrieval_comparison_freq(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/'+dataset+'/solutions_preprocessed'
    solutions_docs, _ = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_docs, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        descriptions_rep = freq_representation.text_to_vector(docs, metric)
        score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'retrieval comparison \nrepresentation: frequency\ndataset:'+dataset+' \nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
Esempio n. 43
0
def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_text'
    store_path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(
        gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
Esempio n. 44
0
def retrieval_comparison_freq(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/' + dataset + '/solutions_preprocessed'
    solutions_docs, _ = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_docs, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        descriptions_rep = freq_representation.text_to_vector(docs, metric)
        score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'retrieval comparison \nrepresentation: frequency\ndataset:' + dataset + ' \nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
Esempio n. 45
0
def freq_classification(dataset='tasa/TASA900'):
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    corpus_path = '../data/'+dataset
    results['results'] = {}
    for metric in freq_representation.get_metrics():
        print metric
        documents, labels = data.read_files(corpus_path+'_preprocessed')
        vectors = freq_representation.text_to_vector(documents, metric)
        r = evaluation.evaluate_classification(vectors, labels, mode='cross-validation')
        results['results'][metric] = r
        print '   ', r
        print
    pp.pprint(results)
    return results
def test_dependency_graph():
    (docs, labels) = data.read_files('../data/tasa/TASA900_dependencies')
    graphs = []
    for i, text in enumerate(docs):
        print i
        graphs.append(construct_dependency_network(text))

    g = graphs[0]
    print g.nodes()
    print g.edges()

    print '#graphs:', len(graphs)

    pos = nx.spring_layout(g)
    graph.draw_with_centrality(g, layout=pos)
Esempio n. 47
0
def test_vocabulary_size(path = '../data/air/problem_descriptions_preprocessed'):
    """
    Print vocabulary sizes for documents in dataset.
    """
    texts, labels = data.read_files(path)
    lengths = []
    for text in texts:
        text = text.split(' ')
        l = len(list(set(text)))
        lengths.append(l)
        print '   ',l
    lengths = numpy.array(lengths)
    print 'avg', lengths.mean()
    print 'max', lengths.max()
    print 'min', lengths.min()
Esempio n. 48
0
def test_dependency_graph():
    (docs, labels) = data.read_files('../data/tasa/TASA900_dependencies')
    graphs = []
    for i, text in enumerate(docs):
        print i
        graphs.append(construct_dependency_network(text))

    g = graphs[0]
    print g.nodes()
    print g.edges()

    print '#graphs:', len(graphs)

    pos = nx.spring_layout(g)
    graph.draw_with_centrality(g, layout=pos)
Esempio n. 49
0
def freq_classification(dataset='tasa/TASA900'):
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    corpus_path = '../data/' + dataset
    results['results'] = {}
    for metric in freq_representation.get_metrics():
        print metric
        documents, labels = data.read_files(corpus_path + '_preprocessed')
        vectors = freq_representation.text_to_vector(documents, metric)
        r = evaluation.evaluate_classification(vectors,
                                               labels,
                                               mode='cross-validation')
        results['results'][metric] = r
        print '   ', r
        print
    pp.pprint(results)
    return results
Esempio n. 50
0
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results,
                            'output/class_context_' + str(window_size))

    pp.pprint(results)
    return results
Esempio n. 51
0
def store_corpus_network(corpus):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_dependencies'
    store_path = 'output/giants/dependency/'+corpus+'/graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdeps = {}
    for i, text in enumerate(texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(texts))
        d = pickle.loads(text)
        for dep in d.keys():
            gdeps[dep] = gdeps.get(dep, []) + d[dep]
    giant = graph_representation.construct_dependency_network(gdeps,verbose=True,unpickle=False)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/class_context_'+str(window_size))

    pp.pprint(results)
    return results
def test_best_classification():
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    rep = []
    print '> Creating representations..'
    for i, text in enumerate(texts):
        if i%100==0: print '   ',i
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   ', score
Esempio n. 54
0
def store_corpus_network(corpus):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_dependencies'
    store_path = 'output/giants/dependency/' + corpus + '/graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdeps = {}
    for i, text in enumerate(texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(texts))
        d = pickle.loads(text)
        for dep in d.keys():
            gdeps[dep] = gdeps.get(dep, []) + d[dep]
    giant = graph_representation.construct_dependency_network(gdeps,
                                                              verbose=True,
                                                              unpickle=False)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
Esempio n. 55
0
def test_best_classification():
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    rep = []
    print '> Creating representations..'
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', i
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        d = graph_representation.graph_to_dict(
            g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   ', score
Esempio n. 56
0
def do_classification_experiments(
        dataset='tasa/TASA900',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(
                vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(
                vectors, labels)

    print
    pp.pprint(results)
    return results
Esempio n. 57
0
def corpus_dependency_properties(dataset='air/problem_descriptions'):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_dependencies'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=True)
        giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k, v in p.iteritems():
            if i == 0: props[k] = []
            props[k].append(v)
        g = None  # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        props_total[key + '_mean'] = numpy.mean(props[key])
        props_total[key + '_std'] = numpy.std(props[key])

    data.pickle_to_file(
        giant,
        'output/properties/dependency/corpus_network_air_all_no_stop_words')
    data.pickle_to_file(
        props, 'output/properties/dependency/docs_air_all_no_stop_words')
    data.pickle_to_file(
        props_total,
        'output/properties/dependency/docs_air_all_no_stop_words_total')