def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path = '../data/air/solutions_preprocessed'
    path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i % 1 == 0:
            print '    document', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(
            text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res')
    return results
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs,
                                                         metric,
                                                         verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric + ' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric+' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
def centrality_weights_retrieval(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the retrieval task.
    """
    results = {'_is_weighted': weighted, '_evaluation': 'retrieval'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(description_texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None  # just to make sure..
        if i % 100 == 0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(
                rep,
                'output/dependencies/exp1_retr_tmp_' + str(i) + '_' + postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solution_vectors)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_retr' + postfix)

    pp.pprint(results)
    return results
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path  = '../data/air/solutions_preprocessed'
    path            = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i%1==0: print '    document',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res')
    return results
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/' + corpus + '_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results,
        'output/tc_icc/cooccurrence/' + corpus + '/classification.res')
    return results
def centrality_weights_retrieval(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the retrieval task.
    """
    results = {'_is_weighted':weighted, '_evaluation':'retrieval'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(description_texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None # just to make sure..
        if i%100==0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(rep, 'output/dependencies/exp1_retr_tmp_'+str(i)+'_'+postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solution_vectors)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_retr'+postfix)

    pp.pprint(results)
    return results
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i % 10 == 0: print i
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_' + str(window_size))

    pp.pprint(results)
    return results
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/'+corpus+'_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res')
    return results
Exemple #10
0
def do_retrieval_experiments(
        descriptions='air/problem_descriptions',
        solutions='air/solutions',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {
        '_solutions': solutions,
        '_descriptions': descriptions,
        '_evaluation': 'retrieval'
    }

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/' + descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/' + solutions + '_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(
                vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i%10==0: print i
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_'+str(window_size))

    pp.pprint(results)
    return results
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results,
                            'output/class_context_' + str(window_size))

    pp.pprint(results)
    return results
Exemple #13
0
def do_retrieval_experiments(descriptions='air/problem_descriptions',
                                solutions='air/solutions',
                                graph_types=['co-occurrence','dependency','random'],
                                use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on the retrieval task.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_solutions':solutions,
                '_descriptions':descriptions,
                '_evaluation':'retrieval'}

    print '> Evaluation type: retrieval'
    print '> Reading cases..'
    descriptions_path = '../data/'+descriptions
    descriptiondata = data.read_data(descriptions_path, graph_types)

    solutions_path = '../data/'+solutions+'_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        docs, labels = descriptiondata[gtype]
        graphs = graph_representation.create_graphs(docs, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            docs, labels = data.read_files(descriptions_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(docs, metric)
            results['freq'][metric] = evaluation.evaluate_retrieval(vectors, solution_vectors)

    print
    pp.pprint(results)
    return results
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/class_context_'+str(window_size))

    pp.pprint(results)
    return results
def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file('MemoryError while claculating '+metric+' on '+corpus+':\n'+str(e)+'\n\n', 'output/log/errors')
def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file(
                'MemoryError while claculating ' + metric + ' on ' + corpus +
                ':\n' + str(e) + '\n\n', 'output/log/errors')
Exemple #17
0
def do_classification_experiments(
        dataset='tasa/TASA900',
        graph_types=['co-occurrence', 'dependency', 'random'],
        use_frequency=True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset': dataset, '_evaluation': 'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ', gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(
                vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path + '_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(
                vectors, labels)

    print
    pp.pprint(results)
    return results
Exemple #18
0
def do_classification_experiments(dataset='tasa/TASA900',
                                    graph_types = ['co-occurrence','dependency','random'],
                                    use_frequency = True):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    Toggle comparison with frequency-based methods using *use_frequency*.
    """
    results = {'_dataset':dataset,
                '_evaluation':'classification'}
    print '> Evaluation type: classification'
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    docdata = data.read_data(corpus_path, graph_types)

    print '> Evaluating..'
    for gtype in graph_types:
        print '   ',gtype
        documents, labels = docdata[gtype]
        graphs = graph_representation.create_graphs(documents, gtype)
        results[gtype] = {}
        for metric in graph_representation.get_metrics():
            print '    -', metric
            vectors = graph_representation.graphs_to_vectors(graphs, metric)
            results[gtype][metric] = evaluation.evaluate_classification(vectors, labels)
    if use_frequency:
        print '    frequency'
        results['freq'] = {}
        for metric in freq_representation.get_metrics():
            print '    -', metric
            documents, labels = data.read_files(corpus_path+'_preprocessed')
            vectors = freq_representation.text_to_vector(documents, metric)
            results['freq'][metric] = evaluation.evaluate_classification(vectors, labels)

    print
    pp.pprint(results)
    return results