def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric+' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
Ejemplo n.º 2
0
def classification_demo():
    """Function intended to illustrate classification in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Classification'
    print 'Graph type:      Co-occurrence w/2-word window context'
    print 'Centrality:      Weighted degree'
    print
    print '> Reading data..'
    corpus_path = '../data/tasa/TASA900_preprocessed'
    docs, labels = data.read_files(corpus_path)

    print '> Creating representations..'
    dicts = []
    for i, doc in enumerate(docs):
        print '   ',str(i)+'/'+str(len(docs))
        g = graph_representation.construct_cooccurrence_network(doc)
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        dicts.append(d)
    vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(vectors, labels)
    print '    score:', score
    print
Ejemplo n.º 3
0
def test_retrieval(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test retrieval using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)
    filenames = data.get_file_names(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        print '    '+str(i)+"/"+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, orders=orders, order_weights=order_weights, doc_id='output/higher_order/air/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/retr'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score
Ejemplo n.º 4
0
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path = '../data/air/solutions_preprocessed'
    path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i % 1 == 0:
            print '    document', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(
            text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res')
    return results
Ejemplo n.º 5
0
def test_classification(orders=[1,2,3],order_weights=[1.0,1.53,1.51]):
    """
    Test classification using different combinations of higher orders and weightings of these.

    The list *orders* define which higher order relations to include.
    The relative importance of the orders are defined by *order_weights*.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)
    filenames = data.get_file_names(path)
    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        print '    '+str(i)+"/"+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context='sentence', orders=orders, order_weights=order_weights, doc_id='output/higher_order/tasa/'+labels[i]+'/'+filenames[i])
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
    rep = graph_representation.dicts_to_vectors(rep)
    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print 'orders:', orders
    print 'score:', score
    fname = 'output/higher_order/results/class'
    with open(fname, 'a+') as f:
        s = reduce(lambda x,y:str(x)+str(y), orders)
        f.write(str(s)+' '+str(score)+'\n')
    return score
Ejemplo n.º 6
0
def print_degree_distributions(dataset, context):
    """
    Extracts degree distribution values from networks, and print them to
    cvs-file.

    **warning** overwrites if file exists.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_text'
    (documents, labels) = data.read_files(corpus_path)

    degsfile = open(
        'output/properties/cooccurrence/degrees_docs_' +
        dataset.replace('/', '.'), 'w')

    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        giant.add_edges_from(g.edges())
        degs = nx.degree(g).values()
        degs = [str(d) for d in degs]
        degsfile.write(','.join(degs) + '\n')
    degsfile.close()

    print '> Writing giant\'s distribution'
    with open(
            'output/properties/cooccurrence/degrees_giant_' +
            dataset.replace('/', '.'), 'w') as f:
        ds = nx.degree(giant).values()
        ds = [str(d) for d in ds]
        f.write(','.join(ds))
Ejemplo n.º 7
0
def classification_demo():
    """Function intended to illustrate classification in the experimental framework.

    Intended as a basis for new experiments for those not intimately
    familiar with the code.
    """
    print 'Evaluation type: Classification'
    print 'Graph type:      Co-occurrence w/2-word window context'
    print 'Centrality:      Weighted degree'
    print
    print '> Reading data..'
    corpus_path = '../data/tasa/TASA900_preprocessed'
    docs, labels = data.read_files(corpus_path)

    print '> Creating representations..'
    dicts = []
    for i, doc in enumerate(docs):
        print '   ', str(i) + '/' + str(len(docs))
        g = graph_representation.construct_cooccurrence_network(doc)
        d = graph_representation.graph_to_dict(
            g, graph.GraphMetrics.WEIGHTED_DEGREE)
        dicts.append(d)
    vectors = graph_representation.dicts_to_vectors(dicts)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(vectors, labels)
    print '    score:', score
    print
Ejemplo n.º 8
0
def test_scale_free():
    import random
    import data
    import graph_representation
    import plfit
    import numpy
    corpus_path = '../data/air/problem_descriptions_text'
    (documents, labels) = data.read_files(corpus_path)
    g = graph_representation.construct_cooccurrence_network(documents[0],
                                                            context='sentence')
    degree_sequence = sorted(nx.degree(g).values(),
                             reverse=True)  # degree sequence
    dmax = max(degree_sequence)

    degree_sequence = numpy.array(degree_sequence)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p, ksv = pl.test_pl()
    print
    print
    print
    print

    seq = [random.randrange(0, 100) for i in range(len(degree_sequence))]
    degree_sequence = numpy.array(seq)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p, ksv = pl.test_pl()
    print
    print
    print
    print
Ejemplo n.º 9
0
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs,
                                                         metric,
                                                         verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric + ' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
Ejemplo n.º 10
0
def test_scale_free():
    import random
    import data
    import graph_representation
    import plfit
    import numpy
    corpus_path = '../data/air/problem_descriptions_text'
    (documents, labels) = data.read_files(corpus_path)
    g = graph_representation.construct_cooccurrence_network(documents[0],context='sentence')
    degree_sequence=sorted(nx.degree(g).values(),reverse=True) # degree sequence
    dmax=max(degree_sequence)

    degree_sequence = numpy.array(degree_sequence)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p,ksv = pl.test_pl()
    print
    print
    print
    print

    seq = [random.randrange(0,100) for i in range(len(degree_sequence))]
    degree_sequence = numpy.array(seq)
    print degree_sequence
    pl = plfit.plfit(degree_sequence)
    p,ksv = pl.test_pl()
    print
    print
    print
    print
def print_degree_distributions(dataset, context):
    """
    Extracts degree distribution values from networks, and print them to
    cvs-file.

    **warning** overwrites if file exists.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_text'
    (documents, labels) = data.read_files(corpus_path)

    degsfile = open('output/properties/cooccurrence/degrees_docs_'+dataset.replace('/','.'), 'w')

    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_cooccurrence_network(text,context=context)
        giant.add_edges_from(g.edges())
        degs = nx.degree(g).values()
        degs = [str(d) for d in degs]
        degsfile.write(','.join(degs)+'\n')
    degsfile.close()

    print '> Writing giant\'s distribution'
    with open('output/properties/cooccurrence/degrees_giant_'+dataset.replace('/','.'), 'w') as f:
        ds = nx.degree(giant).values()
        ds = [str(d) for d in ds]
        f.write(','.join(ds))
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_cooccurrence_network(text,context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ',key
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data_name = dataset.replace('/','.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name)
    data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path  = '../data/air/solutions_preprocessed'
    path            = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i%1==0: print '    document',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res')
    return results
Ejemplo n.º 14
0
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/' + corpus + '_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results,
        'output/tc_icc/cooccurrence/' + corpus + '/classification.res')
    return results
Ejemplo n.º 15
0
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20):
    def _print_terms(cents, rep, num):
        ts = _top_cents(cents, num)
        terms = []
        for t in ts:
            terms.append(t[0])
        print rep + ' & ' + ', '.join(terms) + ' \\\\'

    def _top_cents(cents, num):
        return sorted(cents.iteritems(),
                      key=operator.itemgetter(1),
                      reverse=True)[0:num]

    def _calc_cents(g, metric, gcents=None):
        if gcents: icc = graph_representation.calculate_icc_dict(gcents)
        else: icc = None
        return graph_representation.graph_to_dict(g, metric, icc)

    import operator
    import dependency_experiments
    import co_occurrence_experiments

    dataset = 'air/reports'
    path = '../data/' + doc
    doc = data.read_file(path)

    metric = graph.GraphMetrics.DEGREE
    context = 'window'
    g = graph_representation.construct_cooccurrence_network(doc,
                                                            context=context)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Co-occurrence TC', num)
    gcents = co_occurrence_experiments.retrieve_centralities(
        dataset, context, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Co-occurrence TC-ICC', num)

    metric = graph.GraphMetrics.EIGENVECTOR
    deps = data._text_to_dependencies(doc)
    g = graph_representation.construct_dependency_network(deps)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Dependency TC', num)
    gcents = dependency_experiments.retrieve_centralities(dataset, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Dependency TC-ICC', num)

    fdict = freq_representation.text_to_dict(
        [doc], freq_representation.FrequencyMetrics.TF_IDF)[0]
    _print_terms(fdict, 'TF-IDF', num)

    fdict = freq_representation.text_to_dict(
        [doc], freq_representation.FrequencyMetrics.TF)[0]
    _print_terms(fdict, 'TF', num)
def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_text'
    store_path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/'+corpus+'_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res')
    return results
Ejemplo n.º 18
0
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i % 10 == 0: print i
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_' + str(window_size))

    pp.pprint(results)
    return results
Ejemplo n.º 19
0
def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_text'
    store_path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(
        gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i%10==0: print i
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_'+str(window_size))

    pp.pprint(results)
    return results
Ejemplo n.º 21
0
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results,
                            'output/class_context_' + str(window_size))

    pp.pprint(results)
    return results
Ejemplo n.º 22
0
def term_centrality_study(doc='air/reports_text/2005/a05a0059.html', num=20):
    def _print_terms(cents, rep, num):
        ts = _top_cents(cents, num)
        terms = []
        for t in ts:
            terms.append(t[0])
        print rep + ' & ' + ', '.join(terms) + ' \\\\'
    def _top_cents(cents,num):
        return sorted(cents.iteritems(), key = operator.itemgetter(1), reverse = True)[0:num]
    def _calc_cents(g, metric, gcents=None):
        if gcents: icc = graph_representation.calculate_icc_dict(gcents)
        else: icc = None
        return graph_representation.graph_to_dict(g, metric, icc)

    import operator
    import dependency_experiments
    import co_occurrence_experiments

    dataset = 'air/reports'
    path = '../data/'+doc
    doc = data.read_file(path)

    metric = graph.GraphMetrics.DEGREE
    context = 'window'
    g = graph_representation.construct_cooccurrence_network(doc, context=context)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Co-occurrence TC', num)
    gcents = co_occurrence_experiments.retrieve_centralities(dataset, context, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Co-occurrence TC-ICC', num)

    metric = graph.GraphMetrics.EIGENVECTOR
    deps = data._text_to_dependencies(doc)
    g = graph_representation.construct_dependency_network(deps)
    cents = _calc_cents(g, metric)
    _print_terms(cents, 'Dependency TC', num)
    gcents = dependency_experiments.retrieve_centralities(dataset, metric)
    cents = _calc_cents(g, metric, gcents)
    _print_terms(cents, 'Dependency TC-ICC', num)

    fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF_IDF)[0]
    _print_terms(fdict, 'TF-IDF', num)

    fdict = freq_representation.text_to_dict([doc], freq_representation.FrequencyMetrics.TF)[0]
    _print_terms(fdict, 'TF', num)
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/class_context_'+str(window_size))

    pp.pprint(results)
    return results
def test_best_classification():
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    rep = []
    print '> Creating representations..'
    for i, text in enumerate(texts):
        if i%100==0: print '   ',i
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        d = graph_representation.graph_to_dict(g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   ', score
Ejemplo n.º 25
0
def test_best_classification():
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    rep = []
    print '> Creating representations..'
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', i
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        d = graph_representation.graph_to_dict(
            g, graph.GraphMetrics.WEIGHTED_DEGREE)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   ', score
def complete_network(path='../data/air/problem_descriptions_text'):
    """
    Create and pickle to file a giant co-occurrence network for all documents
    in the dataset pointed to by *path*.
    """
    print '> Reading cases..'
    texts, labels = data.read_files(path)

    print '> Creating graph..'
    g = None
    for i, text in enumerate(texts):
        if i%10==0: print str(i)+'/'+str(len(texts))
        tmp = graph_representation.construct_cooccurrence_network(text, context='sentence', already_preprocessed=False)
        if g is None:
            g = tmp
        else:
            g.add_nodes_from(tmp.nodes())
            g.add_edges_from(tmp.edges())

    data.pickle_to_file(g, 'output/complete_networks/air_descriptions.pkl')

    pp.pprint(g)
    return g
Ejemplo n.º 27
0
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k, v in p.iteritems():
            if i == 0: props[k] = []
            props[k].append(v)
        g = None  # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ', key
        props_total[key + '_mean'] = numpy.mean(props[key])
        props_total[key + '_std'] = numpy.std(props[key])

    data_name = dataset.replace('/', '.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props,
                        'output/properties/cooccurrence/stats_' + data_name)
    data.pickle_to_file(
        props_total, 'output/properties/cooccurrence/stats_tot_' + data_name)
Ejemplo n.º 28
0
def complete_network(path='../data/air/problem_descriptions_text'):
    """
    Create and pickle to file a giant co-occurrence network for all documents
    in the dataset pointed to by *path*.
    """
    print '> Reading cases..'
    texts, labels = data.read_files(path)

    print '> Creating graph..'
    g = None
    for i, text in enumerate(texts):
        if i % 10 == 0: print str(i) + '/' + str(len(texts))
        tmp = graph_representation.construct_cooccurrence_network(
            text, context='sentence', already_preprocessed=False)
        if g is None:
            g = tmp
        else:
            g.add_nodes_from(tmp.nodes())
            g.add_edges_from(tmp.edges())

    data.pickle_to_file(g, 'output/complete_networks/air_descriptions.pkl')

    pp.pprint(g)
    return g