def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(
            text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs,
                                                         metric,
                                                         verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric + ' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_cooccurrence_network(text,context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ',key
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data_name = dataset.replace('/','.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props, 'output/properties/cooccurrence/stats_'+data_name)
    data.pickle_to_file(props_total, 'output/properties/cooccurrence/stats_tot_'+data_name)
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path = '../data/air/solutions_preprocessed'
    path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i % 1 == 0:
            print '    document', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(
            text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results, 'output/tc_icc/cooccurrence/' + corpus + '/retrieval.res')
    return results
def do_context_sentence_evaluation_classification():
    """
    Experiment evaluating performance of sentences as contexts for
    co-occurrence networks in the classification task.
    """
    print '> Reading cases..'
    path = '../data/tasa/TASA900_text'
    texts, labels = data.read_files(path)

    print '> Evaluating..'
    graphs = []
    results = {}
    for text in texts:
        g = graph_representation.construct_cooccurrence_network(text, context='sentence')
        graphs.append(g)
    for metric in graph_representation.get_metrics():
        print '   ', metric
        vectors = graph_representation.graphs_to_vectors(graphs, metric, verbose=True)
        score = evaluation.evaluate_classification(vectors, labels)
        results[metric+' (sentence)'] = score

    data.pickle_to_file(results, 'output/class_context_sentence')

    pp.pprint(results)
    return results
def edge_direction_evaluation(direction):
    """
    Evaluate impact of using different edge directions on dependency networks.

    Values for *direction*: ``forward``, ``backward``, and ``undirected``.
    """
    results = {'_edge-direction':direction}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, direction=direction)
        metric  = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, direction=direction)
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    results['retrieval'] = score

    data.pickle_to_file(results, 'output/dependencies/stop_words_retr_'+direction)

    pp.pprint(results)
    return results
def centrality_weights_retrieval(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the retrieval task.
    """
    results = {'_is_weighted': weighted, '_evaluation': 'retrieval'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(description_texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None  # just to make sure..
        if i % 100 == 0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(
                rep,
                'output/dependencies/exp1_retr_tmp_' + str(i) + '_' + postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solution_vectors)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_retr' + postfix)

    pp.pprint(results)
    return results
def evaluate_tc_icc_retrieval():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'air/problem_descriptions'
    context = 'window'
    solutions_path  = '../data/air/solutions_preprocessed'
    path            = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating solution representations..'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating problem description representations..'
    for i, text in enumerate(description_texts):
        if i%1==0: print '    document',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_cooccurrence_network(text, already_preprocessed=True, context='window')
        for metric in graph_metrics:
            if not icc[metric]: continue
            #~ print '   ',metric
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solutions_rep)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/retrieval.res')
    return results
def construct_cooccurrence_network(doc,
                                   window_size=2,
                                   direction='undirected',
                                   context='sentence',
                                   already_preprocessed=False,
                                   orders=[],
                                   order_weights=[1.0, 1.0, 1.0],
                                   doc_id=None,
                                   verbose=False):
    """Construct co-occurrence network from text.

    *direction* must be 'forward', 'backward' or 'undirected', while  *context*
    can be 'window' or 'sentence'.

    If *context* is 'window', *already_preprocessed* indicate whether *doc*
    already have been processed. Sentence contexts require unpreocessed *doc*s.

    Any value for *window_size* is ignored if *context* is 'sentence'.

    A DiGraph is created regardless of direction parameter, but with 'undirected',
    edges are created in both directions.
    """
    doc = _cooccurrence_preprocessing(doc, context, already_preprocessed)
    if context is 'sentence':
        matrix, term_list = _sentence_cooccurrence_matrix(
            doc, direction, verbose)
    elif context is 'window':
        matrix, term_list = _window_cooccurrence_matrix(
            doc, direction, window_size, verbose)
    g = nx.DiGraph()
    g.add_nodes_from(term_list)
    if len(orders) == 0:
        graph.add_edges_from_matrix(g, matrix, term_list)
    else:
        if doc_id is not None and os.path.exists(doc_id):
            first, second, third = data.pickle_from_file(doc_id)
        else:
            first, second, third = _higher_order_matrix(matrix.todense())
            if doc_id is not None:
                data.pickle_to_file((first, second, third), doc_id)
    if 1 in orders:
        graph.add_edges_from_matrix(g,
                                    first,
                                    term_list,
                                    rel_weight=order_weights[0])
    if 2 in orders:
        graph.add_edges_from_matrix(g,
                                    second,
                                    term_list,
                                    rel_weight=order_weights[1])
    if 3 in orders:
        graph.add_edges_from_matrix(g,
                                    third,
                                    term_list,
                                    rel_weight=order_weights[2])
    return g
def print_common_hub_words(rem_stop_words):
    """
    Print a list of the most common hub words in the created networks.
    Purpose of experiment was to show that hub words typically are stop words.

    The *rem_stop_words* determine whether stop words are removed before creating
    the networks.
    """
    results = {'_removing stop-words': rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None  # just to make sure..

    results['tasa'] = fd.keys()

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(description_texts):
        if i % 100 == 0:
            print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None  # just to make sure..

    results['air'] = fd.keys()

    if rem_stop_words:
        modifier = 'without'
    else:
        modifier = 'with'
    data.pickle_to_file(
        results, 'output/dependencies/common_hubs_' + modifier + 'stop_words')

    pp.pprint(results)
    return results
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/' + corpus + '_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None  # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(
        results,
        'output/tc_icc/cooccurrence/' + corpus + '/classification.res')
    return results
def centrality_weights_retrieval(weighted=True):
    """
    Evaluate whether edge weights are beneficial to the depdendency
    network represenation for the retrieval task.
    """
    results = {'_is_weighted':weighted, '_evaluation':'retrieval'}
    graph_metrics = graph_representation.get_metrics(weighted)

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    rep = {}
    for metric in graph_metrics:
        rep[metric] = []

    print '> Creating graph representations..'
    for i, text in enumerate(description_texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, weighted=weighted)
        for metric in graph_metrics:
            d = graph_representation.graph_to_dict(g, metric)
            rep[metric].append(d)
        g = None # just to make sure..
        if i%100==0:
            if weighted:
                postfix = '_weighted'
            else:
                postfix = '_unweighted'
            data.pickle_to_file(rep, 'output/dependencies/exp1_retr_tmp_'+str(i)+'_'+postfix)

    print '> Creating vector representations..'
    for metric in graph_metrics:
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    for metric in graph_metrics:
        vectors = rep[metric]
        score = evaluation.evaluate_retrieval(vectors, solution_vectors)
        print '   ', metric, score
        results[metric] = score

    if weighted:
        postfix = '_weighted'
    else:
        postfix = '_unweighted'
    data.pickle_to_file(results, 'output/dependencies/exp1_retr'+postfix)

    pp.pprint(results)
    return results
def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_text'
    store_path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def print_common_hub_words(rem_stop_words):
    """
    Print a list of the most common hub words in the created networks.
    Purpose of experiment was to show that hub words typically are stop words.

    The *rem_stop_words* determine whether stop words are removed before creating
    the networks.
    """
    results = {'_removing stop-words':rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None # just to make sure..

    results['tasa'] = fd.keys()

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    fd = nltk.probability.FreqDist()
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        hubs = graph.get_hubs(g, 10)
        for h in hubs:
            fd.inc(h[0])
        g = None # just to make sure..

    results['air'] = fd.keys()

    if rem_stop_words:
        modifier = 'without'
    else:
        modifier = 'with'
    data.pickle_to_file(results, 'output/dependencies/common_hubs_'+modifier+'stop_words')

    pp.pprint(results)
    return results
def evaluate_tc_icc_classification():
    graph_metrics = graph_representation.get_metrics(True, exclude_flow=True)

    print '> Reading cases..'
    corpus = 'tasa/TASA900'
    #~ corpus = 'tasa/TASATest2'
    context = 'sentence'
    path = '../data/'+corpus+'_text'
    texts, labels = data.read_files(path)

    rep = {}
    icc = {}
    print '> Calculating ICCs..'
    for metric in graph_metrics:
        print '   ', metric
        rep[metric] = []
        centralities = retrieve_centralities(corpus, context, metric)
        if centralities:
            icc[metric] = graph_representation.calculate_icc_dict(centralities)
        else:
            icc[metric] = None

    print '> Creating graph representations..'
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_cooccurrence_network(text, context=context)
        for metric in graph_metrics:
            print '   ', metric
            if not icc[metric]: continue
            d = graph_representation.graph_to_dict(g, metric, icc[metric])
            rep[metric].append(d)
        g = None # just to make sure..

    print '> Creating vector representations..'
    for metric in graph_metrics:
        if not icc[metric]: continue
        rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

    print '> Evaluating..'
    results = {}
    for metric in graph_metrics:
        if not icc[metric]:
            results[metric] = None
            continue
        vectors = rep[metric]
        score = evaluation.evaluate_classification(vectors, labels)
        print '   ', metric, score
        results[metric] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/tc_icc/cooccurrence/'+corpus+'/classification.res')
    return results
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i % 10 == 0: print i
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_' + str(window_size))

    pp.pprint(results)
    return results
def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_text'
    store_path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(
        gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def do_context_size_evaluation_retrieval():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the retrieval task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_preprocessed'
    description_texts, labels = data.read_files(descriptions_path)

    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for i, text in enumerate(description_texts):
            if i%10==0: print i
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_retrieval(vectors, solution_vectors)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/retr_context_'+str(window_size))

    pp.pprint(results)
    return results
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1, 11) + [20, 40, 80]:
        print '-- window size:', window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(
                text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None  # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results,
                            'output/class_context_' + str(window_size))

    pp.pprint(results)
    return results
def store_corpus_network(corpus):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_dependencies'
    store_path = 'output/giants/dependency/'+corpus+'/graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdeps = {}
    for i, text in enumerate(texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(texts))
        d = pickle.loads(text)
        for dep in d.keys():
            gdeps[dep] = gdeps.get(dep, []) + d[dep]
    giant = graph_representation.construct_dependency_network(gdeps,verbose=True,unpickle=False)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def do_context_size_evaluation_classification():
    """
    Experiment evaluating performance of different context sizes for
    co-occurrence networks in the classification task.
    """
    results = {}
    graph_metrics = graph_representation.get_metrics()
    for metric in graph_metrics:
        results[metric] = []

    print '> Reading cases..'
    path = '../data/tasa/TASA900_preprocessed'
    texts, labels = data.read_files(path)

    for window_size in range(1,11)+[20,40,80]:
        print '-- window size:',window_size

        rep = {}
        for metric in graph_metrics:
            rep[metric] = []
        print '> Creating representations..'

        # creating graphs and finding centralities
        for text in texts:
            g = graph_representation.construct_cooccurrence_network(text, window_size=window_size, already_preprocessed=True)
            for metric in graph_metrics:
                d = graph_representation.graph_to_dict(g, metric)
                rep[metric].append(d)
            g = None # just to make sure..

        # creating representation vectors
        for metric in graph_metrics:
            rep[metric] = graph_representation.dicts_to_vectors(rep[metric])

        print '> Evaluating..'
        for metric in graph_metrics:
            vectors = rep[metric]
            score = evaluation.evaluate_classification(vectors, labels)
            print '   ', metric, score
            results[metric].append(score)

        data.pickle_to_file(results, 'output/class_context_'+str(window_size))

    pp.pprint(results)
    return results
def store_corpus_network(corpus):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_dependencies'
    store_path = 'output/giants/dependency/' + corpus + '/graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdeps = {}
    for i, text in enumerate(texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(texts))
        d = pickle.loads(text)
        for dep in d.keys():
            gdeps[dep] = gdeps.get(dep, []) + d[dep]
    giant = graph_representation.construct_dependency_network(gdeps,
                                                              verbose=True,
                                                              unpickle=False)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)
def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file('MemoryError while claculating '+metric+' on '+corpus+':\n'+str(e)+'\n\n', 'output/log/errors')
Exemple #23
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)
def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file(
                'MemoryError while claculating ' + metric + ' on ' + corpus +
                ':\n' + str(e) + '\n\n', 'output/log/errors')
def corpus_dependency_properties(dataset = 'air/problem_descriptions'):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset+'_dependencies'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i%10==0: print '   ',str(i)+'/'+str(len(documents))
        g = graph_representation.construct_dependency_network(text,remove_stop_words=True)
        giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k,v in p.iteritems():
            if i==0: props[k] = []
            props[k].append(v)
        g = None # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        props_total[key+'_mean'] = numpy.mean(props[key])
        props_total[key+'_std'] = numpy.std(props[key])

    data.pickle_to_file(giant, 'output/properties/dependency/corpus_network_air_all_no_stop_words')
    data.pickle_to_file(props, 'output/properties/dependency/docs_air_all_no_stop_words')
    data.pickle_to_file(props_total, 'output/properties/dependency/docs_air_all_no_stop_words_total')
def complete_network(path='../data/air/problem_descriptions_text'):
    """
    Create and pickle to file a giant co-occurrence network for all documents
    in the dataset pointed to by *path*.
    """
    print '> Reading cases..'
    texts, labels = data.read_files(path)

    print '> Creating graph..'
    g = None
    for i, text in enumerate(texts):
        if i%10==0: print str(i)+'/'+str(len(texts))
        tmp = graph_representation.construct_cooccurrence_network(text, context='sentence', already_preprocessed=False)
        if g is None:
            g = tmp
        else:
            g.add_nodes_from(tmp.nodes())
            g.add_edges_from(tmp.edges())

    data.pickle_to_file(g, 'output/complete_networks/air_descriptions.pkl')

    pp.pprint(g)
    return g
def corpus_properties(dataset, context):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_text'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    #~ giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_cooccurrence_network(
            text, context=context)
        #~ giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k, v in p.iteritems():
            if i == 0: props[k] = []
            props[k].append(v)
        g = None  # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        print '   ', key
        props_total[key + '_mean'] = numpy.mean(props[key])
        props_total[key + '_std'] = numpy.std(props[key])

    data_name = dataset.replace('/', '.')
    #~ data.pickle_to_file(giant, 'output/properties/cooccurrence/giant_'+data_name)
    data.pickle_to_file(props,
                        'output/properties/cooccurrence/stats_' + data_name)
    data.pickle_to_file(
        props_total, 'output/properties/cooccurrence/stats_tot_' + data_name)
def construct_cooccurrence_network(doc, window_size=2, direction='undirected', context='sentence', already_preprocessed=False, orders=[], order_weights=[1.0,1.0,1.0],doc_id=None,verbose=False):
    """Construct co-occurrence network from text.

    *direction* must be 'forward', 'backward' or 'undirected', while  *context*
    can be 'window' or 'sentence'.

    If *context* is 'window', *already_preprocessed* indicate whether *doc*
    already have been processed. Sentence contexts require unpreocessed *doc*s.

    Any value for *window_size* is ignored if *context* is 'sentence'.

    A DiGraph is created regardless of direction parameter, but with 'undirected',
    edges are created in both directions.
    """
    doc = _cooccurrence_preprocessing(doc, context, already_preprocessed)
    if context is 'sentence':
        matrix, term_list = _sentence_cooccurrence_matrix(doc, direction, verbose)
    elif context is 'window':
        matrix, term_list = _window_cooccurrence_matrix(doc, direction, window_size, verbose)
    g = nx.DiGraph()
    g.add_nodes_from(term_list)
    if len(orders)==0:
        graph.add_edges_from_matrix(g, matrix, term_list)
    else:
        if doc_id is not None and os.path.exists(doc_id):
            first, second, third = data.pickle_from_file(doc_id)
        else:
            first, second, third = _higher_order_matrix(matrix.todense())
            if doc_id is not None:
                data.pickle_to_file((first,second,third), doc_id)
    if 1 in orders:
        graph.add_edges_from_matrix(g, first, term_list, rel_weight=order_weights[0])
    if 2 in orders:
        graph.add_edges_from_matrix(g, second, term_list, rel_weight=order_weights[1])
    if 3 in orders:
        graph.add_edges_from_matrix(g, third, term_list, rel_weight=order_weights[2])
    return g
Exemple #29
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths,
                            'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths,
                      'sentence length (tokens)',
                      '# sentences',
                      bins=70)
def complete_network(path='../data/air/problem_descriptions_text'):
    """
    Create and pickle to file a giant co-occurrence network for all documents
    in the dataset pointed to by *path*.
    """
    print '> Reading cases..'
    texts, labels = data.read_files(path)

    print '> Creating graph..'
    g = None
    for i, text in enumerate(texts):
        if i % 10 == 0: print str(i) + '/' + str(len(texts))
        tmp = graph_representation.construct_cooccurrence_network(
            text, context='sentence', already_preprocessed=False)
        if g is None:
            g = tmp
        else:
            g.add_nodes_from(tmp.nodes())
            g.add_edges_from(tmp.edges())

    data.pickle_to_file(g, 'output/complete_networks/air_descriptions.pkl')

    pp.pprint(g)
    return g
def corpus_dependency_properties(dataset='air/problem_descriptions'):
    """
    Identify and pickle to file various properties of the given dataset.
    These can alter be converted to pretty tables using
    :func:`~experiments.print_network_props`.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset + '_dependencies'
    (documents, labels) = data.read_files(corpus_path)

    props = {}
    giant = nx.DiGraph()
    print '> Building networks..'
    for i, text in enumerate(documents):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(documents))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=True)
        giant.add_edges_from(g.edges())
        p = graph.network_properties(g)
        for k, v in p.iteritems():
            if i == 0: props[k] = []
            props[k].append(v)
        g = None  # just to make sure..

    print '> Calculating means and deviations..'
    props_total = {}
    for key in props:
        props_total[key + '_mean'] = numpy.mean(props[key])
        props_total[key + '_std'] = numpy.std(props[key])

    data.pickle_to_file(
        giant,
        'output/properties/dependency/corpus_network_air_all_no_stop_words')
    data.pickle_to_file(
        props, 'output/properties/dependency/docs_air_all_no_stop_words')
    data.pickle_to_file(
        props_total,
        'output/properties/dependency/docs_air_all_no_stop_words_total')
def edge_direction_evaluation(direction):
    """
    Evaluate impact of using different edge directions on dependency networks.

    Values for *direction*: ``forward``, ``backward``, and ``undirected``.
    """
    results = {'_edge-direction': direction}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, direction=direction)
        metric = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    for i, text in enumerate(description_texts):
        if i % 100 == 0:
            print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, direction=direction)
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    results['retrieval'] = score

    data.pickle_to_file(results,
                        'output/dependencies/stop_words_retr_' + direction)

    pp.pprint(results)
    return results
def evaluate_dep_types():
    """
    Leave-one-out evaluation of the various dependency types from the stanford parser.
    """
    exclude_list = [
        'dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp', 'attr', 'ccomp',
        'xcomp', 'complm', 'dobj', 'iobj', 'pobj', 'mark', 'rel', 'nsubj',
        'nsubjpass', 'csubj', 'csubjpass', 'cc', 'conj', 'expl', 'abbrev',
        'amod', 'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj',
        'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod', 'quantmod',
        'tmod', 'nn', 'npadvmod', 'num', 'number', 'prep', 'poss',
        'possessive', 'prt', 'parataxis', 'punct', 'ref', 'xsubj', 'pcomp',
        'prepc'
    ]
    results = {'classification': [], 'retrieval': []}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None  # just to make sure..
        full_graph = None
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(
            rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_classification(rep[exclude_label], labels)
        print '  ', exclude_label, score
        results['classification'].append(score)

    data.pickle_to_file(results, 'output/dependencies/types_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None  # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(
            rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_retrieval(rep[exclude_label],
                                              solution_vectors)
        print '  ', exclude_label, score
        results['retrieval'].append(score)

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_eval')

    return results
def evaluate_dep_type_sets():
    """
    Evaluation of various sets of dependency relations.

    Each set is excluded from the representation, and the performance recorded.
    The best strategy is to exclude those dependencies which removal lead to the
    greatest imporovement for the representation.
    """
    strategies = {
        'defensive': ['agent', 'advcl', 'parataxis'],
        'aggressive': [
            'agent', 'advcl', 'parataxis', 'dep', 'aux', 'ccomp', 'xcomp',
            'dobj', 'pobj', 'nsubj', 'nsubjpass', 'cc', 'abbrev', 'purpcl',
            'predet', 'preconj', 'advmod', 'neg', 'rcmod', 'tmod', 'poss',
            'prepc'
        ],
        'compromise_1': [
            'agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass',
            'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc'
        ],
        'compromise_2': [
            'agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass',
            'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc',
            'attr', 'csubj', 'csubjpass', 'number', 'possessive', 'punct',
            'ref'
        ]
    }
    results = {'classification': {}, 'retrieval': {}}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i % 10 == 0: print '   ', str(i) + '/' + str(len(texts))
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(
                text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None  # just to make sure. I don't trust this damn garbage collector...
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_classification(rep[strategy], labels)
        print '  ', strategy, score
        results['classification'][strategy] = score

    data.pickle_to_file(results, 'output/dependencies/types_set_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(
                text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None  # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_retrieval(rep[strategy], solution_vectors)
        print '  ', strategy, score
        results['retrieval'][strategy] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_set_eval')

    return results
def stop_word_evaluation(rem_stop_words):
    """
    Experiment for determining what effect removing stop words have on
    dependency networks.
    """
    results = {'_removing stop-words': rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(texts):
        if i % 100 == 0: print '   ', str(i) + '/' + str(len(texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    print '(the networks had a total of', total_nodes, 'nodes)'
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(
        solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(description_texts):
        if i % 100 == 0:
            print '   ', str(i) + '/' + str(len(description_texts))
        g = graph_representation.construct_dependency_network(
            text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None  # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    print '(the networks had a total of', total_nodes, 'nodes)'
    results['retrieval'] = score

    if rem_stop_words:
        postfix = '_without'
    else:
        postfix = '_with'
    data.pickle_to_file(results,
                        'output/dependencies/stop_words_retr' + postfix)

    pp.pprint(results)
    return results
def stop_word_evaluation(rem_stop_words):
    """
    Experiment for determining what effect removing stop words have on
    dependency networks.
    """
    results = {'_removing stop-words':rem_stop_words}

    print '------ CLASSIFICATION EVALUATION --------'

    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric  = graph.GraphMetrics.CLOSENESS
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_classification(rep, labels)
    print '   score:', score
    print '(the networks had a total of',total_nodes,'nodes)'
    results['classification'] = score

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Creating representations..'
    rep = []
    total_nodes = 0
    for i, text in enumerate(description_texts):
        if i%100==0: print '   ',str(i)+'/'+str(len(description_texts))
        g = graph_representation.construct_dependency_network(text, remove_stop_words=rem_stop_words)
        total_nodes += len(g.nodes())
        metric = graph.GraphMetrics.EIGENVECTOR
        d = graph_representation.graph_to_dict(g, metric)
        rep.append(d)
        g = None # just to make sure..
    rep = graph_representation.dicts_to_vectors(rep)

    print '> Evaluating..'
    score = evaluation.evaluate_retrieval(rep, solution_vectors)
    print '   score:', score
    print '(the networks had a total of',total_nodes,'nodes)'
    results['retrieval'] = score

    if rem_stop_words:
        postfix = '_without'
    else:
        postfix = '_with'
    data.pickle_to_file(results, 'output/dependencies/stop_words_retr'+postfix)

    pp.pprint(results)
    return results
def evaluate_dep_type_sets():
    """
    Evaluation of various sets of dependency relations.

    Each set is excluded from the representation, and the performance recorded.
    The best strategy is to exclude those dependencies which removal lead to the
    greatest imporovement for the representation.
    """
    strategies = {
            'defensive': ['agent', 'advcl', 'parataxis'],
            'aggressive': ['agent', 'advcl', 'parataxis', 'dep', 'aux', 'ccomp', 'xcomp', 'dobj', 'pobj', 'nsubj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'preconj', 'advmod', 'neg', 'rcmod', 'tmod', 'poss', 'prepc'],
            'compromise_1': ['agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc'],
            'compromise_2': ['agent', 'advcl', 'parataxis', 'aux', 'xcomp', 'pobj', 'nsubjpass', 'cc', 'abbrev', 'purpcl', 'predet', 'neg', 'tmod', 'poss', 'prepc', 'attr', 'csubj', 'csubjpass', 'number', 'possessive', 'punct', 'ref']
        }
    results = {'classification':{}, 'retrieval':{}}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric  = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None # just to make sure. I don't trust this damn garbage collector...
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_classification(rep[strategy], labels)
        print '  ', strategy, score
        results['classification'][strategy] = score

    data.pickle_to_file(results, 'output/dependencies/types_set_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for strategy in strategies:
        rep[strategy] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for strategy in strategies:
            g = graph_representation.construct_dependency_network(text, exclude=strategies[strategy])
            d = graph_representation.graph_to_dict(g, metric)
            rep[strategy].append(d)
            g = None # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for strategy in strategies:
        rep[strategy] = graph_representation.dicts_to_vectors(rep[strategy])
    print '> Evaluating..'
    for strategy in strategies:
        score = evaluation.evaluate_retrieval(rep[strategy], solution_vectors)
        print '  ', strategy, score
        results['retrieval'][strategy] = score

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_set_eval')

    return results
def evaluate_dep_types():
    """
    Leave-one-out evaluation of the various dependency types from the stanford parser.
    """
    exclude_list = ['dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp',
                    'attr', 'ccomp', 'xcomp', 'complm', 'dobj', 'iobj',
                    'pobj', 'mark', 'rel', 'nsubj', 'nsubjpass', 'csubj',
                    'csubjpass', 'cc', 'conj', 'expl', 'abbrev', 'amod',
                    'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj',
                    'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod',
                    'quantmod', 'tmod', 'nn', 'npadvmod', 'num', 'number',
                    'prep', 'poss', 'possessive', 'prt', 'parataxis',
                    'punct', 'ref', 'xsubj', 'pcomp', 'prepc']
    results = {'classification':[], 'retrieval':[]}

    print '------ CLASSIFICATION EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/tasa/TASA900_dependencies'
    texts, labels = data.read_files(descriptions_path)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric  = graph.GraphMetrics.CLOSENESS
    for i, text in enumerate(texts):
        if i%10==0: print '   ',str(i)+'/'+str(len(texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None # just to make sure..
        full_graph = None
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_classification(rep[exclude_label], labels)
        print '  ', exclude_label, score
        results['classification'].append(score)

    data.pickle_to_file(results, 'output/dependencies/types_eval_tmp')

    print '------ RETRIEVAL EVALUATION --------'
    print '> Reading cases..'
    descriptions_path = '../data/air/problem_descriptions_dependencies'
    description_texts, labels = data.read_files(descriptions_path)
    solutions_path = '../data/air/solutions_preprocessed'
    solution_texts, labels = data.read_files(solutions_path)
    solution_vectors = freq_representation.text_to_vector(solution_texts, freq_representation.FrequencyMetrics.TF_IDF)
    print '> Creating representations..'
    rep = {}
    for exclude_label in exclude_list:
        rep[exclude_label] = []
    metric = graph.GraphMetrics.EIGENVECTOR
    for i, text in enumerate(description_texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(description_texts))
        full_graph = graph_representation.construct_dependency_network(text)
        for exclude_label in exclude_list:
            g = graph.reduce_edge_set(full_graph, exclude_label)
            d = graph_representation.graph_to_dict(g, metric)
            rep[exclude_label].append(d)
            g = None # just to make sure..
        full_graph = None
        #~ if i%100==0: data.pickle_to_file(rep, 'output/dependencies/types_eval_rep_'+str(i))
    for exclude_label in exclude_list:
        rep[exclude_label] = graph_representation.dicts_to_vectors(rep[exclude_label])
    print '> Evaluating..'
    for exclude_label in exclude_list:
        score = evaluation.evaluate_retrieval(rep[exclude_label], solution_vectors)
        print '  ', exclude_label, score
        results['retrieval'].append(score)

    pp.pprint(results)
    data.pickle_to_file(results, 'output/dependencies/types_eval')

    return results