Beispiel #1
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(
            training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(
            training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training': training_rep, 'test': test_rep}
        labels = {'training': training_labels, 'test': test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Beispiel #2
0
def plot_classification_evaluations_experiment():
    import data
    labels = ['Freqency', 'Co-occurrence', 'Dependency']
    legend = ['local', 'global']
    d = [
        [0.5678, 0.5455],  # .., -2
        [0.5694, 0.5333],
        [0.5889, 0.5056]
    ]
    ys = {
        .3: '0.0',
        .35: '...',
        .4: '0.4',
        .5: '0.5',
        .6: '0.6',
        .7: '0.7',
        .8: '0.8'
    }
    fig = plotter.tikz_barchart(d,
                                labels,
                                scale=3.5,
                                yscale=3,
                                color='black',
                                legend=legend,
                                legend_sep=0.6,
                                low_cut=0.3,
                                y_tics=ys,
                                tick=False)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/eval_classification.tex',
        mode='w')
Beispiel #3
0
def classification_comparison_freq(dataset='reuters'):
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training_preprocessed'
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test_preprocessed'
    test_docs, test_labels = data.read_files(test_path)

    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        training_dicts = freq_representation.text_to_dict(training_docs, metric)
        test_dicts = freq_representation.text_to_dict(test_docs, metric)
        print '    dicst -> vectors'
        keys = set()
        for d in training_dicts + test_dicts:
            keys = keys.union(d.keys())
        print '    vocabulary size:', len(keys)
        training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
        test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)
        reps = {'training':training_rep, 'test':test_rep}
        labels = {'training':training_labels, 'test':test_labels}
        score = evaluation.evaluate_classification(reps, labels, mode='split')
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Beispiel #4
0
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False):
    """
    Experiment used for comparative evaluation of different network
    representations on retrieval.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    def make_dicts(docs, icc=None):
        rep = []
        for i, doc in enumerate(docs):
            if i%100==0: print '    graph',str(i)+'/'+str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'}
    gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network,
                'dependency':graph_representation.construct_dependency_network}
    metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE,
                'dependency':graph.GraphMetrics.EIGENVECTOR}

    print '--', graph_type
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions'+postfix[graph_type]
    docs, labels = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/'+dataset+'/solutions_preprocessed'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    icc = None
    if use_icc:
        print '> Calculating ICC..'
        m = metrics[graph_type].split()[0]
        print graph_type
        if graph_type == 'co-occurrence':
            p = 'output/centralities/co-occurrence/'+dataset+'/problem_descriptions/window/'+m+'.cent'
        elif graph_type == 'dependency':
            p = 'output/centralities/dependency/'+dataset+'/problem_descriptions/'+m+'.cent'
        print '    fetching', p
        icc = data.pickle_from_file(p)
        print '    icc:', type(icc)

    print '> Creating problem description representations..'
    dicts = make_dicts(docs, icc)
    descriptions_rep = graph_representation.dicts_to_vectors(dicts)#, remove_stop_words=True)

    print '> Evaluating..'
    results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
    print results
    s = 'retrieval comparison '
    if use_icc: s += 'USING TC-ICC'
    s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
def plot_context_evaluation():
    options = {
        'Degree (window)': 'mark=*,blue',
        'PageRank (window)': 'mark=*,red',
        'Degree (sentence)': 'dashed,blue',
        'PageRank (sentence)': 'dashed,red'
    }
    retr_results = {
        'Degree (window)': [
            0.22290305491606582, 0.2239404496699994, 0.22351183191703122,
            0.22293583927185456, 0.2216027852882311, 0.22232860216650002,
            0.22230162622918934, 0.22287683186704185, 0.22266252053221772,
            0.22237418794670616
        ],
        'PageRank (window)': [
            0.21772129149181993, 0.21884861149427587, 0.22063142971295358,
            0.21893898241891538, 0.21973766615441442, 0.22054672890564322,
            0.22099589130745473, 0.22129686184085004, 0.22148942934157456,
            0.22147928890310792
        ],
        'PageRank (sentence)': [0.22056586008664569] * 10,
        'Degree (sentence)': [0.21784622825075944] * 10
    }
    fig = plotter.tikz_plot(retr_results,
                            options,
                            xlabel='Context size',
                            ylabel='Performance',
                            legend=False)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/co-occ_context_eval_retr.tex',
        mode='w')

    class_results = {
        'Degree (window)': [
            0.52777777777777779, 0.53333333333333333, 0.53611111111111109,
            0.53333333333333333, 0.53888888888888886, 0.54166666666666663,
            0.53611111111111109, 0.52777777777777779, 0.53055555555555556,
            0.53055555555555556
        ],
        'PageRank (window)': [
            0.55833333333333335, 0.55000000000000004, 0.55277777777777781,
            0.54166666666666663, 0.5444444444444444, 0.54722222222222228,
            0.54722222222222228, 0.53888888888888886, 0.53888888888888886,
            0.53611111111111109
        ],
        'Degree (sentence)': [0.57499999999999996] * 10,
        'PageRank (sentence)': [0.56666666666666665] * 10
    }
    fig = plotter.tikz_plot(class_results,
                            options,
                            xlabel='Context size',
                            ylabel='Performance',
                            legend=True)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/co-occ_context_eval_class.tex',
        mode='w')
def plot_context_evaluation():
    options = {
        'Degree (window)': 'mark=*,blue',
        'PageRank (window)': 'mark=*,red',
        'Degree (sentence)': 'dashed,blue',
        'PageRank (sentence)': 'dashed,red'}
    retr_results = {
        'Degree (window)': [0.22290305491606582,
                            0.2239404496699994,
                            0.22351183191703122,
                            0.22293583927185456,
                            0.2216027852882311,
                            0.22232860216650002,
                            0.22230162622918934,
                            0.22287683186704185,
                            0.22266252053221772,
                            0.22237418794670616],
        'PageRank (window)': [0.21772129149181993,
                            0.21884861149427587,
                            0.22063142971295358,
                            0.21893898241891538,
                            0.21973766615441442,
                            0.22054672890564322,
                            0.22099589130745473,
                            0.22129686184085004,
                            0.22148942934157456,
                            0.22147928890310792],
        'PageRank (sentence)': [0.22056586008664569]*10,
        'Degree (sentence)': [0.21784622825075944]*10}
    fig = plotter.tikz_plot(retr_results, options, xlabel='Context size', ylabel='Performance', legend=False)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/co-occ_context_eval_retr.tex',mode='w')

    class_results = {
        'Degree (window)': [0.52777777777777779,
                           0.53333333333333333,
                           0.53611111111111109,
                           0.53333333333333333,
                           0.53888888888888886,
                           0.54166666666666663,
                           0.53611111111111109,
                           0.52777777777777779,
                           0.53055555555555556,
                           0.53055555555555556],
        'PageRank (window)': [0.55833333333333335,
                              0.55000000000000004,
                              0.55277777777777781,
                              0.54166666666666663,
                              0.5444444444444444,
                              0.54722222222222228,
                              0.54722222222222228,
                              0.53888888888888886,
                              0.53888888888888886,
                              0.53611111111111109],
        'Degree (sentence)':[0.57499999999999996]*10,
        'PageRank (sentence)':[0.56666666666666665]*10}
    fig = plotter.tikz_plot(class_results, options, xlabel='Context size', ylabel='Performance', legend=True)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/co-occ_context_eval_class.tex',mode='w')
Beispiel #7
0
def plot_classification_evaluations_experiment():
    import data
    labels = ['Freqency','Co-occurrence','Dependency']
    legend = ['local','global']
    d = [[0.5678,0.5455], # .., -2
         [0.5694,0.5333],
         [0.5889,0.5056]]
    ys = {.3:'0.0',.35:'...',.4:'0.4',.5:'0.5',.6:'0.6',.7:'0.7',.8:'0.8'}
    fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=3, color='black', legend=legend, legend_sep=0.6, low_cut=0.3, y_tics=ys, tick=False)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/eval_classification.tex',mode='w')
Beispiel #8
0
def plot_retrieval_evaluations_experiment():
    import data
    labels = ['Freqency','Co-occurrence','Dependency']
    legend = ['local','global']
    d = [[0.2240,0.2459],
         [0.2227,0.2559],
         [0.2020,0.2048]]
    ys = {0:'0.0', .1:'0.1', .2:'0.2', .3:'0.3', .4:'0.4'}
    fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=3, color='black', legend=None, y_tics=ys, tick=False)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/eval_retrieval.tex',mode='w')
Beispiel #9
0
def plot_classification_comparison_experiment():
    import data
    labels = ['Freqency','Co-occurrence','Dependency']
    legend = ['local','global']
    d = [[0.6693,0.6375],
         [0.6880,0.6875],
         [0.6827,0.6763]]
    ys = {.4:'0.0',.45:'...',.5:'0.5',.6:'0.6',.7:'0.7',.8:'0.8'}
    fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=3, color='black', legend=legend, legend_sep=0.6, low_cut=0.4, y_tics=ys, tick=False)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/comp_classification.tex',mode='w')
Beispiel #10
0
def process_delete(isbn, title):
    #step 1. put all books from csv into a list
    all_books = read_books()
    #step 2. find the book to be deleted
    book_to_delete = find_book_generic(isbn)
    #step 3. find the item to be deleted in the all_books list & delete it
    for index in range(len(all_books)):
        if book_to_delete['isbn'] == all_books[index]['isbn']:
            del all_books[index]
            break
    #4. write back the list to the file
    write_to_file(all_books)
    return redirect(url_for('read_book'))
Beispiel #11
0
def plot_retrieval_evaluations_experiment():
    import data
    labels = ['Freqency', 'Co-occurrence', 'Dependency']
    legend = ['local', 'global']
    d = [[0.2240, 0.2459], [0.2227, 0.2559], [0.2020, 0.2048]]
    ys = {0: '0.0', .1: '0.1', .2: '0.2', .3: '0.3', .4: '0.4'}
    fig = plotter.tikz_barchart(d,
                                labels,
                                scale=3.5,
                                yscale=3,
                                color='black',
                                legend=None,
                                y_tics=ys,
                                tick=False)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/eval_retrieval.tex',
        mode='w')
def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file('MemoryError while claculating '+metric+' on '+corpus+':\n'+str(e)+'\n\n', 'output/log/errors')
Beispiel #13
0
def plot_classification_comparison_experiment():
    import data
    labels = ['Freqency', 'Co-occurrence', 'Dependency']
    legend = ['local', 'global']
    d = [[0.6693, 0.6375], [0.6880, 0.6875], [0.6827, 0.6763]]
    ys = {.4: '0.0', .45: '...', .5: '0.5', .6: '0.6', .7: '0.7', .8: '0.8'}
    fig = plotter.tikz_barchart(d,
                                labels,
                                scale=3.5,
                                yscale=3,
                                color='black',
                                legend=legend,
                                legend_sep=0.6,
                                low_cut=0.4,
                                y_tics=ys,
                                tick=False)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/comp_classification.tex',
        mode='w')
Beispiel #14
0
def run_base_generation(candidates, bases):
    for i in range(100):
        print(str(i))
        cost_vector = create_cost_distribution(candidates, 0)
        voters = 5000
        candidates = candidates
        noise_swaps = int(candidates / 4)
        directory = "cluster_" + str(bases)
        preference_order = model.Preference([x for x in range(0, candidates)])
        random_order = preference_order.generate_random_preference_order()
        profile = data.replicate_preference_order(random_order, int(voters / bases))
        for base in range(bases - 1):
            random_order = preference_order.generate_random_preference_order()
            profile = profile + data.replicate_preference_order(random_order, int(voters / bases))
        data.apply_noise(profile, noise_swaps, 1)
        if not os.path.exists(directory):
            os.makedirs(directory)
        data.write_to_file(
            os.path.join(directory, "{}_v{}:c{}:b{}:s{}".format(i, voters, candidates, bases, noise_swaps) + ".txt"),
            profile, cost_vector)
def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file(
                'MemoryError while claculating ' + metric + ' on ' + corpus +
                ':\n' + str(e) + '\n\n', 'output/log/errors')
Beispiel #16
0
def run_similar_generation(candidates):
    for i in range(100):
        print(str(i))
        cost_vector = create_cost_distribution(candidates, 0)
        voters = 5000
        candidates = candidates
        bases = 2
        similar_swaps = int(candidates / 2)
        noise_swaps = int(candidates / 4)
        directory = "cluster_2_similar"
        preference_order = model.Preference([x for x in range(0, candidates)])
        first_order = preference_order.generate_random_preference_order()
        profile = get_similar_profile(bases, first_order, similar_swaps, noise_swaps, voters)

        if not os.path.exists(directory):
            os.makedirs(directory)
        data.write_to_file(os.path.join(directory,
                                        "{}_v{}:c{}:b{}:s{}"
                                        .format(i, voters, candidates, bases, noise_swaps) + ".txt"),
                           profile,
                           cost_vector)
Beispiel #17
0
def retrieval_comparison_freq(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/'+dataset+'/solutions_preprocessed'
    solutions_docs, _ = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_docs, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        descriptions_rep = freq_representation.text_to_vector(docs, metric)
        score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'retrieval comparison \nrepresentation: frequency\ndataset:'+dataset+' \nresult:\n'+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
Beispiel #18
0
def run_polar_generation(candidates):
    for i in range(100):
        print(str(i))
        cost_vector = create_cost_distribution(candidates, 0)
        voters = 5000
        candidates = candidates
        bases = 2
        noise_swaps = int(candidates / 4)
        directory = "cluster_2_polar"
        preference_order = model.Preference([x for x in range(0, candidates)])
        first_order = preference_order.generate_random_preference_order()
        second_order = first_order.reverse_preference_order()
        first_profile = data.replicate_preference_order(first_order, int(voters / bases))
        second_profile = data.replicate_preference_order(second_order, int(voters / bases))
        profile = first_profile + second_profile
        data.apply_noise(profile, noise_swaps, 1)

        if not os.path.exists(directory):
            os.makedirs(directory)
        data.write_to_file(os.path.join(directory,
                                        "{}_v{}:c{}:b{}:s{}"
                                        .format(i, voters, candidates, bases, noise_swaps) + ".txt"),
                           profile,
                           cost_vector)
Beispiel #19
0
def run_random_generation(candidates):
    for i in range(100):
        print(str(i))
        cost_vector = create_cost_distribution(candidates, 0)
        voters = 5000
        candidates = candidates
        bases = voters
        swaps = 0
        directory = "random"
        preference_order = model.Preference([x for x in range(0, candidates)])
        list_of_preferences = []
        for x in range(voters):
            list_of_preferences.append(preference_order.generate_random_preference_order())
        profile = model.Profile(number_of_candidates=candidates,
                                number_of_voters=voters,
                                preference_list=list_of_preferences)
        data.apply_noise(profile, swaps, 1)

        if not os.path.exists(directory):
            os.makedirs(directory)
        data.write_to_file(os.path.join(directory,
                                        "{}_v{}:c{}:b{}:s{}".format(i, voters, candidates, bases, swaps) + ".txt"),
                           profile,
                           cost_vector)
Beispiel #20
0
def retrieval_comparison_freq(dataset='mir'):
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions_preprocessed'
    docs, _ = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/' + dataset + '/solutions_preprocessed'
    solutions_docs, _ = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_docs, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Evaluating..'
    results = {}
    for metric in freq_representation.get_metrics():
        print '   ', metric,
        descriptions_rep = freq_representation.text_to_vector(docs, metric)
        score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
        results[metric] = score
        print score
    pp.pprint(results)
    s = 'retrieval comparison \nrepresentation: frequency\ndataset:' + dataset + ' \nresult:\n' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
def plot_exp1():
    """
    Plotting the results of the weight evaluation experiment.
    """
    legend = ['unweighted', 'weighted']
    labels = ['Degree','Closeness','Current-flow closeness','Betweenness','Current-flow betweenness','Load','Eigenvector','PageRank','HITS authorities','HITS hubs']

    # classification
    d = [[0.52500000000000002,0.49444444444444446], # Degree
        [0.57499999999999996,0.57499999999999996], # Closeness
        [0.56944444444444442,0.58333333333333337], # Current-flow closeness
        [0.36388888888888887,0.36944444444444446], # Betweenness
        [0.23333333333333334,0.20833333333333334], # Current-flow betweenness
        [0.35555555555555557,0.36666666666666664], # Load
        [0.49722222222222223,0.45555555555555555], # Eigenvector
        [0.52777777777777779,0.51111111111111107], # PageRank
        [0.49722222222222223,0.45555555555555555], # HITS authorities
        [0.49722222222222223,0.45555555555555555]] # HITS hubs
    ys = {0:'0.0',.1:'0.1',.2:'0.2', .3:'0.3',.4:'0.4',.5:'0.5',.6:'0.6'}
    fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=2.8, color='black', legend=legend, legend_sep=1.0, tick=False, y_tics=ys)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/dependency_eval_class.tex',mode='w')

    # retrieval
    d = [[0.18149811054435275,0.18821229318222113], # Degree
        [0.17184314735361236,0.18216618328598347], # Closeness
        [0.14606637651984622,0.13586098100141117], # Betweenness
        [0.17399729543537901,0.17613717518129621], # Current-flow closeness
        [0.042019078720146409,0.042019078720146409], # Current-flow betweenness
        [0.14700372822743263,0.15104493506838745], # Load
        [0.19854658693196564,0.17540014008712554], # Eigenvector
        [0.17725358882165362,0.17252331100724849], # PageRank
        [0.19854658693196564,0.17540014008712554], # HITS authorities
        [0.19854658693196564,0.17540014008712554]] # HITS hubs
    ys = {0:'0.0',.05:'0.05', .1:'0.1',.15:'0.15', .2:'0.2'}
    fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=8, color='black', legend=legend, legend_sep=1.0, tick=False, grid_step=0.05, y_tics=ys)
    data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/dependency_eval_retr.tex',mode='w')
Beispiel #22
0
def run_fraction_generation_1(candidates, fractions):
    for i in range(100):
        print(str(i))
        cost_vector = create_cost_distribution(candidates, 0)
        voters = 5000
        candidates = candidates
        noise_swaps = int(candidates / 4)
        population = [int(voters * fraction) for fraction in fractions]
        fraction_string = "_".join([str(fraction) for fraction in fractions])
        directory = "fraction_" + fraction_string
        preference_order = model.Preference([x for x in range(0, candidates)])

        random_order = preference_order.generate_random_preference_order()
        profile = (data.replicate_preference_order(random_order, population[0]))
        for fraction in population[1:]:
            random_order = preference_order.generate_random_preference_order()
            new_profile = data.replicate_preference_order(random_order, fraction)
            profile = profile + new_profile
        data.apply_noise(profile, noise_swaps, 1)
        if not os.path.exists(directory):
            os.makedirs(directory)
        data.write_to_file(
            os.path.join(directory, "{}_v{}:c{}:b{}:s{}".format(i, voters, candidates, 5, noise_swaps) + ".txt"),
            profile, cost_vector)
Beispiel #23
0
def plot_centrality_evaluations():
    import data
    labels = ['~~~~~Degree','Closeness','Current-flow closeness','Betweenness','Current-flow betweenness','Load','Eigenvector','PageRank','HITS Authorities','HITS Hubs']

    d = [
            [0.5694444444444444,0.5333333333333333],#[0.5555555555555556,0.5333333333333333],
            [0.525,0.5166666666666667],
            [0.5194444444444445,0.5111111111111111],
            [0.4361111111111111,0.43333333333333335],
            [0.42777777777777776,0.4187],#[0.42777777777777776,0.05],
            [0.4361111111111111,0.4222222222222222],
            [0.5183333333333333,0.5055555555555555],
            [0.5573333333333333,0.5433333333333333],
            [0.5083333333333333,0.5083333333333333],
            [0.5083333333333333,0.5083333333333333]]
    ys = {.3:'0.0',.35:'...',.4:'0.4',.5:'0.5',.6:'0.6',.7:'0.7',.8:'0.8'}
    fig = plotter.tikz_barchart(d, None, scale = 3.5, yscale=3, color='black', legend = ['TC','TC-ICC'], legend_sep=0.6, low_cut=0.3, y_tics=ys, tick=False)
    data.write_to_file(fig,'../../masteroppgave/paper/parts/tikz_bar_co-occurrence.tex',mode='w')

    d = [
            [0.52500000000000002,0.5028],
            [0.58894242452424244,0.5056],#[0.57499999999999996,0.5056],
            [0.56944444444444442,0.5028],
            [0.36388888888888887,0.3806],
            [0.23333333333333334,0.2263],#[0.23333333333333334,0.05],
            [0.35555555555555557,0.3778],
            [0.49722222222222223,0.4667],
            [0.52777777777777779,0.4833],
            [0.49722222222222223,0.4611],
            [0.49722222222222223,0.4611]]
    ys = {.0:'0.0',.1:'',.2:'0.2',.3:'',.4:'0.4',.5:'',.6:'0.6',.7:'',.8:'0.8'}
    fig = plotter.tikz_barchart(d, None, scale = 3.5, yscale=1.6, color='black', y_tics=ys, tick=False)
    data.write_to_file(fig,'../../masteroppgave/paper/parts/tikz_bar_dependency.tex',mode='w')

    fig = plotter.tikz_barchart(d, labels, scale = 3.5, color='black', labels_only=True)
    data.write_to_file(fig,'../../masteroppgave/paper/parts/tikz_bar_labels.tex',mode='w')
Beispiel #24
0
def main():
    from argparse import RawTextHelpFormatter
    parser = argparse.ArgumentParser(description='Computes the winner of given profile',
                                     formatter_class=RawTextHelpFormatter)
    parser.add_argument('--preferences', help='A filepath either containing preferences or it does not exist\n'
                                              'If the param "write" is used then the generated preferences will be outputted there.')
    parser.add_argument('--write', action='store_true', help="Set if the generated profile should be saved to a file")
    parser.add_argument('--generate', action='store_true', help="Run generate-profiles code block")
    parser.add_argument('--read', action='store_true', help="Run read-profiles code block")
    parser.add_argument('--cost', type=int, default=1, help='The cost distribution to use over candidates\n'
                                                            '0 = Normal distribution with mean=100, and std=15\n')
    parser.add_argument('--rule', type=int, default=0, help='The rule to decide the winner\n'
                                                            '0 = budget-plurality\n'
                                                            '1 = budget-borda\n'
                                                            '2 = copeland\n'
                                                            '3 = knapsack\n'
                                                            '4 = theta rule\n')
    parser.add_argument('--axiom', type=int, default=0, help='The axiom the check the rule against\n'
                                                             '0 = Unanimity\n'
                                                             '1 = Committee Monotonicity\n'
                                                             '2 = Theta Minority\n'
                                                             '3 = Regret\n'
                                                             '4 = Copeland Axiom\n'
                                                             '5 = Gini Coefficient\n')
    parser.add_argument('--budget', type=int, default=10, help='The total budget to be used')
    parser.add_argument('--voters', type=int, default=10, help='The number of voters')
    parser.add_argument('--candidates', type=int, default=10, help='The number of candidates')
    parser.add_argument('--base', type=int, default=3, help='The number base preference orders')
    parser.add_argument('--swaps', type=int, default=1, help='The number of swaps to do for each preference order')
    parser.add_argument('--noise', type=int, default=2, help='The noise parameter')
    # group = parser.add_mutually_exclusive_group()
    # group.add_argument('-v', '--verbose', action='store_true')
    # group.add_argument('-q', '--quiet', action='store_true')
    args = parser.parse_args()

    if args.generate:
        run_fraction_generation_1(10, [0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
        run_fraction_generation_1(20, [0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
        return

    if args.read:
        read_data_set("fraction_0.8_0.05_0.05_0.05_0.05", 10)
        read_data_set("fraction_0.8_0.05_0.05_0.05_0.05", 20)
        return

    cost_vector = create_cost_distribution(args.candidates, 0)
    if not args.write:
        profile = data.read_from_file(args.preferences)
        rule = initialize_rule(args.rule)
        winner_set = rule.get_winners(profile, args.budget, cost_vector)
        axiom = initialize_axiom(args.axiom)
        satisfied = axiom.is_satisfied(rule, profile, args.budget, cost_vector)
        if satisfied:
            print(rule.name + " satisfies " + axiom.name)
            if axiom.has_value():
                print("value: " + str(axiom.get_value()))
        else:
            print(rule.name + " does not satisfy " + axiom.name)
            #        total_cost = 0
            #        for winner in winner_set:
            #            print(str(winner) + " " + str(cost_vector[winner]))
            #           total_cost += cost_vector[winner]
            #        print(" ".join([str(total_cost), str(args.budget)]))

    else:
        profile = data.create_noisy_data(args.voters, args.candidates, args.base, args.swaps, args.noise)
        data.write_to_file(args.preferences, profile, cost_vector)
Beispiel #25
0
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    import co_occurrence_experiments
    import dependency_experiments

    def make_dicts(docs, icc):
        rep = []
        for i, doc in enumerate(docs):
            if i%100==0: print '    graph',str(i)+'/'+str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'}
    gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network,
                'dependency':graph_representation.construct_dependency_network}
    metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE,
                'dependency':graph.GraphMetrics.CLOSENESS}

    print '--', graph_type
    print '> Reading data..', dataset
    training_path = '../data/'+dataset+'/training'+postfix[graph_type]
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/'+dataset+'/test'+postfix[graph_type]
    test_docs, test_labels = data.read_files(test_path)

    icc_training = None
    icc_test = None
    if icc:
        print '> Calculating ICC..'
        if graph_type is 'co-occurrence':
            icc_training = co_occurrence_experiments.retrieve_centralities(dataset+'/training', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_training = dependency_experiments.retrieve_centralities(dataset+'/training', metrics[graph_type])

        if graph_type is 'co-occurrence':
            icc_test = co_occurrence_experiments.retrieve_centralities(dataset+'/test', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_test = dependency_experiments.retrieve_centralities(dataset+'/test', metrics[graph_type])

    print '> Creating representations..'
    training_dicts = make_dicts(training_docs, icc_training)
    test_dicts = make_dicts(test_docs, icc_test)

    print '    dicts -> vectors'
    keys = set()
    for d in training_dicts + test_dicts:
        keys = keys.union(d.keys())
    keys = list(keys)
    print '    vocabulary size:', len(keys)

    training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
    test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)

    print '> Evaluating..'
    reps = {'training':training_rep, 'test':test_rep}
    labels = {'training':training_labels, 'test':test_labels}
    results = evaluation.evaluate_classification(reps, labels, mode='split')
    print results
    s = 'classification comparison '
    if icc: s += 'USING TC-ICC'
    s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Beispiel #26
0
def retrieval_comparison_graph(dataset='air',
                               graph_type='co-occurrence',
                               use_icc=False):
    """
    Experiment used for comparative evaluation of different network
    representations on retrieval.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    def make_dicts(docs, icc=None):
        rep = []
        for i, doc in enumerate(docs):
            if i % 100 == 0: print '    graph', str(i) + '/' + str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'}
    gfuns = {
        'co-occurrence': graph_representation.construct_cooccurrence_network,
        'dependency': graph_representation.construct_dependency_network
    }
    metrics = {
        'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE,
        'dependency': graph.GraphMetrics.EIGENVECTOR
    }

    print '--', graph_type
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions' + postfix[graph_type]
    docs, labels = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/' + dataset + '/solutions_preprocessed'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    icc = None
    if use_icc:
        print '> Calculating ICC..'
        m = metrics[graph_type].split()[0]
        print graph_type
        if graph_type == 'co-occurrence':
            p = 'output/centralities/co-occurrence/' + dataset + '/problem_descriptions/window/' + m + '.cent'
        elif graph_type == 'dependency':
            p = 'output/centralities/dependency/' + dataset + '/problem_descriptions/' + m + '.cent'
        print '    fetching', p
        icc = data.pickle_from_file(p)
        print '    icc:', type(icc)

    print '> Creating problem description representations..'
    dicts = make_dicts(docs, icc)
    descriptions_rep = graph_representation.dicts_to_vectors(
        dicts)  #, remove_stop_words=True)

    print '> Evaluating..'
    results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
    print results
    s = 'retrieval comparison '
    if use_icc: s += 'USING TC-ICC'
    s += '\nrepresentation: ' + graph_type + '\nresult: ' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results
Beispiel #27
0
 def test_write(self):
     import os
     file_name = "test_out.txt"
     data.write_to_file(file_name, profile)
     os.remove(file_name)
Beispiel #28
0
def classification_comparison_graph(dataset='reuters',
                                    graph_type='co-occurrence',
                                    icc=None):
    """
    Experiment used for comparative evaluation of different network
    representations on classification.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    import co_occurrence_experiments
    import dependency_experiments

    def make_dicts(docs, icc):
        rep = []
        for i, doc in enumerate(docs):
            if i % 100 == 0: print '    graph', str(i) + '/' + str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'}
    gfuns = {
        'co-occurrence': graph_representation.construct_cooccurrence_network,
        'dependency': graph_representation.construct_dependency_network
    }
    metrics = {
        'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE,
        'dependency': graph.GraphMetrics.CLOSENESS
    }

    print '--', graph_type
    print '> Reading data..', dataset
    training_path = '../data/' + dataset + '/training' + postfix[graph_type]
    training_docs, training_labels = data.read_files(training_path)
    test_path = '../data/' + dataset + '/test' + postfix[graph_type]
    test_docs, test_labels = data.read_files(test_path)

    icc_training = None
    icc_test = None
    if icc:
        print '> Calculating ICC..'
        if graph_type is 'co-occurrence':
            icc_training = co_occurrence_experiments.retrieve_centralities(
                dataset + '/training', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_training = dependency_experiments.retrieve_centralities(
                dataset + '/training', metrics[graph_type])

        if graph_type is 'co-occurrence':
            icc_test = co_occurrence_experiments.retrieve_centralities(
                dataset + '/test', 'sentence', metrics[graph_type])
        elif graph_type is 'dependency':
            icc_test = dependency_experiments.retrieve_centralities(
                dataset + '/test', metrics[graph_type])

    print '> Creating representations..'
    training_dicts = make_dicts(training_docs, icc_training)
    test_dicts = make_dicts(test_docs, icc_test)

    print '    dicts -> vectors'
    keys = set()
    for d in training_dicts + test_dicts:
        keys = keys.union(d.keys())
    keys = list(keys)
    print '    vocabulary size:', len(keys)

    training_rep = graph_representation.dicts_to_vectors(training_dicts, keys)
    test_rep = graph_representation.dicts_to_vectors(test_dicts, keys)

    print '> Evaluating..'
    reps = {'training': training_rep, 'test': test_rep}
    labels = {'training': training_labels, 'test': test_labels}
    results = evaluation.evaluate_classification(reps, labels, mode='split')
    print results
    s = 'classification comparison '
    if icc: s += 'USING TC-ICC'
    s += '\nrepresentation: ' + graph_type + '\nresult: ' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/classification')
    return results
Beispiel #29
0
def plot_centrality_evaluations():
    import data
    labels = [
        '~~~~~Degree', 'Closeness', 'Current-flow closeness', 'Betweenness',
        'Current-flow betweenness', 'Load', 'Eigenvector', 'PageRank',
        'HITS Authorities', 'HITS Hubs'
    ]

    d = [
        [0.5694444444444444,
         0.5333333333333333],  #[0.5555555555555556,0.5333333333333333],
        [0.525, 0.5166666666666667],
        [0.5194444444444445, 0.5111111111111111],
        [0.4361111111111111, 0.43333333333333335],
        [0.42777777777777776, 0.4187],  #[0.42777777777777776,0.05],
        [0.4361111111111111, 0.4222222222222222],
        [0.5183333333333333, 0.5055555555555555],
        [0.5573333333333333, 0.5433333333333333],
        [0.5083333333333333, 0.5083333333333333],
        [0.5083333333333333, 0.5083333333333333]
    ]
    ys = {
        .3: '0.0',
        .35: '...',
        .4: '0.4',
        .5: '0.5',
        .6: '0.6',
        .7: '0.7',
        .8: '0.8'
    }
    fig = plotter.tikz_barchart(d,
                                None,
                                scale=3.5,
                                yscale=3,
                                color='black',
                                legend=['TC', 'TC-ICC'],
                                legend_sep=0.6,
                                low_cut=0.3,
                                y_tics=ys,
                                tick=False)
    data.write_to_file(
        fig,
        '../../masteroppgave/paper/parts/tikz_bar_co-occurrence.tex',
        mode='w')

    d = [
        [0.52500000000000002, 0.5028],
        [0.58894242452424244, 0.5056],  #[0.57499999999999996,0.5056],
        [0.56944444444444442, 0.5028],
        [0.36388888888888887, 0.3806],
        [0.23333333333333334, 0.2263],  #[0.23333333333333334,0.05],
        [0.35555555555555557, 0.3778],
        [0.49722222222222223, 0.4667],
        [0.52777777777777779, 0.4833],
        [0.49722222222222223, 0.4611],
        [0.49722222222222223, 0.4611]
    ]
    ys = {
        .0: '0.0',
        .1: '',
        .2: '0.2',
        .3: '',
        .4: '0.4',
        .5: '',
        .6: '0.6',
        .7: '',
        .8: '0.8'
    }
    fig = plotter.tikz_barchart(d,
                                None,
                                scale=3.5,
                                yscale=1.6,
                                color='black',
                                y_tics=ys,
                                tick=False)
    data.write_to_file(
        fig,
        '../../masteroppgave/paper/parts/tikz_bar_dependency.tex',
        mode='w')

    fig = plotter.tikz_barchart(d,
                                labels,
                                scale=3.5,
                                color='black',
                                labels_only=True)
    data.write_to_file(fig,
                       '../../masteroppgave/paper/parts/tikz_bar_labels.tex',
                       mode='w')
def plot_exp1():
    """
    Plotting the results of the weight evaluation experiment.
    """
    legend = ['unweighted', 'weighted']
    labels = [
        'Degree', 'Closeness', 'Current-flow closeness', 'Betweenness',
        'Current-flow betweenness', 'Load', 'Eigenvector', 'PageRank',
        'HITS authorities', 'HITS hubs'
    ]

    # classification
    d = [
        [0.52500000000000002, 0.49444444444444446],  # Degree
        [0.57499999999999996, 0.57499999999999996],  # Closeness
        [0.56944444444444442, 0.58333333333333337],  # Current-flow closeness
        [0.36388888888888887, 0.36944444444444446],  # Betweenness
        [0.23333333333333334, 0.20833333333333334],  # Current-flow betweenness
        [0.35555555555555557, 0.36666666666666664],  # Load
        [0.49722222222222223, 0.45555555555555555],  # Eigenvector
        [0.52777777777777779, 0.51111111111111107],  # PageRank
        [0.49722222222222223, 0.45555555555555555],  # HITS authorities
        [0.49722222222222223, 0.45555555555555555]
    ]  # HITS hubs
    ys = {
        0: '0.0',
        .1: '0.1',
        .2: '0.2',
        .3: '0.3',
        .4: '0.4',
        .5: '0.5',
        .6: '0.6'
    }
    fig = plotter.tikz_barchart(d,
                                labels,
                                scale=3.5,
                                yscale=2.8,
                                color='black',
                                legend=legend,
                                legend_sep=1.0,
                                tick=False,
                                y_tics=ys)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/dependency_eval_class.tex',
        mode='w')

    # retrieval
    d = [
        [0.18149811054435275, 0.18821229318222113],  # Degree
        [0.17184314735361236, 0.18216618328598347],  # Closeness
        [0.14606637651984622, 0.13586098100141117],  # Betweenness
        [0.17399729543537901, 0.17613717518129621],  # Current-flow closeness
        [0.042019078720146409,
         0.042019078720146409],  # Current-flow betweenness
        [0.14700372822743263, 0.15104493506838745],  # Load
        [0.19854658693196564, 0.17540014008712554],  # Eigenvector
        [0.17725358882165362, 0.17252331100724849],  # PageRank
        [0.19854658693196564, 0.17540014008712554],  # HITS authorities
        [0.19854658693196564, 0.17540014008712554]
    ]  # HITS hubs
    ys = {0: '0.0', .05: '0.05', .1: '0.1', .15: '0.15', .2: '0.2'}
    fig = plotter.tikz_barchart(d,
                                labels,
                                scale=3.5,
                                yscale=8,
                                color='black',
                                legend=legend,
                                legend_sep=1.0,
                                tick=False,
                                grid_step=0.05,
                                y_tics=ys)
    data.write_to_file(
        fig,
        '../../masteroppgave/report/imgs/tikz/dependency_eval_retr.tex',
        mode='w')