def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/' + dataset + '/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict( training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors( training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def plot_classification_evaluations_experiment(): import data labels = ['Freqency', 'Co-occurrence', 'Dependency'] legend = ['local', 'global'] d = [ [0.5678, 0.5455], # .., -2 [0.5694, 0.5333], [0.5889, 0.5056] ] ys = { .3: '0.0', .35: '...', .4: '0.4', .5: '0.5', .6: '0.6', .7: '0.7', .8: '0.8' } fig = plotter.tikz_barchart(d, labels, scale=3.5, yscale=3, color='black', legend=legend, legend_sep=0.6, low_cut=0.3, y_tics=ys, tick=False) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/eval_classification.tex', mode='w')
def classification_comparison_freq(dataset='reuters'): print '> Reading data..', dataset training_path = '../data/'+dataset+'/training_preprocessed' training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test_preprocessed' test_docs, test_labels = data.read_files(test_path) results = {} for metric in freq_representation.get_metrics(): print ' ', metric, training_dicts = freq_representation.text_to_dict(training_docs, metric) test_dicts = freq_representation.text_to_dict(test_docs, metric) print ' dicst -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} score = evaluation.evaluate_classification(reps, labels, mode='split') results[metric] = score print score pp.pprint(results) s = 'classification comparison \nrepresentation: frequency\nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False): """ Experiment used for comparative evaluation of different network representations on retrieval. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ def make_dicts(docs, icc=None): rep = [] for i, doc in enumerate(docs): if i%100==0: print ' graph',str(i)+'/'+str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'} gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network, 'dependency':graph_representation.construct_dependency_network} metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency':graph.GraphMetrics.EIGENVECTOR} print '--', graph_type print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions'+postfix[graph_type] docs, labels = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/'+dataset+'/solutions_preprocessed' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) icc = None if use_icc: print '> Calculating ICC..' m = metrics[graph_type].split()[0] print graph_type if graph_type == 'co-occurrence': p = 'output/centralities/co-occurrence/'+dataset+'/problem_descriptions/window/'+m+'.cent' elif graph_type == 'dependency': p = 'output/centralities/dependency/'+dataset+'/problem_descriptions/'+m+'.cent' print ' fetching', p icc = data.pickle_from_file(p) print ' icc:', type(icc) print '> Creating problem description representations..' dicts = make_dicts(docs, icc) descriptions_rep = graph_representation.dicts_to_vectors(dicts)#, remove_stop_words=True) print '> Evaluating..' results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) print results s = 'retrieval comparison ' if use_icc: s += 'USING TC-ICC' s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def plot_context_evaluation(): options = { 'Degree (window)': 'mark=*,blue', 'PageRank (window)': 'mark=*,red', 'Degree (sentence)': 'dashed,blue', 'PageRank (sentence)': 'dashed,red' } retr_results = { 'Degree (window)': [ 0.22290305491606582, 0.2239404496699994, 0.22351183191703122, 0.22293583927185456, 0.2216027852882311, 0.22232860216650002, 0.22230162622918934, 0.22287683186704185, 0.22266252053221772, 0.22237418794670616 ], 'PageRank (window)': [ 0.21772129149181993, 0.21884861149427587, 0.22063142971295358, 0.21893898241891538, 0.21973766615441442, 0.22054672890564322, 0.22099589130745473, 0.22129686184085004, 0.22148942934157456, 0.22147928890310792 ], 'PageRank (sentence)': [0.22056586008664569] * 10, 'Degree (sentence)': [0.21784622825075944] * 10 } fig = plotter.tikz_plot(retr_results, options, xlabel='Context size', ylabel='Performance', legend=False) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/co-occ_context_eval_retr.tex', mode='w') class_results = { 'Degree (window)': [ 0.52777777777777779, 0.53333333333333333, 0.53611111111111109, 0.53333333333333333, 0.53888888888888886, 0.54166666666666663, 0.53611111111111109, 0.52777777777777779, 0.53055555555555556, 0.53055555555555556 ], 'PageRank (window)': [ 0.55833333333333335, 0.55000000000000004, 0.55277777777777781, 0.54166666666666663, 0.5444444444444444, 0.54722222222222228, 0.54722222222222228, 0.53888888888888886, 0.53888888888888886, 0.53611111111111109 ], 'Degree (sentence)': [0.57499999999999996] * 10, 'PageRank (sentence)': [0.56666666666666665] * 10 } fig = plotter.tikz_plot(class_results, options, xlabel='Context size', ylabel='Performance', legend=True) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/co-occ_context_eval_class.tex', mode='w')
def plot_context_evaluation(): options = { 'Degree (window)': 'mark=*,blue', 'PageRank (window)': 'mark=*,red', 'Degree (sentence)': 'dashed,blue', 'PageRank (sentence)': 'dashed,red'} retr_results = { 'Degree (window)': [0.22290305491606582, 0.2239404496699994, 0.22351183191703122, 0.22293583927185456, 0.2216027852882311, 0.22232860216650002, 0.22230162622918934, 0.22287683186704185, 0.22266252053221772, 0.22237418794670616], 'PageRank (window)': [0.21772129149181993, 0.21884861149427587, 0.22063142971295358, 0.21893898241891538, 0.21973766615441442, 0.22054672890564322, 0.22099589130745473, 0.22129686184085004, 0.22148942934157456, 0.22147928890310792], 'PageRank (sentence)': [0.22056586008664569]*10, 'Degree (sentence)': [0.21784622825075944]*10} fig = plotter.tikz_plot(retr_results, options, xlabel='Context size', ylabel='Performance', legend=False) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/co-occ_context_eval_retr.tex',mode='w') class_results = { 'Degree (window)': [0.52777777777777779, 0.53333333333333333, 0.53611111111111109, 0.53333333333333333, 0.53888888888888886, 0.54166666666666663, 0.53611111111111109, 0.52777777777777779, 0.53055555555555556, 0.53055555555555556], 'PageRank (window)': [0.55833333333333335, 0.55000000000000004, 0.55277777777777781, 0.54166666666666663, 0.5444444444444444, 0.54722222222222228, 0.54722222222222228, 0.53888888888888886, 0.53888888888888886, 0.53611111111111109], 'Degree (sentence)':[0.57499999999999996]*10, 'PageRank (sentence)':[0.56666666666666665]*10} fig = plotter.tikz_plot(class_results, options, xlabel='Context size', ylabel='Performance', legend=True) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/co-occ_context_eval_class.tex',mode='w')
def plot_classification_evaluations_experiment(): import data labels = ['Freqency','Co-occurrence','Dependency'] legend = ['local','global'] d = [[0.5678,0.5455], # .., -2 [0.5694,0.5333], [0.5889,0.5056]] ys = {.3:'0.0',.35:'...',.4:'0.4',.5:'0.5',.6:'0.6',.7:'0.7',.8:'0.8'} fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=3, color='black', legend=legend, legend_sep=0.6, low_cut=0.3, y_tics=ys, tick=False) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/eval_classification.tex',mode='w')
def plot_retrieval_evaluations_experiment(): import data labels = ['Freqency','Co-occurrence','Dependency'] legend = ['local','global'] d = [[0.2240,0.2459], [0.2227,0.2559], [0.2020,0.2048]] ys = {0:'0.0', .1:'0.1', .2:'0.2', .3:'0.3', .4:'0.4'} fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=3, color='black', legend=None, y_tics=ys, tick=False) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/eval_retrieval.tex',mode='w')
def plot_classification_comparison_experiment(): import data labels = ['Freqency','Co-occurrence','Dependency'] legend = ['local','global'] d = [[0.6693,0.6375], [0.6880,0.6875], [0.6827,0.6763]] ys = {.4:'0.0',.45:'...',.5:'0.5',.6:'0.6',.7:'0.7',.8:'0.8'} fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=3, color='black', legend=legend, legend_sep=0.6, low_cut=0.4, y_tics=ys, tick=False) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/comp_classification.tex',mode='w')
def process_delete(isbn, title): #step 1. put all books from csv into a list all_books = read_books() #step 2. find the book to be deleted book_to_delete = find_book_generic(isbn) #step 3. find the item to be deleted in the all_books list & delete it for index in range(len(all_books)): if book_to_delete['isbn'] == all_books[index]['isbn']: del all_books[index] break #4. write back the list to the file write_to_file(all_books) return redirect(url_for('read_book'))
def plot_retrieval_evaluations_experiment(): import data labels = ['Freqency', 'Co-occurrence', 'Dependency'] legend = ['local', 'global'] d = [[0.2240, 0.2459], [0.2227, 0.2559], [0.2020, 0.2048]] ys = {0: '0.0', .1: '0.1', .2: '0.2', .3: '0.3', .4: '0.4'} fig = plotter.tikz_barchart(d, labels, scale=3.5, yscale=3, color='black', legend=None, y_tics=ys, tick=False) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/eval_retrieval.tex', mode='w')
def store_centralities(corpus, context): print '> Calculating and storing centralities for', corpus g = retrieve_corpus_network(corpus, context) metrics = graph_representation.get_metrics(True, exclude_flow=True) for metric in metrics: m = metric.split()[0] store_path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping:', metric continue else: print ' calculating:', metric try: c = graph.centralities(g, metric) data.pickle_to_file(c, store_path) except MemoryError as e: print 'MemoryError :(' data.write_to_file('MemoryError while claculating '+metric+' on '+corpus+':\n'+str(e)+'\n\n', 'output/log/errors')
def plot_classification_comparison_experiment(): import data labels = ['Freqency', 'Co-occurrence', 'Dependency'] legend = ['local', 'global'] d = [[0.6693, 0.6375], [0.6880, 0.6875], [0.6827, 0.6763]] ys = {.4: '0.0', .45: '...', .5: '0.5', .6: '0.6', .7: '0.7', .8: '0.8'} fig = plotter.tikz_barchart(d, labels, scale=3.5, yscale=3, color='black', legend=legend, legend_sep=0.6, low_cut=0.4, y_tics=ys, tick=False) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/comp_classification.tex', mode='w')
def run_base_generation(candidates, bases): for i in range(100): print(str(i)) cost_vector = create_cost_distribution(candidates, 0) voters = 5000 candidates = candidates noise_swaps = int(candidates / 4) directory = "cluster_" + str(bases) preference_order = model.Preference([x for x in range(0, candidates)]) random_order = preference_order.generate_random_preference_order() profile = data.replicate_preference_order(random_order, int(voters / bases)) for base in range(bases - 1): random_order = preference_order.generate_random_preference_order() profile = profile + data.replicate_preference_order(random_order, int(voters / bases)) data.apply_noise(profile, noise_swaps, 1) if not os.path.exists(directory): os.makedirs(directory) data.write_to_file( os.path.join(directory, "{}_v{}:c{}:b{}:s{}".format(i, voters, candidates, bases, noise_swaps) + ".txt"), profile, cost_vector)
def store_centralities(corpus, context): print '> Calculating and storing centralities for', corpus g = retrieve_corpus_network(corpus, context) metrics = graph_representation.get_metrics(True, exclude_flow=True) for metric in metrics: m = metric.split()[0] store_path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent' if data.pickle_from_file(store_path, suppress_warning=True): print ' already present, skipping:', metric continue else: print ' calculating:', metric try: c = graph.centralities(g, metric) data.pickle_to_file(c, store_path) except MemoryError as e: print 'MemoryError :(' data.write_to_file( 'MemoryError while claculating ' + metric + ' on ' + corpus + ':\n' + str(e) + '\n\n', 'output/log/errors')
def run_similar_generation(candidates): for i in range(100): print(str(i)) cost_vector = create_cost_distribution(candidates, 0) voters = 5000 candidates = candidates bases = 2 similar_swaps = int(candidates / 2) noise_swaps = int(candidates / 4) directory = "cluster_2_similar" preference_order = model.Preference([x for x in range(0, candidates)]) first_order = preference_order.generate_random_preference_order() profile = get_similar_profile(bases, first_order, similar_swaps, noise_swaps, voters) if not os.path.exists(directory): os.makedirs(directory) data.write_to_file(os.path.join(directory, "{}_v{}:c{}:b{}:s{}" .format(i, voters, candidates, bases, noise_swaps) + ".txt"), profile, cost_vector)
def retrieval_comparison_freq(dataset='mir'): print '> Reading data..', dataset path = '../data/'+dataset+'/problem_descriptions_preprocessed' docs, _ = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/'+dataset+'/solutions_preprocessed' solutions_docs, _ = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector(solutions_docs, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' results = {} for metric in freq_representation.get_metrics(): print ' ', metric, descriptions_rep = freq_representation.text_to_vector(docs, metric) score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) results[metric] = score print score pp.pprint(results) s = 'retrieval comparison \nrepresentation: frequency\ndataset:'+dataset+' \nresult:\n'+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def run_polar_generation(candidates): for i in range(100): print(str(i)) cost_vector = create_cost_distribution(candidates, 0) voters = 5000 candidates = candidates bases = 2 noise_swaps = int(candidates / 4) directory = "cluster_2_polar" preference_order = model.Preference([x for x in range(0, candidates)]) first_order = preference_order.generate_random_preference_order() second_order = first_order.reverse_preference_order() first_profile = data.replicate_preference_order(first_order, int(voters / bases)) second_profile = data.replicate_preference_order(second_order, int(voters / bases)) profile = first_profile + second_profile data.apply_noise(profile, noise_swaps, 1) if not os.path.exists(directory): os.makedirs(directory) data.write_to_file(os.path.join(directory, "{}_v{}:c{}:b{}:s{}" .format(i, voters, candidates, bases, noise_swaps) + ".txt"), profile, cost_vector)
def run_random_generation(candidates): for i in range(100): print(str(i)) cost_vector = create_cost_distribution(candidates, 0) voters = 5000 candidates = candidates bases = voters swaps = 0 directory = "random" preference_order = model.Preference([x for x in range(0, candidates)]) list_of_preferences = [] for x in range(voters): list_of_preferences.append(preference_order.generate_random_preference_order()) profile = model.Profile(number_of_candidates=candidates, number_of_voters=voters, preference_list=list_of_preferences) data.apply_noise(profile, swaps, 1) if not os.path.exists(directory): os.makedirs(directory) data.write_to_file(os.path.join(directory, "{}_v{}:c{}:b{}:s{}".format(i, voters, candidates, bases, swaps) + ".txt"), profile, cost_vector)
def retrieval_comparison_freq(dataset='mir'): print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions_preprocessed' docs, _ = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/' + dataset + '/solutions_preprocessed' solutions_docs, _ = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_docs, freq_representation.FrequencyMetrics.TF_IDF) print '> Evaluating..' results = {} for metric in freq_representation.get_metrics(): print ' ', metric, descriptions_rep = freq_representation.text_to_vector(docs, metric) score = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) results[metric] = score print score pp.pprint(results) s = 'retrieval comparison \nrepresentation: frequency\ndataset:' + dataset + ' \nresult:\n' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def plot_exp1(): """ Plotting the results of the weight evaluation experiment. """ legend = ['unweighted', 'weighted'] labels = ['Degree','Closeness','Current-flow closeness','Betweenness','Current-flow betweenness','Load','Eigenvector','PageRank','HITS authorities','HITS hubs'] # classification d = [[0.52500000000000002,0.49444444444444446], # Degree [0.57499999999999996,0.57499999999999996], # Closeness [0.56944444444444442,0.58333333333333337], # Current-flow closeness [0.36388888888888887,0.36944444444444446], # Betweenness [0.23333333333333334,0.20833333333333334], # Current-flow betweenness [0.35555555555555557,0.36666666666666664], # Load [0.49722222222222223,0.45555555555555555], # Eigenvector [0.52777777777777779,0.51111111111111107], # PageRank [0.49722222222222223,0.45555555555555555], # HITS authorities [0.49722222222222223,0.45555555555555555]] # HITS hubs ys = {0:'0.0',.1:'0.1',.2:'0.2', .3:'0.3',.4:'0.4',.5:'0.5',.6:'0.6'} fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=2.8, color='black', legend=legend, legend_sep=1.0, tick=False, y_tics=ys) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/dependency_eval_class.tex',mode='w') # retrieval d = [[0.18149811054435275,0.18821229318222113], # Degree [0.17184314735361236,0.18216618328598347], # Closeness [0.14606637651984622,0.13586098100141117], # Betweenness [0.17399729543537901,0.17613717518129621], # Current-flow closeness [0.042019078720146409,0.042019078720146409], # Current-flow betweenness [0.14700372822743263,0.15104493506838745], # Load [0.19854658693196564,0.17540014008712554], # Eigenvector [0.17725358882165362,0.17252331100724849], # PageRank [0.19854658693196564,0.17540014008712554], # HITS authorities [0.19854658693196564,0.17540014008712554]] # HITS hubs ys = {0:'0.0',.05:'0.05', .1:'0.1',.15:'0.15', .2:'0.2'} fig = plotter.tikz_barchart(d, labels, scale = 3.5, yscale=8, color='black', legend=legend, legend_sep=1.0, tick=False, grid_step=0.05, y_tics=ys) data.write_to_file(fig,'../../masteroppgave/report/imgs/tikz/dependency_eval_retr.tex',mode='w')
def run_fraction_generation_1(candidates, fractions): for i in range(100): print(str(i)) cost_vector = create_cost_distribution(candidates, 0) voters = 5000 candidates = candidates noise_swaps = int(candidates / 4) population = [int(voters * fraction) for fraction in fractions] fraction_string = "_".join([str(fraction) for fraction in fractions]) directory = "fraction_" + fraction_string preference_order = model.Preference([x for x in range(0, candidates)]) random_order = preference_order.generate_random_preference_order() profile = (data.replicate_preference_order(random_order, population[0])) for fraction in population[1:]: random_order = preference_order.generate_random_preference_order() new_profile = data.replicate_preference_order(random_order, fraction) profile = profile + new_profile data.apply_noise(profile, noise_swaps, 1) if not os.path.exists(directory): os.makedirs(directory) data.write_to_file( os.path.join(directory, "{}_v{}:c{}:b{}:s{}".format(i, voters, candidates, 5, noise_swaps) + ".txt"), profile, cost_vector)
def plot_centrality_evaluations(): import data labels = ['~~~~~Degree','Closeness','Current-flow closeness','Betweenness','Current-flow betweenness','Load','Eigenvector','PageRank','HITS Authorities','HITS Hubs'] d = [ [0.5694444444444444,0.5333333333333333],#[0.5555555555555556,0.5333333333333333], [0.525,0.5166666666666667], [0.5194444444444445,0.5111111111111111], [0.4361111111111111,0.43333333333333335], [0.42777777777777776,0.4187],#[0.42777777777777776,0.05], [0.4361111111111111,0.4222222222222222], [0.5183333333333333,0.5055555555555555], [0.5573333333333333,0.5433333333333333], [0.5083333333333333,0.5083333333333333], [0.5083333333333333,0.5083333333333333]] ys = {.3:'0.0',.35:'...',.4:'0.4',.5:'0.5',.6:'0.6',.7:'0.7',.8:'0.8'} fig = plotter.tikz_barchart(d, None, scale = 3.5, yscale=3, color='black', legend = ['TC','TC-ICC'], legend_sep=0.6, low_cut=0.3, y_tics=ys, tick=False) data.write_to_file(fig,'../../masteroppgave/paper/parts/tikz_bar_co-occurrence.tex',mode='w') d = [ [0.52500000000000002,0.5028], [0.58894242452424244,0.5056],#[0.57499999999999996,0.5056], [0.56944444444444442,0.5028], [0.36388888888888887,0.3806], [0.23333333333333334,0.2263],#[0.23333333333333334,0.05], [0.35555555555555557,0.3778], [0.49722222222222223,0.4667], [0.52777777777777779,0.4833], [0.49722222222222223,0.4611], [0.49722222222222223,0.4611]] ys = {.0:'0.0',.1:'',.2:'0.2',.3:'',.4:'0.4',.5:'',.6:'0.6',.7:'',.8:'0.8'} fig = plotter.tikz_barchart(d, None, scale = 3.5, yscale=1.6, color='black', y_tics=ys, tick=False) data.write_to_file(fig,'../../masteroppgave/paper/parts/tikz_bar_dependency.tex',mode='w') fig = plotter.tikz_barchart(d, labels, scale = 3.5, color='black', labels_only=True) data.write_to_file(fig,'../../masteroppgave/paper/parts/tikz_bar_labels.tex',mode='w')
def main(): from argparse import RawTextHelpFormatter parser = argparse.ArgumentParser(description='Computes the winner of given profile', formatter_class=RawTextHelpFormatter) parser.add_argument('--preferences', help='A filepath either containing preferences or it does not exist\n' 'If the param "write" is used then the generated preferences will be outputted there.') parser.add_argument('--write', action='store_true', help="Set if the generated profile should be saved to a file") parser.add_argument('--generate', action='store_true', help="Run generate-profiles code block") parser.add_argument('--read', action='store_true', help="Run read-profiles code block") parser.add_argument('--cost', type=int, default=1, help='The cost distribution to use over candidates\n' '0 = Normal distribution with mean=100, and std=15\n') parser.add_argument('--rule', type=int, default=0, help='The rule to decide the winner\n' '0 = budget-plurality\n' '1 = budget-borda\n' '2 = copeland\n' '3 = knapsack\n' '4 = theta rule\n') parser.add_argument('--axiom', type=int, default=0, help='The axiom the check the rule against\n' '0 = Unanimity\n' '1 = Committee Monotonicity\n' '2 = Theta Minority\n' '3 = Regret\n' '4 = Copeland Axiom\n' '5 = Gini Coefficient\n') parser.add_argument('--budget', type=int, default=10, help='The total budget to be used') parser.add_argument('--voters', type=int, default=10, help='The number of voters') parser.add_argument('--candidates', type=int, default=10, help='The number of candidates') parser.add_argument('--base', type=int, default=3, help='The number base preference orders') parser.add_argument('--swaps', type=int, default=1, help='The number of swaps to do for each preference order') parser.add_argument('--noise', type=int, default=2, help='The noise parameter') # group = parser.add_mutually_exclusive_group() # group.add_argument('-v', '--verbose', action='store_true') # group.add_argument('-q', '--quiet', action='store_true') args = parser.parse_args() if args.generate: run_fraction_generation_1(10, [0.5, 0.1, 0.1, 0.1, 0.1, 0.1]) run_fraction_generation_1(20, [0.5, 0.1, 0.1, 0.1, 0.1, 0.1]) return if args.read: read_data_set("fraction_0.8_0.05_0.05_0.05_0.05", 10) read_data_set("fraction_0.8_0.05_0.05_0.05_0.05", 20) return cost_vector = create_cost_distribution(args.candidates, 0) if not args.write: profile = data.read_from_file(args.preferences) rule = initialize_rule(args.rule) winner_set = rule.get_winners(profile, args.budget, cost_vector) axiom = initialize_axiom(args.axiom) satisfied = axiom.is_satisfied(rule, profile, args.budget, cost_vector) if satisfied: print(rule.name + " satisfies " + axiom.name) if axiom.has_value(): print("value: " + str(axiom.get_value())) else: print(rule.name + " does not satisfy " + axiom.name) # total_cost = 0 # for winner in winner_set: # print(str(winner) + " " + str(cost_vector[winner])) # total_cost += cost_vector[winner] # print(" ".join([str(total_cost), str(args.budget)])) else: profile = data.create_noisy_data(args.voters, args.candidates, args.base, args.swaps, args.noise) data.write_to_file(args.preferences, profile, cost_vector)
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None): """ Experiment used for comparative evaluation of different network representations on classification. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ import co_occurrence_experiments import dependency_experiments def make_dicts(docs, icc): rep = [] for i, doc in enumerate(docs): if i%100==0: print ' graph',str(i)+'/'+str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'} gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network, 'dependency':graph_representation.construct_dependency_network} metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency':graph.GraphMetrics.CLOSENESS} print '--', graph_type print '> Reading data..', dataset training_path = '../data/'+dataset+'/training'+postfix[graph_type] training_docs, training_labels = data.read_files(training_path) test_path = '../data/'+dataset+'/test'+postfix[graph_type] test_docs, test_labels = data.read_files(test_path) icc_training = None icc_test = None if icc: print '> Calculating ICC..' if graph_type is 'co-occurrence': icc_training = co_occurrence_experiments.retrieve_centralities(dataset+'/training', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_training = dependency_experiments.retrieve_centralities(dataset+'/training', metrics[graph_type]) if graph_type is 'co-occurrence': icc_test = co_occurrence_experiments.retrieve_centralities(dataset+'/test', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_test = dependency_experiments.retrieve_centralities(dataset+'/test', metrics[graph_type]) print '> Creating representations..' training_dicts = make_dicts(training_docs, icc_training) test_dicts = make_dicts(test_docs, icc_test) print ' dicts -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) keys = list(keys) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) print '> Evaluating..' reps = {'training':training_rep, 'test':test_rep} labels = {'training':training_labels, 'test':test_labels} results = evaluation.evaluate_classification(reps, labels, mode='split') print results s = 'classification comparison ' if icc: s += 'USING TC-ICC' s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False): """ Experiment used for comparative evaluation of different network representations on retrieval. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ def make_dicts(docs, icc=None): rep = [] for i, doc in enumerate(docs): if i % 100 == 0: print ' graph', str(i) + '/' + str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'} gfuns = { 'co-occurrence': graph_representation.construct_cooccurrence_network, 'dependency': graph_representation.construct_dependency_network } metrics = { 'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency': graph.GraphMetrics.EIGENVECTOR } print '--', graph_type print '> Reading data..', dataset path = '../data/' + dataset + '/problem_descriptions' + postfix[graph_type] docs, labels = data.read_files(path) print '> Creating solution representations..' solutions_path = '../data/' + dataset + '/solutions_preprocessed' solutions_texts, labels = data.read_files(solutions_path) solutions_rep = freq_representation.text_to_vector( solutions_texts, freq_representation.FrequencyMetrics.TF_IDF) icc = None if use_icc: print '> Calculating ICC..' m = metrics[graph_type].split()[0] print graph_type if graph_type == 'co-occurrence': p = 'output/centralities/co-occurrence/' + dataset + '/problem_descriptions/window/' + m + '.cent' elif graph_type == 'dependency': p = 'output/centralities/dependency/' + dataset + '/problem_descriptions/' + m + '.cent' print ' fetching', p icc = data.pickle_from_file(p) print ' icc:', type(icc) print '> Creating problem description representations..' dicts = make_dicts(docs, icc) descriptions_rep = graph_representation.dicts_to_vectors( dicts) #, remove_stop_words=True) print '> Evaluating..' results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep) print results s = 'retrieval comparison ' if use_icc: s += 'USING TC-ICC' s += '\nrepresentation: ' + graph_type + '\nresult: ' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/retrieval') return results
def test_write(self): import os file_name = "test_out.txt" data.write_to_file(file_name, profile) os.remove(file_name)
def classification_comparison_graph(dataset='reuters', graph_type='co-occurrence', icc=None): """ Experiment used for comparative evaluation of different network representations on classification. graph_type = 'co-occurrence' | 'dependency' `icc` determines whether to use _inverse corpus centrality_ in the vector representations. """ import co_occurrence_experiments import dependency_experiments def make_dicts(docs, icc): rep = [] for i, doc in enumerate(docs): if i % 100 == 0: print ' graph', str(i) + '/' + str(len(docs)) g = gfuns[graph_type](doc) d = graph_representation.graph_to_dict(g, metrics[graph_type], icc) rep.append(d) return rep postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'} gfuns = { 'co-occurrence': graph_representation.construct_cooccurrence_network, 'dependency': graph_representation.construct_dependency_network } metrics = { 'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE, 'dependency': graph.GraphMetrics.CLOSENESS } print '--', graph_type print '> Reading data..', dataset training_path = '../data/' + dataset + '/training' + postfix[graph_type] training_docs, training_labels = data.read_files(training_path) test_path = '../data/' + dataset + '/test' + postfix[graph_type] test_docs, test_labels = data.read_files(test_path) icc_training = None icc_test = None if icc: print '> Calculating ICC..' if graph_type is 'co-occurrence': icc_training = co_occurrence_experiments.retrieve_centralities( dataset + '/training', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_training = dependency_experiments.retrieve_centralities( dataset + '/training', metrics[graph_type]) if graph_type is 'co-occurrence': icc_test = co_occurrence_experiments.retrieve_centralities( dataset + '/test', 'sentence', metrics[graph_type]) elif graph_type is 'dependency': icc_test = dependency_experiments.retrieve_centralities( dataset + '/test', metrics[graph_type]) print '> Creating representations..' training_dicts = make_dicts(training_docs, icc_training) test_dicts = make_dicts(test_docs, icc_test) print ' dicts -> vectors' keys = set() for d in training_dicts + test_dicts: keys = keys.union(d.keys()) keys = list(keys) print ' vocabulary size:', len(keys) training_rep = graph_representation.dicts_to_vectors(training_dicts, keys) test_rep = graph_representation.dicts_to_vectors(test_dicts, keys) print '> Evaluating..' reps = {'training': training_rep, 'test': test_rep} labels = {'training': training_labels, 'test': test_labels} results = evaluation.evaluate_classification(reps, labels, mode='split') print results s = 'classification comparison ' if icc: s += 'USING TC-ICC' s += '\nrepresentation: ' + graph_type + '\nresult: ' + str( results) + '\n\n\n' data.write_to_file(s, 'output/comparison/classification') return results
def plot_centrality_evaluations(): import data labels = [ '~~~~~Degree', 'Closeness', 'Current-flow closeness', 'Betweenness', 'Current-flow betweenness', 'Load', 'Eigenvector', 'PageRank', 'HITS Authorities', 'HITS Hubs' ] d = [ [0.5694444444444444, 0.5333333333333333], #[0.5555555555555556,0.5333333333333333], [0.525, 0.5166666666666667], [0.5194444444444445, 0.5111111111111111], [0.4361111111111111, 0.43333333333333335], [0.42777777777777776, 0.4187], #[0.42777777777777776,0.05], [0.4361111111111111, 0.4222222222222222], [0.5183333333333333, 0.5055555555555555], [0.5573333333333333, 0.5433333333333333], [0.5083333333333333, 0.5083333333333333], [0.5083333333333333, 0.5083333333333333] ] ys = { .3: '0.0', .35: '...', .4: '0.4', .5: '0.5', .6: '0.6', .7: '0.7', .8: '0.8' } fig = plotter.tikz_barchart(d, None, scale=3.5, yscale=3, color='black', legend=['TC', 'TC-ICC'], legend_sep=0.6, low_cut=0.3, y_tics=ys, tick=False) data.write_to_file( fig, '../../masteroppgave/paper/parts/tikz_bar_co-occurrence.tex', mode='w') d = [ [0.52500000000000002, 0.5028], [0.58894242452424244, 0.5056], #[0.57499999999999996,0.5056], [0.56944444444444442, 0.5028], [0.36388888888888887, 0.3806], [0.23333333333333334, 0.2263], #[0.23333333333333334,0.05], [0.35555555555555557, 0.3778], [0.49722222222222223, 0.4667], [0.52777777777777779, 0.4833], [0.49722222222222223, 0.4611], [0.49722222222222223, 0.4611] ] ys = { .0: '0.0', .1: '', .2: '0.2', .3: '', .4: '0.4', .5: '', .6: '0.6', .7: '', .8: '0.8' } fig = plotter.tikz_barchart(d, None, scale=3.5, yscale=1.6, color='black', y_tics=ys, tick=False) data.write_to_file( fig, '../../masteroppgave/paper/parts/tikz_bar_dependency.tex', mode='w') fig = plotter.tikz_barchart(d, labels, scale=3.5, color='black', labels_only=True) data.write_to_file(fig, '../../masteroppgave/paper/parts/tikz_bar_labels.tex', mode='w')
def plot_exp1(): """ Plotting the results of the weight evaluation experiment. """ legend = ['unweighted', 'weighted'] labels = [ 'Degree', 'Closeness', 'Current-flow closeness', 'Betweenness', 'Current-flow betweenness', 'Load', 'Eigenvector', 'PageRank', 'HITS authorities', 'HITS hubs' ] # classification d = [ [0.52500000000000002, 0.49444444444444446], # Degree [0.57499999999999996, 0.57499999999999996], # Closeness [0.56944444444444442, 0.58333333333333337], # Current-flow closeness [0.36388888888888887, 0.36944444444444446], # Betweenness [0.23333333333333334, 0.20833333333333334], # Current-flow betweenness [0.35555555555555557, 0.36666666666666664], # Load [0.49722222222222223, 0.45555555555555555], # Eigenvector [0.52777777777777779, 0.51111111111111107], # PageRank [0.49722222222222223, 0.45555555555555555], # HITS authorities [0.49722222222222223, 0.45555555555555555] ] # HITS hubs ys = { 0: '0.0', .1: '0.1', .2: '0.2', .3: '0.3', .4: '0.4', .5: '0.5', .6: '0.6' } fig = plotter.tikz_barchart(d, labels, scale=3.5, yscale=2.8, color='black', legend=legend, legend_sep=1.0, tick=False, y_tics=ys) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/dependency_eval_class.tex', mode='w') # retrieval d = [ [0.18149811054435275, 0.18821229318222113], # Degree [0.17184314735361236, 0.18216618328598347], # Closeness [0.14606637651984622, 0.13586098100141117], # Betweenness [0.17399729543537901, 0.17613717518129621], # Current-flow closeness [0.042019078720146409, 0.042019078720146409], # Current-flow betweenness [0.14700372822743263, 0.15104493506838745], # Load [0.19854658693196564, 0.17540014008712554], # Eigenvector [0.17725358882165362, 0.17252331100724849], # PageRank [0.19854658693196564, 0.17540014008712554], # HITS authorities [0.19854658693196564, 0.17540014008712554] ] # HITS hubs ys = {0: '0.0', .05: '0.05', .1: '0.1', .15: '0.15', .2: '0.2'} fig = plotter.tikz_barchart(d, labels, scale=3.5, yscale=8, color='black', legend=legend, legend_sep=1.0, tick=False, grid_step=0.05, y_tics=ys) data.write_to_file( fig, '../../masteroppgave/report/imgs/tikz/dependency_eval_retr.tex', mode='w')