def main(): """ Calculate spearman correlation coefficient of two lists. """ # Get the arguments args = docopt("""Calculate spearman correlation coefficient of two lists. Usage: rank_correlation.py <file1> <file2> <target_col> <value_col> <file1> = file1 with list of targets + values <file2> = file2 with list of targets + values <target_col> = column number of targets <value_col> = column number of values """) file1 = args['<file1>'] file2 = args['<file2>'] target_col = int(args['<target_col>']) value_col = int(args['<value_col>']) non_values = [-999.0, -888.0] with codecs.open(file1) as f_in: file1_dict = dict([ tuple((line.strip().split('\t')[target_col], line.strip().split('\t')[value_col])) for line in f_in ]) with codecs.open(file2) as f_in: file2_dict = dict([ tuple((line.strip().split('\t')[target_col], line.strip().split('\t')[value_col])) for line in f_in ]) file1_entropies = [] file2_entropies = [] for target in file1_dict: if target in file2_dict: if not float(file1_dict[target]) == non_values[0] and not float( file2_dict[target]) == non_values[0]: file1_entropies.append(float(file1_dict[target])) file2_entropies.append(float(file2_dict[target])) scoring_utils.score = score_mod score, p = scoring_utils.score(file1_entropies, file2_entropies, "spearman") print 'The spearman correlation coefficient between\n %s \nand \n %s \nis %.5f (p-value: %.5f)' % ( file1, file2, score, p)
def evaluate_sim(in_file, columns, corr_measures): if not len(columns) == 2: raise ValueError("Column description unrecognized!") col0 = int(columns[0]) - 1 col1 = int(columns[1]) - 1 gold = [] prediction = [] with open(in_file) as in_stream: for line in in_stream: if not line.strip() == "": elems = line.strip().split() gold.append(float(elems[col0])) prediction.append(float(elems[col1])) for corr_measure in corr_measures: print "CORRELATION:%s" % corr_measure corr = scoring_utils.score(gold, prediction, corr_measure) print "\t%f" % corr
ingredients = [] with open("../vector_processing/ingredients_in_wordnet") as f: for line in limit(f,int(sys.argv[1])): l = line.strip() ingredients.append(l) for (a,b) in combinations(ingredients,2): a_,b_=getss(a), getss(b) wn_scores.append(wn_sim(a_,b_)) res_scores.append(res_sim(a_,b_)) jcn_scores.append(jcn_sim(a_,b_)) lin_scores.append(lin_sim(a_,b_)) wup_scores.append(wup_sim(a_,b_)) lch_scores.append(lch_sim(a_,b_)) vs_scores.append(vs_sim(a,b,gastrovec)) print("Path distance:") print(scoring_utils.score(wn_scores,vs_scores,"spearman")) print("JCN distance:") print(scoring_utils.score(jcn_scores,vs_scores,"spearman")) print("LIN distance:") print(scoring_utils.score(jcn_scores,vs_scores,"spearman")) print("RES distance:") print(scoring_utils.score(res_scores,vs_scores,"spearman")) print("WUP distance:") print(scoring_utils.score(wup_scores,vs_scores,"spearman")) print("LCH distance:") print(scoring_utils.score(lch_scores,vs_scores,"spearman"))
print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print "Scoring lexical function..." print scoring_utils.score(gold, pred, "spearman") print "Training Full Additive composition model..." comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman") print "Training Weighted Additive composition model..." comp_model = WeightedAdditive() comp_model.train(train_data, space, per_space) print "alpha, beta:", comp_model.alpha, comp_model.beta composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman")
#ex20.py #------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity #read in a space my_space = io_utils.load("data/out/ex01.pkl") #compute similarities of a list of word pairs fname = "data/in/word_sims.txt" word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) predicted = my_space.get_sims(word_pairs, CosSimilarity()) #compute correlations gold = io_utils.read_list(fname, field=2) print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
print "Composing phrases..." test_phrases_file = data_path + "ML08nvs_test.txt" test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2]) composed_space = comp_model.compose(test_phrases, space) print "Reading similarity test data..." test_similarity_file = data_path + "ML08data_new.txt" test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1]) gold = io_utils.read_list(test_similarity_file, field=2) print "Computing similarity with lexical function..." pred = composed_space.get_sims(test_pairs, CosSimilarity()) #use this composed space to assign similarities print "Scoring lexical function..." print scoring_utils.score(gold, pred, "spearman") print "Training Full Additive composition model..." comp_model = FullAdditive(learner = RidgeRegressionLearner(param=2)) comp_model.train(train_data, space, per_space) composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity()) print scoring_utils.score(gold, pred, "spearman") print "Training Weighted Additive composition model..." comp_model = WeightedAdditive() comp_model.train(train_data, space, per_space) print "alpha, beta:", comp_model.alpha, comp_model.beta composed_space = comp_model.compose(test_phrases, space) pred = composed_space.get_sims(test_pairs, CosSimilarity())
# ex20.py # ------- from composes.utils import io_utils from composes.utils import scoring_utils from composes.similarity.cos import CosSimilarity # read in a space my_space = io_utils.load("data/out/ex01.pkl") # compute similarities of a list of word pairs fname = "data/in/word_sims.txt" word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1]) predicted = my_space.get_sims(word_pairs, CosSimilarity()) # compute correlations gold = io_utils.read_list(fname, field=2) print "Spearman" print scoring_utils.score(gold, predicted, "spearman") print "Pearson" print scoring_utils.score(gold, predicted, "pearson")
# print(composed_space.cooccurrence_matrix) # compute similarity between two words in the space cos_sim = {} for pair in train_data: cos = my_space.get_sim(pair[1], pair[2], CosSimilarity(), space2=composed_space) if pair[0] in cos_sim: cos_sim[pair[0]].append(cos) else: cos_sim[pair[0]] = [cos] import numpy f = open("./data/out/sim_test_word.txt", 'w') for key, value in cos_sim.items(): f.write(key + '\t' + str(numpy.mean(value)) + '\n') f.close() #Evaluation of similarity scores fname = "./data/in/stem_affix/gold_affix.txt" gold = io_utils.read_list(fname, field=3) print(gold) print("Spearman") print(scoring_utils.score(gold, cos_sim, "spearman")) print("Pearson") print(scoring_utils.score(gold, cos_sim, "pearson"))