Esempio n. 1
0
def main():
    """
    Calculate spearman correlation coefficient of two lists.
    """

    # Get the arguments
    args = docopt("""Calculate spearman correlation coefficient of two lists.

    Usage:
        rank_correlation.py <file1> <file2> <target_col> <value_col>

        <file1> = file1 with list of targets + values
        <file2> = file2 with list of targets + values
        <target_col> = column number of targets
        <value_col> = column number of values

    """)

    file1 = args['<file1>']
    file2 = args['<file2>']
    target_col = int(args['<target_col>'])
    value_col = int(args['<value_col>'])
    non_values = [-999.0, -888.0]

    with codecs.open(file1) as f_in:
        file1_dict = dict([
            tuple((line.strip().split('\t')[target_col],
                   line.strip().split('\t')[value_col])) for line in f_in
        ])

    with codecs.open(file2) as f_in:
        file2_dict = dict([
            tuple((line.strip().split('\t')[target_col],
                   line.strip().split('\t')[value_col])) for line in f_in
        ])

    file1_entropies = []
    file2_entropies = []
    for target in file1_dict:
        if target in file2_dict:
            if not float(file1_dict[target]) == non_values[0] and not float(
                    file2_dict[target]) == non_values[0]:
                file1_entropies.append(float(file1_dict[target]))
                file2_entropies.append(float(file2_dict[target]))

    scoring_utils.score = score_mod
    score, p = scoring_utils.score(file1_entropies, file2_entropies,
                                   "spearman")

    print 'The spearman correlation coefficient between\n %s \nand \n %s \nis %.5f (p-value: %.5f)' % (
        file1, file2, score, p)
def evaluate_sim(in_file, columns, corr_measures):
    
    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!") 
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1
    
    gold = []
    prediction = []
    with open(in_file) as in_stream:
        for line in in_stream:
            if not line.strip() == "":
                elems = line.strip().split()
                gold.append(float(elems[col0]))
                prediction.append(float(elems[col1]))
    
    for corr_measure in corr_measures:
        print "CORRELATION:%s" % corr_measure                    
        corr = scoring_utils.score(gold, prediction, corr_measure)
        print "\t%f" % corr  
Esempio n. 3
0
def evaluate_sim(in_file, columns, corr_measures):

    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!")
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1

    gold = []
    prediction = []
    with open(in_file) as in_stream:
        for line in in_stream:
            if not line.strip() == "":
                elems = line.strip().split()
                gold.append(float(elems[col0]))
                prediction.append(float(elems[col1]))

    for corr_measure in corr_measures:
        print "CORRELATION:%s" % corr_measure
        corr = scoring_utils.score(gold, prediction, corr_measure)
        print "\t%f" % corr
Esempio n. 4
0
ingredients = []

with open("../vector_processing/ingredients_in_wordnet") as f:
    for line in limit(f,int(sys.argv[1])):
        l = line.strip()
        ingredients.append(l)

for (a,b) in combinations(ingredients,2):
    a_,b_=getss(a), getss(b)
    wn_scores.append(wn_sim(a_,b_))
    res_scores.append(res_sim(a_,b_))
    jcn_scores.append(jcn_sim(a_,b_))
    lin_scores.append(lin_sim(a_,b_))
    wup_scores.append(wup_sim(a_,b_))
    lch_scores.append(lch_sim(a_,b_))
    vs_scores.append(vs_sim(a,b,gastrovec))

print("Path distance:")
print(scoring_utils.score(wn_scores,vs_scores,"spearman"))
print("JCN distance:")
print(scoring_utils.score(jcn_scores,vs_scores,"spearman"))
print("LIN distance:")
print(scoring_utils.score(jcn_scores,vs_scores,"spearman"))
print("RES distance:")
print(scoring_utils.score(res_scores,vs_scores,"spearman"))
print("WUP distance:")
print(scoring_utils.score(wup_scores,vs_scores,"spearman"))
print("LCH distance:")
print(scoring_utils.score(lch_scores,vs_scores,"spearman"))
Esempio n. 5
0
print "Composing phrases..."
test_phrases_file = data_path + "ML08nvs_test.txt"
test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0, 1, 2])
composed_space = comp_model.compose(test_phrases, space)

print "Reading similarity test data..."
test_similarity_file = data_path + "ML08data_new.txt"
test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0, 1])
gold = io_utils.read_list(test_similarity_file, field=2)

print "Computing similarity with lexical function..."
pred = composed_space.get_sims(test_pairs, CosSimilarity())

#use this composed space to assign similarities
print "Scoring lexical function..."
print scoring_utils.score(gold, pred, "spearman")

print "Training Full Additive composition model..."
comp_model = FullAdditive(learner=RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)
composed_space = comp_model.compose(test_phrases, space)
pred = composed_space.get_sims(test_pairs, CosSimilarity())
print scoring_utils.score(gold, pred, "spearman")

print "Training Weighted Additive composition model..."
comp_model = WeightedAdditive()
comp_model.train(train_data, space, per_space)
print "alpha, beta:", comp_model.alpha, comp_model.beta
composed_space = comp_model.compose(test_phrases, space)
pred = composed_space.get_sims(test_pairs, CosSimilarity())
print scoring_utils.score(gold, pred, "spearman")
Esempio n. 6
0
#ex20.py
#-------
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity

#read in a space
my_space = io_utils.load("data/out/ex01.pkl")

#compute similarities of a list of word pairs
fname = "data/in/word_sims.txt"
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])
predicted = my_space.get_sims(word_pairs, CosSimilarity())

#compute correlations
gold = io_utils.read_list(fname, field=2)
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")
Esempio n. 7
0
print "Composing phrases..."
test_phrases_file = data_path + "ML08nvs_test.txt" 
test_phrases = io_utils.read_tuple_list(test_phrases_file, fields=[0,1,2])
composed_space = comp_model.compose(test_phrases, space)

print "Reading similarity test data..."
test_similarity_file = data_path + "ML08data_new.txt"
test_pairs = io_utils.read_tuple_list(test_similarity_file, fields=[0,1])
gold = io_utils.read_list(test_similarity_file, field=2)

print "Computing similarity with lexical function..."
pred = composed_space.get_sims(test_pairs, CosSimilarity())

#use this composed space to assign similarities
print "Scoring lexical function..."
print scoring_utils.score(gold, pred, "spearman")
                    

print "Training Full Additive composition model..."
comp_model = FullAdditive(learner = RidgeRegressionLearner(param=2))
comp_model.train(train_data, space, per_space)
composed_space = comp_model.compose(test_phrases, space)
pred = composed_space.get_sims(test_pairs, CosSimilarity())
print scoring_utils.score(gold, pred, "spearman")

print "Training Weighted Additive composition model..."
comp_model = WeightedAdditive()
comp_model.train(train_data, space, per_space)
print "alpha, beta:", comp_model.alpha, comp_model.beta
composed_space = comp_model.compose(test_phrases, space)
pred = composed_space.get_sims(test_pairs, CosSimilarity())
Esempio n. 8
0
# ex20.py
# -------
from composes.utils import io_utils
from composes.utils import scoring_utils
from composes.similarity.cos import CosSimilarity

# read in a space
my_space = io_utils.load("data/out/ex01.pkl")

# compute similarities of a list of word pairs
fname = "data/in/word_sims.txt"
word_pairs = io_utils.read_tuple_list(fname, fields=[0, 1])
predicted = my_space.get_sims(word_pairs, CosSimilarity())

# compute correlations
gold = io_utils.read_list(fname, field=2)
print "Spearman"
print scoring_utils.score(gold, predicted, "spearman")
print "Pearson"
print scoring_utils.score(gold, predicted, "pearson")
# print(composed_space.cooccurrence_matrix)

# compute similarity between two words in the space

cos_sim = {}
for pair in train_data:
    cos = my_space.get_sim(pair[1],
                           pair[2],
                           CosSimilarity(),
                           space2=composed_space)
    if pair[0] in cos_sim:
        cos_sim[pair[0]].append(cos)
    else:
        cos_sim[pair[0]] = [cos]

import numpy
f = open("./data/out/sim_test_word.txt", 'w')
for key, value in cos_sim.items():
    f.write(key + '\t' + str(numpy.mean(value)) + '\n')
f.close()

#Evaluation of similarity scores

fname = "./data/in/stem_affix/gold_affix.txt"
gold = io_utils.read_list(fname, field=3)
print(gold)
print("Spearman")
print(scoring_utils.score(gold, cos_sim, "spearman"))
print("Pearson")
print(scoring_utils.score(gold, cos_sim, "pearson"))