def evaluate(eval_data_dir, embeddings_filename): test_treebank_filename = '{}/{}'.format(eval_data_dir, get_test_treebank_filename()) train_treebank_filename = '{}/{}'.format(eval_data_dir, get_train_treebank_filename()) test_arcstd_filename = '{}/{}'.format(eval_data_dir, get_test_arcstd_filename()) train_arcstd_filename = '{}/{}'.format(eval_data_dir, get_train_arcstd_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename( test_treebank_filename, train_treebank_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) assert len(word_vecs) > 0 embeddings_dimensionality = len(word_vecs.itervalues().next()) coverage = compute_coverage(test_treebank_filename, word_vecs) if coverage == 0: print 'coverage = 0!!' return (0.0, 0.0) score = parsing_wrapper(train_arcstd_filename, test_arcstd_filename, relevant_embeddings_filename, embeddings_dimensionality) os.remove(relevant_embeddings_filename) return ( score, coverage, )
def all_word_sim(word_vec_file, word_sim_dir): word_vecs = read_word_vectors(word_vec_file) print '=================================================================================' print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' total_rho = 0 for i, filename in enumerate(os.listdir(word_sim_dir)): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(os.path.join(word_sim_dir, filename), 'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else: not_found += 1 total_size += 1 rho = spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict)) total_rho += rho print "%6s" % str(i + 1), "%20s" % filename, "%15s" % str(total_size), print "%15s" % str(not_found), print "%15.4f" % rho print "Sum of scores: %15.4f" % total_rho
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = '{}/{}'.format(eval_data_dir, get_qvec_gold_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) coverage = compute_coverage(eval_data_filename, word_vecs) score = qvec_cca_wrapper(eval_data_filename, relevant_embeddings_filename) os.remove(relevant_embeddings_filename) return (score, coverage,)
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = '{}/{}'.format(eval_data_dir, get_word_translation_gold_filename()) relevant_word_types, relevant_word_pairs = get_relevant_word_types(eval_data_filename) relevant_embeddings_filename = get_relevant_embeddings_filename(relevant_word_types, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) coverage = compute_coverage(eval_data_filename, word_vecs) score = compute_precision_at_k(relevant_word_pairs, word_vecs, 1) os.remove(relevant_embeddings_filename) return (score, coverage,)
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = '{}/{}'.format(eval_data_dir, get_qvec_gold_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) coverage = compute_coverage(eval_data_filename, word_vecs) score = qvec_wrapper(eval_data_filename, relevant_embeddings_filename) os.remove(relevant_embeddings_filename) embedding_size = len(word_vecs[next(iter(word_vecs))]) return (score / embedding_size, coverage,)
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = "{}/{}".format(eval_data_dir, get_qvec_gold_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) coverage = compute_coverage(eval_data_filename, word_vecs) score = qvec_wrapper(eval_data_filename, relevant_embeddings_filename) os.remove(relevant_embeddings_filename) embedding_size = len(word_vecs[next(iter(word_vecs))]) return (score / embedding_size, coverage)
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = '{}/{}'.format(eval_data_dir, get_wordsim_gold_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename(eval_data_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) manual_dict, auto_dict, coverage = compute_similarities_and_coverage(eval_data_filename, word_vecs) print 'size of manual/auto dicts: ', len(manual_dict), len(auto_dict) if coverage == 0 or min(len(manual_dict), len(auto_dict)) < 2: return (0.0, 0.0) ranked_manual_dict, ranked_auto_dict = assign_ranks(manual_dict), assign_ranks(auto_dict) score = spearmans_rho(ranked_manual_dict, ranked_auto_dict) os.remove(relevant_embeddings_filename) return (score, coverage,)
def evaluate(eval_data_dir, embeddings_filename): test_treebank_filename = '{}/{}'.format(eval_data_dir, get_test_treebank_filename()) train_treebank_filename = '{}/{}'.format(eval_data_dir, get_train_treebank_filename()) test_arcstd_filename = '{}/{}'.format(eval_data_dir, get_test_arcstd_filename()) train_arcstd_filename = '{}/{}'.format(eval_data_dir, get_train_arcstd_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename(test_treebank_filename, train_treebank_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) assert len(word_vecs) > 0; embeddings_dimensionality = len(word_vecs.itervalues().next()) coverage = compute_coverage(test_treebank_filename, word_vecs) if coverage == 0: print 'coverage = 0!!' return (0.0, 0.0) score = parsing_wrapper(train_arcstd_filename, test_arcstd_filename, relevant_embeddings_filename, embeddings_dimensionality) os.remove(relevant_embeddings_filename) return (score, coverage,)
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = '{}/{}'.format(eval_data_dir, get_word_translation_gold_filename()) relevant_word_types, relevant_word_pairs = get_relevant_word_types( eval_data_filename) relevant_embeddings_filename = get_relevant_embeddings_filename( relevant_word_types, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) coverage = compute_coverage(eval_data_filename, word_vecs) score = compute_precision_at_k(relevant_word_pairs, word_vecs, 1) os.remove(relevant_embeddings_filename) return ( score, coverage, )
def evaluate(eval_data_dir, embeddings_filename): eval_data_filename = '{}/{}'.format(eval_data_dir, get_wordsim_gold_filename()) relevant_embeddings_filename = get_relevant_embeddings_filename( eval_data_filename, embeddings_filename) word_vecs = read_word_vectors(relevant_embeddings_filename) manual_dict, auto_dict, coverage = compute_similarities_and_coverage( eval_data_filename, word_vecs) print 'size of manual/auto dicts: ', len(manual_dict), len(auto_dict) if coverage == 0 or min(len(manual_dict), len(auto_dict)) < 2: return (0.0, 0.0) ranked_manual_dict, ranked_auto_dict = assign_ranks( manual_dict), assign_ranks(auto_dict) score = spearmans_rho(ranked_manual_dict, ranked_auto_dict) os.remove(relevant_embeddings_filename) return ( score, coverage, )
def word_sim(word_vec_file, word_sim_file): word_vecs = read_word_vectors(word_vec_file) print '=================================================================================' print "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(word_sim_file, 'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else: not_found += 1 total_size += 1 print "%15s" % str(total_size), "%15s" % str(not_found), print "%15.4f" % spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
import sys import os from read_write import read_word_vectors from ranking import * if __name__ == '__main__': word_vec_file = sys.argv[1] word_sim_dir = sys.argv[2] try: top_vocab = int(float(sys.argv[3])) #accepts answers in 1eX notation. if top_vocab < 0: top_vocab = 1e6 except IndexError: top_vocab = 1e6 word_vecs = read_word_vectors(word_vec_file, int(top_vocab)) print '=================================================================================' print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' for i, filename in enumerate(os.listdir(word_sim_dir)): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(os.path.join(word_sim_dir, filename), 'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else:
# -*- coding:utf-8 -*- import sys import os from read_write import read_word_vectors from ranking import * if __name__ == '__main__': word_vec_file = sys.argv[1] word_sim_dir = sys.argv[2] word_vecs = read_word_vectors(word_vec_file, False) print( '=================================================================================' ) print("%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho") print( '=================================================================================' ) for i, filename in enumerate(os.listdir(word_sim_dir)): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(os.path.join(word_sim_dir, filename), 'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else:
from read_write import read_word_vectors, read_word_vectors_orig from ent_eval import * import os import sys import time import argparse default_data = "../xling-entailment/data/monoling_entailment/baroni2012/data_lex_test.tsv" if __name__ == "__main__": parser = argparse.ArgumentParser(description='Short sample app') parser.add_argument('--m', action="store", dest="model", required=True) parser.add_argument('--d', action="store", dest="datapath", default=default_data) opts = parser.parse_args(sys.argv[1:]) scorer = ComplexScorer(datapath=opts.datapath) start = time.time() re_vecs, im_vecs = read_word_vectors(opts.model) # re_vecs,im_vecs=read_word_vectors_orig(opts.model+".real"),read_word_vectors_orig(opts.model+".imag") end = time.time() print "elapsed in loading", end - start missed, scores = scorer.compute_scores([re_vecs, im_vecs]) print "missed", missed scorer.get_best_perf(scores)
for line in f: line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs1 and word2 in word_vecs1 and word1 in word_vecs2 and word2 in word_vecs2: auto_dict1[(word1, word2)] = cosine_sim(word_vecs1[word1], word_vecs1[word2]) auto_dict2[(word1, word2)] = cosine_sim(word_vecs2[word1], word_vecs2[word2]) else: not_found += 1 total_size += 1 return auto_dict1,auto_dict2,not_found,total_size if __name__=="__main__": word_vec1_file = sys.argv[1] word_vec2_file = sys.argv[2] word_sim_file = sys.argv[3] word_vecs1 = read_word_vectors(word_vec1_file) word_vecs2 = read_word_vectors(word_vec2_file) print '=================================================================================' print "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' manual_dict,auto1_dict,not_found,total_size = compute_vs_gold(open(word_sim_file,'r'),word_vecs1) A=spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto1_dict)) print "%15s" % str(total_size), "%15s" % str(not_found), print "%15.4f" % A manual_dict,auto2_dict,not_found,total_size = compute_vs_gold(open(word_sim_file,'r'),word_vecs2) B=spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto2_dict)) print "%15s" % str(total_size), "%15s" % str(not_found), print "%15.4f" % B
format='png') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--e', required=True, help="embedding file") parser.add_argument('--s', required=True, help="synonym lexicon file") parser.add_argument('--a', required=True, help="antonym lexicon file") parser.add_argument( '--d', required=False, help="Distance metric. Choose between euclidean and cosine") args = parser.parse_args(sys.argv[1:]) name = args.e wordVecs = read_write.read_word_vectors(args.e) synonyms = read_write.read_lexicon(args.s) antonyms = read_write.read_lexicon(args.a) print("now calculating distances") if (args.d is not None): syn_mean_dist, syn_norm, syns = calculate_lex_distance( synonyms, wordVecs, args.d) ant_mean_dist, ant_norm, ants = calculate_lex_distance( antonyms, wordVecs, args.d) else: syn_mean_dist, syn_norm, syns = calculate_lex_distance( synonyms, wordVecs) ant_mean_dist, ant_norm, ants = calculate_lex_distance( antonyms, wordVecs) print("Calculated distances...")
import sys from read_write import read_word_vectors from findMatch import cosine_sim from ranking import spearmans_rho from ranking import assign_ranks if __name__=='__main__': wordVectorFile = sys.argv[1] wordVectors = read_word_vectors(wordVectorFile) print '=================================================================================' print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' DIR = '/usr1/corpora/usr0-corpora/word-sim/' FILES = ['EN-MC-30.txt', 'EN-MTurk-287.txt', 'EN-RG-65.txt', 'EN-RW-STANFORD.txt', 'EN-WS-353-ALL.txt', 'EN-WS-353-REL.txt', 'EN-WS-353-SIM.txt', 'EN-MEN-TR-3k.txt', 'EN-YP-130.txt', 'EN-MTurk-771.txt'] for i, FILE in enumerate(FILES): manualDict, autoDict = ({}, {}) notFound, totalSize = (0, 0) for line in open(DIR+FILE,'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in wordVectors and word2 in wordVectors: manualDict[(word1, word2)] = float(val) autoDict[(word1, word2)] = cosine_sim(wordVectors[word1], wordVectors[word2]) else: notFound += 1 totalSize += 1 print "%6s" % str(i+1), "%20s" % FILE, "%15s" % str(totalSize), print "%15s" % str(notFound), print "%15.4f" % spearmans_rho(assign_ranks(manualDict), assign_ranks(autoDict))
import sys import os from read_write import read_word_vectors from ranking import * if __name__=='__main__': word_vec_file = sys.argv[1] word_sim_dir = sys.argv[2] word_vecs = read_word_vectors(word_vec_file) print '=================================================================================' print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' total_rho = 0 for i, filename in enumerate(os.listdir(word_sim_dir)): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(os.path.join(word_sim_dir, filename),'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else: not_found += 1 total_size += 1 rho = spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict)) total_rho += rho print "%6s" % str(i+1), "%20s" % filename, "%15s" % str(total_size),
word_vec_file = sys.argv[1] word_sim_dir = sys.argv[2] dimension = int(sys.argv[3]) oov = int(sys.argv[3]) print(sys.argv) v = [] for i, filename in enumerate(os.listdir(word_sim_dir)): for line in open(os.path.join(word_sim_dir, filename), 'r'): line = line.strip().lower() word1, word2, val = line.split() v.append(word1) v.append(word2) word_vecs = read_word_vectors(word_vec_file, v, dimension, oov) print( '=================================================================================' ) print("%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho") print( '=================================================================================' ) for i, filename in enumerate(os.listdir(word_sim_dir)): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(os.path.join(word_sim_dir, filename), 'r'): line = line.strip().lower() word1, word2, val = line.split()
import sys import os from read_write import read_word_vectors from ranking import * if __name__ == '__main__': word_vec_file = sys.argv[1] word_sim_dir = sys.argv[2] word_vecs = read_word_vectors(word_vec_file) print '=================================================================================' print "%6s" % "Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" print '=================================================================================' for i, filename in enumerate(os.listdir(word_sim_dir)): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) for line in open(os.path.join(word_sim_dir, filename), 'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else: not_found += 1 total_size += 1 print "%6s" % str(i + 1), "%20s" % filename, "%15s" % str(total_size), print "%15s" % str(not_found), print "%15.4f" % spearmans_rho(assign_ranks(manual_dict),
word_vec_files = [f for f in os.listdir(word_vec_dir) if not f.startswith('.')] # Don't read .DS_Store! num_embeddings = len(word_vec_files) word_sim_files = [f for f in os.listdir(word_sim_dir) if not f.startswith('.')] # Don't read .DS_Store! num_benchmarks = len(word_sim_files) header = ["File #","Word_embedding"] + word_sim_files scores = np.zeros((num_embeddings,num_benchmarks)) for i,word_vec_file in enumerate(word_vec_files): root,ext = os.path.splitext(word_vec_file) word_vecs = read_word_vectors(os.path.join(word_vec_dir,word_vec_file)) print "%6s" % str(i+1), "%30s" % root, for j, word_sim_file in enumerate(word_sim_files): manual_dict, auto_dict = ({}, {}) not_found, total_size = (0, 0) not_found_words = [] for line in open(os.path.join(word_sim_dir, word_sim_file),'r'): line = line.strip().lower() word1, word2, val = line.split() if word1 in word_vecs and word2 in word_vecs: manual_dict[(word1, word2)] = float(val) auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) else: