def checkpoint(self, epoch, sess): """ Computes intrinsic scores for embeddings and dumps the embeddings embeddings Parameters ---------- epoch: Current epoch number sess: Tensorflow session object Returns ------- """ embed_matrix, \ context_matrix = sess.run([self.embed_matrix, self.context_matrix]) voc2vec = {wrd: embed_matrix[wid] for wrd, wid in self.voc2id.items()} embedding = Embedding.from_dict(voc2vec) results = evaluate_on_all(embedding) results = {key: round(val[0], 4) for key, val in results.items()} curr_int = np.mean(list(results.values())) self.logger.info('Current Score: {}'.format(curr_int)) if curr_int > self.best_int_avg: self.logger.info("Saving embedding matrix") f = open('{}/{}'.format(self.p.emb_dir, self.p.name), 'w') for id, wrd in self.id2voc.items(): f.write('{} {}\n'.format(wrd, ' '.join([str(round(v, 6)) for v in embed_matrix[id].tolist()]))) self.saver.save(sess=sess, save_path=self.save_path) self.best_int_avg = curr_int
def main(): embeddings, name = get_embeddings() results = evaluate_on_all(embeddings) out_fname = "{}.csv".format(name) logger.info("Saving results...") print(results) results.to_csv(out_fname)
def run_job(j): fn, kwargs = j outf = path.join(opts.output_dir, fn + "_" + "_".join(str(k) + "=" + str(v) for k, v in iteritems(kwargs))) + ".csv" logger.info("Processing " + outf) if not path.exists(outf): w = getattr(embeddings, fn)(**kwargs) res = evaluate_on_all(w) res.to_csv(outf)
def run_job(j): fn, kwargs = j outf = path.join(opts.output_dir, fn + "_" + "_".join(str(k) + "=" + str(v) for k,v in iteritems(kwargs))) + ".csv" logger.info("Processing " + outf) if not path.exists(outf): w = getattr(embeddings, fn)(**kwargs) res = evaluate_on_all(w) res.to_csv(outf)
def benchmark(filepath, savepath, save_file_name): from web.embeddings import load_embedding w = load_embedding(filepath, format='word2vec') # out_fname = os.path.join(savepath) + save_file_name + "_results.csv" results = evaluate.evaluate_on_all(w) # print(results['AP'].item()) for ind in results.keys(): print(ind,results[ind].item())
def evaluate(embed_matrix: dict, voc2id: dict) -> np.float: """ Computes intrinsic scores for embeddings and dumps the embeddings embeddings Parameters ---------- epoch: Current epoch number sess: Tensorflow session object Returns ------- """ voc2vec = {wrd: embed_matrix[wid] for wrd, wid in voc2id.items()} embedding = Embedding.from_dict(voc2vec) results = evaluate_on_all(embedding) results = {key: round(val[0], 4) for key, val in results.items()} curr_int = np.mean(list(results.values())) return curr_int
_, ext = os.path.splitext(fname) if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": load_kwargs['vocab_size'] = sum(1 for line in open(fname)) load_kwargs['dim'] = len(next(open(fname)).split()) - 1 w = load_embedding(fname, format=format, normalize=True, lower=False, clean_words=options.clean_words, load_kwargs=load_kwargs) out_fname = options.output if options.output else "results.csv" results = evaluate_on_all(w) logger.info("Saving results...") print(results) results.to_csv(out_fname)
elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": load_kwargs['vocab_size'] = sum(1 for line in open(fname)) load_kwargs['dim'] = len(next(open(fname)).split()) - 1 print ("loading embedding...") start = time.time() w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, load_kwargs=load_kwargs) elapsed_time = time.time() - start print ("elapsed_time:{0}".format(elapsed_time) + "[sec]") out_fname = options.output if options.output else "results.csv" print ("Evaluating on embedding...") start = time.time() results = evaluate_on_all(w, options.entity, options.fastText_ML) elapsed_time = time.time() - start print ("elapsed_time:{0}".format(elapsed_time) + "[sec]") logger.info("Saving results...") results.to_csv(out_fname)
for c in set(compositional_dict[w]): q_c += word_vectors[c] q_c = q_c / len(word_vectors[c]) word_vectors[w] += q_c word_vectors[w] = word_vectors[w] / 2 except KeyError: pass #print("before",w, Y_prev[w]) Y[w] -= word_vectors[w] #print("after",w, Y[w]) return Y Y = enhance_euclid(word_vectors,syn_dict,ant_dict,compositional_dict) results_retrofitted = evaluate_on_all(Y) print(results_retrofitted) ''' {'AP': {0: 0.63681592039800994}, 'BLESS': {0: 0.82000000000000006}, 'Battig': {0: 0.41693748805199771}, 'ESSLI_1a': {0: 0.77272727272727271}, 'ESSLI_2b': {0: 0.82500000000000007}, 'ESSLI_2c': {0: 0.64444444444444449}, 'Google': {0: 0.10284486287351617}, 'MEN': {0: 0.73746469698055173}, 'MSR': {0: 0.086999999999999994}, 'MTurk': {0: 0.63318199788472018}, 'RG65': {0: 0.76952497886121318}, 'RW': {0: 0.36701861264054064},
# assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" # load_kwargs = {} # if format == "glove": # load_kwargs['vocab_size'] = sum(1 for line in open(fname)) # load_kwargs['dim'] = len(next(open(fname)).split()) - 1 # w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, # load_kwargs=load_kwargs) if os.path.exists("results.csv"): os.remove("results.csv") out_fname = options.output if options.output else "results.csv" results_sum = pd.DataFrame() temporary_file = open("results.txt", 'w') for word_embedding_name, w in iteritems(pretrained_word_embeddings): results = evaluate.evaluate_on_all(w, word_embedding_name) logger.info("Saving results... {}".format(word_embedding_name)) print("results:", results) temporary_file.write(str(results) + '\n') results_sum = results_sum.append(results) results_sum.to_csv(out_fname) temporary_file.close() # results = evaluate_on_all(w) # print(results) # results.to_csv(out_fname)
assert format in [ 'word2vec_bin', 'word2vec', 'glove', 'bin', 'dict', 'dict_poly' ], "Unrecognized format" load_kwargs = {} if format == "glove": vocab_size = sum(1 for line in open(fname)) dim = len(next(open(fname)).split()) - 1 load_kwargs = {'dim': dim, 'vocab_size': vocab_size} w = load_embedding(fname, format=format, normalize=options.normalize, clean_words=options.clean_words, lower=options.lowercase, lowercase_if_OOV=options.lower_or_lemma, lemmatize_if_OOV=options.lower_or_lemma, load_kwargs=load_kwargs) out_fname = options.output if options.output else "results.csv" if options.multi_prototype: results = evaluate_on_all_multi(w, options.model) else: results = evaluate_on_all(w, only_sim_rel=options.only_sim_rel) logger.info("Saving results...") print(results) results.to_csv(out_fname)
if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" # assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": vocab_size = sum(1 for line in open(fname)) dim = len(next(open(fname)).split()) - 1 load_kwargs = {"vocab_size": vocab_size, "dim": dim} w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, load_kwargs=load_kwargs, nonorm=options.nonorm, nocheat=options.nocheat) out_fname = options.output if options.output else "results.csv" results = evaluate_on_all(w, options.nocheat) logger.info("Saving results...") print(results) results.to_csv(out_fname)
fname = os.path.join(_get_dataset_dir(), fname) format = options.format if not format: _, ext = os.path.splitext(fname) if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": vocab_size = sum(1 for line in open(fname)) dim = len(next(open(fname)).split()) - 1 w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, load_kwargs=load_kwargs) out_fname = options.output if options.output else "results.csv" results = evaluate_on_all(w) logger.info("Saving results...") print(results) results.to_csv(out_fname)