Ejemplo n.º 1
0
	def checkpoint(self, epoch, sess):
		"""
		Computes intrinsic scores for embeddings and dumps the embeddings embeddings

		Parameters
		----------
		epoch:		Current epoch number
		sess:		Tensorflow session object

		Returns
		-------
		"""
		embed_matrix, \
		context_matrix 	= sess.run([self.embed_matrix, self.context_matrix])
		voc2vec 	= {wrd: embed_matrix[wid] for wrd, wid in self.voc2id.items()}
		embedding 	= Embedding.from_dict(voc2vec)
		results		= evaluate_on_all(embedding)
		results 	= {key: round(val[0], 4) for key, val in results.items()}
		curr_int 	= np.mean(list(results.values()))
		self.logger.info('Current Score: {}'.format(curr_int))

		if curr_int > self.best_int_avg:
			self.logger.info("Saving embedding matrix")
			f = open('{}/{}'.format(self.p.emb_dir, self.p.name), 'w')
			for id, wrd in self.id2voc.items():
				f.write('{} {}\n'.format(wrd, ' '.join([str(round(v, 6)) for v in embed_matrix[id].tolist()])))

			self.saver.save(sess=sess, save_path=self.save_path)
			self.best_int_avg = curr_int
Ejemplo n.º 2
0
def main():
    embeddings, name = get_embeddings()
    results = evaluate_on_all(embeddings)
    out_fname = "{}.csv".format(name)
    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
Ejemplo n.º 3
0
def run_job(j):
    fn, kwargs = j
    outf = path.join(opts.output_dir, fn + "_" + "_".join(str(k) + "=" + str(v) for k, v in iteritems(kwargs))) + ".csv"
    logger.info("Processing " + outf)
    if not path.exists(outf):
        w = getattr(embeddings, fn)(**kwargs)
        res = evaluate_on_all(w)
        res.to_csv(outf)
def run_job(j):
    fn, kwargs = j
    outf = path.join(opts.output_dir, fn + "_" + "_".join(str(k) + "=" + str(v) for k,v in iteritems(kwargs))) + ".csv"
    logger.info("Processing " + outf)
    if not path.exists(outf):
        w = getattr(embeddings, fn)(**kwargs)
        res = evaluate_on_all(w)
        res.to_csv(outf)
Ejemplo n.º 5
0
def benchmark(filepath, savepath, save_file_name):
    from web.embeddings import load_embedding

    w = load_embedding(filepath, format='word2vec')

    # out_fname = os.path.join(savepath) + save_file_name + "_results.csv"

    results = evaluate.evaluate_on_all(w)

    # print(results['AP'].item())

    for ind in results.keys():
        print(ind,results[ind].item())
Ejemplo n.º 6
0
def evaluate(embed_matrix: dict, voc2id: dict) -> np.float:
    """
    Computes intrinsic scores for embeddings and dumps the embeddings embeddings
    Parameters
    ----------
    epoch:        Current epoch number
    sess:        Tensorflow session object
    Returns
    -------
    """
    voc2vec = {wrd: embed_matrix[wid] for wrd, wid in voc2id.items()}
    embedding = Embedding.from_dict(voc2vec)
    results = evaluate_on_all(embedding)
    results = {key: round(val[0], 4) for key, val in results.items()}
    curr_int = np.mean(list(results.values()))
    return curr_int
            _, ext = os.path.splitext(fname)
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

        assert format in ['word2vec_bin', 'word2vec', 'glove',
                          'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            load_kwargs['vocab_size'] = sum(1 for line in open(fname))
            load_kwargs['dim'] = len(next(open(fname)).split()) - 1

        w = load_embedding(fname,
                           format=format,
                           normalize=True,
                           lower=False,
                           clean_words=options.clean_words,
                           load_kwargs=load_kwargs)

    out_fname = options.output if options.output else "results.csv"

    results = evaluate_on_all(w)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
Ejemplo n.º 8
0
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

        assert format in ['word2vec_bin', 'word2vec',
                          'glove', 'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            load_kwargs['vocab_size'] = sum(1 for line in open(fname))
            load_kwargs['dim'] = len(next(open(fname)).split()) - 1

        print ("loading embedding...")
        start = time.time()
        w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words,
                           load_kwargs=load_kwargs)
        elapsed_time = time.time() - start
        print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

    out_fname = options.output if options.output else "results.csv"

    print ("Evaluating on embedding...")
    start = time.time()
    results = evaluate_on_all(w, options.entity, options.fastText_ML)
    elapsed_time = time.time() - start
    print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

    logger.info("Saving results...")
    results.to_csv(out_fname)
Ejemplo n.º 9
0
    
                for c in set(compositional_dict[w]):
                    q_c += word_vectors[c]
                q_c = q_c / len(word_vectors[c])
                word_vectors[w] += q_c
                word_vectors[w] = word_vectors[w] / 2

            except KeyError:
                pass
            #print("before",w, Y_prev[w])
        Y[w] -= word_vectors[w]
            #print("after",w, Y[w])
    return Y

Y = enhance_euclid(word_vectors,syn_dict,ant_dict,compositional_dict)
results_retrofitted = evaluate_on_all(Y)
print(results_retrofitted)

'''
{'AP': {0: 0.63681592039800994},
 'BLESS': {0: 0.82000000000000006},
 'Battig': {0: 0.41693748805199771},
 'ESSLI_1a': {0: 0.77272727272727271},
 'ESSLI_2b': {0: 0.82500000000000007},
 'ESSLI_2c': {0: 0.64444444444444449},
 'Google': {0: 0.10284486287351617},
 'MEN': {0: 0.73746469698055173},
 'MSR': {0: 0.086999999999999994},
 'MTurk': {0: 0.63318199788472018},
 'RG65': {0: 0.76952497886121318},
 'RW': {0: 0.36701861264054064},
Ejemplo n.º 10
0
    #     assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format"

    #     load_kwargs = {}
    #     if format == "glove":
    #         load_kwargs['vocab_size'] = sum(1 for line in open(fname))
    #         load_kwargs['dim'] = len(next(open(fname)).split()) - 1

    #     w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words,
    #                        load_kwargs=load_kwargs)

    if os.path.exists("results.csv"):
        os.remove("results.csv")
    out_fname = options.output if options.output else "results.csv"

    results_sum = pd.DataFrame()

    temporary_file = open("results.txt", 'w')
    for word_embedding_name, w in iteritems(pretrained_word_embeddings):
        results = evaluate.evaluate_on_all(w, word_embedding_name)
        logger.info("Saving results... {}".format(word_embedding_name))
        print("results:", results)
        temporary_file.write(str(results) + '\n')
        results_sum = results_sum.append(results)
    results_sum.to_csv(out_fname)
    temporary_file.close()

    # results = evaluate_on_all(w)
    # print(results)
    # results.to_csv(out_fname)
Ejemplo n.º 11
0
        assert format in [
            'word2vec_bin', 'word2vec', 'glove', 'bin', 'dict', 'dict_poly'
        ], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            vocab_size = sum(1 for line in open(fname))
            dim = len(next(open(fname)).split()) - 1
            load_kwargs = {'dim': dim, 'vocab_size': vocab_size}

        w = load_embedding(fname,
                           format=format,
                           normalize=options.normalize,
                           clean_words=options.clean_words,
                           lower=options.lowercase,
                           lowercase_if_OOV=options.lower_or_lemma,
                           lemmatize_if_OOV=options.lower_or_lemma,
                           load_kwargs=load_kwargs)

    out_fname = options.output if options.output else "results.csv"

    if options.multi_prototype:
        results = evaluate_on_all_multi(w, options.model)
    else:
        results = evaluate_on_all(w, only_sim_rel=options.only_sim_rel)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
Ejemplo n.º 12
0
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

#		assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            vocab_size = sum(1 for line in open(fname))
            dim = len(next(open(fname)).split()) - 1
            load_kwargs = {"vocab_size": vocab_size, "dim": dim}
        w = load_embedding(fname,
                           format=format,
                           normalize=True,
                           lower=True,
                           clean_words=options.clean_words,
                           load_kwargs=load_kwargs,
                           nonorm=options.nonorm,
                           nocheat=options.nocheat)

    out_fname = options.output if options.output else "results.csv"

    results = evaluate_on_all(w, options.nocheat)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
            fname = os.path.join(_get_dataset_dir(), fname)

        format = options.format

        if not format:
            _, ext = os.path.splitext(fname)
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

        assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            vocab_size = sum(1 for line in open(fname))
            dim = len(next(open(fname)).split()) - 1

        w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words,
                           load_kwargs=load_kwargs)

    out_fname = options.output if options.output else "results.csv"

    results = evaluate_on_all(w)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)