def test_save(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) dirpath = tempfile.mkdtemp() w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True) w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False) w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True) w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False) assert np.array_equal(w.vectors, w2.vectors) assert w.vocabulary.words == w2.vocabulary.words assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5 assert w.vocabulary.words == w3.vocabulary.words
def test_analogy_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_google_analogy() ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False) X, y = data.X[ids], data.y[ids] category = data.category_high_level[ids] results = evaluate_analogy(w=w, X=X, y=y, category=category) assert results['accuracy']['all'] >= 0.65 assert results['accuracy']['semantic'] >= 0.7 assert results['accuracy']['syntactic'] >= 0.63 results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul") assert results['accuracy']['all'] >= 0.7 assert results['accuracy']['semantic'] >= 0.75 assert results['accuracy']['syntactic'] >= 0.64 results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400) results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400) assert results_mul['accuracy']['all'] >= results_add['accuracy']['all'] assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic'] assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
def test_categorization(): data = fetch_ESSLI_2c() url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2
def test_standardize(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) w2 = w.standardize_words(inplace=False, lower=False, clean_words=True) w3 = Embedding.from_word2vec(file_name, binary=True) assert len(w2.words) == 95 for word in w.vocabulary.words: if standardize_string(word, lower=False, clean_words=True): assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)]) w3.standardize_words(inplace=True, clean_words=True, lower=False) assert len(w3.words) == 95 for word in w.vocabulary.words: if standardize_string(word, lower=False): assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])
def test_save_2(): dirpath = tempfile.mkdtemp() w = ["a", "b", "c"] vectors = np.array([[1., 2.], [2., 3.], [3., 4.]]) e = Embedding(Vocabulary(w), vectors) Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True) e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True) assert np.array_equal(e2.vectors, vectors)
def test_save_2(): dirpath = tempfile.mkdtemp() w = ["a", "b", "c"] vectors = np.array([[1.,2.] ,[2.,3.], [3.,4.]]) e = Embedding(Vocabulary(w), vectors) Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True) e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True) assert np.array_equal(e2.vectors, vectors)
def load_embedding(fname, format="word2vec_bin", normalize=True, lower=False, clean_words=False, load_kwargs={}): """ Loads embeddings from file Parameters ---------- fname: string Path to file containing embedding format: string Format of the embedding. Possible values are: 'word2vec_bin', 'word2vec', 'glove', 'dict' normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. """ assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format" if format == "word2vec_bin": w = Embedding.from_word2vec(fname, binary=True) elif format == "word2vec": w = Embedding.from_word2vec(fname, binary=False) elif format == "glove": w = Embedding.from_glove(fname, **load_kwargs) elif format == "dict": d = pickle.load(open(fname, "rb"), encoding='latin1') w = Embedding.from_dict(d) if normalize: w.normalize_words(inplace=True) if lower or clean_words: w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) return w
def test_similarity(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_SimLex999() result_1 = evaluate_similarity(w, data.X, data.y) result_2 = evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y) assert result_2 > 0 assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"
def test_similarity_norm(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) w_norm = w.normalize_words() data = fetch_SimLex999() result_1 = evaluate_similarity(w, data.X, data.y) result_2 = evaluate_similarity(w_norm, data.X, data.y) assert result_2 > 0 assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"
def test_analogy_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) data = fetch_google_analogy() ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False) X, y = data.X[ids], data.y[ids] category = data.category_high_level[ids] results = evaluate_analogy(w=w, X=X, y=y, category=category) assert results['accuracy']['all'] >= 0.65 assert results['accuracy']['semantic'] >= 0.7 assert results['accuracy']['syntactic'] >= 0.63 results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul") assert results['accuracy']['all'] >= 0.7 assert results['accuracy']['semantic'] >= 0.75 assert results['accuracy']['syntactic'] >= 0.64 results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400) results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400) assert results_mul['accuracy']['all'] >= results_add['accuracy']['all'] assert results_mul['accuracy']['syntactic'] >= results_add['accuracy'][ 'syntactic'] assert results_mul['accuracy']['semantic'] >= results_add['accuracy'][ 'semantic']
def test_wordrep_solver(): url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) P = evaluate_on_WordRep(w, max_pairs=2) assert P['accuracy']['all'] >= 0
def test_semeval_solver(): url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1" file_name = _fetch_file(url, "test") w = Embedding.from_word2vec(file_name, binary=True) results = evaluate_on_semeval_2012_2(w) assert results['all'] >= 0, "Should have some results on SemEval2012"
size=dimension, window=word_window, workers=3, negative=negative_sample) model.build_vocab(sentences=sents) model.train(sentences=sents, total_examples=len(sents), epochs=EPOCHS) model.save(model_filename) # model.vocabulary.save(vocab_filename) print("Model saved at " + model_filename) embedding_model = [] if embedding == 'w2v': embedding_model = Embedding.from_word2vec(model_filename) else: embedding_model = FastText.load(model_filename) for dset in USE_DATASETS: dataset = get_dataset(dset) vocab = set() for pair in dataset.X: vocab.add(pair[0]) vocab.add(pair[1]) if dset in ['Google', 'MSR']: evaluate_w2v_analogy(dset, dataset, sents, vocab, embedding_model, dimension, word_window, word_count) else: