Beispiel #1
0
def test_save():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)

    dirpath = tempfile.mkdtemp()
    w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True)
    w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False)
    w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True)
    w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False)
    assert np.array_equal(w.vectors, w2.vectors)
    assert w.vocabulary.words == w2.vocabulary.words
    assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5
    assert w.vocabulary.words == w3.vocabulary.words
Beispiel #2
0
def test_save():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)

    dirpath = tempfile.mkdtemp()
    w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True)
    w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False)
    w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True)
    w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False)
    assert np.array_equal(w.vectors, w2.vectors)
    assert w.vocabulary.words == w2.vocabulary.words
    assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5
    assert w.vocabulary.words == w3.vocabulary.words
Beispiel #3
0
def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400)
    results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
Beispiel #4
0
def test_categorization():
    data = fetch_ESSLI_2c()
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    assert evaluate_categorization(w, data.X, data.y, seed=777,
                                   method="all") >= 0.2
Beispiel #5
0
def test_standardize():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    w2 = w.standardize_words(inplace=False, lower=False, clean_words=True)
    w3 = Embedding.from_word2vec(file_name, binary=True)
    assert len(w2.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False, clean_words=True):
            assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)])

    w3.standardize_words(inplace=True, clean_words=True, lower=False)
    assert len(w3.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False):
            assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])
Beispiel #6
0
def test_save_2():
    dirpath = tempfile.mkdtemp()
    w = ["a", "b", "c"]
    vectors = np.array([[1., 2.], [2., 3.], [3., 4.]])
    e = Embedding(Vocabulary(w), vectors)
    Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True)
    e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True)
    assert np.array_equal(e2.vectors, vectors)
Beispiel #7
0
def test_standardize():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    w2 = w.standardize_words(inplace=False, lower=False, clean_words=True)
    w3 = Embedding.from_word2vec(file_name, binary=True)
    assert len(w2.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False, clean_words=True):
            assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)])

    w3.standardize_words(inplace=True, clean_words=True, lower=False)
    assert len(w3.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False):
            assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])
Beispiel #8
0
def test_save_2():
    dirpath = tempfile.mkdtemp()
    w = ["a", "b", "c"]
    vectors = np.array([[1.,2.] ,[2.,3.], [3.,4.]])
    e = Embedding(Vocabulary(w), vectors)
    Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True)
    e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True)
    assert np.array_equal(e2.vectors, vectors)
Beispiel #9
0
def load_embedding(fname,
                   format="word2vec_bin",
                   normalize=True,
                   lower=False,
                   clean_words=False,
                   load_kwargs={}):
    """
    Loads embeddings from file

    Parameters
    ----------
    fname: string
      Path to file containing embedding

    format: string
      Format of the embedding. Possible values are:
      'word2vec_bin', 'word2vec', 'glove', 'dict'

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.
    """
    assert format in ['word2vec_bin', 'word2vec', 'glove',
                      'dict'], "Unrecognized format"
    if format == "word2vec_bin":
        w = Embedding.from_word2vec(fname, binary=True)
    elif format == "word2vec":
        w = Embedding.from_word2vec(fname, binary=False)
    elif format == "glove":
        w = Embedding.from_glove(fname, **load_kwargs)
    elif format == "dict":
        d = pickle.load(open(fname, "rb"), encoding='latin1')
        w = Embedding.from_dict(d)
    if normalize:
        w.normalize_words(inplace=True)
    if lower or clean_words:
        w.standardize_words(lower=lower, clean_words=clean_words, inplace=True)
    return w
def test_similarity():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 =  evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"
Beispiel #11
0
def test_similarity_norm():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    w_norm = w.normalize_words()
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 = evaluate_similarity(w_norm, data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"
Beispiel #12
0
def test_similarity_norm():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    w_norm = w.normalize_words()
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 = evaluate_similarity(w_norm, data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for normalized and unnormalized words"
Beispiel #13
0
def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]),
                                            1000,
                                            replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w,
                                   X=X,
                                   y=y,
                                   category=category,
                                   method="mul",
                                   k=400)
    results_add = evaluate_analogy(w=w,
                                   X=X,
                                   y=y,
                                   category=category,
                                   method="add",
                                   k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy'][
        'syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy'][
        'semantic']
def test_categorization():
    data = fetch_ESSLI_2c()
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2
Beispiel #15
0
def test_wordrep_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    P = evaluate_on_WordRep(w, max_pairs=2)
    assert P['accuracy']['all'] >= 0
Beispiel #16
0
def test_semeval_solver():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    results = evaluate_on_semeval_2012_2(w)
    assert results['all'] >= 0, "Should have some results on SemEval2012"
Beispiel #17
0
def test_semeval_solver():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    results = evaluate_on_semeval_2012_2(w)
    assert results['all'] >= 0, "Should have some results on SemEval2012"
Beispiel #18
0
def test_wordrep_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    P = evaluate_on_WordRep(w, max_pairs=2)
    assert P['accuracy']['all'] >= 0
Beispiel #19
0
                                         size=dimension,
                                         window=word_window,
                                         workers=3,
                                         negative=negative_sample)
                        model.build_vocab(sentences=sents)
                        model.train(sentences=sents,
                                    total_examples=len(sents),
                                    epochs=EPOCHS)
                        model.save(model_filename)
                        # model.vocabulary.save(vocab_filename)

                    print("Model saved at " + model_filename)

                embedding_model = []
                if embedding == 'w2v':
                    embedding_model = Embedding.from_word2vec(model_filename)
                else:
                    embedding_model = FastText.load(model_filename)

                for dset in USE_DATASETS:
                    dataset = get_dataset(dset)
                    vocab = set()
                    for pair in dataset.X:
                        vocab.add(pair[0])
                        vocab.add(pair[1])

                    if dset in ['Google', 'MSR']:
                        evaluate_w2v_analogy(dset, dataset, sents, vocab,
                                             embedding_model, dimension,
                                             word_window, word_count)
                    else: