Exemple #1
0
def test_inplace_transform_word_OrderedVocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = OrderedVocabulary(words=['dog', 'cat', '  cat'])

    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=True)

    assert pe is e and pe == e

    assert len(pe.vocabulary) == 2
    assert len(pe.vectors) == 2

    # 'dog'
    assert [0, 0, 11] in pe.vectors.tolist()
    # 'cat'
    assert [0, 11, 12] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 11])

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 11, 12])

    assert type(pe.vocabulary) == OrderedVocabulary
Exemple #2
0
def test_inplace_transform_word_CountedVocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)])

    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=True)

    assert pe is e and pe == e

    assert len(pe.vocabulary) == 2
    assert len(pe.vectors) == 2

    # 'dog'
    assert [0, 0, 11] in pe.vectors.tolist()
    # 'cat'
    assert [0, 11, 12] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words

    l = pe.vocabulary.getstate()
    d = {l[0][i]: l[1][i] for i in range(len(l[0]))}

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 11])
    assert d['dog'] == 60

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 11, 12])
    assert d['cat'] == 50

    assert type(pe.vocabulary) == CountedVocabulary
Exemple #3
0
def test_noinplace_transform_word_prefer_shortestword2_Vocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = Vocabulary(words=['dog', 'cat', '    pikatchu   ', 'pikatchu', ' cat '])
    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=False)

    assert len(pe.vocabulary) == 3
    assert len(pe.vectors) == 3

    # 'dog'
    assert [0, 0, 1] in pe.vectors.tolist()
    # 'cat'
    assert [0, 1, 11] in pe.vectors.tolist()
    # pikatchu
    assert [0, 12, 13] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words
    assert 'pikatchu' in pe.vocabulary.words

    # pikatchu
    assert pe.vocabulary.words[2] == 'pikatchu'
    assert np.array_equal(pe.vectors[2], [0, 12, 13])

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 1])

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 1, 11])

    assert type(pe.vocabulary) == Vocabulary
Exemple #4
0
def test_save_2():
    dirpath = tempfile.mkdtemp()
    w = ["a", "b", "c"]
    vectors = np.array([[1., 2.], [2., 3.], [3., 4.]])
    e = Embedding(Vocabulary(w), vectors)
    Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True)
    e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True)
    assert np.array_equal(e2.vectors, vectors)
Exemple #5
0
def test_noinplace_transform_word_prefer_shortestword_CountedVocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = CountedVocabulary(
        word_count=[('dog', 60), ('cat', 50), ('    pikatchu   ', 10), ('pikatchu', 10), (' cat ', 5)])

    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=False)

    assert len(pe.vocabulary) == 3
    assert len(pe.vectors) == 3

    # 'dog'
    assert [0, 0, 1] in pe.vectors.tolist()
    # 'cat'
    assert [0, 1, 11] in pe.vectors.tolist()
    # pikatchu
    assert [0, 12, 13] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words
    assert 'pikatchu' in pe.vocabulary.words

    l = pe.vocabulary.getstate()
    d = {l[0][i]: l[1][i] for i in range(len(l[0]))}

    # pikatchu
    assert pe.vocabulary.words[2] == 'pikatchu'
    assert np.array_equal(pe.vectors[2], [0, 12, 13])
    assert d['pikatchu'] == 10

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 1])
    assert d['dog'] == 60

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 1, 11])
    assert d['cat'] == 50

    assert type(pe.vocabulary) == CountedVocabulary
Exemple #6
0
def web_tests(emb):
    """
    :param emb: dict of words and their corresponding embeddings
    :return: dict of word-embeddings-benchmarks tests and scores received
    """
    similarity_tasks = {
        'WS353': fetch_WS353(),
        'RG65': fetch_RG65(),
        'RW': fetch_RW(),
        'MTurk': fetch_MTurk(),
        'MEN': fetch_MEN(),
        'SimLex999': fetch_SimLex999()
    }

    web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values()))
    similarity_results = {}
    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y)
        logging.info("Spearman correlation of scores on {} {}".format(
            name, evaluate_similarity(web_emb, data.X, data.y)))
    return similarity_results