def test_inplace_transform_word_OrderedVocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = OrderedVocabulary(words=['dog', 'cat', ' cat']) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) pe = e.transform_words(lambda x: x.strip(), inplace=True) assert pe is e and pe == e assert len(pe.vocabulary) == 2 assert len(pe.vectors) == 2 # 'dog' assert [0, 0, 11] in pe.vectors.tolist() # 'cat' assert [0, 11, 12] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 11]) # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 11, 12]) assert type(pe.vocabulary) == OrderedVocabulary
def test_inplace_transform_word_CountedVocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)]) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]])) pe = e.transform_words(lambda x: x.strip(), inplace=True) assert pe is e and pe == e assert len(pe.vocabulary) == 2 assert len(pe.vectors) == 2 # 'dog' assert [0, 0, 11] in pe.vectors.tolist() # 'cat' assert [0, 11, 12] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words l = pe.vocabulary.getstate() d = {l[0][i]: l[1][i] for i in range(len(l[0]))} # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 11]) assert d['dog'] == 60 # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 11, 12]) assert d['cat'] == 50 assert type(pe.vocabulary) == CountedVocabulary
def test_noinplace_transform_word_prefer_shortestword2_Vocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = Vocabulary(words=['dog', 'cat', ' pikatchu ', 'pikatchu', ' cat ']) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) pe = e.transform_words(lambda x: x.strip(), inplace=False) assert len(pe.vocabulary) == 3 assert len(pe.vectors) == 3 # 'dog' assert [0, 0, 1] in pe.vectors.tolist() # 'cat' assert [0, 1, 11] in pe.vectors.tolist() # pikatchu assert [0, 12, 13] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words assert 'pikatchu' in pe.vocabulary.words # pikatchu assert pe.vocabulary.words[2] == 'pikatchu' assert np.array_equal(pe.vectors[2], [0, 12, 13]) # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 1]) # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 1, 11]) assert type(pe.vocabulary) == Vocabulary
def test_save_2(): dirpath = tempfile.mkdtemp() w = ["a", "b", "c"] vectors = np.array([[1., 2.], [2., 3.], [3., 4.]]) e = Embedding(Vocabulary(w), vectors) Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True) e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True) assert np.array_equal(e2.vectors, vectors)
def test_noinplace_transform_word_prefer_shortestword_CountedVocabulary(): logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) cw = CountedVocabulary( word_count=[('dog', 60), ('cat', 50), (' pikatchu ', 10), ('pikatchu', 10), (' cat ', 5)]) e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]])) pe = e.transform_words(lambda x: x.strip(), inplace=False) assert len(pe.vocabulary) == 3 assert len(pe.vectors) == 3 # 'dog' assert [0, 0, 1] in pe.vectors.tolist() # 'cat' assert [0, 1, 11] in pe.vectors.tolist() # pikatchu assert [0, 12, 13] in pe.vectors.tolist() assert 'cat' in pe.vocabulary.words assert 'dog' in pe.vocabulary.words assert 'pikatchu' in pe.vocabulary.words l = pe.vocabulary.getstate() d = {l[0][i]: l[1][i] for i in range(len(l[0]))} # pikatchu assert pe.vocabulary.words[2] == 'pikatchu' assert np.array_equal(pe.vectors[2], [0, 12, 13]) assert d['pikatchu'] == 10 # dog assert pe.vocabulary.words[0] == 'dog' assert np.array_equal(pe.vectors[0], [0, 0, 1]) assert d['dog'] == 60 # cat assert pe.vocabulary.words[1] == 'cat' assert np.array_equal(pe.vectors[1], [0, 1, 11]) assert d['cat'] == 50 assert type(pe.vocabulary) == CountedVocabulary
def web_tests(emb): """ :param emb: dict of words and their corresponding embeddings :return: dict of word-embeddings-benchmarks tests and scores received """ similarity_tasks = { 'WS353': fetch_WS353(), 'RG65': fetch_RG65(), 'RW': fetch_RW(), 'MTurk': fetch_MTurk(), 'MEN': fetch_MEN(), 'SimLex999': fetch_SimLex999() } web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values())) similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y) logging.info("Spearman correlation of scores on {} {}".format( name, evaluate_similarity(web_emb, data.X, data.y))) return similarity_results