def test_most_similar_simple(self): s = pd.Series(["one one one"]) s = preprocessing.tokenize(s) df_embeddings = representation.word2vec(s, min_count=1, seed=1) to = "one" most_similar = representation.most_similar(df_embeddings, to) self.assertEqual(most_similar.shape, (1, ))
def test_incorrect_index_most_similar(self): s = pd.DataFrame([[1.0], [2.0]], index=["word1", "word2"]) result_s = representation.most_similar(s, "word1") t_different_index = pd.DataFrame(s.values, index=None) self.assertFalse(result_s.index.equals(t_different_index.index))
def test_correct_index_most_similar(self): s = pd.DataFrame([[1.0], [2.0]], index=["word1", "word2"]) result_s = representation.most_similar(s, "word1") t_same_index = pd.DataFrame(s.values, s.index) self.assertTrue(result_s.index.equals(t_same_index.index))
def test_most_similar_raise_with_not_in_index(self): s_embed = pd.DataFrame(data=[1], index=["one"]) to = "two" with self.assertRaisesRegex(ValueError, r"index"): representation.most_similar(s_embed, to)
def test_most_similar_raise_with_series(self): s_embed = pd.Series({"one": 1}) to = "one" with self.assertRaisesRegex(ValueError, r"Pandas|pandas"): representation.most_similar(s_embed, to)