def test_compute_similar_words_from_word(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) word = "てすと" expected = [("サンプル", 0.7506074093675397)] actual = word_embeds.compute_similar_words_from_word(word) self.assertEqual(actual, expected)
def test_property_shape(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) self.assertEqual(len(word_embeds), 2) self.assertEqual(word_embeds.dim, 10) self.assertEqual(word_embeds.shape, (2, 10))
def test_error_unknown_words(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) words = ["てすと", "unk"] with self.assertRaises(ValueError, msg=f"unknown word: 'unk'"): word_embeds(words)
def test_extract_word_embeddings(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) words = ["てすと", "サンプル"] actual = word_embeds(words) expected = np.array( [ [ 0.31882, 0.89289, 0.90071, 0.45753, 0.37083, 0.64955, 0.34075, 0.70048, 0.89085, 0.13621, ], [ 0.79375, 0.44464, 0.07644, 0.35242, 0.03996, 0.68827, 0.97103, 0.77324, 0.72781, 0.69158, ], ] ) np.testing.assert_almost_equal(actual, expected) word = "てすと" actual = word_embeds(word) expected = np.array( [ 0.31882, 0.89289, 0.90071, 0.45753, 0.37083, 0.64955, 0.34075, 0.70048, 0.89085, 0.13621, ] ) np.testing.assert_almost_equal(actual, expected)
def test_compute_cosine_similarity(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) embed = np.array( [ 0.29902, 0.90019, 0.89964, 0.50753, 0.38001, 0.59495, 0.29175, 0.69909, 0.90185, 0.09687, ] ) expected = np.array([0.9987114080207757, 0.7286216119815097]) actual = word_embeds.compute_cosine_similarity(embed) np.testing.assert_almost_equal(actual, expected)
def test_compute_similar_words_from_vec(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) embed = np.array( [ 0.29902, 0.90019, 0.89964, 0.50753, 0.38001, 0.59495, 0.29175, 0.69909, 0.90185, 0.09687, ] ) expected = [("てすと", 0.9987114080207757), ("サンプル", 0.7286216119815097)] actual = word_embeds.compute_similar_words_from_vec(embed) self.assertEqual(actual, expected)
def test_property_vocab(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.word2vec.txt" ) word_embeds = WordEmbedding(file_path) self.assertSetEqual(word_embeds.vocab, {"てすと", "サンプル"}) self.assertEqual(word_embeds.to_word(1), "サンプル") self.assertEqual(word_embeds.to_index("てすと"), 0) self.assertTrue(word_embeds.is_known("てすと")) self.assertFalse(word_embeds.is_known("test"))
def test_load_glove_format_file(self) -> None: file_path = str( Path(__file__).resolve().parent / "samples" / "sample.glove.txt" ) word_embeds = WordEmbedding(file_path) self.assertIsInstance(word_embeds, WordEmbedding)