class TestTokenEmbeddingSimilarity(TestTokenEmbedding): def setUp(self): super().setUp() self.config["extended_vocab_path"] = self.test_data_file self.config["keep_extended_vocab_only"] = True def get_dot(self, vec_a, vec_b): return np.sum(vec_a * vec_b) def get_cosine(self, vec_a, vec_b): return self.get_dot(vec_a, vec_b) / (np.sqrt( self.get_dot(vec_a, vec_a) * self.get_dot(vec_b, vec_b))) def get_random_word_vec(self, vocab_list): vocab_size = len(vocab_list) ids = np.random.randint(vocab_size, size=2) word_a, word_b = vocab_list[ids[0]], vocab_list[ids[1]] vec_a, vec_b = self.embedding.search([word_a, word_b]) return word_a, word_b, vec_a, vec_b def test_cosine_sim(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list) result = self.embedding.cosine_sim(word_a, word_b) expected_result = self.get_cosine(vec_a, vec_b) self.check_output_equal(result, expected_result) def test_dot(self): self.embedding = TokenEmbedding(**self.config) vocab_list = get_vocab_list(self.config["extended_vocab_path"]) word_a, word_b, vec_a, vec_b = self.get_random_word_vec(vocab_list) result = self.embedding.dot(word_a, word_b) expected_result = self.get_dot(vec_a, vec_b) self.check_output_equal(result, expected_result)
words = jiagu.seg(text) # 分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(words) # 命名实体识别 print(ner) from paddlenlp.datasets import ChnSentiCorp train_ds, dev_ds, test_ds = ChnSentiCorp.get_datasets(['train', 'dev', 'test']) from paddlenlp.embeddings import TokenEmbedding wordemb = TokenEmbedding("w2v.baidu_encyclopedia.target.word-word.dim300") print(wordemb.cosine_sim("苹果", "香蕉")) wordemb.cosine_sim("艺术", "火车") wordemb.cosine_sim("狗", "香蕉") for token1 in ['狗', '猫', '香蕉']: for token2 in ['狗', '猫', '香蕉']: print(wordemb.cosine_sim(token1, token2)) vv = wordemb.search(['狗', '猫', '香蕉']) vv2 = wordemb.search('狗猫香蕉')