def similarity_calculation(str_arr, str_2): sim = Similarity() str_2 = strip_word(str_2) result = [] for item in str_arr: #这里可以将base提前处理好导出备用,以达到优化目的 item = strip_word(item) result.append(sim.get_score(item, str_2)) return result
def __init__(self, stopwords_path=config.stopwords_path, person_name_path=config.person_name_path, place_name_path=config.place_name_path, common_char_path=config.common_char_path, segment_sep=config.segment_sep): self.stopwords = self.load_set_file(stopwords_path) self.person_names = self.load_set_file(person_name_path) self.place_names = self.load_set_file(place_name_path) self.common_chars = self.load_set_file(common_char_path) self.segment_sep = segment_sep self.vec = Vector() self.sim = Similarity(similarity_type=SimType.WMD)
def test_oov_sim(): from text2vec import Similarity sim = Similarity() a = ',' b = '花' s = sim.get_score(a, b) print(a, b, s) a = ',画画' b = '花画画' s = sim.get_score(a, b) print(a, b, s) a = ',' b = '花画画' s = sim.get_score(a, b) print(a, b, s) a = ',机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) a = '机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) a = '机票' b = '特价的机票' s = sim.get_score(a, b) print(a, b, s)
def __init__(self, stopwords_path=config.stopwords_path, person_name_path=config.person_name_path, place_name_path=config.place_name_path, common_char_path=config.common_char_path, segment_sep=config.segment_sep ): self.stopwords = self.load_set_file(stopwords_path) self.person_names = self.load_set_file(person_name_path) self.place_names = self.load_set_file(place_name_path) self.common_chars = self.load_set_file(common_char_path) self.segment_sep = segment_sep self.sim = Similarity(similarity_type=SimType.COSINE, embedding_type=EmbType.SBERT) self.sim.load_model()
def test_sents_score(self): """测试句子之间相似度值-word2vec""" sim = Similarity() print(sim.get_score(a, b), a, b) print(sim.get_score(a, c), a, c) print(sim.get_score(a, d), a, d) print(sim.get_score(b, c), b, c) print(sim.get_score(b, d), b, d) print(sim.get_score(c, d), c, d) print("{:.3f}".format(sim.get_score(a, b))) self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "1.000") self.assertTrue("{:.3f}".format(sim.get_score(a, c)) == "1.000") self.assertTrue("{:.3f}".format(sim.get_score(a, d)) == "0.903") self.assertTrue("{:.3f}".format(sim.get_score(b, c)) == "1.000") self.assertTrue("{:.3f}".format(sim.get_score(b, d)) == "0.903") self.assertTrue("{:.3f}".format(sim.get_score(c, d)) == "0.903")
def test_oov_sim(self): """测试OOV word 相似度""" sim = Similarity() a = ',' b = '花' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(s == 0.0) a = ',画画' b = '花画画' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.822) a = ',' b = '花画画' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.000) a = ',机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.884) a = '机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.884) a = '特价机票' b = '特价的机票' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 1.000)
def test_encode(self): """测试文本wmd encode结果""" sim = Similarity(embedding_type=EmbType.W2V, similarity_type=SimType.WMD) a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' emb = sim.encode(a) print(a, emb) print(emb.shape) self.assertEqual(emb.shape, (200, )) s = sim.get_score(a, b) print(a, b, s) self.assertTrue("{:.3f}".format(s) == "0.746") s = sim.get_score(a, c) print(a, c, s) self.assertTrue("{:.3f}".format(s) == "0.573") s = sim.get_score(b, c) print(b, c, s) self.assertTrue("{:.3f}".format(s) == "0.563")
def test_sents_score_bert(self): """测试句子之间相似度值-bert""" sim = Similarity(embedding_type='bert') print("{:.3f}".format(sim.get_score(a, b))) self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "0.915")
# -*- coding: utf-8 -*- """ @author:XuMing<*****@*****.**> @description: """ from text2vec import Similarity a = '湖北人爱吃鱼' b = '甘肃人不爱吃鱼' ss = Similarity(embedding_type='w2v') ss.get_score(a, b) print(ss.model.info()) ss = Similarity(embedding_type='bert') ss.get_score(a, b) print(ss.model.info())
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from text2vec import Similarity, EmbType, SimType sim = Similarity(embedding_type=EmbType.W2V, similarity_type=SimType.WMD) a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' emb = sim.encode(a) print(a, emb) print(emb.shape) s = sim.get_score(a, b) print(a, b, s) s = sim.get_score(a, c) print(a, c, s) s = sim.get_score(b, c) print(b, c, s)
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append('..') from text2vec import Similarity if __name__ == '__main__': a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' sim = Similarity() s = sim.get_score(a, b) print(a, b, s) s = sim.get_score(a, c) print(a, c, s) s = sim.get_score(b, c) print(b, c, s) from text2vec import SearchSimilarity corpus = [a, b, c] print(corpus) search_sim = SearchSimilarity(corpus=corpus)
""" @author:XuMing([email protected]) @description: """ import sys sys.path.append('..') from text2vec import Vector from text2vec import Similarity if __name__ == '__main__': vec = Vector(embedding_type='bert') char = '卡' emb = vec.encode(char) # <class 'numpy.ndarray'> (128, 3072) 128=seq_len, 3072=768*4 print(type(emb), emb.shape) print(char, emb) word = '银行卡' print(word, vec.encode(word)) a = '如何更换花呗绑定银行卡' emb = vec.encode(a) print(a, emb) print(emb.shape) sim = Similarity(embedding_type='bert') b = '花呗更改绑定银行卡' print(sim.get_score(a, b))
# -*- coding: utf-8 -*- """ @author:XuMing<*****@*****.**> @description: """ from text2vec import Similarity sim = Similarity() def test_sim_diff(): a = '研究团队面向国家重大战略需求追踪国际前沿发展借鉴国际人工智能研究领域的科研模式有效整合创新资源解决复' b = '英汉互译比较语言学' print(a, b, sim.get_score(a, b)) def test_sim_same(): a = '汉英翻译比较语言学' b = '英汉互译比较语言学' print(a, b, sim.get_score(a, b))
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from text2vec import Similarity sim = Similarity() if __name__ == '__main__': a = '什么是智能手环' b = '智能手环是个啥' c = '智能手环有什么用' d = '智能手环能干什么' print(sim.get_score(a, b), a, b) print(sim.get_score(a, c), a, c) print(sim.get_score(a, d), a, d) print(sim.get_score(b, c), b, c) print(sim.get_score(b, d), b, d) print(sim.get_score(c, d), c, d) print("example:") while True: a = input('input1:') b = input('input2:') print(sim.get_score(a, b), a, b)
class TextFeature(object): def __init__(self, stopwords_path=config.stopwords_path, person_name_path=config.person_name_path, place_name_path=config.place_name_path, common_char_path=config.common_char_path, segment_sep=config.segment_sep): self.stopwords = self.load_set_file(stopwords_path) self.person_names = self.load_set_file(person_name_path) self.place_names = self.load_set_file(place_name_path) self.common_chars = self.load_set_file(common_char_path) self.segment_sep = segment_sep self.vec = Vector() self.sim = Similarity(similarity_type=SimType.WMD) @staticmethod def load_set_file(path): words = set() with codecs.open(path, 'r', encoding='utf-8') as f: for w in f: w = w.strip() if w.startswith('#'): continue if w: words.add(w.split()[0]) return words def is_stopword(self, word): return word in self.stopwords def is_name(self, word): names = self.person_names | self.place_names return word in names def is_entity(self, pos, entity_pos=('ns', 'n', 'vn', 'v')): return pos in entity_pos def is_common_char(self, c): return c in self.common_chars def is_common_char_string(self, word): return all(self.is_common_char(c) for c in word) def get_feature(self, query, is_word_segmented=False): """ Get text feature :param query: :param is_word_segmented: :return: list, list: term features, sentence features """ term_features = [] if is_word_segmented: word_seq = query.split(self.segment_sep) else: word_seq = word_segment(query, cut_type='word', pos=False) logger.debug('%s' % word_seq) # sentence sentence_features = AttrDict( query_length=len(query), term_size=len(word_seq), ) # term idx = 0 offset = 0 for word in word_seq: emb = self.vec.encode(word) word_list = deepcopy(word_seq) if word in word_list: word_list.remove(word) del_word_query = ''.join(word_list) del_term_sim_score = self.sim.get_score(query, del_word_query) term_features.append( AttrDict( term=word, term_length=len(word), idx=idx, offset=offset, is_number=is_number_string(word), is_chinese=is_chinese_string(word), is_alphabet=is_alphabet_string(word), is_stopword=self.is_stopword(word), is_name=self.is_name(word), # is_entity=self.is_entity(pos), is_common_char=self.is_common_char_string(word), embedding_sum=np.sum(emb), del_term_score=del_term_sim_score, )) idx += len(word) offset += 1 return term_features, sentence_features
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from text2vec import Similarity sim = Similarity(embedding_type='w2v', similarity_type='wmd') a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' emb = sim.encode(a) print(emb) s = sim.score(a, b) print(a, b, s) s = sim.score(a, c) print(a, c, s) s = sim.score(b, c) print(b, c, s)