def similarity_calculation(str_arr, str_2): sim = Similarity() str_2 = strip_word(str_2) result = [] for item in str_arr: #这里可以将base提前处理好导出备用,以达到优化目的 item = strip_word(item) result.append(sim.get_score(item, str_2)) return result
def test_oov_sim(): from text2vec import Similarity sim = Similarity() a = ',' b = '花' s = sim.get_score(a, b) print(a, b, s) a = ',画画' b = '花画画' s = sim.get_score(a, b) print(a, b, s) a = ',' b = '花画画' s = sim.get_score(a, b) print(a, b, s) a = ',机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) a = '机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) a = '机票' b = '特价的机票' s = sim.get_score(a, b) print(a, b, s)
def test_encode(self): """测试文本wmd encode结果""" sim = Similarity(embedding_type=EmbType.W2V, similarity_type=SimType.WMD) a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' emb = sim.encode(a) print(a, emb) print(emb.shape) self.assertEqual(emb.shape, (200, )) s = sim.get_score(a, b) print(a, b, s) self.assertTrue("{:.3f}".format(s) == "0.746") s = sim.get_score(a, c) print(a, c, s) self.assertTrue("{:.3f}".format(s) == "0.573") s = sim.get_score(b, c) print(b, c, s) self.assertTrue("{:.3f}".format(s) == "0.563")
def test_sents_score(self): """测试句子之间相似度值-word2vec""" sim = Similarity() print(sim.get_score(a, b), a, b) print(sim.get_score(a, c), a, c) print(sim.get_score(a, d), a, d) print(sim.get_score(b, c), b, c) print(sim.get_score(b, d), b, d) print(sim.get_score(c, d), c, d) print("{:.3f}".format(sim.get_score(a, b))) self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "1.000") self.assertTrue("{:.3f}".format(sim.get_score(a, c)) == "1.000") self.assertTrue("{:.3f}".format(sim.get_score(a, d)) == "0.903") self.assertTrue("{:.3f}".format(sim.get_score(b, c)) == "1.000") self.assertTrue("{:.3f}".format(sim.get_score(b, d)) == "0.903") self.assertTrue("{:.3f}".format(sim.get_score(c, d)) == "0.903")
def test_oov_sim(self): """测试OOV word 相似度""" sim = Similarity() a = ',' b = '花' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(s == 0.0) a = ',画画' b = '花画画' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.822) a = ',' b = '花画画' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.000) a = ',机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.884) a = '机票' b = '特价机票' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 0.884) a = '特价机票' b = '特价的机票' s = sim.get_score(a, b) print(a, b, s) self.assertTrue(round(s, 3) == 1.000)
def test_sents_score_bert(self): """测试句子之间相似度值-bert""" sim = Similarity(embedding_type='bert') print("{:.3f}".format(sim.get_score(a, b))) self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "0.915")
# -*- coding: utf-8 -*- """ @author:XuMing<*****@*****.**> @description: """ from text2vec import Similarity a = '湖北人爱吃鱼' b = '甘肃人不爱吃鱼' ss = Similarity(embedding_type='w2v') ss.get_score(a, b) print(ss.model.info()) ss = Similarity(embedding_type='bert') ss.get_score(a, b) print(ss.model.info())
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from text2vec import Similarity, EmbType, SimType sim = Similarity(embedding_type=EmbType.W2V, similarity_type=SimType.WMD) a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' emb = sim.encode(a) print(a, emb) print(emb.shape) s = sim.get_score(a, b) print(a, b, s) s = sim.get_score(a, c) print(a, c, s) s = sim.get_score(b, c) print(b, c, s)
""" @author:XuMing([email protected]) @description: """ import sys sys.path.append('..') from text2vec import Vector from text2vec import Similarity if __name__ == '__main__': vec = Vector(embedding_type='bert') char = '卡' emb = vec.encode(char) # <class 'numpy.ndarray'> (128, 3072) 128=seq_len, 3072=768*4 print(type(emb), emb.shape) print(char, emb) word = '银行卡' print(word, vec.encode(word)) a = '如何更换花呗绑定银行卡' emb = vec.encode(a) print(a, emb) print(emb.shape) sim = Similarity(embedding_type='bert') b = '花呗更改绑定银行卡' print(sim.get_score(a, b))
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from text2vec import Similarity sim = Similarity() if __name__ == '__main__': a = '什么是智能手环' b = '智能手环是个啥' c = '智能手环有什么用' d = '智能手环能干什么' print(sim.get_score(a, b), a, b) print(sim.get_score(a, c), a, c) print(sim.get_score(a, d), a, d) print(sim.get_score(b, c), b, c) print(sim.get_score(b, d), b, d) print(sim.get_score(c, d), c, d) print("example:") while True: a = input('input1:') b = input('input2:') print(sim.get_score(a, b), a, b)
class TextFeature(object): def __init__(self, stopwords_path=config.stopwords_path, person_name_path=config.person_name_path, place_name_path=config.place_name_path, common_char_path=config.common_char_path, segment_sep=config.segment_sep): self.stopwords = self.load_set_file(stopwords_path) self.person_names = self.load_set_file(person_name_path) self.place_names = self.load_set_file(place_name_path) self.common_chars = self.load_set_file(common_char_path) self.segment_sep = segment_sep self.vec = Vector() self.sim = Similarity(similarity_type=SimType.WMD) @staticmethod def load_set_file(path): words = set() with codecs.open(path, 'r', encoding='utf-8') as f: for w in f: w = w.strip() if w.startswith('#'): continue if w: words.add(w.split()[0]) return words def is_stopword(self, word): return word in self.stopwords def is_name(self, word): names = self.person_names | self.place_names return word in names def is_entity(self, pos, entity_pos=('ns', 'n', 'vn', 'v')): return pos in entity_pos def is_common_char(self, c): return c in self.common_chars def is_common_char_string(self, word): return all(self.is_common_char(c) for c in word) def get_feature(self, query, is_word_segmented=False): """ Get text feature :param query: :param is_word_segmented: :return: list, list: term features, sentence features """ term_features = [] if is_word_segmented: word_seq = query.split(self.segment_sep) else: word_seq = word_segment(query, cut_type='word', pos=False) logger.debug('%s' % word_seq) # sentence sentence_features = AttrDict( query_length=len(query), term_size=len(word_seq), ) # term idx = 0 offset = 0 for word in word_seq: emb = self.vec.encode(word) word_list = deepcopy(word_seq) if word in word_list: word_list.remove(word) del_word_query = ''.join(word_list) del_term_sim_score = self.sim.get_score(query, del_word_query) term_features.append( AttrDict( term=word, term_length=len(word), idx=idx, offset=offset, is_number=is_number_string(word), is_chinese=is_chinese_string(word), is_alphabet=is_alphabet_string(word), is_stopword=self.is_stopword(word), is_name=self.is_name(word), # is_entity=self.is_entity(pos), is_common_char=self.is_common_char_string(word), embedding_sum=np.sum(emb), del_term_score=del_term_sim_score, )) idx += len(word) offset += 1 return term_features, sentence_features
from text2vec import Similarity a = "我喜欢一个女孩子。" b = "我讨厌一个女孩子。" sim = Similarity() s = sim.get_score(a, b) print(s)