Esempio n. 1
0
def similarity_calculation(str_arr, str_2):
    sim = Similarity()
    str_2 = strip_word(str_2)
    result = []
    for item in str_arr:
        #这里可以将base提前处理好导出备用,以达到优化目的
        item = strip_word(item)
        result.append(sim.get_score(item, str_2))
    return result
Esempio n. 2
0
def test_oov_sim():
    from text2vec import Similarity
    sim = Similarity()
    a = ','
    b = '花'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = ',画画'
    b = '花画画'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = ','
    b = '花画画'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = ',机票'
    b = '特价机票'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = '机票'
    b = '特价机票'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = '机票'
    b = '特价的机票'
    s = sim.get_score(a, b)
    print(a, b, s)
Esempio n. 3
0
    def test_encode(self):
        """测试文本wmd encode结果"""
        sim = Similarity(embedding_type=EmbType.W2V,
                         similarity_type=SimType.WMD)
        a = '如何更换花呗绑定银行卡'
        b = '花呗更改绑定银行卡'
        c = '我什么时候开通了花呗'

        emb = sim.encode(a)
        print(a, emb)
        print(emb.shape)
        self.assertEqual(emb.shape, (200, ))

        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue("{:.3f}".format(s) == "0.746")

        s = sim.get_score(a, c)
        print(a, c, s)
        self.assertTrue("{:.3f}".format(s) == "0.573")

        s = sim.get_score(b, c)
        print(b, c, s)
        self.assertTrue("{:.3f}".format(s) == "0.563")
Esempio n. 4
0
    def test_sents_score(self):
        """测试句子之间相似度值-word2vec"""
        sim = Similarity()

        print(sim.get_score(a, b), a, b)
        print(sim.get_score(a, c), a, c)
        print(sim.get_score(a, d), a, d)
        print(sim.get_score(b, c), b, c)
        print(sim.get_score(b, d), b, d)
        print(sim.get_score(c, d), c, d)

        print("{:.3f}".format(sim.get_score(a, b)))
        self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "1.000")
        self.assertTrue("{:.3f}".format(sim.get_score(a, c)) == "1.000")
        self.assertTrue("{:.3f}".format(sim.get_score(a, d)) == "0.903")
        self.assertTrue("{:.3f}".format(sim.get_score(b, c)) == "1.000")
        self.assertTrue("{:.3f}".format(sim.get_score(b, d)) == "0.903")
        self.assertTrue("{:.3f}".format(sim.get_score(c, d)) == "0.903")
Esempio n. 5
0
    def test_oov_sim(self):
        """测试OOV word 相似度"""
        sim = Similarity()
        a = ','
        b = '花'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(s == 0.0)

        a = ',画画'
        b = '花画画'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.822)

        a = ','
        b = '花画画'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.000)

        a = ',机票'
        b = '特价机票'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.884)

        a = '机票'
        b = '特价机票'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.884)

        a = '特价机票'
        b = '特价的机票'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 1.000)
Esempio n. 6
0
 def test_sents_score_bert(self):
     """测试句子之间相似度值-bert"""
     sim = Similarity(embedding_type='bert')
     print("{:.3f}".format(sim.get_score(a, b)))
     self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "0.915")
Esempio n. 7
0
# -*- coding: utf-8 -*-
"""
@author:XuMing<*****@*****.**>
@description: 
"""
from text2vec import Similarity

a = '湖北人爱吃鱼'
b = '甘肃人不爱吃鱼'

ss = Similarity(embedding_type='w2v')
ss.get_score(a, b)
print(ss.model.info())

ss = Similarity(embedding_type='bert')
ss.get_score(a, b)
print(ss.model.info())
Esempio n. 8
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

from text2vec import Similarity, EmbType, SimType

sim = Similarity(embedding_type=EmbType.W2V, similarity_type=SimType.WMD)
a = '如何更换花呗绑定银行卡'
b = '花呗更改绑定银行卡'
c = '我什么时候开通了花呗'
emb = sim.encode(a)
print(a, emb)
print(emb.shape)
s = sim.get_score(a, b)
print(a, b, s)

s = sim.get_score(a, c)
print(a, c, s)

s = sim.get_score(b, c)
print(b, c, s)
Esempio n. 9
0
"""
@author:XuMing([email protected])
@description: 
"""

import sys

sys.path.append('..')
from text2vec import Vector
from text2vec import Similarity

if __name__ == '__main__':
    vec = Vector(embedding_type='bert')
    char = '卡'
    emb = vec.encode(char)
    # <class 'numpy.ndarray'> (128, 3072) 128=seq_len, 3072=768*4
    print(type(emb), emb.shape)
    print(char, emb)

    word = '银行卡'
    print(word, vec.encode(word))

    a = '如何更换花呗绑定银行卡'
    emb = vec.encode(a)
    print(a, emb)
    print(emb.shape)

    sim = Similarity(embedding_type='bert')
    b = '花呗更改绑定银行卡'
    print(sim.get_score(a, b))
Esempio n. 10
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

from text2vec import Similarity
sim = Similarity()

if __name__ == '__main__':
    a = '什么是智能手环'
    b = '智能手环是个啥'
    c = '智能手环有什么用'
    d = '智能手环能干什么'
    print(sim.get_score(a, b), a, b)
    print(sim.get_score(a, c), a, c)
    print(sim.get_score(a, d), a, d)
    print(sim.get_score(b, c), b, c)
    print(sim.get_score(b, d), b, d)
    print(sim.get_score(c, d), c, d)

    print("example:")
    while True:
        a = input('input1:')
        b = input('input2:')
        print(sim.get_score(a, b), a, b)
Esempio n. 11
0
class TextFeature(object):
    def __init__(self,
                 stopwords_path=config.stopwords_path,
                 person_name_path=config.person_name_path,
                 place_name_path=config.place_name_path,
                 common_char_path=config.common_char_path,
                 segment_sep=config.segment_sep):
        self.stopwords = self.load_set_file(stopwords_path)
        self.person_names = self.load_set_file(person_name_path)
        self.place_names = self.load_set_file(place_name_path)
        self.common_chars = self.load_set_file(common_char_path)
        self.segment_sep = segment_sep
        self.vec = Vector()
        self.sim = Similarity(similarity_type=SimType.WMD)

    @staticmethod
    def load_set_file(path):
        words = set()
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for w in f:
                w = w.strip()
                if w.startswith('#'):
                    continue
                if w:
                    words.add(w.split()[0])
        return words

    def is_stopword(self, word):
        return word in self.stopwords

    def is_name(self, word):
        names = self.person_names | self.place_names
        return word in names

    def is_entity(self, pos, entity_pos=('ns', 'n', 'vn', 'v')):
        return pos in entity_pos

    def is_common_char(self, c):
        return c in self.common_chars

    def is_common_char_string(self, word):
        return all(self.is_common_char(c) for c in word)

    def get_feature(self, query, is_word_segmented=False):
        """
        Get text feature
        :param query:
        :param is_word_segmented:
        :return: list, list: term features, sentence features
        """
        term_features = []
        if is_word_segmented:
            word_seq = query.split(self.segment_sep)
        else:
            word_seq = word_segment(query, cut_type='word', pos=False)
        logger.debug('%s' % word_seq)

        # sentence
        sentence_features = AttrDict(
            query_length=len(query),
            term_size=len(word_seq),
        )

        # term
        idx = 0
        offset = 0
        for word in word_seq:
            emb = self.vec.encode(word)
            word_list = deepcopy(word_seq)
            if word in word_list:
                word_list.remove(word)
            del_word_query = ''.join(word_list)
            del_term_sim_score = self.sim.get_score(query, del_word_query)
            term_features.append(
                AttrDict(
                    term=word,
                    term_length=len(word),
                    idx=idx,
                    offset=offset,
                    is_number=is_number_string(word),
                    is_chinese=is_chinese_string(word),
                    is_alphabet=is_alphabet_string(word),
                    is_stopword=self.is_stopword(word),
                    is_name=self.is_name(word),
                    # is_entity=self.is_entity(pos),
                    is_common_char=self.is_common_char_string(word),
                    embedding_sum=np.sum(emb),
                    del_term_score=del_term_sim_score,
                ))
            idx += len(word)
            offset += 1

        return term_features, sentence_features
Esempio n. 12
0
from text2vec import Similarity

a = "我喜欢一个女孩子。"
b = "我讨厌一个女孩子。"

sim = Similarity()
s = sim.get_score(a, b)
print(s)