Example #1
0
def similarity_calculation(str_arr, str_2):
    sim = Similarity()
    str_2 = strip_word(str_2)
    result = []
    for item in str_arr:
        #这里可以将base提前处理好导出备用,以达到优化目的
        item = strip_word(item)
        result.append(sim.get_score(item, str_2))
    return result
Example #2
0
 def __init__(self,
              stopwords_path=config.stopwords_path,
              person_name_path=config.person_name_path,
              place_name_path=config.place_name_path,
              common_char_path=config.common_char_path,
              segment_sep=config.segment_sep):
     self.stopwords = self.load_set_file(stopwords_path)
     self.person_names = self.load_set_file(person_name_path)
     self.place_names = self.load_set_file(place_name_path)
     self.common_chars = self.load_set_file(common_char_path)
     self.segment_sep = segment_sep
     self.vec = Vector()
     self.sim = Similarity(similarity_type=SimType.WMD)
Example #3
0
def test_oov_sim():
    from text2vec import Similarity
    sim = Similarity()
    a = ','
    b = '花'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = ',画画'
    b = '花画画'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = ','
    b = '花画画'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = ',机票'
    b = '特价机票'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = '机票'
    b = '特价机票'
    s = sim.get_score(a, b)
    print(a, b, s)

    a = '机票'
    b = '特价的机票'
    s = sim.get_score(a, b)
    print(a, b, s)
Example #4
0
 def __init__(self,
              stopwords_path=config.stopwords_path,
              person_name_path=config.person_name_path,
              place_name_path=config.place_name_path,
              common_char_path=config.common_char_path,
              segment_sep=config.segment_sep
              ):
     self.stopwords = self.load_set_file(stopwords_path)
     self.person_names = self.load_set_file(person_name_path)
     self.place_names = self.load_set_file(place_name_path)
     self.common_chars = self.load_set_file(common_char_path)
     self.segment_sep = segment_sep
     self.sim = Similarity(similarity_type=SimType.COSINE, embedding_type=EmbType.SBERT)
     self.sim.load_model()
Example #5
0
    def test_sents_score(self):
        """测试句子之间相似度值-word2vec"""
        sim = Similarity()

        print(sim.get_score(a, b), a, b)
        print(sim.get_score(a, c), a, c)
        print(sim.get_score(a, d), a, d)
        print(sim.get_score(b, c), b, c)
        print(sim.get_score(b, d), b, d)
        print(sim.get_score(c, d), c, d)

        print("{:.3f}".format(sim.get_score(a, b)))
        self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "1.000")
        self.assertTrue("{:.3f}".format(sim.get_score(a, c)) == "1.000")
        self.assertTrue("{:.3f}".format(sim.get_score(a, d)) == "0.903")
        self.assertTrue("{:.3f}".format(sim.get_score(b, c)) == "1.000")
        self.assertTrue("{:.3f}".format(sim.get_score(b, d)) == "0.903")
        self.assertTrue("{:.3f}".format(sim.get_score(c, d)) == "0.903")
Example #6
0
    def test_oov_sim(self):
        """测试OOV word 相似度"""
        sim = Similarity()
        a = ','
        b = '花'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(s == 0.0)

        a = ',画画'
        b = '花画画'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.822)

        a = ','
        b = '花画画'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.000)

        a = ',机票'
        b = '特价机票'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.884)

        a = '机票'
        b = '特价机票'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 0.884)

        a = '特价机票'
        b = '特价的机票'
        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue(round(s, 3) == 1.000)
Example #7
0
    def test_encode(self):
        """测试文本wmd encode结果"""
        sim = Similarity(embedding_type=EmbType.W2V,
                         similarity_type=SimType.WMD)
        a = '如何更换花呗绑定银行卡'
        b = '花呗更改绑定银行卡'
        c = '我什么时候开通了花呗'

        emb = sim.encode(a)
        print(a, emb)
        print(emb.shape)
        self.assertEqual(emb.shape, (200, ))

        s = sim.get_score(a, b)
        print(a, b, s)
        self.assertTrue("{:.3f}".format(s) == "0.746")

        s = sim.get_score(a, c)
        print(a, c, s)
        self.assertTrue("{:.3f}".format(s) == "0.573")

        s = sim.get_score(b, c)
        print(b, c, s)
        self.assertTrue("{:.3f}".format(s) == "0.563")
Example #8
0
 def test_sents_score_bert(self):
     """测试句子之间相似度值-bert"""
     sim = Similarity(embedding_type='bert')
     print("{:.3f}".format(sim.get_score(a, b)))
     self.assertTrue("{:.3f}".format(sim.get_score(a, b)) == "0.915")
Example #9
0
# -*- coding: utf-8 -*-
"""
@author:XuMing<*****@*****.**>
@description: 
"""
from text2vec import Similarity

a = '湖北人爱吃鱼'
b = '甘肃人不爱吃鱼'

ss = Similarity(embedding_type='w2v')
ss.get_score(a, b)
print(ss.model.info())

ss = Similarity(embedding_type='bert')
ss.get_score(a, b)
print(ss.model.info())
Example #10
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

from text2vec import Similarity, EmbType, SimType

sim = Similarity(embedding_type=EmbType.W2V, similarity_type=SimType.WMD)
a = '如何更换花呗绑定银行卡'
b = '花呗更改绑定银行卡'
c = '我什么时候开通了花呗'
emb = sim.encode(a)
print(a, emb)
print(emb.shape)
s = sim.get_score(a, b)
print(a, b, s)

s = sim.get_score(a, c)
print(a, c, s)

s = sim.get_score(b, c)
print(b, c, s)
Example #11
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description:
"""
import sys

sys.path.append('..')
from text2vec import Similarity

if __name__ == '__main__':
    a = '如何更换花呗绑定银行卡'
    b = '花呗更改绑定银行卡'
    c = '我什么时候开通了花呗'

    sim = Similarity()
    s = sim.get_score(a, b)
    print(a, b, s)

    s = sim.get_score(a, c)
    print(a, c, s)

    s = sim.get_score(b, c)
    print(b, c, s)

    from text2vec import SearchSimilarity

    corpus = [a, b, c]
    print(corpus)
    search_sim = SearchSimilarity(corpus=corpus)
Example #12
0
"""
@author:XuMing([email protected])
@description: 
"""

import sys

sys.path.append('..')
from text2vec import Vector
from text2vec import Similarity

if __name__ == '__main__':
    vec = Vector(embedding_type='bert')
    char = '卡'
    emb = vec.encode(char)
    # <class 'numpy.ndarray'> (128, 3072) 128=seq_len, 3072=768*4
    print(type(emb), emb.shape)
    print(char, emb)

    word = '银行卡'
    print(word, vec.encode(word))

    a = '如何更换花呗绑定银行卡'
    emb = vec.encode(a)
    print(a, emb)
    print(emb.shape)

    sim = Similarity(embedding_type='bert')
    b = '花呗更改绑定银行卡'
    print(sim.get_score(a, b))
Example #13
0
# -*- coding: utf-8 -*-
"""
@author:XuMing<*****@*****.**>
@description: 
"""

from text2vec import Similarity

sim = Similarity()


def test_sim_diff():
    a = '研究团队面向国家重大战略需求追踪国际前沿发展借鉴国际人工智能研究领域的科研模式有效整合创新资源解决复'
    b = '英汉互译比较语言学'

    print(a, b, sim.get_score(a, b))


def test_sim_same():
    a = '汉英翻译比较语言学'
    b = '英汉互译比较语言学'

    print(a, b, sim.get_score(a, b))
Example #14
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

from text2vec import Similarity
sim = Similarity()

if __name__ == '__main__':
    a = '什么是智能手环'
    b = '智能手环是个啥'
    c = '智能手环有什么用'
    d = '智能手环能干什么'
    print(sim.get_score(a, b), a, b)
    print(sim.get_score(a, c), a, c)
    print(sim.get_score(a, d), a, d)
    print(sim.get_score(b, c), b, c)
    print(sim.get_score(b, d), b, d)
    print(sim.get_score(c, d), c, d)

    print("example:")
    while True:
        a = input('input1:')
        b = input('input2:')
        print(sim.get_score(a, b), a, b)
Example #15
0
class TextFeature(object):
    def __init__(self,
                 stopwords_path=config.stopwords_path,
                 person_name_path=config.person_name_path,
                 place_name_path=config.place_name_path,
                 common_char_path=config.common_char_path,
                 segment_sep=config.segment_sep):
        self.stopwords = self.load_set_file(stopwords_path)
        self.person_names = self.load_set_file(person_name_path)
        self.place_names = self.load_set_file(place_name_path)
        self.common_chars = self.load_set_file(common_char_path)
        self.segment_sep = segment_sep
        self.vec = Vector()
        self.sim = Similarity(similarity_type=SimType.WMD)

    @staticmethod
    def load_set_file(path):
        words = set()
        with codecs.open(path, 'r', encoding='utf-8') as f:
            for w in f:
                w = w.strip()
                if w.startswith('#'):
                    continue
                if w:
                    words.add(w.split()[0])
        return words

    def is_stopword(self, word):
        return word in self.stopwords

    def is_name(self, word):
        names = self.person_names | self.place_names
        return word in names

    def is_entity(self, pos, entity_pos=('ns', 'n', 'vn', 'v')):
        return pos in entity_pos

    def is_common_char(self, c):
        return c in self.common_chars

    def is_common_char_string(self, word):
        return all(self.is_common_char(c) for c in word)

    def get_feature(self, query, is_word_segmented=False):
        """
        Get text feature
        :param query:
        :param is_word_segmented:
        :return: list, list: term features, sentence features
        """
        term_features = []
        if is_word_segmented:
            word_seq = query.split(self.segment_sep)
        else:
            word_seq = word_segment(query, cut_type='word', pos=False)
        logger.debug('%s' % word_seq)

        # sentence
        sentence_features = AttrDict(
            query_length=len(query),
            term_size=len(word_seq),
        )

        # term
        idx = 0
        offset = 0
        for word in word_seq:
            emb = self.vec.encode(word)
            word_list = deepcopy(word_seq)
            if word in word_list:
                word_list.remove(word)
            del_word_query = ''.join(word_list)
            del_term_sim_score = self.sim.get_score(query, del_word_query)
            term_features.append(
                AttrDict(
                    term=word,
                    term_length=len(word),
                    idx=idx,
                    offset=offset,
                    is_number=is_number_string(word),
                    is_chinese=is_chinese_string(word),
                    is_alphabet=is_alphabet_string(word),
                    is_stopword=self.is_stopword(word),
                    is_name=self.is_name(word),
                    # is_entity=self.is_entity(pos),
                    is_common_char=self.is_common_char_string(word),
                    embedding_sum=np.sum(emb),
                    del_term_score=del_term_sim_score,
                ))
            idx += len(word)
            offset += 1

        return term_features, sentence_features
Example #16
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

from text2vec import Similarity
sim = Similarity(embedding_type='w2v', similarity_type='wmd')
a = '如何更换花呗绑定银行卡'
b = '花呗更改绑定银行卡'
c = '我什么时候开通了花呗'
emb = sim.encode(a)
print(emb)

s = sim.score(a, b)
print(a, b, s)

s = sim.score(a, c)
print(a, c, s)

s = sim.score(b, c)
print(b, c, s)