Esempio n. 1
0
class Baseline(object):
    """Normalized Levenshtein https://pypi.org/project/strsim/#normalized-levenshtein"""
    def __init__(self):
        self.matcher = Jaccard(3)  # NormalizedLevenshtein()

    def similarity(self, seq1: str, seq2: str) -> float:
        return self.matcher.similarity(seq1, seq2)

    def distance(self, seq1: str, seq2: str) -> float:
        return self.matcher.distance(seq1, seq2)
def met_jaccard(s1, s2, n):
    jac = Jaccard(n)
    return jac.similarity(s1, s2)
Esempio n. 3
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
Esempio n. 4
0
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.jaccard import Jaccard

s1 = '中华人民共和国'
s2 = '中国'

normalized_levenshtein = NormalizedLevenshtein()
print('Levenshtein: ', normalized_levenshtein.distance(s1, s2))

jaccard_distance = Jaccard(1)
print('Jaccard: ', jaccard_distance.distance(s1, s2))

# print(jaccard_similarity_score(list(s1), list(s2)))
Esempio n. 5
0
 def __init__(self):
     self.matcher = Jaccard(3)  # NormalizedLevenshtein()
Esempio n. 6
0
from sklearn.preprocessing import normalize
import numpy as np

# Inizializza all'import
levenshtein = Levenshtein()
norm_levenshtein = NormalizedLevenshtein()
damerau = Damerau()
optimal_string_alignment = OptimalStringAlignment()
jarowinkler = JaroWinkler()
lcs = LongestCommonSubsequence()
metric_lcs = MetricLCS()
ngram = NGram()
qgram = QGram()
dice = SorensenDice()
cos = Cosine(5)
jaccard = Jaccard(5)

similarity_functions = [
    norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b),
    lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity
]


def mono_vector0(tup1, tup2):

    str1 = ' '.join(tup1).lower()
    str2 = ' '.join(tup2).lower()

    simv = list(map(lambda x: x(str1, str2), similarity_functions))

    return simv
Esempio n. 7
0
def string_similarity(
    string_list,
    source_str=None,
    similarity='seq_matcher',
):
    '''
    Compute similarity between strings
    
    Input
    -----
    string_list: list of str
    
    source_str: str, default None - if specified, then the similarities will
        be computed between source_str and all str in string_list
   
   similarity: str,  {'jaccard', 'spacy', 'seq_matcher'}, specifying which 
       similarity measure will be used
       
       'jaccard': jaccard similarity
       'spacy': vector similarity (cosine) will be used based on en_core_web_lg
                 see spaCy documentation: 
                 https://spacy.io/usage/vectors-similarity
                
                Note: this process is quiet slow and there must be a good 
                reason for opting for this 
       'seq_matcher': used the quick_ratio method of the SequenceMatcher class
               see https://docs.python.org/2.4/lib/sequence-matcher.html
               
    NOTE: all the above metrics are normalized in the range [0 1] with 0=low
    and 1=high  similarity, EXCEPT for 'jaccard' where 0=high and 1=low 
    
    Roundoff errors and vector operations may give rise to slight deviations 
    from such range
           
    Output
    ------ 
    all_similarities: ndarray of shape (N,N), if source_str is None, or 
        shape(N,) if source_str is not None
        
        Contains floats denoting the similarity between strings such that:
        if source_str is None: 
        all_similarities[i,j] = similarity between string_list[i] and string_list[j]   
        
        if source_str is None: 
        all_similarities[i] = similarity between source_str and string_list[i] 
    
    Examples
    --------
    s = ['Today I waited and stared to the ocean.', 
         'The owl of Minerva flies only after dusk',
         'ice scream','When the sword wakes, time sleeps',
         'bike',
         'pancak']
    s_source = 'pancake'
    
    #With 'jaccard'
    similarity = string_similarity(s,
                                   source_str = s_source,
                                   similarity = 'jaccard',
                                   )
    print(similarity)
    array([0.96969697, 1.        , 1.        , 0.93103448, 0.875     ,
       0.16666667])
    
    Note that since the jaccard index is used, all it matters are the 
    characters of strings that are compared, and not the semantics. Thus,
    the lower value (for jaccard better match) is observed with the last string
    'pancak'
    
    #With 'spacy'    
    similarity = string_similarity(s,
                                   source_str = s_source,
                                   similarity = 'spacy',
                                   )
    print(similarity)
    array([0.20109747, 0.23185522, 0.33395686, 0.18453109, 0.1748583 ,
       0.        ])
    
    Note that this will generate a warning, since there is no word vector
    for 'pancak' (a non-existent word) and thus similarity is 0 (non-existent)
    
    Note that the higher similarity is observed with 'ice scream' due to the 
    semantic nature of the similarity
    
    #With 'seq_matcher'
    
    similarity = string_similarity(s,
                               source_str = s_source,
                               similarity = 'seq_matcher',
                               )
    print(similarity)
    array([0.2173913 , 0.21276596, 0.35294118, 0.25      , 0.36363636,
       0.92307692])
    
    Similarity highest with 'pancak'
    '''
    if source_str is None:
        all_similarities = np.zeros((len(string_list), len(string_list)))
        if similarity == 'spacy': nlp = en_core_web_lg.load()
        if similarity == 'jaccard': jaccard = Jaccard(2)
        for i, source in enumerate(string_list):
            print(i)
            if similarity == 'spacy':
                token1 = nlp(source)
                current_similarities = [
                    token1.similarity(nlp(target)) for target in string_list
                ]
                all_similarities[i, :] = current_similarities
            if similarity == 'seq_matcher':
                current_similarities = [
                    SequenceMatcher(None, source, target).quick_ratio()
                    for target in string_list
                ]
                all_similarities[i, :] = current_similarities
            if similarity == 'jaccard':
                current_similarities = [
                    jaccard.distance(source, target) for target in string_list
                ]
                all_similarities[i, :] = current_similarities

    if source_str is not None:
        all_similarities = np.zeros((len(string_list), ))
        if similarity == 'seq_matcher':
            all_similarities = [
                SequenceMatcher(None, source_str, target).quick_ratio()
                for target in string_list
            ]
        if similarity == 'spacy':
            nlp = en_core_web_lg.load()
            token1 = nlp(source_str)
            all_similarities = [
                token1.similarity(nlp(target)) for target in string_list
            ]
        if similarity == 'jaccard':
            jaccard = Jaccard(2)
            all_similarities = [
                jaccard.distance(source_str, target) for target in string_list
            ]

    all_similarities = np.asarray(all_similarities)

    return all_similarities