class Baseline(object): """Normalized Levenshtein https://pypi.org/project/strsim/#normalized-levenshtein""" def __init__(self): self.matcher = Jaccard(3) # NormalizedLevenshtein() def similarity(self, seq1: str, seq2: str) -> float: return self.matcher.similarity(seq1, seq2) def distance(self, seq1: str, seq2: str) -> float: return self.matcher.distance(seq1, seq2)
def met_jaccard(s1, s2, n): jac = Jaccard(n) return jac.similarity(s1, s2)
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.jaccard import Jaccard s1 = '中华人民共和国' s2 = '中国' normalized_levenshtein = NormalizedLevenshtein() print('Levenshtein: ', normalized_levenshtein.distance(s1, s2)) jaccard_distance = Jaccard(1) print('Jaccard: ', jaccard_distance.distance(s1, s2)) # print(jaccard_similarity_score(list(s1), list(s2)))
def __init__(self): self.matcher = Jaccard(3) # NormalizedLevenshtein()
from sklearn.preprocessing import normalize import numpy as np # Inizializza all'import levenshtein = Levenshtein() norm_levenshtein = NormalizedLevenshtein() damerau = Damerau() optimal_string_alignment = OptimalStringAlignment() jarowinkler = JaroWinkler() lcs = LongestCommonSubsequence() metric_lcs = MetricLCS() ngram = NGram() qgram = QGram() dice = SorensenDice() cos = Cosine(5) jaccard = Jaccard(5) similarity_functions = [ norm_levenshtein.similarity, lambda a, b: 1 - metric_lcs.distance(a, b), lambda a, b: 1 - ngram.distance(a, b), cos.similarity, dice.similarity ] def mono_vector0(tup1, tup2): str1 = ' '.join(tup1).lower() str2 = ' '.join(tup2).lower() simv = list(map(lambda x: x(str1, str2), similarity_functions)) return simv
def string_similarity( string_list, source_str=None, similarity='seq_matcher', ): ''' Compute similarity between strings Input ----- string_list: list of str source_str: str, default None - if specified, then the similarities will be computed between source_str and all str in string_list similarity: str, {'jaccard', 'spacy', 'seq_matcher'}, specifying which similarity measure will be used 'jaccard': jaccard similarity 'spacy': vector similarity (cosine) will be used based on en_core_web_lg see spaCy documentation: https://spacy.io/usage/vectors-similarity Note: this process is quiet slow and there must be a good reason for opting for this 'seq_matcher': used the quick_ratio method of the SequenceMatcher class see https://docs.python.org/2.4/lib/sequence-matcher.html NOTE: all the above metrics are normalized in the range [0 1] with 0=low and 1=high similarity, EXCEPT for 'jaccard' where 0=high and 1=low Roundoff errors and vector operations may give rise to slight deviations from such range Output ------ all_similarities: ndarray of shape (N,N), if source_str is None, or shape(N,) if source_str is not None Contains floats denoting the similarity between strings such that: if source_str is None: all_similarities[i,j] = similarity between string_list[i] and string_list[j] if source_str is None: all_similarities[i] = similarity between source_str and string_list[i] Examples -------- s = ['Today I waited and stared to the ocean.', 'The owl of Minerva flies only after dusk', 'ice scream','When the sword wakes, time sleeps', 'bike', 'pancak'] s_source = 'pancake' #With 'jaccard' similarity = string_similarity(s, source_str = s_source, similarity = 'jaccard', ) print(similarity) array([0.96969697, 1. , 1. , 0.93103448, 0.875 , 0.16666667]) Note that since the jaccard index is used, all it matters are the characters of strings that are compared, and not the semantics. Thus, the lower value (for jaccard better match) is observed with the last string 'pancak' #With 'spacy' similarity = string_similarity(s, source_str = s_source, similarity = 'spacy', ) print(similarity) array([0.20109747, 0.23185522, 0.33395686, 0.18453109, 0.1748583 , 0. ]) Note that this will generate a warning, since there is no word vector for 'pancak' (a non-existent word) and thus similarity is 0 (non-existent) Note that the higher similarity is observed with 'ice scream' due to the semantic nature of the similarity #With 'seq_matcher' similarity = string_similarity(s, source_str = s_source, similarity = 'seq_matcher', ) print(similarity) array([0.2173913 , 0.21276596, 0.35294118, 0.25 , 0.36363636, 0.92307692]) Similarity highest with 'pancak' ''' if source_str is None: all_similarities = np.zeros((len(string_list), len(string_list))) if similarity == 'spacy': nlp = en_core_web_lg.load() if similarity == 'jaccard': jaccard = Jaccard(2) for i, source in enumerate(string_list): print(i) if similarity == 'spacy': token1 = nlp(source) current_similarities = [ token1.similarity(nlp(target)) for target in string_list ] all_similarities[i, :] = current_similarities if similarity == 'seq_matcher': current_similarities = [ SequenceMatcher(None, source, target).quick_ratio() for target in string_list ] all_similarities[i, :] = current_similarities if similarity == 'jaccard': current_similarities = [ jaccard.distance(source, target) for target in string_list ] all_similarities[i, :] = current_similarities if source_str is not None: all_similarities = np.zeros((len(string_list), )) if similarity == 'seq_matcher': all_similarities = [ SequenceMatcher(None, source_str, target).quick_ratio() for target in string_list ] if similarity == 'spacy': nlp = en_core_web_lg.load() token1 = nlp(source_str) all_similarities = [ token1.similarity(nlp(target)) for target in string_list ] if similarity == 'jaccard': jaccard = Jaccard(2) all_similarities = [ jaccard.distance(source_str, target) for target in string_list ] all_similarities = np.asarray(all_similarities) return all_similarities