def choose_the_best_clause(union, question): if USE_COSINE: cos = Cosine(1) article_and_clause_no = [] max_cosine = 0 for ref in union: article, clause = ref[1] clause = corpus[article]['clauses'][clause]['text'] cur_cos = cos.similarity(clause, question) if max_cosine < cur_cos: max_cosine = cur_cos article_and_clause_no = ref[1] return article_and_clause_no else: union.sort(key=lambda x: x[0], reverse=True) return union[0][1]
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
from similarity.levenshtein import Levenshtein from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.cosine import Cosine lev = Levenshtein() nolev = NormalizedLevenshtein() cosine = Cosine(4) str1 = 'I enjoy playing football' str2 = 'I love to play soccer' print(lev.distance(str1, str2)) print('Levenshtein distance:') print(nolev.similarity(str1, str2)) print('Cosine similarity:') print(cosine.similarity(str1, str2))