def testQratioForceAscii(self): s1 = "ABCD\u00C1" s2 = "ABCD" score = fuzzywuzzy.QRatio(s1, s2, force_ascii=True) self.assertEqual(score, 100) score = fuzzywuzzy.QRatio(s1, s2, force_ascii=False) self.assertLess(score, 100)
def testQRatioUnicodeString(self): s1 = "\u00C1" s2 = "ABCD" score = fuzzywuzzy.QRatio(s1, s2) self.assertEqual(0, score) # Cyrillic. s1 = "\u043f\u0441\u0438\u0445\u043e\u043b\u043e\u0433" s2 = "\u043f\u0441\u0438\u0445\u043e\u0442\u0435\u0440\u0430\u043f\u0435\u0432\u0442" score = fuzzywuzzy.QRatio(s1, s2, force_ascii=False) self.assertNotEqual(0, score) # Chinese. s1 = "\u6211\u4e86\u89e3\u6570\u5b66" s2 = "\u6211\u5b66\u6570\u5b66" score = fuzzywuzzy.QRatio(s1, s2, force_ascii=False) self.assertNotEqual(0, score)
def extract_features(q1, q2): advanced_feature = [] # preprocessing each question # Removing html tags,punctuations,stemming,stopwords,contractions, and then return the text of question q1 = preprocess(q1) q2 = preprocess(q2) token_features = get_token_features(q1, q2) #token_features is a list. advanced_feature.extend(token_features) #cwc_min,cwc_min,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len #fuzzy_features advanced_feature.append(fuzz.token_set_ratio(q1, q2)) #token_set_ratio advanced_feature.append(fuzz.token_sort_ratio(q1, q2)) #token_sort_ratio advanced_feature.append(fuzz.QRatio(q1, q2)) #fuzz_ratio advanced_feature.append(fuzz.partial_ratio(q1, q2)) #fuzz_partial_ratio advanced_feature.append(get_longest_substr_ratio( q1, q2)) #longest_substr_ratio return advanced_feature
def testQuickRatioNotEqual(self): self.assertNotEqual(fuzzywuzzy.QRatio(self.s1, self.s3), 100)
def testQuickRatioCaseInsensitive(self): self.assertEqual(fuzzywuzzy.QRatio(self.s1, self.s2), 100)
def testQuickRatioEqual(self): self.assertEqual(fuzzywuzzy.QRatio(self.s1, self.s1a), 100)