def fp_ratio(s1, s2, force_ascii=True, full_process=True): """ Return a measure of the sequences' similarity between 0 and 100, using fuzz.ratio and fuzz.partial_ratio. """ if full_process: p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) else: p1 = s1 p2 = s2 if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True partial_scale = .9 base = fuzz.ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False if try_partial: partial = fuzz.partial_ratio(p1, p2) * partial_scale return utils.intr(max(base, partial)) else: return utils.intr(base)
def partial_ratio(s1, s2): """"Return the ratio of the most similar substring as a number between 0 and 100.""" s1, s2 = utils.make_type_consistent(s1, s2) if len(s1) <= len(s2): shorter = s1 longer = s2 else: shorter = s2 longer = s1 m = SequenceMatcher(None, shorter, longer) blocks = m.get_matching_blocks() # each block represents a sequence of matching characters in a string # of the form (idx_1, idx_2, len) # the best partial match will block align with at least one of those blocks # e.g. shorter = "abcd", longer = XXXbcdeEEE # block = (1,3,3) # best score === ratio("abcd", "Xbcd") scores = [] for block in blocks: long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 long_end = long_start + len(shorter) long_substr = longer[long_start:long_end] m2 = SequenceMatcher(None, shorter, long_substr) r = m2.ratio() if r > .995: return 100 else: scores.append(r) return utils.intr(100 * max(scores))
def WRatio(s1, s2, force_ascii=True): """Return a measure of the sequences' similarity between 0 and 100, using different algorithms. """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale return utils.intr(max(base, tsor, tser))
def WRatio(s1, s2, force_ascii=True): """ Return a measure of the sequences' similarity between 0 and 100, using different algorithms. **Steps in the order they occur** #. Run full_process from utils on both strings #. Short circuit if this makes either string empty #. Take the ratio of the two processed strings (fuzz.ratio) #. Run checks to compare the length of the strings * If one of the strings is more than 1.5 times as long as the other use partial_ratio comparisons - scale partial results by 0.9 (this makes sure only full results can return 100) * If one of the strings is over 8 times as long as the other instead scale by 0.6 #. Run the other ratio functions * if using partial ratio functions call partial_ratio, partial_token_sort_ratio and partial_token_set_ratio scale all of these by the ratio based on length * otherwise call token_sort_ratio and token_set_ratio * all token based comparisons are scaled by 0.95 (on top of any partial scalars) #. Take the highest value from these results round it and return it as an integer. :param s1: :param s2: :param force_ascii: Allow only ascii characters :type force_ascii: bool :return: """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .60 partial_scale = .90 base = fuzz.ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if abs(len(p2) - len(p1)) <= 1: try_partial = True partial_scale = 0.95 unbase_scale = 0.65 if abs(len(p2) - len(p1)) <= 2 and max(len(p2), len(p1)) > 6: try_partial = False if abs(len(p2) - len(p1)) >= 3 and max(len(p2), len(p1)) > 6: try_partial = True partial_scale = 0.85 if len_ratio > 2: try_partial = True partial_scale = 0.65 # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .60 if try_partial: partial = fuzz.partial_ratio(p1, p2) * partial_scale ptsor = fuzz.partial_token_sort_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale ptser = fuzz.partial_token_set_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = fuzz.token_sort_ratio(p1, p2, full_process=False) * unbase_scale tser = fuzz.token_set_ratio(p1, p2, full_process=False) * unbase_scale return utils.intr(max(base, tsor, tser))
def ratio(s1, s2): s1, s2 = utils.make_type_consistent(s1, s2) m = SequenceMatcher(None, s1, s2) return utils.intr(100 * m.ratio())
def compare2thing(lost, found, choice): # print("compare") listRatios = {} # print("ch " , choice) if (choice == 1): topic_lost = lost[0]["topic"] desc_lost = lost[0]["description"] key_lost = lost[0]['key'] for i in range(0, len(found), 1): tmp = "" topic_found = found[i]["topic"] desc_found = found[i]["description"] key_found = found[i]['key'] img_found = found[i]['img'] Ratios_topic = fuzz.partial_token_sort_ratio(topic_lost, topic_found, force_ascii=True, full_process=1) Ratios_desc = fuzz.partial_token_sort_ratio(desc_lost, desc_found, force_ascii=True, full_process=1) Ratios_img = (100 * img_found) Ratios_2 = utils.intr(((((Ratios_topic + Ratios_desc) / 2) * 1.8) + (Ratios_img) * 0.2) / 2) listRatios[i] = { 'keyDB': key_found, "topic": topic_found, "per": Ratios_2 } # print("choice 1") top = topTen(lost, listRatios) return top elif (choice == 2): topic_found = found[0]["topic"] desc_found = found[0]["description"] key_found = found[0]['key'] for i in range(0, len(lost), 1): # print("in for 2") tmp = "" # tmp = lost[i] topic_lost = lost[i]["topic"] desc_lost = lost[i]["description"] key_lost = lost[i]['key'] img_lost = lost[i]['img'] Ratios_topic = fuzz.partial_token_sort_ratio(topic_lost, topic_found, force_ascii=True, full_process=1) Ratios_desc = fuzz.partial_token_sort_ratio(desc_lost, desc_found, force_ascii=True, full_process=1) Ratios_img = (100 * img_lost) Ratios_2 = utils.intr(((((Ratios_topic + Ratios_desc) / 2) * 1.8) + (Ratios_img) * 0.2) / 2) # print(Ratios) listRatios[i] = { 'keyDB': key_lost, "topic": topic_lost, "per": Ratios_2 } # print("choice 2") top = topTen(found, listRatios) return top