def fp_ratio(s1, s2, force_ascii=True, full_process=True): """ Return a measure of the sequences' similarity between 0 and 100, using fuzz.ratio and fuzz.partial_ratio. """ if full_process: p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) else: p1 = s1 p2 = s2 if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True partial_scale = .9 base = fuzz.ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False if try_partial: partial = fuzz.partial_ratio(p1, p2) * partial_scale return utils.intr(max(base, partial)) else: return utils.intr(base)
def QRatio(s1, s2, force_ascii=True): p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 return ratio(p1, p2)
def match_name(search_param: str, name: str) -> bool: """Modified from fuzzywuzzy.token_set_ratio""" p1 = utils.full_process(search_param).translate(translation_table) p2 = utils.full_process(name).translate(translation_table) for k, v in SPECIAL_REPLACE.items(): p1 = p1.replace(k, v) p2 = p2.replace(k, v) if not utils.validate_string(p1): return False if not utils.validate_string(p2): return False # Doesn't seem to be needed now but might be useful in the future # ALTERNATIVE_DIVIDER = ["-", "="] # for divider in ALTERNATIVE_DIVIDER: # p1 = p1.replace(divider, " ") # p2 = p2.replace(divider, " ") if p1 in p2: return True # pull tokens tokens1 = set(p1.split()) tokens2 = set(p2.split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() NAME_MATCH_THRESHOLD = 80 # Use sorted_sect first so "Okita Souji (Alter)" works as expected # This way "a b c" search_param will match to "a b c d e" but not vice versa if sorted_sect: return fuzz.ratio(sorted_sect, combined_1to2) > NAME_MATCH_THRESHOLD else: return fuzz.ratio(combined_2to1, combined_1to2) > NAME_MATCH_THRESHOLD
def _token_set(s1, s2, partial=True, force_ascii=True, use_avg=False): if s1 is None: raise TypeError("s1 is None") if s2 is None: raise TypeError("s2 is None") p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(utils.full_process(p1).split()) tokens2 = set(utils.full_process(p2).split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() if partial: ratio_func = fuzz.partial_ratio else: ratio_func = fuzz.ratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), ratio_func(combined_1to2, combined_2to1) ] if use_avg: return statistics.mean(pairwise) else: return max(pairwise)
def _token_set(s1, s2, partial=True, force_ascii=True): """Find all alphanumeric tokens in each string... - treat them as a set - construct two strings of the form: <sorted_intersection><sorted_remainder> - take ratios of those two strings - controls for unordered partial matches""" p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # pull tokens tokens1 = set(utils.full_process(p1).split()) tokens2 = set(utils.full_process(p2).split()) intersection = tokens1.intersection(tokens2) diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) sorted_sect = " ".join(sorted(intersection)) sorted_1to2 = " ".join(sorted(diff1to2)) sorted_2to1 = " ".join(sorted(diff2to1)) combined_1to2 = sorted_sect + " " + sorted_1to2 combined_2to1 = sorted_sect + " " + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() if partial: ratio_func = partial_ratio else: ratio_func = ratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), ratio_func(combined_1to2, combined_2to1) ] return max(pairwise)
def WRatio(s1, s2, force_ascii=True): """Return a measure of the sequences' similarity between 0 and 100, using different algorithms. """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .95 partial_scale = .90 base = ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if len_ratio < 1.5: try_partial = False # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .6 if try_partial: partial = partial_ratio(p1, p2) * partial_scale ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale return utils.intr(max(base, tsor, tser))
def WRatio(s1, s2, force_ascii=True): """ Return a measure of the sequences' similarity between 0 and 100, using different algorithms. **Steps in the order they occur** #. Run full_process from utils on both strings #. Short circuit if this makes either string empty #. Take the ratio of the two processed strings (fuzz.ratio) #. Run checks to compare the length of the strings * If one of the strings is more than 1.5 times as long as the other use partial_ratio comparisons - scale partial results by 0.9 (this makes sure only full results can return 100) * If one of the strings is over 8 times as long as the other instead scale by 0.6 #. Run the other ratio functions * if using partial ratio functions call partial_ratio, partial_token_sort_ratio and partial_token_set_ratio scale all of these by the ratio based on length * otherwise call token_sort_ratio and token_set_ratio * all token based comparisons are scaled by 0.95 (on top of any partial scalars) #. Take the highest value from these results round it and return it as an integer. :param s1: :param s2: :param force_ascii: Allow only ascii characters :type force_ascii: bool :return: """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .60 partial_scale = .90 base = fuzz.ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if abs(len(p2) - len(p1)) <= 1: try_partial = True partial_scale = 0.95 unbase_scale = 0.65 if abs(len(p2) - len(p1)) <= 2 and max(len(p2), len(p1)) > 6: try_partial = False if abs(len(p2) - len(p1)) >= 3 and max(len(p2), len(p1)) > 6: try_partial = True partial_scale = 0.85 if len_ratio > 2: try_partial = True partial_scale = 0.65 # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .60 if try_partial: partial = fuzz.partial_ratio(p1, p2) * partial_scale ptsor = fuzz.partial_token_sort_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale ptser = fuzz.partial_token_set_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = fuzz.token_sort_ratio(p1, p2, full_process=False) * unbase_scale tser = fuzz.token_set_ratio(p1, p2, full_process=False) * unbase_scale return utils.intr(max(base, tsor, tser))