Example #1
0
def fp_ratio(s1, s2, force_ascii=True, full_process=True):
    """
    Return a measure of the sequences' similarity between 0 and 100, using fuzz.ratio and fuzz.partial_ratio.
    """
    if full_process:
        p1 = utils.full_process(s1, force_ascii=force_ascii)
        p2 = utils.full_process(s2, force_ascii=force_ascii)
    else:
        p1 = s1
        p2 = s2

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    partial_scale = .9

    base = fuzz.ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    if try_partial:
        partial = fuzz.partial_ratio(p1, p2) * partial_scale
        return utils.intr(max(base, partial))
    else:
        return utils.intr(base)
def partial_ratio(s1, s2):
    """"Return the ratio of the most similar substring
    as a number between 0 and 100."""
    s1, s2 = utils.make_type_consistent(s1, s2)

    if len(s1) <= len(s2):
        shorter = s1
        longer = s2
    else:
        shorter = s2
        longer = s1

    m = SequenceMatcher(None, shorter, longer)
    blocks = m.get_matching_blocks()

    # each block represents a sequence of matching characters in a string
    # of the form (idx_1, idx_2, len)
    # the best partial match will block align with at least one of those blocks
    #   e.g. shorter = "abcd", longer = XXXbcdeEEE
    #   block = (1,3,3)
    #   best score === ratio("abcd", "Xbcd")
    scores = []
    for block in blocks:
        long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
        long_end = long_start + len(shorter)
        long_substr = longer[long_start:long_end]

        m2 = SequenceMatcher(None, shorter, long_substr)
        r = m2.ratio()
        if r > .995:
            return 100
        else:
            scores.append(r)

    return utils.intr(100 * max(scores))
def WRatio(s1, s2, force_ascii=True):
    """Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms.
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = .95
    partial_scale = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = .6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale

        return utils.intr(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

        return utils.intr(max(base, tsor, tser))
Example #4
0
def WRatio(s1, s2, force_ascii=True):
    """
    Return a measure of the sequences' similarity between 0 and 100, using different algorithms.

    **Steps in the order they occur**

    #. Run full_process from utils on both strings
    #. Short circuit if this makes either string empty
    #. Take the ratio of the two processed strings (fuzz.ratio)
    #. Run checks to compare the length of the strings
        * If one of the strings is more than 1.5 times as long as the other
          use partial_ratio comparisons - scale partial results by 0.9
          (this makes sure only full results can return 100)
        * If one of the strings is over 8 times as long as the other
          instead scale by 0.6

    #. Run the other ratio functions
        * if using partial ratio functions call partial_ratio,
          partial_token_sort_ratio and partial_token_set_ratio
          scale all of these by the ratio based on length
        * otherwise call token_sort_ratio and token_set_ratio
        * all token based comparisons are scaled by 0.95
          (on top of any partial scalars)

    #. Take the highest value from these results
       round it and return it as an integer.

    :param s1:
    :param s2:
    :param force_ascii: Allow only ascii characters
    :type force_ascii: bool
    :return:
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True

    unbase_scale = .60
    partial_scale = .90

    base = fuzz.ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if abs(len(p2) - len(p1)) <= 1:
        try_partial = True
        partial_scale = 0.95
        unbase_scale = 0.65

    if abs(len(p2) - len(p1)) <= 2 and max(len(p2), len(p1)) > 6:
        try_partial = False

    if abs(len(p2) - len(p1)) >= 3 and max(len(p2), len(p1)) > 6:
        try_partial = True
        partial_scale = 0.85

    if len_ratio > 2:
        try_partial = True
        partial_scale = 0.65

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = .60

    if try_partial:
        partial = fuzz.partial_ratio(p1, p2) * partial_scale
        ptsor = fuzz.partial_token_sort_ratio(p1, p2, full_process=False) \
            * unbase_scale * partial_scale
        ptser = fuzz.partial_token_set_ratio(p1, p2, full_process=False) \
            * unbase_scale * partial_scale

        return utils.intr(max(base, partial, ptsor, ptser))
    else:
        tsor = fuzz.token_sort_ratio(p1, p2, full_process=False) * unbase_scale
        tser = fuzz.token_set_ratio(p1, p2, full_process=False) * unbase_scale

        return utils.intr(max(base, tsor, tser))
Example #5
0
def ratio(s1, s2):
    s1, s2 = utils.make_type_consistent(s1, s2)

    m = SequenceMatcher(None, s1, s2)
    return utils.intr(100 * m.ratio())
Example #6
0
def compare2thing(lost, found, choice):
    # print("compare")
    listRatios = {}
    # print("ch " , choice)
    if (choice == 1):
        topic_lost = lost[0]["topic"]
        desc_lost = lost[0]["description"]
        key_lost = lost[0]['key']
        for i in range(0, len(found), 1):
            tmp = ""
            topic_found = found[i]["topic"]
            desc_found = found[i]["description"]
            key_found = found[i]['key']
            img_found = found[i]['img']

            Ratios_topic = fuzz.partial_token_sort_ratio(topic_lost,
                                                         topic_found,
                                                         force_ascii=True,
                                                         full_process=1)
            Ratios_desc = fuzz.partial_token_sort_ratio(desc_lost,
                                                        desc_found,
                                                        force_ascii=True,
                                                        full_process=1)
            Ratios_img = (100 * img_found)

            Ratios_2 = utils.intr(((((Ratios_topic + Ratios_desc) / 2) * 1.8) +
                                   (Ratios_img) * 0.2) / 2)

            listRatios[i] = {
                'keyDB': key_found,
                "topic": topic_found,
                "per": Ratios_2
            }
        # print("choice 1")
        top = topTen(lost, listRatios)
        return top

    elif (choice == 2):
        topic_found = found[0]["topic"]
        desc_found = found[0]["description"]
        key_found = found[0]['key']
        for i in range(0, len(lost), 1):
            # print("in for 2")
            tmp = ""
            # tmp = lost[i]
            topic_lost = lost[i]["topic"]
            desc_lost = lost[i]["description"]
            key_lost = lost[i]['key']
            img_lost = lost[i]['img']

            Ratios_topic = fuzz.partial_token_sort_ratio(topic_lost,
                                                         topic_found,
                                                         force_ascii=True,
                                                         full_process=1)
            Ratios_desc = fuzz.partial_token_sort_ratio(desc_lost,
                                                        desc_found,
                                                        force_ascii=True,
                                                        full_process=1)
            Ratios_img = (100 * img_lost)

            Ratios_2 = utils.intr(((((Ratios_topic + Ratios_desc) / 2) * 1.8) +
                                   (Ratios_img) * 0.2) / 2)
            # print(Ratios)
            listRatios[i] = {
                'keyDB': key_lost,
                "topic": topic_lost,
                "per": Ratios_2
            }
        # print("choice 2")
        top = topTen(found, listRatios)
        return top