Example #1
0
def WRatio(s1, s2):
    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)
    if not utils.validate_string(p1): return 0
    if not utils.validate_string(p2): return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = .95
    partial_scale = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5: try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8: partial_scale = .6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2) * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2) * unbase_scale
        tser = token_set_ratio(p1, p2) * unbase_scale

        return int(max(base, tsor, tser))
Example #2
0
def WRatio(s1,  s2):
    if not utils.validate_string(s1): return 0
    if not utils.validate_string(s2): return 0

    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)

    # should we look at partials?
    try_partial     = True
    unbase_scale    = .95
    partial_scale   = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1),len(p2)))/min(len(p1),len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5: try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8: partial_scale = .6

    if try_partial:
        partial      = partial_ratio(p1, p2)                 * partial_scale
        ptsor        = partial_token_sort_ratio(p1, p2)      * unbase_scale * partial_scale
        ptser        = partial_token_set_ratio(p1, p2)       * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor         = token_sort_ratio(p1, p2)              * unbase_scale
        tser         = token_set_ratio(p1, p2)               * unbase_scale

        return int(max(base, tsor, tser))
Example #3
0
def QRatio(s1, s2):
    if not utils.validate_string(s1): return 0
    if not utils.validate_string(s2): return 0

    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)

    return ratio(p1, p2)
Example #4
0
def QRatio(s1,  s2):
    if not utils.validate_string(s1): return 0
    if not utils.validate_string(s2): return 0

    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)

    return ratio(p1, p2)
Example #5
0
def QRatio(s1, s2, force_ascii=True):

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    return ratio(p1, p2)
Example #6
0
def QRatio(s1, s2, force_ascii=True):

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    return ratio(p1, p2)
Example #7
0
def _token_set(s1, s2, partial=True, force_ascii=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(utils.full_process(p1).split())
    tokens2 = set(utils.full_process(p2).split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()

    if partial:
        ratio_func = partial_ratio
    else:
        ratio_func = ratio

    pairwise = [
        ratio_func(sorted_sect, combined_1to2),
        ratio_func(sorted_sect, combined_2to1),
        ratio_func(combined_1to2, combined_2to1),
    ]
    return max(pairwise)
Example #8
0
def _token_set(s1, s2, partial=True, force_ascii=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(utils.full_process(p1).split())
    tokens2 = set(utils.full_process(p2).split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()

    if partial:
        ratio_func = partial_ratio
    else:
        ratio_func = ratio

    pairwise = [
        ratio_func(sorted_sect, combined_1to2),
        ratio_func(sorted_sect, combined_2to1),
        ratio_func(combined_1to2, combined_2to1)
    ]
    return max(pairwise)
Example #9
0
def WRatio(s1, s2, force_ascii=True):
    """Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms.
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = .95
    partial_scale = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = .6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

        return int(max(base, tsor, tser))
Example #10
0
def WRatio(s1, s2, force_ascii=True):
    """Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms.
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = 0.95
    partial_scale = 0.90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = 0.6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

        return int(max(base, tsor, tser))
def _token_print_set(s1, s2, partial=True, force_ascii=True, full_process=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
    p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(p1.split())
    tokens2 = set(p2.split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)
    
    print(intersection)
    print(diff1to2)
    print(diff2to1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()
Example #12
0
 def test_validate_strings(self):
     tester = None
     self.assertTrue(not utils.validate_string(tester))
     tester = ""
     self.assertTrue(not utils.validate_string(tester))
     tester = 0.00123
     self.assertTrue(not utils.validate_string(tester))
     tester = 999999999999999999999999999999999L
     self.assertTrue(not utils.validate_string(tester))
     tester = 'a'
     self.assertTrue(utils.validate_string(tester))
     tester = "This is a perfectly valid string"
     self.assertTrue(utils.validate_string(tester))
     tester = "This \n\ris \n\r\ra \n\r\t\033[49m\n \033[31mperfectly\033[39m \r\nvalid string"
     self.assertTrue(utils.validate_string(tester))
Example #13
0
 def test_validate_strings(self):
     tester = None
     self.assertTrue(not utils.validate_string(tester))
     tester = ""
     self.assertTrue(not utils.validate_string(tester))
     tester = 0.00123
     self.assertTrue(not utils.validate_string(tester))
     tester = 999999999999999999999999999999999L
     self.assertTrue(not utils.validate_string(tester))
     tester = 'a'
     self.assertTrue(utils.validate_string(tester))
     tester = "This is a perfectly valid string"
     self.assertTrue(utils.validate_string(tester))
     tester = "This \n\ris \n\r\ra \n\r\t\033[49m\n \033[31mperfectly\033[39m \r\nvalid string"
     self.assertTrue(utils.validate_string(tester))