Beispiel #1
0
def WRatio(s1, s2):
    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)
    if not utils.validate_string(p1): return 0
    if not utils.validate_string(p2): return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = .95
    partial_scale = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5: try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8: partial_scale = .6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2) * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2) * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2) * unbase_scale
        tser = token_set_ratio(p1, p2) * unbase_scale

        return int(max(base, tsor, tser))
Beispiel #2
0
def WRatio(s1,  s2):
    if not utils.validate_string(s1): return 0
    if not utils.validate_string(s2): return 0

    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)

    # should we look at partials?
    try_partial     = True
    unbase_scale    = .95
    partial_scale   = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1),len(p2)))/min(len(p1),len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5: try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8: partial_scale = .6

    if try_partial:
        partial      = partial_ratio(p1, p2)                 * partial_scale
        ptsor        = partial_token_sort_ratio(p1, p2)      * unbase_scale * partial_scale
        ptser        = partial_token_set_ratio(p1, p2)       * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor         = token_sort_ratio(p1, p2)              * unbase_scale
        tser         = token_set_ratio(p1, p2)               * unbase_scale

        return int(max(base, tsor, tser))
Beispiel #3
0
def QRatio(s1,  s2):
    if not utils.validate_string(s1): return 0
    if not utils.validate_string(s2): return 0

    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)

    return ratio(p1, p2)
Beispiel #4
0
def QRatio(s1, s2):
    if not utils.validate_string(s1): return 0
    if not utils.validate_string(s2): return 0

    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)

    return ratio(p1, p2)
Beispiel #5
0
def QRatio(s1, s2, force_ascii=True):

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    return ratio(p1, p2)
Beispiel #6
0
def QRatio(s1, s2, force_ascii=True):

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    return ratio(p1, p2)
Beispiel #7
0
def _token_set(s1, s2, partial=True, force_ascii=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(utils.full_process(p1).split())
    tokens2 = set(utils.full_process(p2).split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()

    if partial:
        ratio_func = partial_ratio
    else:
        ratio_func = ratio

    pairwise = [
        ratio_func(sorted_sect, combined_1to2),
        ratio_func(sorted_sect, combined_2to1),
        ratio_func(combined_1to2, combined_2to1),
    ]
    return max(pairwise)
Beispiel #8
0
def _token_set(s1, s2, partial=True, force_ascii=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    if s1 is None:
        raise TypeError("s1 is None")
    if s2 is None:
        raise TypeError("s2 is None")

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(utils.full_process(p1).split())
    tokens2 = set(utils.full_process(p2).split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()

    if partial:
        ratio_func = partial_ratio
    else:
        ratio_func = ratio

    pairwise = [
        ratio_func(sorted_sect, combined_1to2),
        ratio_func(sorted_sect, combined_2to1),
        ratio_func(combined_1to2, combined_2to1)
    ]
    return max(pairwise)
Beispiel #9
0
def extract(query, choices, processor=None, scorer=None, limit=5):
    """Find best matches in a list of choices, return a list of tuples containing the match and it's score.

    Arguments:
        query       -- an object representing the thing we want to find
        choices     -- a list of objects we are attempting to extract values from
        scorer      -- f(OBJ, QUERY) --> INT. We will return the objects with the highest score
                        by default, we use score.WRatio() and both OBJ and QUERY should be strings
        processor   -- f(OBJ_A) --> OBJ_B, where the output is an input to scorer
                        for example, "processor = lambda x: x[0]" would return the first element
                        in a collection x (of, say, strings) this would then be used in the scoring collection
                        by default, we use utils.full_process()

    """
    if choices is None or len(choices) == 0:
        return []

    # default, turn whatever the choice is into a workable string
    if processor is None:
        processor = lambda x: utils.full_process(x)

    # default: wratio
    if scorer is None:
        scorer = WRatio

    sl = list()

    for choice in choices:
        processed = processor(choice)
        score = scorer(query, processed)
        tuple = (choice, score)
        sl.append(tuple)

    sl.sort(key=lambda i: i[1], reverse=True)
    return sl[:limit]
Beispiel #10
0
def extract(query, choices, processor=None, scorer=None, limit=5):

    # choices       = a list of objects we are attempting to extract values from
    # query         = an object representing the thing we want to find
    # scorer        f(OBJ, QUERY) --> INT. We will return the objects with the highest score
        # by default, we use score.WRatio() and both OBJ and QUERY should be strings
    # processor     f(OBJ_A) --> OBJ_B, where the output is an input to scorer
        # for example, "processor = lambda x: x[0]" would return the first element in a collection x (of, say, strings)
        # this would then be used in the scoring collection

    if choices is None or len(choices) == 0:
        return []

    # default, turn whatever the choice is into a workable string
    if processor is None:
        processor = lambda x: utils.full_process(x)

    # default: wratio
    if scorer is None:
        scorer = WRatio

    sl = list()

    for choice in choices:
        processed = processor(choice)
        score = scorer(query, processed)
        tuple = (choice, score)
        sl.append(tuple)

    sl.sort(key=lambda i: i[1], reverse=True)
    return sl[:limit]
Beispiel #11
0
def _process_and_sort(s, force_ascii):
    """Return a cleaned string with token sorted."""
    # pull tokens
    tokens = utils.full_process(s, force_ascii=force_ascii).split()

    # sort tokens and join
    sorted_string = u" ".join(sorted(tokens))
    return sorted_string.strip()
Beispiel #12
0
def _process_and_sort(s, force_ascii):
    """Return a cleaned string with token sorted."""
    # pull tokens
    tokens = utils.full_process(s, force_ascii=force_ascii).split()

    # sort tokens and join
    sorted_string = " ".join(sorted(tokens))
    return sorted_string.strip()
Beispiel #13
0
def WRatio(s1, s2, force_ascii=True):
    """Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms.
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = .95
    partial_scale = .90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = .6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \
            * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

        return int(max(base, tsor, tser))
Beispiel #14
0
def WRatio(s1, s2, force_ascii=True):
    """Return a measure of the sequences' similarity between 0 and 100,
    using different algorithms.
    """

    p1 = utils.full_process(s1, force_ascii=force_ascii)
    p2 = utils.full_process(s2, force_ascii=force_ascii)

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # should we look at partials?
    try_partial = True
    unbase_scale = 0.95
    partial_scale = 0.90

    base = ratio(p1, p2)
    len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2))

    # if strings are similar length, don't use partials
    if len_ratio < 1.5:
        try_partial = False

    # if one string is much much shorter than the other
    if len_ratio > 8:
        partial_scale = 0.6

    if try_partial:
        partial = partial_ratio(p1, p2) * partial_scale
        ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale
        ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale * partial_scale

        return int(max(base, partial, ptsor, ptser))
    else:
        tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale
        tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale

        return int(max(base, tsor, tser))
def _token_print_set(s1, s2, partial=True, force_ascii=True, full_process=True):
    """Find all alphanumeric tokens in each string...
        - treat them as a set
        - construct two strings of the form:
            <sorted_intersection><sorted_remainder>
        - take ratios of those two strings
        - controls for unordered partial matches"""

    p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
    p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2

    if not utils.validate_string(p1):
        return 0
    if not utils.validate_string(p2):
        return 0

    # pull tokens
    tokens1 = set(p1.split())
    tokens2 = set(p2.split())

    intersection = tokens1.intersection(tokens2)
    diff1to2 = tokens1.difference(tokens2)
    diff2to1 = tokens2.difference(tokens1)
    
    print(intersection)
    print(diff1to2)
    print(diff2to1)

    sorted_sect = " ".join(sorted(intersection))
    sorted_1to2 = " ".join(sorted(diff1to2))
    sorted_2to1 = " ".join(sorted(diff2to1))

    combined_1to2 = sorted_sect + " " + sorted_1to2
    combined_2to1 = sorted_sect + " " + sorted_2to1

    # strip
    sorted_sect = sorted_sect.strip()
    combined_1to2 = combined_1to2.strip()
    combined_2to1 = combined_2to1.strip()
Beispiel #16
0
 def testCaseInsensitive(self):
     self.assertNotEqual(ratio(self.s1, self.s2),100)
     self.assertEqual(ratio(utils.full_process(self.s1), utils.full_process(self.s2)),100)
Beispiel #17
0
 def testCaseInsensitive(self):
     self.assertNotEqual(ratio(self.s1, self.s2), 100)
     self.assertEqual(
         ratio(utils.full_process(self.s1), utils.full_process(self.s2)),
         100)
Beispiel #18
0
 def test_fullProcess(self):
     for s in self.mixed_strings:
         utils.full_process(s)
Beispiel #19
0
def QRatio(s1,  s2):
    p1 = utils.full_process(s1)
    p2 = utils.full_process(s2)
    return ratio(p1, p2)
Beispiel #20
0
 def test_fullProcess(self):
     for s in self.mixed_strings:
         utils.full_process(s)