def fuzzy_equal_for_diff(diff_x, diff_y, is_equal_ratio):
    """
    Does a quick estimation to determine if the two strings (diff_x and diff_y)
    are fuzzy equal.

    Not using fuzzy_equal() to compare results of applying diff() because of
    CSRF tokens and other randomly generated tokens which were breaking the
    comparison.

    This function removes those randomly generated strings and then does the
    compare.

    :param diff_x: Result of running diff() on responses A and B
    :param diff_y: Result of running diff() on responses B and C
    :param is_equal_ratio: The ratio to use when comparing the responses (0 to 1)
    :return: True if the two results of applying the diff() function are
             fuzzy equal (applying split_by_sep technique)
    """
    split_x = split_by_sep(diff_x)
    split_y = split_by_sep(diff_y)

    split_x = remove_hashes(split_x)
    split_y = remove_hashes(split_y)

    x = '\n'.join(split_x)
    y = '\n'.join(split_y)

    return fuzzy_equal(x, y, threshold=is_equal_ratio)
Example #2
0
def fuzzy_equal_for_diff(diff_x, diff_y, is_equal_ratio):
    """
    Does a quick estimation to determine if the two strings (diff_x and diff_y)
    are fuzzy equal.

    Not using fuzzy_equal() to compare results of applying diff() because of
    CSRF tokens and other randomly generated tokens which were breaking the
    comparison.

    This function removes those randomly generated strings and then does the
    compare.

    :param diff_x: Result of running diff() on responses A and B
    :param diff_y: Result of running diff() on responses B and C
    :param is_equal_ratio: The ratio to use when comparing the responses (0 to 1)
    :return: True if the two results of applying the diff() function are
             fuzzy equal (applying split_by_sep technique)
    """
    if diff_x == diff_y:
        return True

    split_x = split_by_sep(diff_x)
    split_y = split_by_sep(diff_y)

    split_x = remove_hashes(split_x)
    split_y = remove_hashes(split_y)

    x = '\n'.join(split_x)
    y = '\n'.join(split_y)

    return fuzzy_equal(x, y, threshold=is_equal_ratio)
Example #3
0
    def test_split_by_sep_perf(self):
        loops = 1000
        inputs = [unittest.__doc__,
                  re.__doc__,
                  '',
                  'hello world<bye bye!']

        for _ in xrange(loops):
            for _input in inputs:
                split_by_sep(_input)
Example #4
0
def relative_distance(a_str, b_str):
    """
    Measures the "similarity" of two strings.

    Depends on the algorithm we finally implement, but usually a return value
    greater than 0.75 means the strings are very similar.

    :param a_str: A string object
    :param b_str: A string object
    :return: A float with the distance
    """
    return difflib.SequenceMatcher(None, split_by_sep(a_str),
                                   split_by_sep(b_str)).quick_ratio()
Example #5
0
def relative_distance(a_str, b_str):
    """
    Measures the "similarity" of the strings.

    Depends on the algorithm we finally implement, but usually a return value
    over 0.7 means the strings are close matches.

    :param a_str: A string object
    :param b_str: A string object
    :return: A float with the distance
    """
    return difflib.SequenceMatcher(None,
                                   split_by_sep(a_str),
                                   split_by_sep(b_str)).quick_ratio()
Example #6
0
def relative_distance(a_str, b_str):
    """
    Measures the "similarity" of the strings.

    Depends on the algorithm we finally implement, but usually a return value
    over 0.6 means the strings are close matches.

    :param a_str: A string object
    :param b_str: A string object
    :return: A float with the distance
    """
    set_a = set(a_str.split(' '))
    set_b = set(b_str.split(' '))

    if min(len(set_a), len(set_b)) in (0, 1):
        #
        #   This is a rare case, where the http response body is one long
        #   non-space separated string.
        #
        return difflib.SequenceMatcher(None, split_by_sep(a_str),
                                       split_by_sep(b_str)).quick_ratio()

    return 1.0 * len(set_a.intersection(set_b)) / max(len(set_a), len(set_b))
Example #7
0
def relative_distance_boolean(a_str, b_str, threshold=0.6):
    """
    Indicates if the strings to compare are similar enough. This (optimized)
    function is equivalent to the expression:
        relative_distance(x, y) > threshold

    :param a_str: A string object
    :param b_str: A string object
    :param threshold: Float value indicating the expected "similarity". Must be
                      0 <= threshold <= 1.0
    :return: A boolean value
    """
    if threshold == 0:
        return True
    elif threshold == 1.0:
        return a_str == b_str

    # First we need b_str to be the longer of both
    if len(b_str) < len(a_str):
        a_str, b_str = b_str, a_str

    alen = len(a_str)
    blen = len(b_str)

    if blen == 0 or alen == 0:
        return alen == blen

    if blen == alen and a_str == b_str and threshold <= 1.0:
        return True

    if threshold > upper_bound_similarity(a_str, b_str):
        return False
    else:
        # Bad, we can't optimize anything here
        return threshold <= difflib.SequenceMatcher(
            None, split_by_sep(a_str), split_by_sep(b_str)).quick_ratio()
Example #8
0
 def test_split_by_sep_utf8(self):
     sequence = u'ąęż'
     # this shouldn't rise UnicodeDecodeError
     split_by_sep(sequence)
Example #9
0
 def test_split_by_sep_2(self):
     result = split_by_sep('hello world<bye\nbye!')
     self.assertEqual(result, ['hello world', 'bye', 'bye!'])