def matches(self, query): """ This method is used to check if the `query` HTTP response body matches the failed login page instance. :param query: An HTTP response body :return: True if the `query` response body is equal to the failed login bodies which were received in __init__(). """ if self.body_a == query: return True if self.body_b == query: return True if not fuzzy_equal(self.body_a, query, 0.60): # They are really different, no need to calculate diff() return False if self.diff_a_b is None: self.diff_a_b, _ = diff(self.body_a, self.body_b) _, diff_query_a = diff(self.body_a, query) # Had to add this in order to prevent issues with CSRF tokens, which # might be part of the HTTP response body, are random (not removed by # clean_body) and will "break" the diff if len(diff_query_a) < 64: return True if fuzzy_equal(self.diff_a_b, diff_query_a, 0.9): return True return False
def test_xml(self): """ Comment the @SkipTest and then run: nosetests --with-timer -s -v -x w3af/core/controllers/misc/tests/test_diff.py """ a = file(os.path.join(self.DATA, 'source.xml')).read() b = file(os.path.join(self.DATA, 'target.xml')).read() # This takes ~2.5 seconds on my workstation diff(a, b)
def test_diff_large_different_responses(self): """ Same here, this test took 8 seconds to run, and now it takes 0.4704s! """ large_file_1 = '' large_file_2 = '' _max = 10000 for i in xrange(_max): large_file_1 += 'A' * i large_file_1 += '\n' for i in xrange(_max): if i == _max - 3: large_file_2 += 'B' * i else: large_file_2 += 'A' * i large_file_2 += '\n' start = time.time() body1, body2 = diff(large_file_1, large_file_2) spent = time.time() - start self.assertGreater(1.0, spent) self.assertEqual(body1, 'A' * (_max - 3)) self.assertEqual(body2, 'B' * (_max - 3))
def test_xml(self): """ Before using https://pypi.org/project/diff-match-patch/ this test took around 2 seconds to run. Now it only takes 0.0056 sec! nosetests --with-timer -s -v -x w3af/core/controllers/misc/tests/test_diff.py """ a = file(os.path.join(self.DATA, 'source.xml')).read() b = file(os.path.join(self.DATA, 'target.xml')).read() start = time.time() diff(a, b) spent = time.time() - start self.assertGreater(1.0, spent)
def test_xml(self): """ Before using https://pypi.org/project/diff-match-patch/ this test took around 2 seconds to run. Now it only takes 0.0056 sec! nosetests --with-timer -s -v -x w3af/core/controllers/misc/tests/test_diff.py """ a = file(os.path.join(self.DATA, 'source.xml')).read() b = file(os.path.join(self.DATA, 'target.xml')).read() start = time.time() diff(a, b) spent = time.time() - start self.assertGreater(15.0, spent)
def equal_with_limit(self, body1, body2, compare_diff=False): """ Determines if two pages are equal using a ratio, if compare_diff is set then we just compare the parts of the response bodies which are different. """ if compare_diff: body1, body2 = diff(body1, body2) cmp_res = fuzzy_equal(body1, body2, self._eq_limit) return cmp_res
def equal_with_limit(self, body1, body2, compare_diff=False): """ Determines if two pages are equal using a ratio. """ if compare_diff: body1, body2 = diff(body1, body2) cmp_res = relative_distance_boolean(body1, body2, self._eq_limit) self.debug('Result: %s' % cmp_res) return cmp_res
def equal_with_limit(self, body1, body2, compare_diff=False): """ Determines if two pages are equal using a ratio. """ if compare_diff: body1, body2 = diff(body1, body2) cmp_res = relative_distance_boolean(body1, body2, self._eq_limit) self.debug("Result: %s" % cmp_res) return cmp_res
def equal_with_limit(self, body1, body2, compare_diff=False): """ Determines if two pages are equal using a ratio. """ if compare_diff: body1, body2 = diff(body1, body2) cmp_res = relative_distance_boolean(body1, body2, self._eq_limit) args = (self._eq_limit, cmp_res) self.debug('Strings are similar enough with limit %s? %s' % args, None) return cmp_res
def test_large_equal_responses(self): large_file = '' for i in xrange(10000): large_file += 'A' * i large_file += '\n' start = time.time() body1, body2 = diff(large_file, large_file) self.assertEqual(body1, '') self.assertEqual(body2, '') spent = time.time() - start self.assertGreater(1.0, spent)
def equal_with_limit(self, body1, body2, compare_diff=False): """ Determines if two pages are equal using a ratio. """ start = time.time() if compare_diff: body1, body2 = diff(body1, body2) cmp_res = relative_distance_boolean(body1, body2, self._eq_limit) are = 'ARE' if cmp_res else 'ARE NOT' args = (are, self._eq_limit) self.debug('Strings %s similar enough (limit: %s)' % args) spent = time.time() - start self.debug('Took %.2f seconds to run equal_with_limit' % spent) return cmp_res
def test_special_chars(self): a = 'X\tB\nC' b = 'A<B\nC' self.assertEqual(diff(a, b), ('X\t', 'A<'))
def test_start(self): a = 'X\nB\nC' b = 'A\nB\nC' self.assertEqual(diff(a, b), ('X', 'A'))
def test_start(self): self.assertEqual(diff('yes 123abc', 'no 123abc'), ('yes', 'no'))
def test_all_no_sep(self): a = 'ABC' b = 'AXC' self.assertEqual(diff(a, b), ('B', 'X'))
def test_nono(self): self.assertEqual(diff('123abc yes', 'no 123abc no'), ('yes', 'no no'))
def _handle_large_http_responses(self, http_response, query, known_404, debugging_id): """ When HTTP response bodies are large the fuzzy_equal() will generate 404 false positives. This is explained in a comment above, (search for "{header-4000bytes}"). This method will handle that case by using three HTTP responses instead of two (which is the most common case). The three HTTP responses used by this method are: * known_404: The forced 404 generated by this class * query: The HTTP response we want to know if it is a 404 * Another forced 404 generated by this method The method will diff the two 404 responses, and one 404 response with the query response, then compare using fuzzy_equal() to determine if the query is a 404. :return: True if the query response is a 404! """ # Make the algorithm easier to read known_404_1 = known_404 if known_404_1.diff is not None: # At some point during the execution of this scan we already sent # an HTTP request to use in this process and calculated the diff # # In order to prevent more HTTP requests from being sent to the # server, and also to reduce CPU usage, we saved the diff as an # attribute. pass else: # Need to send the second request and calculate the diff, there is # no previous knowledge that we can use # # Send exclude=[known_404_1.url] to prevent the function from sending # an HTTP request to the same forced 404 URL known_404_2 = send_request_generate_404(self._uri_opener, http_response, debugging_id, exclude=[known_404_1.url]) known_404_1.diff, _ = diff(known_404_1.body, known_404_2.body) known_404_1.diff_with_id = known_404_2.id self._404_responses[query.normalized_path] = known_404_1 diff_x = known_404_1.diff _, diff_y = diff(known_404_1.body, query.body) is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO) if not is_fuzzy_equal: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [similarity_ratio < %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id)])) om.out.debug(msg % args) return False msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [similarity_ratio > %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id)])) om.out.debug(msg % args) return True
def test_middle(self): a = 'A\nB\nC' b = 'A\nX\nC' self.assertEqual(diff(a, b), ('B', 'X'))
def test_empty(self): self.assertEqual(diff('', ''), ('', ''))
def _handle_large_http_responses(self, http_response, query, known_404, debugging_id): """ When HTTP response bodies are large the fuzzy_equal() will generate 404 false positives. This is explained in a comment above, (search for "{header-4000bytes}"). This method will handle that case by using three HTTP responses instead of two (which is the most common case). The three HTTP responses used by this method are: * known_404: The forced 404 generated by this class * query: The HTTP response we want to know if it is a 404 * Another forced 404 generated by this method The method will diff the two 404 responses, and one 404 response with the query response, then compare using fuzzy_equal() to determine if the query is a 404. :return: True if the query response is a 404! """ # Make the algorithm easier to read known_404_1 = known_404 if known_404_1.diff is not None: # At some point during the execution of this scan we already sent # an HTTP request to use in this process and calculated the diff # # In order to prevent more HTTP requests from being sent to the # server, and also to reduce CPU usage, we saved the diff as an # attribute. pass else: # Need to send the second request and calculate the diff, there is # no previous knowledge that we can use # # Send exclude=[known_404_1.url] to prevent the function from sending # an HTTP request to the same forced 404 URL known_404_2 = send_request_generate_404(self._uri_opener, http_response, debugging_id, exclude=[known_404_1.url]) known_404_1.diff, _ = diff(known_404_1.body, known_404_2.body) known_404_1.diff_with_id = known_404_2.id self._404_responses[query.normalized_path] = known_404_1 diff_x = known_404_1.diff _, diff_y = diff(known_404_1.body, query.body) is_fuzzy_equal = fuzzy_equal_for_diff(diff_x, diff_y, IS_EQUAL_RATIO) if not is_fuzzy_equal: msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is NOT a 404' ' [similarity_ratio < %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([ str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id) ])) om.out.debug(msg % args) return False msg = ('"%s" (id:%s, code:%s, len:%s, did:%s) is a 404' ' [similarity_ratio > %s with diff of 404]' ' [Request IDs: %s]') args = (http_response.get_url(), http_response.id, http_response.get_code(), len(http_response.get_body()), debugging_id, IS_EQUAL_RATIO, ', '.join([ str(http_response.id), str(known_404_1.id), str(known_404_1.diff_with_id) ])) om.out.debug(msg % args) return True
def test_middle(self): self.assertEqual(diff('123456', '123a56'), ('4', 'a'))
def test_middle_not_aligned(self): a = 'A\nB\nC' b = 'A\nXY\nC' self.assertEqual(diff(a, b), ('B', 'XY'))