def test_race_condition_with_timeout(self): urls = [] search_string = 'success' path = '/timed_out_success' self.httpd.responses[path] = sleep_func(search_string, 4) urls.append(self.path_to_url(path)) path = '/slow_success' self.httpd.responses[path] = sleep_func(search_string, 2) urls.append(self.path_to_url(path)) for i in range(5): path = '/not_success_{0}'.format(i) self.httpd.responses[path] = 'nothing to see here' urls.append(self.path_to_url(path)) path = '/fast_success' self.httpd.responses[path] = search_string expected_result = self.path_to_url(path) urls.append(expected_result) callback = contains_callback(search_string) result = solution.crawler(iter(urls), callback, 3) self.assertEqual(result, expected_result)
def test_if_concurrent_with_sleep_in_server(self): ''' All responsens will be ~1 second long. If the solution is not concurrent and it tried all URLs in turns it should have been working for more than 2 seconds since there are at least 2 URLs in the work queue. ''' urls = [] concurrency = 5 def with_sleep(hndl): hndl.send_response(404) hndl.end_headers() time.sleep(0.5) hndl.wfile.write(bytes('not what you are searching for', 'UTF-8')) for i in range(concurrency): path = '/not-there-{0}'.format(i) self.httpd.responses[path] = with_sleep urls.append(self.path_to_url(path)) success_path = '/bingo' success_url = self.path_to_url(success_path) self.httpd.responses[success_path] = 'I made in less than 5 seconds' urls.append(success_url) callback = contains_callback('less than 5') start = time.time() result = solution.crawler(iter(urls), callback, concurrency) took = time.time() - start self.assertEqual(result, success_url) msg = ('This crawler is probably not an concurrent one.') self.assertTrue(took < 1, msg)
def test_timeout_value(self): urls = [] concurrency = 2 def with_sleep(hndl): hndl.send_response(200) hndl.end_headers() time.sleep(4) hndl.wfile.write(bytes('I made in less than 5 seconds', 'UTF-8')) for i in range(concurrency): path = '/timeout-{0}'.format(i) self.httpd.responses[path] = with_sleep urls.append(self.path_to_url(path)) success_path = '/bingo' success_url = self.path_to_url(success_path) self.httpd.responses[success_path] = 'I made in less than 5 seconds' urls.append(success_url) callback = contains_callback('less than 5') start = time.time() result = solution.crawler(iter(urls), callback, concurrency) took = time.time() - start self.assertEqual(result, success_url, 'Crawler did not timeout') msg = ('This crawler is probably not an concurrent one.') self.assertTrue(took < 5, msg)
def test_with_non_2xx_responses(self): search_for = 'some string' server_response = 'There should be some string in here' def create_err_handler(status_code): def error_handler(hndl): hndl.send_response(status_code) hndl.end_headers() return error_handler def return_409_and_the_string(hndl): hndl.send_response(409) hndl.end_headers() hndl.wfile.write(bytes(server_response, 'UTF-8')) self.httpd.responses = { '/500': create_err_handler(500), '/504': create_err_handler(504), '/401': create_err_handler(401), '/416': create_err_handler(416), '/409': return_409_and_the_string, '/bingo': server_response, } urls = [self.path_to_url(path) for path in self.httpd.responses] result = solution.crawler(iter(urls), contains_callback(search_for), 5) self.assertEqual(result, self.path_to_url('/bingo'))
def test_wrong_arguments_passed(self): with self.assertRaises(Exception): solution.crawler(iter([]), always_false, -1) with self.assertRaises(Exception): solution.crawler(iter([]), always_false, 0) with self.assertRaises(Exception): solution.crawler(iter(['http://google.com']), None, 3) with self.assertRaises(Exception): solution.crawler(None, always_false, 3)
def test_no_result(self): contains_luck = contains_callback('You are lucky, FMI python 2015') urls = map(self.path_to_url, ["/1", "/2", "wrong.url!", "/3", "/5",]) result = solution.crawler(iter(urls), contains_luck, 4) self.assertIsNone(result, ('Did not expect the string "you are ' 'lucky" in any of the URLs'))
def test_with_many_wrong_urls(self): success_path = '/success' self.httpd.responses[success_path] = 'first not wrong url' urls = [self.path_to_url('/not-found-{0}'.format(i)) for i in range(15)] urls += ["wrong.url!", "python://almost-an-URL", "http://almost-a-domain"] success_url = self.path_to_url(success_path) urls.append(success_url) callback = contains_callback('not wrong') result = solution.crawler(iter(urls), callback, 5) self.assertEqual(result, success_url)
def test_simple_operation(self): self.httpd.responses = { '/path1': 'Hello, lonely! How are you today?' } success_url = self.path_to_url('/path1') work_urls = [ self.path_to_url('/path-not-here'), success_url, self.path_to_url('/ops'), ] callback = contains_callback('How are you today?') result = solution.crawler(iter(work_urls), callback, 3) self.assertEqual(result, success_url)
def test_crawler_starts_no_more_than_expected_workers(self): urls = [] for i in range(5): path = '/not_success_{0}'.format(i) self.httpd.responses[path] = sleep_func('nothing to see here', 0.100) urls.append(self.path_to_url(path)) start = time.time() result = solution.crawler(iter(urls), always_false, 4) took = time.time() - start self.assertIsNone(result) # if the time is less than 200ms it meas there have been more than 4 workers self.assertTrue(took >= 0.200, 'crawler spawned more workers than expected')
def test_when_callback_never_returns_true(self): self.httpd.responses = { '/500': 'cuddly', '/504': 'little', '/401': 'cat', '/416': 'which', '/409': 'eats', '/bingo': 'mice', '/355': 'but', '/356': 'likes', '/357': 'fish', '/358': 'too', } urls = [self.path_to_url(path) for path in self.httpd.responses] result = solution.crawler(iter(urls), always_false, 3) self.assertIsNone(result)
def test_only_one_worker(self): self.httpd.responses = { '/500': 'cuddly', '/504': 'little', '/401': 'cat', '/416': 'which', '/409': 'eats', '/bingo': 'mice', '/355': 'but', '/356': 'likes', '/357': 'fish', '/358': 'too', } urls = [self.path_to_url(path) for path in self.httpd.responses] result = solution.crawler(iter(urls), contains_callback('mice'), 1) self.assertEqual(result, self.path_to_url('/bingo'))
def test_race_condition(self): urls = [] search_string = 'success' path = '/failure' self.httpd.responses[path] = 'failure' urls.append(self.path_to_url(path)) path = '/slow_success' self.httpd.responses[path] = sleep_func(search_string, 1) urls.append(self.path_to_url(path)) path = '/fast_success' self.httpd.responses[path] = sleep_func(search_string, 0.5) expected_result = self.path_to_url(path) urls.append(expected_result) callback = contains_callback(search_string) result = solution.crawler(iter(urls), callback, 3) self.assertEqual(result, expected_result)
def test_crawling_stops_after_successful_callback(self): search_for = 'Ame-no-Murakumo-no-Tsurugi' server_response = 'Ame-no-Murakumo-no-Tsurugi was given to the warrior' success_path = '/kusanagi' touched = [] not_touched = [] urls = [] for i in range(10): path = '/touched-{0}'.format(i) self.httpd.responses[path] = 'Nothing to see here' urls.append(self.path_to_url(path)) touched.append(path) self.httpd.responses[success_path] = server_response urls.append(self.path_to_url(success_path)) touched.append(success_path) for i in range(10): path = '/not-touched-{0}'.format(i) self.httpd.responses[path] = 'Nothing to see here' urls.append(self.path_to_url(path)) not_touched.append(path) result = solution.crawler(iter(urls), contains_callback(search_for), 2) self.assertEqual(result, self.path_to_url(success_path)) for path in touched: self.assertTrue(self.httpd.is_touched(path), 'crawler did not check {0}'.format(path)) # There might be up to 2 (workers_count) urls visited before the crawler # returns. for path in not_touched[2:]: self.assertFalse(self.httpd.is_touched(path), 'crawler did check `{0}` when it was not supposed to'.format(path))
def test_empty_iterator(self): result = solution.crawler(iter([]), always_true, 4) self.assertIsNone(result)