def test_crawl_001(self): """Must call `api.get_crawl_queue()`.""" smock.mock('api.get_crawl_queue', returns=[]) with eventlet.Timeout(DEFAULT_TIMEOUT, False): self.client.crawl() self.assertTrue(smock.is_called('api.get_crawl_queue'))
def test_get_001(self): """Must call `storage.meta.query_new_random`.""" req = webob.Request.blank('/') req.method = 'POST' req.body = "limit=10" smock.mock('StorageConnection.query_new_random', returns=[]) self.manager.active = True self.manager.crawl_queue(req) self.assertTrue(smock.is_called('StorageConnection.query_new_random'))
def test_get_003(self): """Must return no more than requested items.""" req = webob.Request.blank('/') req.method = 'POST' req.body = "limit=2" items = [{'url': "http://url1/", 'visited': None}, {'url': "http://url2/", 'visited': None}, {'url': "http://url3/", 'visited': None}] items_copy = items[:] smock.mock('StorageConnection.query_new_random', returns_func=lambda *a, **kw: [items_copy.pop()] if items_copy else []) result = self.manager.crawl_queue(req) self.assertTrue(len(result) <= 2)
def setUp(self): settings.prefetch = {'queue_size': 10, 'get_timeout': 0.01, 'single_limit': 5, 'cache_timeout': 60, } settings.postreport = {'queue_size': 10, 'flush_size': 1, 'flush_delay': 0.01, } settings.storage = {'max_connections': 1} settings.api = {'max_queue_limit': 100} self.manager = Manager() # StorageConnection mock smock.mock('StorageConnection.__init__', returns=None)
def test_crawl_004(self): """Must handle invalid URL with empty port number.""" def mock_get_crawl_queue(_limit): return [{'url': "http://localhost:/test_crawl_004_link", 'visited': None, 'links': []}] def mock_report_result(report): self.assertTrue('Error' in report['result']) self.assertEqual(report['url'], "http://localhost:/test_crawl_004_link") smock.mock('api.get_crawl_queue', returns_func=mock_get_crawl_queue) smock.mock('api.report_result', returns_func=mock_report_result) #smock.mock('httplib2.Http.request', returns=(make_http_response(404), "")) with eventlet.Timeout(DEFAULT_TIMEOUT, False): self.client.crawl()
def test_get_002(self): """Must return list of items fetched from storage.""" req = webob.Request.blank('/') req.method = 'POST' req.body = "limit=10" items = [{'url': "http://url1/", 'visited': None}, {'url': "http://url2/", 'visited': None}, {'url': "http://url3/", 'visited': None}] items_copy = items[:] smock.mock('StorageConnection.query_new_random', returns_func=lambda *a, **kw: [items_copy.pop()] if items_copy else []) self.manager.active = True result = self.manager.crawl_queue(req) self.assertEqual(sorted(items), sorted(result))
def test_crawl_003(self): """Must make no more than 5 simultaneous connections to single server.""" item = {'url': "http://localhost/test_crawl_003_link", 'visited': None, 'links': []} flags = {'max_count': 0} NUM_ITEMS = self.client.max_connections_per_host * 2 REQUEST_PAUSE = 0.05 def mock_httplib2_request_sleep(url, *args, **kwargs): # pylint: disable-msg=W0613 flags['max_count'] = max(flags['max_count'], self.client.get_active_connections_count('127.0.0.1')) eventlet.sleep(REQUEST_PAUSE) return make_http_response(404), "" smock.mock('api.get_crawl_queue', returns=[]) smock.mock('api.report_result', returns=None) smock.mock('httplib2.Http.request', returns_func=mock_httplib2_request_sleep) # prepopulate the queue for _ in xrange(NUM_ITEMS): self.client.queue.put(item) with eventlet.Timeout(DEFAULT_TIMEOUT, False): self.client.crawl() self.assertTrue(self.client.queue.empty(), u"Crawler didn't consume all queue in allocated time.") self.assertTrue(self.client.graceful_stop(timeout=NUM_ITEMS * REQUEST_PAUSE), u"Crawler didn't stop in allocated time.") self.assertTrue(flags['max_count'] > 0, u"No connections started at all.") self.assertTrue(flags['max_count'] <= self.client.max_connections_per_host, u"Started too many connections.")
def test_put_001(self): """Must accept one valid report item.""" req = webob.Request.blank('/') req.method = 'PUT' url = "http://localhost/manager-test_put_001-url" item = {'url': url, 'visited': datetime.now().strftime(TIME_FORMAT), 'status_code': 200, 'content': "test content", } req.body = json.dumps(item) smock.mock('StorageConnection.save_content', returns=None) smock.mock('StorageConnection.query_all_by_url_one', returns={'url': url, 'visited': None}) smock.mock('StorageConnection.save', returns=None) smock.mock('StorageConnection.update', returns=None) self.manager.report_result(req)
def test_crawl_002(self): """Must call `httplib2.Http.request` and `report_item`.""" def mock_get_crawl_queue(_limit): return [{'url': "http://localhost/test_crawl_002_link", 'visited': None, 'links': []}] def mock_report_result(report): self.assertEqual(report['url'], "http://localhost/test_crawl_002_link") smock.mock('api.get_crawl_queue', returns_func=mock_get_crawl_queue) smock.mock('api.report_result', returns_func=mock_report_result) smock.mock('httplib2.Http.request', returns=(make_http_response(404), "")) with eventlet.Timeout(DEFAULT_TIMEOUT, False): self.client.crawl() self.assertTrue(smock.is_called('httplib2.Http.request')) self.assertTrue(smock.is_called('api.report_result'))
def setUp(self): settings.manager_url = "fake-url" settings.socket_timeout = 10 settings.identity = {'name': "HeroshiBot", 'user_agent': "HeroshiBot/100.500 (lalala)"} self.client = Crawler(queue_size=2000, max_connections=20) self.uris = [] self.responses = {} self.handlers = {} self.requested = [] self.used_run_crawler = False self.on_unexpected_uri = 'fail' self.on_unexpected_uri_func = lambda url: self.fail(u"`self.on_unexpected_uri_func` is unset.") self.default_hanlder_200 = lambda url: (make_http_response(200), "Dummy page at %s." % (url,)) self.default_hanlder_404 = lambda url: (make_http_response(404), "Not found: %s." % (url,)) def mock_httplib2_request(url, *args, **kwargs): self.requested.append(url) if url in self.responses: code, content = self.responses[url] return make_http_response(code), content elif url in self.handlers: handler = self.handlers[url] return handler(url) else: if self.on_unexpected_uri == 'fail': self.fail(u"Unknown URL requested: %s. You didn't register it in `self.uris`." % (url,)) elif self.on_unexpected_uri == '200': return self.default_hanlder_200(url) elif self.on_unexpected_uri == '404': return self.default_hanlder_404(url) elif self.on_unexpected_uri == 'call': return make_http_response(*self.on_unexpected_uri_func(url)) else: self.fail(u"Unknown URL requested: %s. And no code for `self.on_unexpected_uri`: %s." % (url, self.on_unexpected_uri)) self.fail(u"httplib2_request mock supposed to return somewhere earlier.") smock.mock('api.get_crawl_queue', returns=[]) smock.mock('api.report_result', returns=None) smock.mock('httplib2.Http.request', returns_func=mock_httplib2_request)