def test_start_crawler(self): server_id = "a" * 32 cookies = {'SERVER_ID': server_id} with patch("crawler.tasks.requests") as requests_mock: requests_mock.get.return_value = Mock( status_code=200, cookies=Mock(get_dict=Mock(return_value=cookies)), json=Mock( return_value={ 'data': [{ "id": uid } for uid in range(API_LIMIT)], 'next_page': { 'offset': -1 }, 'prev_page': { 'offset': 1 }, }), ) process_feed.apply_async = Mock() process_feed() requests_mock.get.assert_called_once_with( FEED_URL_TEMPLATE.format( host=PUBLIC_API_HOST, version=API_VERSION, resource="tenders", ), params=dict(feed="changes", limit=API_LIMIT, descending="1", mode="_all_", opt_fields=",".join(TENDER_OPT_FIELDS)), cookies={}, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT), ) self.assertEqual(process_feed.apply_async.call_args_list, [ call( kwargs={ 'resource': 'tenders', 'mode': '_all_', 'offset': -1, 'descending': '1', 'cookies': { 'SERVER_ID': server_id } }), call(countdown=60, kwargs={ 'resource': 'tenders', 'mode': '_all_', 'offset': 1, 'cookies': { 'SERVER_ID': server_id }, }), ], msg="Both forward and backward crawling")
def test_proceed_forward_crawler_few_results(self): server_id = "a" * 32 cookies = {'SERVER_ID': server_id} with patch("crawler.tasks.requests") as requests_mock: requests_mock.get.return_value = Mock( status_code=200, json=Mock( return_value={ 'data': [{ "id": uid } for uid in range(API_LIMIT - 1)], 'next_page': { 'offset': 2 }, 'prev_page': { 'offset': 0 }, }), cookies=Mock(get_dict=Mock(return_value=cookies)), ) process_feed.apply_async = Mock() process_feed(offset=1, cookies=cookies) requests_mock.get.assert_called_once_with( FEED_URL_TEMPLATE.format( host=PUBLIC_API_HOST, version=API_VERSION, resource="tenders", ), params=dict(feed="changes", offset=1, mode="_all_", limit=API_LIMIT, opt_fields=",".join(TENDER_OPT_FIELDS)), cookies={'SERVER_ID': server_id}, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT), ) process_feed.apply_async.assert_called_once_with( kwargs={ 'resource': 'tenders', 'mode': '_all_', 'offset': 2, 'cookies': { 'SERVER_ID': server_id } }, countdown=WAIT_MORE_RESULTS_COUNTDOWN)
def test_proceed_empty_forward_crawler(self): server_id = "a" * 32 cookies = {'SERVER_ID': server_id} with patch("crawler.tasks.requests") as requests_mock: requests_mock.get.return_value = Mock( status_code=200, json=Mock(return_value={ 'data': [], 'next_page': { 'offset': '' }, }), cookies=Mock(get_dict=Mock(return_value=cookies)), ) process_feed.apply_async = Mock() process_feed(offset='', try_count=1, cookies=cookies) requests_mock.get.assert_called_once_with( FEED_URL_TEMPLATE.format( host=PUBLIC_API_HOST, version=API_VERSION, resource="tenders", ), params=dict(feed="changes", descending="1", mode="_all_", limit=API_LIMIT, opt_fields=",".join(TENDER_OPT_FIELDS)), cookies={'SERVER_ID': server_id}, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT), ) process_feed.apply_async.assert_called_once_with(countdown=60, kwargs={ 'resource': 'tenders', 'mode': '_all_', 'cookies': { 'SERVER_ID': server_id }, 'try_count': 2, })
def process_feed(self, resource="tenders", offset=None, descending=None, mode="_all_", cookies=None, try_count=0): logger.info("Start task {}".format(self.request.id), extra={ "MESSAGE_ID": "START_TASK_MSG", "TASK_ID": self.request.id }) config = resources.configs.get(resource) cookies = cookies or {} if not offset: # initialization descending = "1" url = FEED_URL_TEMPLATE.format( host=PUBLIC_API_HOST, version=API_VERSION, resource=resource, ) params = dict( feed="changes", limit=API_LIMIT, mode=mode, ) if config.opt_fields: params["opt_fields"] = ",".join(config.opt_fields) if descending: params["descending"] = descending if offset: params["offset"] = offset try: response = requests.get( url, params=params, cookies=cookies, timeout=(CONNECT_TIMEOUT, READ_TIMEOUT), ) except RETRY_REQUESTS_EXCEPTIONS as exc: logger.exception(exc, extra={"MESSAGE_ID": "FEED_RETRY_EXCEPTION"}) raise self.retry(exc=exc) else: if response.status_code == 200: # handle cookies if response.cookies: cookies = response.cookies.get_dict() # get response data response_json = response.json() # call handlers (TENDER_HANDLERS, CONTRACT_HANDLERS, FRAMEWORK_HANDLERS item_handlers = config.handlers for item in response_json["data"]: for handler in item_handlers: try: handler(item) except Exception as e: logger.exception( e, extra={"MESSAGE_ID": "FEED_HANDLER_EXCEPTION"}) # schedule getting the next page next_page_kwargs = dict( resource=resource, mode=mode, offset=response_json["next_page"]["offset"], cookies=cookies) if descending: next_page_kwargs["descending"] = descending if len(response_json["data"]) < API_LIMIT: if descending: logger.info("Stopping backward crawling", extra={"MESSAGE_ID": "FEED_BACKWARD_FINISH"}) else: if offset == next_page_kwargs["offset"]: # increase try_count so task won't be stopped by unique_lock decorator next_page_kwargs["try_count"] = try_count + 1 else: # reset try_count to sync all duplicate tasks # and let unique_lock decorator do it's job next_page_kwargs.pop("try_count", None) process_feed.apply_async( kwargs=next_page_kwargs, countdown=WAIT_MORE_RESULTS_COUNTDOWN, ) else: process_feed.apply_async(kwargs=next_page_kwargs) # if it's initialization, add forward crawling task if not offset: process_kwargs = dict( resource=resource, mode=mode, cookies=cookies, ) if response_json.get("prev_page", {}).get("offset"): process_kwargs["offset"] = response_json["prev_page"][ "offset"] else: logger.debug("Initialization on an empty feed result", extra={"MESSAGE_ID": "FEED_INIT_EMPTY"}) process_kwargs["try_count"] = try_count + 1 process_feed.apply_async( kwargs=process_kwargs, countdown=WAIT_MORE_RESULTS_COUNTDOWN, ) elif response.status_code == 412: # Precondition failed logger.warning( "Precondition failed with cookies {}".format(cookies), extra={"MESSAGE_ID": "FEED_PRECONDITION_FAILED"}) retry_kwargs = dict(**self.request.kwargs) retry_kwargs["cookies"] = response.cookies.get_dict() raise self.retry(kwargs=retry_kwargs) elif response.status_code == 404: # "Offset expired/invalid" logger.warning("Offset {} failed with cookies {}".format( offset, cookies), extra={"MESSAGE_ID": "FEED_OFFSET_FAILED"}) if not descending or not offset: # for forward process only logger.info("Feed process reinitialization", extra={"MESSAGE_ID": "FEED_REINITIALIZATION"}) retry_kwargs = { k: v for k, v in self.request.kwargs.items() if k != "offset" } raise self.retry(kwargs=retry_kwargs) else: logger.warning("Unexpected status code {}: {}".format( response.status_code, response.text), extra={"MESSAGE_ID": "FEED_UNEXPECTED_STATUS"}) raise self.retry(countdown=get_request_retry_countdown(response)) return response.status_code