def parse_binary_search(self, response, minimum=None, maximum=None): offset = int(get_parameter_value(response.request.url, 'offset')) first_offset = response.request.meta['first'] if minimum and maximum: self.logger.info(f'Starting binary search for {first_offset:,} within [{minimum:,}, {maximum:,}]') elif self.is_http_success(response): minimum = response.request.meta['minimum'] maximum = offset else: minimum = offset + 1 maximum = response.request.meta['maximum'] # If the search succeeded, parse the response as usual. We use a threshold, because getting the exact # millisecond requires 27 requests. if minimum + THRESHOLD >= maximum: self.logger.info(f'New offset found after {first_offset:,} at {maximum:,}!') if offset == maximum: # If the last request used the offset, we can reuse its response. yield from self.parse(response) else: url = replace_parameters(response.request.url, offset=maximum) yield self._build_request(url, self.parse, {}) else: url = replace_parameters(response.request.url, offset=(minimum + maximum) // 2) yield self._build_request(url, self.parse_binary_search, {'minimum': minimum, 'maximum': maximum, 'first': first_offset})
def parse_list(self, response): data = response.json() # The last page returns an empty JSON object. if not data: return for item in data['data']: url = replace_parameters(response.request.url, offset=None) + item['ocid'] yield self.build_request(url, formatter=components(-2)) url = replace_parameters(response.request.url, offset=data['offset']) yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list)
def parse_list(self, response): base_url = 'http://public.eprocurement.systems/ocds/tenders/' data = json.loads(response.text) # The last page returns an empty JSON object. if not data: return for item in data['data']: yield self.build_request(base_url + item['ocid'], formatter=components(-1)) url = replace_parameters(response.request.url, offset=data['offset']) yield self.build_request(url, formatter=parameters('offset'), callback=self.parse_list)
def parse_pages(self, response): content = json.loads(response.text) for url in self.get_files_to_download(content): yield self.build_request(url, formatter=components(-1), dont_filter=True) pagination = content['pagination'] if pagination['current_page'] < pagination['total_pages']: page = pagination['current_page'] + 1 url = replace_parameters(response.request.url, page=page) yield self.build_request(url, formatter=parameters( 'fecha_desde', 'page'), dont_filter=True, callback=self.parse_pages)
def parse_date_range(self, response): offset = int(get_parameter_value(response.request.url, 'offset')) # Scrapy uses `datetime.datetime.utcnow()`, so we don't need to worry about time zones. start_time = int(self.crawler.stats.get_value('start_time').timestamp() * 1000) # We use the first offset to calculate the new offset, and in log lessages. first_offset = response.request.meta.get('first', offset) # The exponent for the exponential search. exponent = response.request.meta.get('exponent', -1) + 1 # If this offset succeeded, do a binary search from the previous offset to this offset. if self.is_http_success(response): yield from self.parse_binary_search(response, response.request.meta['prev'], offset) # If this offset failed and reached a limit, stop. elif offset >= start_time or exponent > EXPONENT_LIMIT: self.logger.info(f'No offset found after {first_offset:,} within {2 ** EXPONENT_LIMIT} days.') yield self.build_file_error_from_response(response) # Otherwise, continue. else: new_offset = min(first_offset + MILLISECONDS_PER_DAY * 2 ** exponent, start_time) url = replace_parameters(response.request.url, offset=new_offset) yield self._build_request(url, self.parse_date_range, {'prev': offset, 'exponent': exponent, 'first': first_offset})
def _build_url(self, params): url_params = params.copy() url_params.update(self.additional_params) return util.replace_parameters(self.base_url, **url_params)
def _set_base_url(self, url): self.base_url = util.replace_parameters(url, page=None, limit=None, offset=None)
def test_replace_parameters(url, value, expected): assert replace_parameters(url, page=value) == expected