def _next_request(self, spider): if not self.heart: return if self.paused: return heart = self.heart while not self._needs_slowdown(): if not self._next_request_from_scheduler(spider): break if heart.start_requests and not self._needs_slowdown(): try: request = next(heart.start_requests) except StopIteration: heart.start_requests = None except Exception as e: heart.start_requests = None logger.error('Error while obtaining start requests', exc_info=True, extra={ 'spider': spider, 'exc_info': str(e) }) else: self.crawl(request, spider) if self.spider_is_idle() and heart.close_if_idle: self._spider_idle(spider)
def enqueue_scrape(self, response, request, spider): slot = self.slot try: slot.add_response_request(response, request) # print('queue: ', len(self.slot.queue), '\tactive: ', len(self.slot.active)) self._scrape_next(spider, slot) except Exception as e: logger.error('Scraper bug processing %(request)s %(err)s', {'request': request, 'err': logger.exception(e)}, extra={'spider': spider}) # ,exc_info=failure_to_exc_info(f),
async def task(_request, _spider): try: response = await self.downloader.fetch(_request, _spider) if not isinstance(response, Response): logger.error('The Download data was not a Response object') return for item in _request.callback(response): print(item) except Exception as e: print(logger.exception(e))
def handle_parser_output(self, result, request, response, spider): if isinstance(result, Request): self.starter.engine.crawl(request=result, spider=spider) elif isinstance(result, (BaseItem, dict)): self.slot.itemproc_size += 1 output = self.itemmw.process_item(result, spider) self._itemproc_finished(output, result, response, spider) elif result is None: pass else: logger.error('Spider must return Request, BaseItem, dict or None, got %(typename)r in %(request)s', {'request': request, 'typename': type(result).__name__}, extra={'spider': spider})
def _get_middleware_list_from_settings(cls, settings): mw_dict = settings['ITEM_PIPELINES'] preprocess_mw_dict = dict() for k, v in mw_dict.items(): if not v: continue if isinstance(v, int): preprocess_mw_dict[k] = v else: logger.error( 'There is an error in your settings file.\nThe variable ITEM_PIPELINES error.' ) raise ErrorSettings('Settings ITEM_PIPELINES error: ' + str((k, v))) sorted_mw_list = sorted(preprocess_mw_dict.items(), key=lambda x: x[1], reverse=False) mw_list = list() for mw in sorted_mw_list: mw_list.append(mw[0]) return mw_list
def _get_handler(self, scheme): if scheme in self._handlers: return self._handlers[scheme] if scheme in self._not_configured: return None if scheme not in self._schemes: self._not_configured[scheme] = 'no handler available for that scheme' return None path = self._schemes[scheme] try: download_handler = load_object(path) dh = download_handler(self._starter.settings) except NotConfigured as e: self._not_configured[scheme] = str(e) return None except Exception as e: logger.error('Loading "%(class_path)s" for scheme "%(scheme)s"', {"class_path": path, "scheme": scheme}, exc_info=True, extra={'crawler': self._starter}) self._not_configured[scheme] = str(e) return None else: self._handlers[scheme] = dh return self._handlers[scheme]