Esempio n. 1
0
 def _next_request(self, spider):
     if not self.heart:
         return
     if self.paused:
         return
     heart = self.heart
     while not self._needs_slowdown():
         if not self._next_request_from_scheduler(spider):
             break
     if heart.start_requests and not self._needs_slowdown():
         try:
             request = next(heart.start_requests)
         except StopIteration:
             heart.start_requests = None
         except Exception as e:
             heart.start_requests = None
             logger.error('Error while obtaining start requests',
                          exc_info=True,
                          extra={
                              'spider': spider,
                              'exc_info': str(e)
                          })
         else:
             self.crawl(request, spider)
     if self.spider_is_idle() and heart.close_if_idle:
         self._spider_idle(spider)
Esempio n. 2
0
 def enqueue_scrape(self, response, request, spider):
     slot = self.slot
     try:
         slot.add_response_request(response, request)
         # print('queue: ', len(self.slot.queue), '\tactive: ', len(self.slot.active))
         self._scrape_next(spider, slot)
     except Exception as e:
         logger.error('Scraper bug processing %(request)s %(err)s', {'request': request, 'err': logger.exception(e)},
                      extra={'spider': spider})  # ,exc_info=failure_to_exc_info(f),
Esempio n. 3
0
 async def task(_request, _spider):
     try:
         response = await self.downloader.fetch(_request, _spider)
         if not isinstance(response, Response):
             logger.error('The Download data was not a Response object')
             return
         for item in _request.callback(response):
             print(item)
     except Exception as e:
         print(logger.exception(e))
Esempio n. 4
0
 def handle_parser_output(self, result, request, response, spider):
     if isinstance(result, Request):
         self.starter.engine.crawl(request=result, spider=spider)
     elif isinstance(result, (BaseItem, dict)):
         self.slot.itemproc_size += 1
         output = self.itemmw.process_item(result, spider)
         self._itemproc_finished(output, result, response, spider)
     elif result is None:
         pass
     else:
         logger.error('Spider must return Request, BaseItem, dict or None, got %(typename)r in %(request)s',
                      {'request': request, 'typename': type(result).__name__}, extra={'spider': spider})
Esempio n. 5
0
 def _get_middleware_list_from_settings(cls, settings):
     mw_dict = settings['ITEM_PIPELINES']
     preprocess_mw_dict = dict()
     for k, v in mw_dict.items():
         if not v:
             continue
         if isinstance(v, int):
             preprocess_mw_dict[k] = v
         else:
             logger.error(
                 'There is an error in your settings file.\nThe variable ITEM_PIPELINES error.'
             )
             raise ErrorSettings('Settings ITEM_PIPELINES error: ' +
                                 str((k, v)))
     sorted_mw_list = sorted(preprocess_mw_dict.items(),
                             key=lambda x: x[1],
                             reverse=False)
     mw_list = list()
     for mw in sorted_mw_list:
         mw_list.append(mw[0])
     return mw_list
Esempio n. 6
0
 def _get_handler(self, scheme):
     if scheme in self._handlers:
         return self._handlers[scheme]
     if scheme in self._not_configured:
         return None
     if scheme not in self._schemes:
         self._not_configured[scheme] = 'no handler available for that scheme'
         return None
     path = self._schemes[scheme]
     try:
         download_handler = load_object(path)
         dh = download_handler(self._starter.settings)
     except NotConfigured as e:
         self._not_configured[scheme] = str(e)
         return None
     except Exception as e:
         logger.error('Loading "%(class_path)s" for scheme "%(scheme)s"', {"class_path": path, "scheme": scheme},
                      exc_info=True,  extra={'crawler': self._starter})
         self._not_configured[scheme] = str(e)
         return None
     else:
         self._handlers[scheme] = dh
     return self._handlers[scheme]