Example #1
0
 def close(self, reason):
     self.crawling = False
     if self.engine is not None:
         self.engine.close()
     logger.info('total time: ' + str(int(time.time()) - self.start_time))
     self.stats.close_spider(self.spider, reason=reason)
     loop.stop()
Example #2
0
 def pop_request(self):
     try:
         request = self.q.get(block=False)
     except Exception as e:
         request = None
         logger.info('No request object in the queue now!\t' + str(e))
     return request
Example #3
0
 def from_settings(cls, settings, starter=None):
     mwlist = cls._get_middleware_list_from_settings(settings)
     middlewares = []
     enabled = []
     for clspath in mwlist:
         try:
             mwcls = load_object(clspath)
             if starter and hasattr(mwcls, 'from_starter'):
                 mw = mwcls.from_starter(starter)
             elif hasattr(mwcls, 'from_settings'):
                 mw = mwcls.from_settings(settings)
             else:
                 mw = mwcls()
             middlewares.append(mw)
             enabled.append(clspath)
         except NotConfigured as e:
             if e.args:
                 clsname = clspath.split('.')[-1]
                 logger.warning("Disabled %(clsname)s: %(eargs)s", {
                     'clsname': clsname,
                     'eargs': e.args[0]
                 },
                                extra={'starter': starter})
     logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", {
         'componentname': cls.component_name,
         'enabledlist': pprint.pformat(enabled)
     },
                 extra={'starter': starter})
     return cls(*middlewares)
Example #4
0
 def close_spider(self, spider, reason='cancelled'):
     """Close (cancel) spider and clear all its outstanding requests"""
     heart = self.heart
     if heart.closing:
         return heart.closing
     heart.close()
     logger.info("Closing spider (%(reason)s)", {'reason': reason},
                 extra={'spider': spider})
     # self.starter.stats.close_spider(spider, reason=reason)
     loop.call_later(3, self.starter.close, reason)
Example #5
0
 def start(self, sender):
     self.engine = sender
     logger.info("Telnet console listening on %(host)s:%(port)d", {
         'host': self.host,
         'port': self.portrange[0]
     },
                 extra={'starter': self.starter})
     logger.info(type(self.engine))
     asyncio.run_coroutine_threadsafe(
         self.telnet_server.create_server(self.portrange[0], self.engine,
                                          self.telnet_vars), loop)
Example #6
0
 def process_spider_exception(self, response, exception, spider):
     if isinstance(exception, HttpError):
         # spider.starter.stats.inc_value('httperror/response_ignored_count')
         # spider.starter.stats.inc_value(
         #     'httperror/response_ignored_status_count/%s' % response.status
         # )
         logger.info(
             "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
             {'response': response},
             extra={'spider': spider},
         )
         return []
Example #7
0
 def process_item(self, item, spider):
     res = True
     for method in self.methods['process_item']:
         try:
             item = method(item, spider)
             if not isinstance(item, (BaseItem, dict)):
                 raise DropItem(
                     'DropItem: process_item do not return BaseItem or dict'
                 )
         except DropItem as e:
             logger.info('DropItem: %s' % e)
             res = False
             break
     return res
Example #8
0
 def start(self):
     assert not self.crawling, "Crawling already taking place"
     self.crawling = True
     self.start_time = int(time.time())
     try:
         if self.is_check_emmory:
             cm = CheckMemory()
             cm.start(60)
         self.spider = self._create_spider()
         self.engine = self._create_engine()
         self.engine.start(self.spider)
     except KeyboardInterrupt as e:
         logger.info(e)
         self.close('KeyboardInterrupt')
Example #9
0
 def close_spider(self, spider, reason):
     if self._dump:
         logger.info("Dumping Quixote stats:\n" + pprint.pformat(self._stats), extra={'spider': spider})
     self._persist_stats(self._stats, spider)
Example #10
0
 def _check_if_closing(self, spider, slot):
     if slot.closing and slot.is_idle():
         # slot.closing.callback(spider)
         logger.info('Scraper close successfully.')