def close(self, reason): self.crawling = False if self.engine is not None: self.engine.close() logger.info('total time: ' + str(int(time.time()) - self.start_time)) self.stats.close_spider(self.spider, reason=reason) loop.stop()
def pop_request(self): try: request = self.q.get(block=False) except Exception as e: request = None logger.info('No request object in the queue now!\t' + str(e)) return request
def from_settings(cls, settings, starter=None): mwlist = cls._get_middleware_list_from_settings(settings) middlewares = [] enabled = [] for clspath in mwlist: try: mwcls = load_object(clspath) if starter and hasattr(mwcls, 'from_starter'): mw = mwcls.from_starter(starter) elif hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings) else: mw = mwcls() middlewares.append(mw) enabled.append(clspath) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] logger.warning("Disabled %(clsname)s: %(eargs)s", { 'clsname': clsname, 'eargs': e.args[0] }, extra={'starter': starter}) logger.info("Enabled %(componentname)ss:\n%(enabledlist)s", { 'componentname': cls.component_name, 'enabledlist': pprint.pformat(enabled) }, extra={'starter': starter}) return cls(*middlewares)
def close_spider(self, spider, reason='cancelled'): """Close (cancel) spider and clear all its outstanding requests""" heart = self.heart if heart.closing: return heart.closing heart.close() logger.info("Closing spider (%(reason)s)", {'reason': reason}, extra={'spider': spider}) # self.starter.stats.close_spider(spider, reason=reason) loop.call_later(3, self.starter.close, reason)
def start(self, sender): self.engine = sender logger.info("Telnet console listening on %(host)s:%(port)d", { 'host': self.host, 'port': self.portrange[0] }, extra={'starter': self.starter}) logger.info(type(self.engine)) asyncio.run_coroutine_threadsafe( self.telnet_server.create_server(self.portrange[0], self.engine, self.telnet_vars), loop)
def process_spider_exception(self, response, exception, spider): if isinstance(exception, HttpError): # spider.starter.stats.inc_value('httperror/response_ignored_count') # spider.starter.stats.inc_value( # 'httperror/response_ignored_status_count/%s' % response.status # ) logger.info( "Ignoring response %(response)r: HTTP status code is not handled or not allowed", {'response': response}, extra={'spider': spider}, ) return []
def process_item(self, item, spider): res = True for method in self.methods['process_item']: try: item = method(item, spider) if not isinstance(item, (BaseItem, dict)): raise DropItem( 'DropItem: process_item do not return BaseItem or dict' ) except DropItem as e: logger.info('DropItem: %s' % e) res = False break return res
def start(self): assert not self.crawling, "Crawling already taking place" self.crawling = True self.start_time = int(time.time()) try: if self.is_check_emmory: cm = CheckMemory() cm.start(60) self.spider = self._create_spider() self.engine = self._create_engine() self.engine.start(self.spider) except KeyboardInterrupt as e: logger.info(e) self.close('KeyboardInterrupt')
def close_spider(self, spider, reason): if self._dump: logger.info("Dumping Quixote stats:\n" + pprint.pformat(self._stats), extra={'spider': spider}) self._persist_stats(self._stats, spider)
def _check_if_closing(self, spider, slot): if slot.closing and slot.is_idle(): # slot.closing.callback(spider) logger.info('Scraper close successfully.')