def process_response(self, request, response, spider): stats.inc_value("crawled_url_num", spider=spider) stats.inc_value("crawled_page_size", count=len(response.body), spider=spider) # log.msg('download page:%s, size:%s, content-length:%s'\ # %(response.url, len(response.body), # response.headers.get('Content-Length'))) return response
def handle_spider_error(self, _failure, request, spider, propagated_failure=None): referer = request.headers.get('Referer', None) msg = "Spider error processing <%s> (referer: <%s>)" % \ (request.url, referer) log.err(_failure, msg, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider)
def process_request(self, request, spider): stats.inc_value('downloader/request_count') stats.inc_value('downloader/request_count', spider=spider) stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider) reqlen = len(request_httprepr(request)) stats.inc_value('downloader/request_bytes', reqlen, spider=spider) stats.inc_value('downloader/request_bytes', reqlen)
def process_response(self, request, response, spider): if (self.is_cacheable(request) and self.is_cacheable_response(response) and 'cached' not in response.flags): self.storage.store_response(spider, request, response) stats.inc_value('httpcache/store', spider=spider) return response
def stats_spider_closing(self, spider, reason): stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider) stats.inc_value('spider_count/%s' % reason, spider=spider)
def stats_spider_opened(self, spider): stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider) stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider) stats.inc_value('spider_count/opened')
def process_response(self, request, response, spider): stats.inc_value('downloader/response_count') stats.inc_value('downloader/response_count', spider=spider) stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider) reslen = len(response_httprepr(response)) stats.inc_value('downloader/response_bytes', reslen, spider=spider) stats.inc_value('downloader/response_bytes', reslen) return response
def process_response(self, request, response, spider): doc = request.meta['terms'] if self.index.appendif(doc, response.url, 0.0): return response stats.inc_value('downloader/near_duplicates') raise IgnoreRequest
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or "cancelled") return log.err(_failure, "Spider error processing %s" % request, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
def process_response(self, request, response, spider): stats.inc_value('crawled_url_num', spider=spider) stats.inc_value('crawled_page_size', count=len(response.body), spider=spider) #log.msg('download page:%s, size:%s, content-length:%s'\ # %(response.url, len(response.body), # response.headers.get('Content-Length'))) return response
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return log.err(_failure, "Spider error processing %s" % request, spider=spider) send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \ spider=spider) stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \ spider=spider)
def process_item(self, item, spider): self.file.write('%s\n' % item['domain']) domain = Domain.get(item['domain']) if not domain: stats.inc_value('found_new_domain') log.msg("Found new domain: %s" % item['domain']) Domain.objects.create(domain_name=item['domain'], monitoring_status='new', metadata_status='new')
def parse(self, response): self.log('Response:%s, type:%s' % (response.url, type(response))) self.parser_manager.process_response(response, self) stats.inc_value('crawled_url_num', spider=self) stats.inc_value( 'crawled_page_size', count=len(response.body), spider=self, )
def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, request.priority) except ValueError: # non serializable request return else: stats.inc_value('scheduler/disk_enqueued', spider=self.spider) return True
def process_request(self, request, spider): if not self.is_cacheable(request): return response = self.storage.retrieve_response(spider, request) if response and self.is_cacheable_response(response): response.flags.append("cached") stats.inc_value("httpcache/hit", spider=spider) return response stats.inc_value("httpcache/miss", spider=spider) if self.ignore_missing: raise IgnoreRequest("Ignored request not in cache: %s" % request)
def process_request(self, request, spider): if not self.is_cacheable(request): return response = self.storage.retrieve_response(spider, request) if response and self.is_cacheable_response(response): response.flags.append('cached') stats.inc_value('httpcache/hit', spider=spider) return response stats.inc_value('httpcache/miss', spider=spider) if self.ignore_missing: raise IgnoreRequest("Ignored request not in cache: %s" % request)
def _filter(request): if isinstance(request, Request): depth = response.request.meta['depth'] + 1 request.meta['depth'] = depth if self.maxdepth and depth > self.maxdepth: log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \ level=log.DEBUG, spider=spider) return False elif self.stats: stats.inc_value('request_depth_count/%s' % depth, spider=spider) if depth > stats.get_value('request_depth_max', 0, spider=spider): stats.set_value('request_depth_max', depth, spider=spider) return True
def parse_item(self, response): item = RecipebotItem() doc = response.meta['terms'] # decide if the page is interesting if not self.sim.is_relevant(doc): stats.inc_value('recipe/filtered_out') # probably not recipe page return item['url'] = response.url return item
def parse_item(self, response): item = ScifibotItem() # clean body orig_body = response.body_as_unicode() body = remove_tags_with_content(orig_body, which_ones=('script', 'head')) body = remove_tags(remove_comments(body)) tokens = tokenize(body.lower()) # decide if the page is interesting if not is_relevant(tokens): stats.inc_value('scifi/filtered_out') # probably not scifi page return item['keywords'] = tokens item['page'] = orig_body item['url'] = response.url return item
def new_callback(*args, **kwargs): tbefore = time() mbefore = self._memusage() r = function(*args, **kwargs) mafter = self._memusage() ct = time() - tbefore tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider) sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider) stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider) if ct > sct: stats.set_value('profiling/slowest_callback_time', ct, spider=spider) stats.set_value('profiling/slowest_callback_name', function.__name__, \ spider=spider) stats.set_value('profiling/slowest_callback_url', args[0].url, \ spider=spider) if self._memusage: stats.inc_value('profiling/total_mem_allocated_in_callbacks', \ count=mafter-mbefore, spider=spider) return r
def parse_item(self, response): item = RecipebotItem() body = response.meta['body'] result = self.detector.extract(body) if len(result) == 0: stats.inc_value('recipe/filtered_out') # probably not recipe page return item['url'] = response.url item['ingredients'] = [] for item in result: if item[2] >= 0.25: item['ingredients'].append(item[0]) return item
def process_spider_output(self, response, result, spider): def _filter(request): if isinstance(request, Request): depth = response.request.meta['depth'] + 1 request.meta['depth'] = depth if self.maxdepth and depth > self.maxdepth: log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \ level=log.DEBUG, spider=spider) return False elif self.stats: stats.inc_value('request_depth_count/%s' % depth, spider=spider) if depth > stats.get_value('request_depth_max', 0, spider=spider): stats.set_value('request_depth_max', depth, spider=spider) return True # base case (depth=0) if self.stats and 'depth' not in response.request.meta: response.request.meta['depth'] = 0 stats.inc_value('request_depth_count/0', spider=spider) return (r for r in result or () if _filter(r))
def process_exception(self, request, exception, spider): ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__) stats.inc_value('downloader/exception_count') stats.inc_value('downloader/exception_count', spider=spider) stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
def item_dropped(self, item, spider, exception): reason = exception.__class__.__name__ stats.inc_value('item_dropped_count', spider=spider) stats.inc_value('item_dropped_reasons_count/%s' % reason, spider=spider)
def _mqpush(self, request): stats.inc_value('scheduler/memory_enqueued', spider=self.spider) self.mqs.push(request, -request.priority)
def __len__(self): return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs) def _dqpush(self, request): if self.dqs is None: return try: reqd = request_to_dict(request, self.spider) self.dqs.push(reqd, -request.priority) except ValueError, e: # non serializable request if self.logunser: log.msg("Unable to serialize request: %s - reason: %s" % \ (request, str(e)), level=log.ERROR, spider=self.spider) return else: stats.inc_value('scheduler/disk_enqueued', spider=self.spider) return True def _mqpush(self, request): stats.inc_value('scheduler/memory_enqueued', spider=self.spider) self.mqs.push(request, -request.priority) def _dqpop(self): if self.dqs: d = self.dqs.pop() if d: return request_from_dict(d, self.spider) def _newmq(self, priority): return self.mqclass()
def inc_stats(self, spider, status): stats.inc_value('image_count', spider=spider) stats.inc_value('image_status_count/%s' % status, spider=spider)
def item_dropped(self, item, spider, exception): reason = exception.__class__.__name__ stats.inc_value("item_dropped_count", spider=spider) stats.inc_value("item_dropped_reasons_count/%s" % reason, spider=spider) stats.inc_value("item_dropped_count")
def item_scraped(self, item, spider): stats.inc_value('item_scraped_count', spider=spider) stats.inc_value('item_scraped_count')
def item_passed(self, item, spider): stats.inc_value('item_passed_count', spider=spider) stats.inc_value('item_passed_count')
def inc_stats(self, spider, status): stats.inc_value("image_count", spider=spider) stats.inc_value("image_status_count/%s" % status, spider=spider)
def item_scraped(self, item, spider): stats.inc_value('item_scraped_count', spider=spider)
def parse(self, response): self.log('Response:%s, type:%s' %(response.url, type(response))) self.parser_manager.process_response(response, self) stats.inc_value('crawled_url_num', spider=self) stats.inc_value('crawled_page_size', count=len(response.body), spider=self,)
def process_response(self, request, response, spider): if self.is_cacheable(request) and self.is_cacheable_response(response) and "cached" not in response.flags: self.storage.store_response(spider, request, response) stats.inc_value("httpcache/store", spider=spider) return response
def stats_spider_opened(self, spider): stats.set_value("start_time", datetime.datetime.utcnow(), spider=spider) stats.set_value("envinfo/host", stats.get_value("envinfo/host"), spider=spider) stats.inc_value("spider_count/opened")
def stats_spider_closing(self, spider, reason): stats.set_value("finish_time", datetime.datetime.utcnow(), spider=spider) stats.set_value("finish_status", "OK" if reason == "finished" else reason, spider=spider) stats.inc_value("spider_count/%s" % reason, spider=spider)
def process_exception(self, request, exception, spider): ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__) stats.inc_value('downloader/exception_count', spider=spider) stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
def item_passed(self, item, spider): stats.inc_value("item_passed_count", spider=spider) stats.inc_value("item_passed_count")