def logerror(failure, recv): if dont_log is None or not isinstance(failure.value, dont_log): log.spider_log(failure, "Error caught on signal handler: %s" % recv, level=log.ERROR, spider=spider) return failure
def _debug_cookie(self, request, spider): if self.debug: cl = request.headers.getlist('Cookie') if cl: msg = "Sending cookies to: %s" % request + os.linesep msg += os.linesep.join("Cookie: %s" % c for c in cl) log.spider_log(msg, spider=spider, level=log.DEBUG)
def _debug_set_cookie(self, response, spider): if self.debug: cl = response.headers.getlist('Set-Cookie') if cl: msg = "Received cookies from: %s" % response + os.linesep msg += os.linesep.join("Set-Cookie: %s" % c for c in cl) log.spider_log(msg, spider=spider, level=log.DEBUG)
def dump_stacktrace(self, signum, frame): stackdumps = self._thread_stacks() enginestatus = format_engine_status(self.crawler.engine) liverefs = format_live_refs() msg = "Dumping stack trace and engine status" \ "\n{0}\n{1}\n{2}".format(enginestatus, liverefs, stackdumps) log.spider_log(msg)
def process_request(self, request, spider): useragent = self._useragent rp = self.robot_parser(request, spider) if rp and not rp.can_fetch(useragent, request.url): log.spider_log("Forbidden by robots.txt:" + request.url, level=log.DEBUG) raise IgnoreRequest
def create_spider_for_request(spidermanager, request, default_spider=None, \ log_none=False, log_multiple=False, **spider_kwargs): """Create a spider to handle the given Request. This will look for the spiders that can handle the given request (using the spider manager) and return a (new) Spider if (and only if) there is only one Spider able to handle the Request. If multiple spiders (or no spider) are found, it will return the default_spider passed. It can optionally log if multiple or no spiders are found. """ snames = spidermanager.find_by_request(request) if len(snames) == 1: return spidermanager.create(snames[0], **spider_kwargs) if len(snames) > 1 and log_multiple: log.spider_log(format='More than one spider can handle:' + request.url + ':' + ', '.join(snames), level=log.ERROR) if len(snames) == 0 and log_none: log.spider_log(format='Unable to find spider that handles:' + request.url, level=log.ERROR) return default_spider
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named): """Like pydispatcher.robust.sendRobust but it also logs errors and returns Failures instead of exceptions. """ dont_log = named.pop('dont_log', None) spider = named.get('spider', None) responses = [] for receiver in liveReceivers(getAllReceivers(sender, signal)): try: response = robustApply(receiver, signal=signal, sender=sender, *arguments, **named) if isinstance(response, Deferred): log.spider_log("Cannot return deferreds from signal handler:" + receiver, level=log.ERROR, spider=spider) except dont_log: result = Failure() except Exception: result = Failure() log.spider_log(result, "Error caught on signal handler: %s" % receiver, level=log.ERROR, spider=spider) else: result = response responses.append((receiver, result)) return responses
def log(self, spider): items = self.stats.get_value('item_scraped_count', 0) pages = self.stats.get_value('response_received_count', 0) irate = (items - self.itemsprev) * self.multiplier prate = (pages - self.pagesprev) * self.multiplier self.pagesprev, self.itemsprev = pages, items msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \ % (pages, prate, items, irate) log.spider_log(msg, spider=spider)
def process_spider_exception(self, response, exception, spider): if isinstance(exception, HttpError): log.spider_log( "Ignoring response " + response.url + ": HTTP status code is not handled or not allowed", level=log.DEBUG, spider=spider, ) return []
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = request.headers.get('Referer') log.spider_log('File (unknown-error): Error downloading ' + self.MEDIA_NAME + ' from ' + request.url + ' referred in <' + referer + '>:' + failure.value, level=log.WARNING, spider=info.spider) raise FileException
def _storage_supported(self, uri): scheme = urlparse(uri).scheme if scheme in self.storages: try: self._get_storage(uri) return True except NotConfigured: log.spider_log("Disabled feed storage scheme: %s" % scheme, log.ERROR) else: log.spider_log("Unknown feed storage scheme: %s" % scheme, log.ERROR)
def _filter(request): if isinstance(request, Request) and len(request.url) > self.maxlength: log.spider_log("Ignoring link (url length > " + self.maxlength + "):" + request.url, level=log.DEBUG, spider=spider) return False else: return True
def item_completed(self, results, item, info): """Called per item when all media requests has been processed""" if self.LOG_FAILED_RESULTS: msg = '%s found errors proessing %s' % (self.__class__.__name__, item) for ok, value in results: if not ok: log.spider_log(value + msg, spider=info.spider, level=log.ERROR) return item
def log(self, request, spider): if self.debug: log.spider_log("Filtered duplicate request:" + request.url, level=log.DEBUG, spider=spider) elif self.logdupes: fmt = ("Filtered duplicate request: " + request.url + " - no more duplicates will be shown" " (see DUPEFILTER_DEBUG to show all duplicates)") log.spider_log(fmt, level=log.DEBUG, spider=spider) self.logdupes = False spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
def _check_warning(self): if self.warned: # warn only once return if self.get_virtual_size() > self.warning: self.crawler.stats.set_value('memusage/warning_reached', 1) mem = self.warning / 1024 / 1024 log.spider_log("Memory usage reached " + str(mem) + "M", level=log.WARNING) if self.notify_mails: subj = "%s warning: memory usage reached %dM at %s" % \ (self.crawler.settings['BOT_NAME'], mem, socket.gethostname()) self._send_report(self.notify_mails, subj) self.crawler.stats.set_value('memusage/warning_notified', 1) self.warned = True
def process_response(self, request, response, spider): if not response.body: return response for fmt, func in self._formats.iteritems(): new_response = func(response) if new_response: log.spider_log( 'Decompressed response with format:' + fmt, level=log.DEBUG, spider=spider, ) return new_response return response
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.engine.engine.close_spider(spider, exc.reason or 'cancelled') return log.spider_log(_failure, "Spider error processing %s" % request, spider=spider) self.signals.send_catch_log(signal=signals.spider_error, failure=_failure, response=response, pider=spider) self.engine.stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider) self.engine.send_request_result(request, str(exc))
def process_spider_output(self, response, result, spider): for x in result: if isinstance(x, Request): if x.dont_filter or self.should_follow(x, spider): yield x else: domain = urlparse_cached(x).hostname if domain and domain not in self.domains_seen: self.domains_seen.add(domain) log.spider_log("Filtered offsite request to " + str(domain) + ":" + x.url, level=log.DEBUG, spider=spider) self.stats.inc_value('offsite/domains', spider=spider) self.stats.inc_value('offsite/filtered', spider=spider) else: yield x
def media_downloaded(self, response, request, info): referer = request.headers.get('Referer') if response.status != 200: log.spider_log('File (code:' + response.status + '): Error downloading file from ' + request + ' referred in <' + referer + '>', level=log.WARNING, spider=info.spider) raise FileException('download-error') if not response.body: log.spider_log('File (empty-content): Empty file from ' + request.url + ' referred in <' + referer + '>: no-content', level=log.WARNING, spider=info.spider) raise FileException('empty-content') status = 'cached' if 'cached' in response.flags else 'downloaded' log.spider_log('File (' + status + '): Downloaded file from ' + request.url + ' referred in <' + referer + '>', level=log.DEBUG, spider=info.spider) self.inc_stats(info.spider, status) try: path = self.file_path(request, response=response, info=info) checksum = self.file_downloaded(response, request, info) except FileException as exc: msg = 'File (error): Error processing file from ' + request.url + ' referred in <' + referer + '>:' + str( exc) log.spider_log(msg, level=log.WARNING, spider=info.spider) raise except Exception as exc: msg = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>' log.spider_log(msg % { 'request': request, 'referer': referer }, spider=info.spider) raise FileException(str(exc)) return {'url': request.url, 'path': path, 'checksum': checksum}
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): self.engine.add_request(request, output) elif isinstance(output, BaseItem): self.slot.itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, request, response, spider) return dfd elif output is None: self.engine.send_request_result() else: typename = type(output).__name__ msg = 'Spider must return Request, BaseItem or None, got ' + typename + ' in ' + request.url log.spider_log(msg, level=log.ERROR, spider=spider) self.engine.send_request_result(request, msg)
def close_spider(self, spider): slot = self.slot if not slot.itemcount and not self.store_empty: return slot.exporter.finish_exporting() logfmt = "%%s %s feed (%d items) in: %s" % (self.format, slot.itemcount, slot.uri) d = defer.maybeDeferred(slot.storage.store, slot.file) d.addCallback(lambda _: log.spider_log(logfmt % "Stored", spider=spider)) d.addErrback(log.spider_log, logfmt % "Error storing", spider=spider) return d
def _filter(request): if isinstance(request, Request): depth = response.meta['depth'] + 1 request.meta['depth'] = depth if self.prio: request.priority -= depth * self.prio if self.maxdepth and depth > self.maxdepth: log.spider_log("Ignoring link (depth > %(" + str(self.maxdepth) + ")d):" + request.url, level=log.DEBUG, spider=spider) return False elif self.stats: if self.verbose_stats: self.stats.inc_value('request_depth_count/%s' % depth, spider=spider) self.stats.max_value('request_depth_max', depth, spider=spider) return True
def _check_limit(self): if self.get_virtual_size() > self.limit: self.crawler.stats.set_value('memusage/limit_reached', 1) mem = self.limit / 1024 / 1024 log.spider_log( "Memory usage exceeded %(memusage)dM. Shutting down Scrapy...", level=log.ERROR, memusage=mem) if self.notify_mails: subj = "%s terminated: memory usage exceeded %dM at %s" % \ (self.crawler.settings['BOT_NAME'], mem, socket.gethostname()) self._send_report(self.notify_mails, subj) self.crawler.stats.set_value('memusage/limit_notified', 1) open_spiders = self.crawler.engine.open_spiders if open_spiders: for spider in open_spiders: self.crawler.engine.close_spider(spider, 'memusage_exceeded') else: self.crawler.stop()
def _log_download_errors(spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download errors that got propagated thru here) """ if isinstance(download_failure, Failure) \ and not download_failure.check(IgnoreRequest): if download_failure.frames: log.spider_log('Error downloading %s' % request, spider=spider, level=log.ERROR) else: errmsg = download_failure.getErrorMessage() if errmsg: log.spider_log('Error downloading ' + request.url + ':' + errmsg, level=log.ERROR, spider=spider) if spider_failure is not download_failure: return spider_failure
def _itemproc_finished(self, output, item, request, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): log.spider_log('scrape error:' + ex.message + ':in:' + response.url, spider=spider, level=log.ERROR) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: log.spider_log('Error processing %s' % item, spider=spider, level=log.ERROR) self.engine.send_request_result(request, ex) else: log.spider_log('scrape ok in:' + response.url, spider=spider) self.engine.send_request_result(request) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get( 'redirect_urls', []) + [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust log.spider_log("Redirecting " + reason + " to " + redirected + " from " + request.url, level=log.DEBUG, spider=spider) return redirected else: log.spider_log(format="Discarding " + request.url + ": max redirections reached", level=log.DEBUG, spider=spider) raise IgnoreRequest("max redirections reached")
def _parse_sitemap(self, response): if response.url.endswith('/robots.txt'): for url in sitemap_urls_from_robots(response.body): yield Request(url, callback=self._parse_sitemap) else: body = self._get_sitemap_body(response) if body is None: log.spider_log("Ignoring invalid sitemap:" + response.url, level=log.WARNING, spider=self) return s = Sitemap(body) if s.type == 'sitemapindex': for loc in iterloc(s, self.sitemap_alternate_links): if any(x.search(loc) for x in self._follow): yield Request(loc, callback=self._parse_sitemap) elif s.type == 'urlset': for loc in iterloc(s): for r, c in self._cbs: if r.search(loc): yield Request(loc, callback=c) break
def _onsuccess(result): if not result: return # returning None force download last_modified = result.get('last_modified', None) if not last_modified: return # returning None force download age_seconds = time.time() - last_modified age_days = age_seconds / 60 / 60 / 24 if age_days > self.EXPIRES: return # returning None force download referer = request.headers.get('Referer') log.spider_log('File (uptodate): Downloaded ' + self.MEDIA_NAME + ' from ' + request.url + ' referred in <' + referer + '>', level=log.DEBUG, spider=info.spider) self.inc_stats(info.spider, 'uptodate') checksum = result.get('checksum', None) return {'url': request.url, 'path': path, 'checksum': checksum}
def process_response(self, request, response, spider): if not isinstance(response, HtmlResponse) or response.status != 200: return response if request.method != 'GET': # other HTTP methods are either not safe or don't have a body return response if 'ajax_crawlable' in request.meta: # prevent loops return response if not self._has_ajax_crawlable_variant(response): return response # ants already handles #! links properly ajax_crawl_request = request.replace(url=request.url + '#!') log.spider_log("Downloading AJAX crawlable " + ajax_crawl_request + " instead of " + request.url, level=log.DEBUG, spider=spider) ajax_crawl_request.meta['ajax_crawlable'] = True return ajax_crawl_request
def from_settings(cls, settings, crawler=None): mwlist = cls._get_mwlist_from_settings(settings) middlewares = [] for clspath in mwlist: try: mwcls = load_object(clspath) if crawler and hasattr(mwcls, 'from_crawler'): mw = mwcls.from_crawler(crawler) elif hasattr(mwcls, 'from_settings'): mw = mwcls.from_settings(settings) else: mw = mwcls() middlewares.append(mw) except NotConfigured as e: if e.args: clsname = clspath.split('.')[-1] log.spider_log("Disabled " + clsname + ":" + e.args[0], level=log.WARNING) enabled = [x.__class__.__name__ for x in middlewares] log.spider_log("Enabled " + cls.component_name + ":" + ', '.join(enabled), level=log.INFO) return cls(*middlewares)