def _debug_cookie(self, request): if self.debug: cl = request.headers.getlist('Cookie') if cl: msg = 'Sending cookies to: %s' % request + os.linesep msg += os.linesep.join('Cookie: %s' % c for c in cl) log.msg(msg, level=log.DEBUG)
def _extract_links_from_html(self, html, response_encoding): links = [] for el, attr, attr_val, pos in html.iterlinks(): if self.tag_func(el.tag): if self.attr_func(attr): try: url = attr_val if isinstance(url, unicode): try: url = to_str(url, response_encoding) except UnicodeEncodeError: # fallback url = to_str(url, 'utf-8') url = requote_url(url) url = correct_relative_path(url) text = el.text or u'' text = to_unicode(text, 'utf-8') nofollow = (el.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def start_listening(self): self.port = listen_tcp(self.portrange, self.host, self) h = self.port.getHost() log.msg(format='Telnet console listening on %(host)s:%(port)d', level=log.DEBUG, host=h.host, port=h.port)
def _get_middlewares(self, mw_classes): if mw_classes is None: mwlist = [] for clspath in self._get_mwlist(): mwlist.append(load_object(clspath)) else: mwlist = mw_classes self.mwlist = mwlist middlewares = [] for mwcls in mwlist: try: # middlewares disabled through enabled_setting if hasattr(mwcls, 'enabled_setting'): enabled_setting = mwcls.enabled_setting else: enabled_setting = ('%s_ENABLED' % camelcase_to_capital(mwcls.__name__)) if not self.settings.get_bool(enabled_setting, True): raise NotConfigured() mw = mwcls(self.engine) mw.enabled_setting = enabled_setting middlewares.append(mw) except NotConfigured as e: log.msg(format='Disabled %(clsname)s: %(error)s', level=log.DEBUG, clsname=mwcls, error=e) enabled = [x.__class__.__name__ for x in middlewares] log.msg(format='Enabled %(componentname)ss: %(enabledlist)s', level=log.DEBUG, componentname=self.component_name, enabledlist=', '.join(enabled)) return middlewares
def send(self, signal=Any, *args, **kwargs): sender = kwargs.pop('sender', self.sender) dont_log = kwargs.pop('dont_log', None) responses = [] for receiver in liveReceivers(getAllReceivers(sender, signal)): try: response = robustApply(receiver, signal=signal, sender=sender, *args, **kwargs) if isinstance(response, defer.Deferred): log.msg( format= 'Cannot return deferreds from signal handler: %(receiver)s', level=log.ERROR, receiver=receiver) except dont_log: result = Failure() except Exception: result = Failure() log.err(result, 'Error caught on signal handler: %s' % receiver) else: result = response responses.append((receiver, result)) return responses
def _handle_pipeline_result(self, result): if result is None: pass elif isinstance(result, Request): self.download(result) else: assert isinstance(result, (Response, Failure)) request = result.request if isinstance(result, Response): flags = ' %s' % result.flags if result.flags else '' log.msg(format='Crawled %(url)s [%(status)s]%(flags)s', level=log.DEBUG, url=result.url, status=result.status, flags=flags) self.signals.send(signal=signals.response_received, response=result) else: self.signals.send(signal=signals.failure_received, failure=result) dfd = defer_result(result, clock=self.clock) dfd.addCallbacks(request.callback or self.spider.parse, request.errback) dfd.addCallbacks(self._handle_spider_output, self._handle_spider_error, callbackKeywords={'request': request}, errbackKeywords={'request': request}) return dfd
def _handle_pipeline_result(self, result): if result is None: pass elif isinstance(result, Request): self.download(result) else: assert isinstance(result, (Response, Failure)) request = result.request if isinstance(result, Response): flags = ' %s' % result.flags if result.flags else '' log.msg(format='Crawled %(url)s [%(status)s]%(flags)s', level=log.DEBUG, url=result.url, status=result.status, flags=flags) self.signals.send(signal=signals.response_received, response=result) else: self.signals.send(signal=signals.failure_received, failure=result) dfd = defer_result(result, clock=self.clock) dfd.addCallbacks(request.callback or self.spider.parse, request.errback) dfd.addCallbacks( self._handle_spider_output, self._handle_spider_error, callbackKeywords={'request': request}, errbackKeywords={'request': request}) return dfd
def _debug_set_cookie(self, response): if self.debug: cl = response.headers.getlist('Set-Cookie') if cl: msg = 'Received cookies from: %s' % response + os.linesep msg += os.linesep.join('Set-Cookie: %s' % c for c in cl) log.msg(msg, level=log.DEBUG)
def _stop(_): self.processing.cancel() self.downloader.close() self.request_queue.close() self.response_queue.close() log.msg(format='Engine stopped (%(reason)s)', reason=reason) self.signals.send(signal=signals.engine_stopped, reason=reason) self.stats.dump_stats()
def log(self): downloaded_speed = (self.downloaded - self.downloaded_prev) * self.multiplier self.downloaded_prev = self.downloaded log.msg(format='Crawled %(down)d pages (at %(speed)d pages/min).', level=log.INFO, down=self.downloaded, speed=downloaded_speed)
def _redirect(self, redirected, request, reason): if len(redirected.history) < self.max_redirect_times: redirected.history.append(request.url) redirected.priority = request.priority + self.priority_adjust log.msg(format='Redirecting (%(reason)s) to %(redirected)s from %(request)s', level=log.DEBUG, request=request, redirected=redirected, reason=reason) return redirected else: log.msg(format='Discarding %(request)s: max redirections reached', level=log.DEBUG, request=request) return None
def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg( format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def create_spider_by_url(self, url, default_spider=None, spargs={}): spiders = self.get_spiders_by_url(url) if len(spiders) == 1: return self.create_spider_by_name(spiders[0]) elif len(spiders) > 1: log.msg( format="More than one spider can handle: %(url)s - %(snames)s", level=log.ERROR, url=url, snames=", ".join(spiders), ) elif len(spiders) == 0: log.msg(format="Unable to find spider that handles: %(url)s", level=log.ERROR, url=url) return default_spider
def _retry(self, request, reason): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: log.msg(format='Retrying %(request)s (failed %(retries)d times): %(reason)s', level=log.DEBUG, request=request, retries=retries, reason=reason) retry_req = request.copy() retry_req.meta['retry_times'] = retries retry_req.meta['DUPLICATE_FILTER_ENABLED'] = False retry_req.priority = request.priority + self.priority_adjust return retry_req else: log.msg(format='Gave up retrying %(request)s (failed %(retries)d times): %(reason)s', level=log.DEBUG, request=request, retries=retries, reason=reason)
def create_spider_by_url(self, url, default_spider=None, spargs={}): spiders = self.get_spiders_by_url(url) if len(spiders) == 1: return self.create_spider_by_name(spiders[0]) elif len(spiders) > 1: log.msg( format='More than one spider can handle: %(url)s - %(snames)s', level=log.ERROR, url=url, snames=', '.join(spiders)) elif len(spiders) == 0: log.msg(format='Unable to find spider that handles: %(url)s', level=log.ERROR, url=url) return default_spider
def _redirect(self, redirected, request, reason): if len(redirected.history) < self.max_redirect_times: redirected.history.append(request.url) redirected.priority = request.priority + self.priority_adjust log.msg( format= 'Redirecting (%(reason)s) to %(redirected)s from %(request)s', level=log.DEBUG, request=request, redirected=redirected, reason=reason) return redirected else: log.msg(format='Discarding %(request)s: max redirections reached', level=log.DEBUG, request=request) return None
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding)) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append(Link(url=url, text=text, nofollow=nofollow)) return links
def send(self, signal=Any, *args, **kwargs): sender = kwargs.pop('sender', self.sender) dont_log = kwargs.pop('dont_log', None) responses = [] for receiver in liveReceivers(getAllReceivers(sender, signal)): try: response = robustApply(receiver, signal=signal, sender=sender, *args, **kwargs) if isinstance(response, defer.Deferred): log.msg(format='Cannot return deferreds from signal handler: %(receiver)s', level=log.ERROR, receiver=receiver) except dont_log: result = Failure() except Exception: result = Failure() log.err(result, 'Error caught on signal handler: %s' % receiver) else: result = response responses.append((receiver, result)) return responses
def _extract_links_from_html(self, html, response_encoding): links = [] for e, a, l, p in html.iterlinks(): if self.tag_func(e.tag): if self.attr_func(a): try: url = requote_url( to_str(to_unicode(l, 'utf-8'), response_encoding)) url = correct_relative_path(url) text = e.text or u'' text = to_unicode(text, 'utf-8') nofollow = (e.attrib.get('rel') == 'nofollow') except Exception as e: log.msg( format= 'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s', level=log.WARNING, url=html.base_url, etype=type(e), error=e) else: links.append( Link(url=url, text=text, nofollow=nofollow)) return links
def _retry(self, request, reason): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: log.msg( format= 'Retrying %(request)s (failed %(retries)d times): %(reason)s', level=log.DEBUG, request=request, retries=retries, reason=reason) retry_req = request.copy() retry_req.meta['retry_times'] = retries retry_req.meta['DUPLICATE_FILTER_ENABLED'] = False retry_req.priority = request.priority + self.priority_adjust return retry_req else: log.msg( format= 'Gave up retrying %(request)s (failed %(retries)d times): %(reason)s', level=log.DEBUG, request=request, retries=retries, reason=reason)
def new_tor_identity(self): '''Sets new tor identity. ''' if self.tor_connection is None or self.tor_password is None: log.msg(format='Unable to set new tor identity.', level=log.WARNING) return s = socket.socket() s.connect(self.tor_connection) s.send('AUTHENTICATE "%s"\r\n' % self.tor_password) resp = s.recv(1024) if resp.startswith('250'): s.send('signal NEWNYM\r\n') resp = s.recv(1024) if resp.startswith('250'): log.msg(format='New tor identity set', level=log.INFO) else: log.msg(format='Error 1 when setting new tor identity: %(resp)s', level=log.WARNING, resp=resp) else: log.msg(format='Error 2 when setting new tor identity: %(resp)s', level=log.WARNING, resp=resp)
def _signal_shutdown(self, signum, _): install_shutdown_handlers(self._signal_kill) signame = signal_names[signum] log.msg(format='Received %(signame)s, shutting down gracefully. Send again to force.', level=log.INFO, signame=signame) reactor.callFromThread(self.stop)
def process_request(self, request): if request.proxy is None: request.proxy = self.tor_proxy log.msg(format='Using tor for request %(request)s', level=log.DEBUG, request=request) return request
def test_msg_ignore_level(self): log.msg('Hello', level=log.DEBUG) log.msg('World', level=log.INFO) self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: World')
def test_msg_encoding(self): log.msg(u'Price: \xa3100') self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Price: \xc2\xa3100')
def test_msg_wrong_level(self): log.msg('Hello', level=9999) self.assertEqual(self.lw.get_first_line(), '[crawlmi] NOLEVEL: Hello')
def test_msg_level2(self): log.msg('Hello', log.WARNING) self.assertEqual(self.lw.get_first_line(), '[crawlmi] WARNING: Hello')
def test_format(self): log.msg(format='%(hi)s', hi='Hello') self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Hello')
def test_msg_basic(self): log.msg('Hello') self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Hello')
def dump_stats(self): if self._dump: log.msg('Dumping crawlmi stats:\n' + pprint.pformat(self._stats))
def _signal_kill(self, signum, _): install_shutdown_handlers(signal.SIG_IGN) signame = signal_names[signum] log.msg(format='Received %(signame)s twice, forcing unclean shutdown.', level=log.INFO, signame=signame) reactor.callFromThread(self._stop_reactor)