Exemple #1
0
 def _debug_cookie(self, request):
     if self.debug:
         cl = request.headers.getlist('Cookie')
         if cl:
             msg = 'Sending cookies to: %s' % request + os.linesep
             msg += os.linesep.join('Cookie: %s' % c for c in cl)
             log.msg(msg, level=log.DEBUG)
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for el, attr, attr_val, pos in html.iterlinks():
         if self.tag_func(el.tag):
             if self.attr_func(attr):
                 try:
                     url = attr_val
                     if isinstance(url, unicode):
                         try:
                             url = to_str(url, response_encoding)
                         except UnicodeEncodeError:
                             # fallback
                             url = to_str(url, 'utf-8')
                     url = requote_url(url)
                     url = correct_relative_path(url)
                     text = el.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (el.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Exemple #3
0
 def start_listening(self):
     self.port = listen_tcp(self.portrange, self.host, self)
     h = self.port.getHost()
     log.msg(format='Telnet console listening on %(host)s:%(port)d',
             level=log.DEBUG,
             host=h.host,
             port=h.port)
Exemple #4
0
    def _get_middlewares(self, mw_classes):
        if mw_classes is None:
            mwlist = []
            for clspath in self._get_mwlist():
                mwlist.append(load_object(clspath))
        else:
            mwlist = mw_classes

        self.mwlist = mwlist
        middlewares = []
        for mwcls in mwlist:
            try:
                # middlewares disabled through enabled_setting
                if hasattr(mwcls, 'enabled_setting'):
                    enabled_setting = mwcls.enabled_setting
                else:
                    enabled_setting = ('%s_ENABLED' %
                                       camelcase_to_capital(mwcls.__name__))
                if not self.settings.get_bool(enabled_setting, True):
                    raise NotConfigured()

                mw = mwcls(self.engine)
                mw.enabled_setting = enabled_setting
                middlewares.append(mw)
            except NotConfigured as e:
                log.msg(format='Disabled %(clsname)s: %(error)s',
                        level=log.DEBUG, clsname=mwcls, error=e)

        enabled = [x.__class__.__name__ for x in middlewares]
        log.msg(format='Enabled %(componentname)ss: %(enabledlist)s',
                level=log.DEBUG,
                componentname=self.component_name,
                enabledlist=', '.join(enabled))
        return middlewares
 def send(self, signal=Any, *args, **kwargs):
     sender = kwargs.pop('sender', self.sender)
     dont_log = kwargs.pop('dont_log', None)
     responses = []
     for receiver in liveReceivers(getAllReceivers(sender, signal)):
         try:
             response = robustApply(receiver,
                                    signal=signal,
                                    sender=sender,
                                    *args,
                                    **kwargs)
             if isinstance(response, defer.Deferred):
                 log.msg(
                     format=
                     'Cannot return deferreds from signal handler: %(receiver)s',
                     level=log.ERROR,
                     receiver=receiver)
         except dont_log:
             result = Failure()
         except Exception:
             result = Failure()
             log.err(result,
                     'Error caught on signal handler: %s' % receiver)
         else:
             result = response
         responses.append((receiver, result))
     return responses
Exemple #6
0
 def _handle_pipeline_result(self, result):
     if result is None:
         pass
     elif isinstance(result, Request):
         self.download(result)
     else:
         assert isinstance(result, (Response, Failure))
         request = result.request
         if isinstance(result, Response):
             flags = ' %s' % result.flags if result.flags else ''
             log.msg(format='Crawled %(url)s [%(status)s]%(flags)s',
                     level=log.DEBUG,
                     url=result.url,
                     status=result.status,
                     flags=flags)
             self.signals.send(signal=signals.response_received,
                               response=result)
         else:
             self.signals.send(signal=signals.failure_received,
                               failure=result)
         dfd = defer_result(result, clock=self.clock)
         dfd.addCallbacks(request.callback or self.spider.parse,
                          request.errback)
         dfd.addCallbacks(self._handle_spider_output,
                          self._handle_spider_error,
                          callbackKeywords={'request': request},
                          errbackKeywords={'request': request})
         return dfd
Exemple #7
0
 def _handle_pipeline_result(self, result):
     if result is None:
         pass
     elif isinstance(result, Request):
         self.download(result)
     else:
         assert isinstance(result, (Response, Failure))
         request = result.request
         if isinstance(result, Response):
             flags = ' %s' % result.flags if result.flags else ''
             log.msg(format='Crawled %(url)s [%(status)s]%(flags)s',
                     level=log.DEBUG, url=result.url, status=result.status,
                     flags=flags)
             self.signals.send(signal=signals.response_received,
                               response=result)
         else:
             self.signals.send(signal=signals.failure_received,
                               failure=result)
         dfd = defer_result(result, clock=self.clock)
         dfd.addCallbacks(request.callback or self.spider.parse,
                          request.errback)
         dfd.addCallbacks(
             self._handle_spider_output,
             self._handle_spider_error,
             callbackKeywords={'request': request},
             errbackKeywords={'request': request})
         return dfd
Exemple #8
0
 def _debug_set_cookie(self, response):
     if self.debug:
         cl = response.headers.getlist('Set-Cookie')
         if cl:
             msg = 'Received cookies from: %s' % response + os.linesep
             msg += os.linesep.join('Set-Cookie: %s' % c for c in cl)
             log.msg(msg, level=log.DEBUG)
Exemple #9
0
 def _stop(_):
     self.processing.cancel()
     self.downloader.close()
     self.request_queue.close()
     self.response_queue.close()
     log.msg(format='Engine stopped (%(reason)s)', reason=reason)
     self.signals.send(signal=signals.engine_stopped, reason=reason)
     self.stats.dump_stats()
Exemple #10
0
 def _stop(_):
     self.processing.cancel()
     self.downloader.close()
     self.request_queue.close()
     self.response_queue.close()
     log.msg(format='Engine stopped (%(reason)s)', reason=reason)
     self.signals.send(signal=signals.engine_stopped, reason=reason)
     self.stats.dump_stats()
Exemple #11
0
 def log(self):
     downloaded_speed = (self.downloaded -
                         self.downloaded_prev) * self.multiplier
     self.downloaded_prev = self.downloaded
     log.msg(format='Crawled %(down)d pages (at %(speed)d pages/min).',
             level=log.INFO,
             down=self.downloaded,
             speed=downloaded_speed)
Exemple #12
0
 def _redirect(self, redirected, request, reason):
     if len(redirected.history) < self.max_redirect_times:
         redirected.history.append(request.url)
         redirected.priority = request.priority + self.priority_adjust
         log.msg(format='Redirecting (%(reason)s) to %(redirected)s from %(request)s',
                 level=log.DEBUG, request=request,
                 redirected=redirected, reason=reason)
         return redirected
     else:
         log.msg(format='Discarding %(request)s: max redirections reached',
                 level=log.DEBUG, request=request)
         return None
Exemple #13
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING, url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG, url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG, url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s',
                        level=log.WARNING, url=response.url, stype=sitemap_type)
        return requests
Exemple #14
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING,
                        url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG,
                        url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG,
                        url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(
                    format='Unrecognized type of sitemap %(url)s: %(stype)s',
                    level=log.WARNING,
                    url=response.url,
                    stype=sitemap_type)
        return requests
Exemple #15
0
    def create_spider_by_url(self, url, default_spider=None, spargs={}):
        spiders = self.get_spiders_by_url(url)

        if len(spiders) == 1:
            return self.create_spider_by_name(spiders[0])
        elif len(spiders) > 1:
            log.msg(
                format="More than one spider can handle: %(url)s - %(snames)s",
                level=log.ERROR,
                url=url,
                snames=", ".join(spiders),
            )
        elif len(spiders) == 0:
            log.msg(format="Unable to find spider that handles: %(url)s", level=log.ERROR, url=url)
        return default_spider
Exemple #16
0
    def _retry(self, request, reason):
        retries = request.meta.get('retry_times', 0) + 1

        if retries <= self.max_retry_times:
            log.msg(format='Retrying %(request)s (failed %(retries)d times): %(reason)s',
                    level=log.DEBUG, request=request, retries=retries,
                    reason=reason)
            retry_req = request.copy()
            retry_req.meta['retry_times'] = retries
            retry_req.meta['DUPLICATE_FILTER_ENABLED'] = False
            retry_req.priority = request.priority + self.priority_adjust
            return retry_req
        else:
            log.msg(format='Gave up retrying %(request)s (failed %(retries)d times): %(reason)s',
                    level=log.DEBUG, request=request, retries=retries,
                    reason=reason)
    def create_spider_by_url(self, url, default_spider=None, spargs={}):
        spiders = self.get_spiders_by_url(url)

        if len(spiders) == 1:
            return self.create_spider_by_name(spiders[0])
        elif len(spiders) > 1:
            log.msg(
                format='More than one spider can handle: %(url)s - %(snames)s',
                level=log.ERROR,
                url=url,
                snames=', '.join(spiders))
        elif len(spiders) == 0:
            log.msg(format='Unable to find spider that handles: %(url)s',
                    level=log.ERROR,
                    url=url)
        return default_spider
Exemple #18
0
 def _redirect(self, redirected, request, reason):
     if len(redirected.history) < self.max_redirect_times:
         redirected.history.append(request.url)
         redirected.priority = request.priority + self.priority_adjust
         log.msg(
             format=
             'Redirecting (%(reason)s) to %(redirected)s from %(request)s',
             level=log.DEBUG,
             request=request,
             redirected=redirected,
             reason=reason)
         return redirected
     else:
         log.msg(format='Discarding %(request)s: max redirections reached',
                 level=log.DEBUG,
                 request=request)
         return None
Exemple #19
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(to_str(to_unicode(l, 'utf-8'), response_encoding))
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format='Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING, url=html.base_url, etype=type(e),
                         error=e)
                 else:
                     links.append(Link(url=url, text=text, nofollow=nofollow))
     return links
Exemple #20
0
 def send(self, signal=Any, *args, **kwargs):
     sender = kwargs.pop('sender', self.sender)
     dont_log = kwargs.pop('dont_log', None)
     responses = []
     for receiver in liveReceivers(getAllReceivers(sender, signal)):
         try:
             response = robustApply(receiver, signal=signal, sender=sender,
                                    *args, **kwargs)
             if isinstance(response, defer.Deferred):
                 log.msg(format='Cannot return deferreds from signal handler: %(receiver)s',
                         level=log.ERROR, receiver=receiver)
         except dont_log:
             result = Failure()
         except Exception:
             result = Failure()
             log.err(result, 'Error caught on signal handler: %s' % receiver)
         else:
             result = response
         responses.append((receiver, result))
     return responses
Exemple #21
0
 def _extract_links_from_html(self, html, response_encoding):
     links = []
     for e, a, l, p in html.iterlinks():
         if self.tag_func(e.tag):
             if self.attr_func(a):
                 try:
                     url = requote_url(
                         to_str(to_unicode(l, 'utf-8'), response_encoding))
                     url = correct_relative_path(url)
                     text = e.text or u''
                     text = to_unicode(text, 'utf-8')
                     nofollow = (e.attrib.get('rel') == 'nofollow')
                 except Exception as e:
                     log.msg(
                         format=
                         'Error occurred while extracting links from %(url)s. Error (%(etype)s): %(error)s',
                         level=log.WARNING,
                         url=html.base_url,
                         etype=type(e),
                         error=e)
                 else:
                     links.append(
                         Link(url=url, text=text, nofollow=nofollow))
     return links
Exemple #22
0
    def _retry(self, request, reason):
        retries = request.meta.get('retry_times', 0) + 1

        if retries <= self.max_retry_times:
            log.msg(
                format=
                'Retrying %(request)s (failed %(retries)d times): %(reason)s',
                level=log.DEBUG,
                request=request,
                retries=retries,
                reason=reason)
            retry_req = request.copy()
            retry_req.meta['retry_times'] = retries
            retry_req.meta['DUPLICATE_FILTER_ENABLED'] = False
            retry_req.priority = request.priority + self.priority_adjust
            return retry_req
        else:
            log.msg(
                format=
                'Gave up retrying %(request)s (failed %(retries)d times): %(reason)s',
                level=log.DEBUG,
                request=request,
                retries=retries,
                reason=reason)
Exemple #23
0
    def new_tor_identity(self):
        '''Sets new tor identity.
        '''
        if self.tor_connection is None or self.tor_password is None:
            log.msg(format='Unable to set new tor identity.',
                    level=log.WARNING)
            return

        s = socket.socket()
        s.connect(self.tor_connection)
        s.send('AUTHENTICATE "%s"\r\n' % self.tor_password)
        resp = s.recv(1024)
        if resp.startswith('250'):
            s.send('signal NEWNYM\r\n')
            resp = s.recv(1024)
            if resp.startswith('250'):
                log.msg(format='New tor identity set', level=log.INFO)
            else:
                log.msg(format='Error 1 when setting new tor identity: %(resp)s',
                        level=log.WARNING, resp=resp)
        else:
            log.msg(format='Error 2 when setting new tor identity: %(resp)s',
                    level=log.WARNING, resp=resp)
Exemple #24
0
 def _signal_shutdown(self, signum, _):
     install_shutdown_handlers(self._signal_kill)
     signame = signal_names[signum]
     log.msg(format='Received %(signame)s, shutting down gracefully. Send again to force.',
             level=log.INFO, signame=signame)
     reactor.callFromThread(self.stop)
Exemple #25
0
 def start_listening(self):
     self.port = listen_tcp(self.portrange, self.host, self)
     h = self.port.getHost()
     log.msg(format='Telnet console listening on %(host)s:%(port)d',
             level=log.DEBUG, host=h.host, port=h.port)
Exemple #26
0
 def log(self):
     downloaded_speed = (self.downloaded - self.downloaded_prev) * self.multiplier
     self.downloaded_prev = self.downloaded
     log.msg(format='Crawled %(down)d pages (at %(speed)d pages/min).',
             level=log.INFO, down=self.downloaded, speed=downloaded_speed)
Exemple #27
0
 def process_request(self, request):
     if request.proxy is None:
         request.proxy = self.tor_proxy
         log.msg(format='Using tor for request %(request)s',
                 level=log.DEBUG, request=request)
     return request
Exemple #28
0
 def test_msg_ignore_level(self):
     log.msg('Hello', level=log.DEBUG)
     log.msg('World', level=log.INFO)
     self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: World')
Exemple #29
0
 def test_msg_encoding(self):
     log.msg(u'Price: \xa3100')
     self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Price: \xc2\xa3100')
Exemple #30
0
 def test_msg_wrong_level(self):
     log.msg('Hello', level=9999)
     self.assertEqual(self.lw.get_first_line(), '[crawlmi] NOLEVEL: Hello')
Exemple #31
0
 def test_msg_level2(self):
     log.msg('Hello', log.WARNING)
     self.assertEqual(self.lw.get_first_line(), '[crawlmi] WARNING: Hello')
Exemple #32
0
 def test_format(self):
     log.msg(format='%(hi)s', hi='Hello')
     self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Hello')
Exemple #33
0
 def test_msg_basic(self):
     log.msg('Hello')
     self.assertEqual(self.lw.get_first_line(), '[crawlmi] INFO: Hello')
Exemple #34
0
 def dump_stats(self):
     if self._dump:
         log.msg('Dumping crawlmi stats:\n' + pprint.pformat(self._stats))
Exemple #35
0
 def _signal_kill(self, signum, _):
     install_shutdown_handlers(signal.SIG_IGN)
     signame = signal_names[signum]
     log.msg(format='Received %(signame)s twice, forcing unclean shutdown.',
             level=log.INFO, signame=signame)
     reactor.callFromThread(self._stop_reactor)
Exemple #36
0
 def dump_stats(self):
     if self._dump:
         log.msg('Dumping crawlmi stats:\n' + pprint.pformat(self._stats))