Ejemplo n.º 1
0
 def process_request(self, request, spider):
     useragent = self._useragent
     rp = self.robot_parser(request, spider)
     if rp and not rp.can_fetch(useragent, request.url):
         log.msg(format="Forbidden by robots.txt: %(request)s",
                 level=log.DEBUG, request=request)
         raise IgnoreRequest
Ejemplo n.º 2
0
 def _debug_set_cookie(self, response, spider):
     if self.debug:
         cl = response.headers.getlist('Set-Cookie')
         if cl:
             msg = "Received cookies from: %s" % response + os.linesep
             msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
             log.msg(msg, spider=spider, level=log.DEBUG)
Ejemplo n.º 3
0
 def _debug_cookie(self, request, spider):
     if self.debug:
         cl = request.headers.getlist('Cookie')
         if cl:
             msg = "Sending cookies to: %s" % request + os.linesep
             msg += os.linesep.join("Cookie: %s" % c for c in cl)
             log.msg(msg, spider=spider, level=log.DEBUG)
Ejemplo n.º 4
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        if retries <= self.max_retry_times:
            log.msg(format="Retrying %(request)s (failed %(retries)d times): %(reason)s",
                    level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust
            return retryreq
        else:
            log.msg(format="Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                    level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
Ejemplo n.º 5
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.msg(format="Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                    level=log.DEBUG, spider=spider, request=request,
                    redirected=redirected, reason=reason)
            return redirected
        else:
            log.msg(format="Discarding %(request)s: max redirections reached",
                    level=log.DEBUG, spider=spider, request=request)
            raise IgnoreRequest("max redirections reached")
Ejemplo n.º 6
0
    def from_settings(cls, settings, signals=None, stats=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                if hasattr(mwcls, 'from_settings'):
                    mw = mwcls.from_settings(settings, signals, stats)
                else:
                    mw = mwcls()
                middlewares.append(mw)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    log.msg(format="Disabled %(clsname)s: %(eargs)s",
                            level=log.WARNING, clsname=clsname, eargs=e.args[0])

        enabled = [x.__class__.__name__ for x in middlewares]
        log.msg(format="Enabled %(componentname)ss: %(enabledlist)s", level=log.INFO,
                componentname=cls.component_name, enabledlist=', '.join(enabled))
        return cls(*middlewares)
Ejemplo n.º 7
0
    def process_response(self, request, response, spider):

        if not isinstance(response, HtmlResponse) or response.status != 200:
            return response

        if request.method != 'GET':
            # other HTTP methods are either not safe or don't have a body
            return response

        if 'ajax_crawlable' in request.meta:  # prevent loops
            return response

        if not self._has_ajax_crawlable_variant(response):
            return response

        # scrapy already handles #! links properly
        ajax_crawl_request = request.replace(url=request.url+'#!')
        log.msg(format="Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
                level=log.DEBUG, spider=spider,
                ajax_crawl_request=ajax_crawl_request, request=request)

        ajax_crawl_request.meta['ajax_crawlable'] = True
        return ajax_crawl_request
Ejemplo n.º 8
0
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
    """Like pydispatcher.robust.sendRobust but it also logs errors and returns
    Failures instead of exceptions.
    """
    dont_log = named.pop('dont_log', None)
    spider = named.get('spider', None)
    responses = []
    for receiver in liveReceivers(getAllReceivers(sender, signal)):
        try:
            response = robustApply(receiver, signal=signal, sender=sender,
                *arguments, **named)
            if isinstance(response, Deferred):
                log.msg(format="Cannot return deferreds from signal handler: %(receiver)s",
                        level=log.ERROR, spider=spider, receiver=receiver)
        except dont_log:
            result = Failure()
        except Exception:
            result = Failure()
            log.err(result, "Error caught on signal handler: %s" % receiver, \
                spider=spider)
        else:
            result = response
        responses.append((receiver, result))
    return responses
Ejemplo n.º 9
0
 def close_spider(self, spider, reason):
     if self._dump:
         log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self._stats), \
             spider=spider)
     self._persist_stats(self._stats, spider)