예제 #1
0
 def logerror(failure, recv):
     if dont_log is None or not isinstance(failure.value, dont_log):
         log.spider_log(failure,
                        "Error caught on signal handler: %s" % recv,
                        level=log.ERROR,
                        spider=spider)
     return failure
예제 #2
0
 def _debug_cookie(self, request, spider):
     if self.debug:
         cl = request.headers.getlist('Cookie')
         if cl:
             msg = "Sending cookies to: %s" % request + os.linesep
             msg += os.linesep.join("Cookie: %s" % c for c in cl)
             log.spider_log(msg, spider=spider, level=log.DEBUG)
예제 #3
0
 def _debug_set_cookie(self, response, spider):
     if self.debug:
         cl = response.headers.getlist('Set-Cookie')
         if cl:
             msg = "Received cookies from: %s" % response + os.linesep
             msg += os.linesep.join("Set-Cookie: %s" % c for c in cl)
             log.spider_log(msg, spider=spider, level=log.DEBUG)
예제 #4
0
파일: debug.py 프로젝트: zymITsky/ants
 def dump_stacktrace(self, signum, frame):
     stackdumps = self._thread_stacks()
     enginestatus = format_engine_status(self.crawler.engine)
     liverefs = format_live_refs()
     msg = "Dumping stack trace and engine status" \
         "\n{0}\n{1}\n{2}".format(enginestatus, liverefs, stackdumps)
     log.spider_log(msg)
예제 #5
0
 def process_request(self, request, spider):
     useragent = self._useragent
     rp = self.robot_parser(request, spider)
     if rp and not rp.can_fetch(useragent, request.url):
         log.spider_log("Forbidden by robots.txt:" + request.url,
                        level=log.DEBUG)
         raise IgnoreRequest
예제 #6
0
파일: spider.py 프로젝트: zymITsky/ants
def create_spider_for_request(spidermanager, request, default_spider=None, \
                              log_none=False, log_multiple=False, **spider_kwargs):
    """Create a spider to handle the given Request.

    This will look for the spiders that can handle the given request (using
    the spider manager) and return a (new) Spider if (and only if) there is
    only one Spider able to handle the Request.

    If multiple spiders (or no spider) are found, it will return the
    default_spider passed. It can optionally log if multiple or no spiders
    are found.
    """
    snames = spidermanager.find_by_request(request)
    if len(snames) == 1:
        return spidermanager.create(snames[0], **spider_kwargs)

    if len(snames) > 1 and log_multiple:
        log.spider_log(format='More than one spider can handle:' +
                       request.url + ':' + ', '.join(snames),
                       level=log.ERROR)

    if len(snames) == 0 and log_none:
        log.spider_log(format='Unable to find spider that handles:' +
                       request.url,
                       level=log.ERROR)

    return default_spider
예제 #7
0
def send_catch_log(signal=Any, sender=Anonymous, *arguments, **named):
    """Like pydispatcher.robust.sendRobust but it also logs errors and returns
    Failures instead of exceptions.
    """
    dont_log = named.pop('dont_log', None)
    spider = named.get('spider', None)
    responses = []
    for receiver in liveReceivers(getAllReceivers(sender, signal)):
        try:
            response = robustApply(receiver,
                                   signal=signal,
                                   sender=sender,
                                   *arguments,
                                   **named)
            if isinstance(response, Deferred):
                log.spider_log("Cannot return deferreds from signal handler:" +
                               receiver,
                               level=log.ERROR,
                               spider=spider)
        except dont_log:
            result = Failure()
        except Exception:
            result = Failure()
            log.spider_log(result,
                           "Error caught on signal handler: %s" % receiver,
                           level=log.ERROR,
                           spider=spider)
        else:
            result = response
        responses.append((receiver, result))
    return responses
예제 #8
0
파일: logstats.py 프로젝트: zymITsky/ants
 def log(self, spider):
     items = self.stats.get_value('item_scraped_count', 0)
     pages = self.stats.get_value('response_received_count', 0)
     irate = (items - self.itemsprev) * self.multiplier
     prate = (pages - self.pagesprev) * self.multiplier
     self.pagesprev, self.itemsprev = pages, items
     msg = "Crawled %d pages (at %d pages/min), scraped %d items (at %d items/min)" \
         % (pages, prate, items, irate)
     log.spider_log(msg, spider=spider)
예제 #9
0
 def process_spider_exception(self, response, exception, spider):
     if isinstance(exception, HttpError):
         log.spider_log(
             "Ignoring response " + response.url +
             ": HTTP status code is not handled or not allowed",
             level=log.DEBUG,
             spider=spider,
         )
         return []
예제 #10
0
    def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = request.headers.get('Referer')
            log.spider_log('File (unknown-error): Error downloading ' +
                           self.MEDIA_NAME + ' from ' + request.url +
                           ' referred in <' + referer + '>:' + failure.value,
                           level=log.WARNING,
                           spider=info.spider)

        raise FileException
예제 #11
0
파일: feedexport.py 프로젝트: zymITsky/ants
 def _storage_supported(self, uri):
     scheme = urlparse(uri).scheme
     if scheme in self.storages:
         try:
             self._get_storage(uri)
             return True
         except NotConfigured:
             log.spider_log("Disabled feed storage scheme: %s" % scheme, log.ERROR)
     else:
         log.spider_log("Unknown feed storage scheme: %s" % scheme, log.ERROR)
예제 #12
0
파일: urllength.py 프로젝트: zymITsky/ants
 def _filter(request):
     if isinstance(request,
                   Request) and len(request.url) > self.maxlength:
         log.spider_log("Ignoring link (url length > " +
                        self.maxlength + "):" + request.url,
                        level=log.DEBUG,
                        spider=spider)
         return False
     else:
         return True
예제 #13
0
 def item_completed(self, results, item, info):
     """Called per item when all media requests has been processed"""
     if self.LOG_FAILED_RESULTS:
         msg = '%s found errors proessing %s' % (self.__class__.__name__,
                                                 item)
         for ok, value in results:
             if not ok:
                 log.spider_log(value + msg,
                                spider=info.spider,
                                level=log.ERROR)
     return item
예제 #14
0
파일: dupefilter.py 프로젝트: zymITsky/ants
    def log(self, request, spider):
        if self.debug:
            log.spider_log("Filtered duplicate request:" + request.url,
                           level=log.DEBUG,
                           spider=spider)
        elif self.logdupes:
            fmt = ("Filtered duplicate request: " + request.url +
                   " - no more duplicates will be shown"
                   " (see DUPEFILTER_DEBUG to show all duplicates)")
            log.spider_log(fmt, level=log.DEBUG, spider=spider)
            self.logdupes = False

        spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider)
예제 #15
0
파일: memusage.py 프로젝트: zymITsky/ants
 def _check_warning(self):
     if self.warned:  # warn only once
         return
     if self.get_virtual_size() > self.warning:
         self.crawler.stats.set_value('memusage/warning_reached', 1)
         mem = self.warning / 1024 / 1024
         log.spider_log("Memory usage reached " + str(mem) + "M",
                        level=log.WARNING)
         if self.notify_mails:
             subj = "%s warning: memory usage reached %dM at %s" % \
                    (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
             self._send_report(self.notify_mails, subj)
             self.crawler.stats.set_value('memusage/warning_notified', 1)
         self.warned = True
예제 #16
0
    def process_response(self, request, response, spider):
        if not response.body:
            return response

        for fmt, func in self._formats.iteritems():
            new_response = func(response)
            if new_response:
                log.spider_log(
                    'Decompressed response with format:' + fmt,
                    level=log.DEBUG,
                    spider=spider,
                )
                return new_response
        return response
예제 #17
0
파일: scrapy.py 프로젝트: zymITsky/ants
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.engine.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     log.spider_log(_failure,
                    "Spider error processing %s" % request,
                    spider=spider)
     self.signals.send_catch_log(signal=signals.spider_error,
                                 failure=_failure,
                                 response=response,
                                 pider=spider)
     self.engine.stats.inc_value("spider_exceptions/%s" %
                                 _failure.value.__class__.__name__,
                                 spider=spider)
     self.engine.send_request_result(request, str(exc))
예제 #18
0
 def process_spider_output(self, response, result, spider):
     for x in result:
         if isinstance(x, Request):
             if x.dont_filter or self.should_follow(x, spider):
                 yield x
             else:
                 domain = urlparse_cached(x).hostname
                 if domain and domain not in self.domains_seen:
                     self.domains_seen.add(domain)
                     log.spider_log("Filtered offsite request to " +
                                    str(domain) + ":" + x.url,
                                    level=log.DEBUG,
                                    spider=spider)
                     self.stats.inc_value('offsite/domains', spider=spider)
                 self.stats.inc_value('offsite/filtered', spider=spider)
         else:
             yield x
예제 #19
0
    def media_downloaded(self, response, request, info):
        referer = request.headers.get('Referer')

        if response.status != 200:
            log.spider_log('File (code:' + response.status +
                           '): Error downloading file from ' + request +
                           ' referred in <' + referer + '>',
                           level=log.WARNING,
                           spider=info.spider)
            raise FileException('download-error')

        if not response.body:
            log.spider_log('File (empty-content): Empty file from ' +
                           request.url + ' referred in <' + referer +
                           '>: no-content',
                           level=log.WARNING,
                           spider=info.spider)
            raise FileException('empty-content')

        status = 'cached' if 'cached' in response.flags else 'downloaded'
        log.spider_log('File (' + status + '): Downloaded file from ' +
                       request.url + ' referred in <' + referer + '>',
                       level=log.DEBUG,
                       spider=info.spider)
        self.inc_stats(info.spider, status)

        try:
            path = self.file_path(request, response=response, info=info)
            checksum = self.file_downloaded(response, request, info)
        except FileException as exc:
            msg = 'File (error): Error processing file from ' + request.url + ' referred in <' + referer + '>:' + str(
                exc)
            log.spider_log(msg, level=log.WARNING, spider=info.spider)
            raise
        except Exception as exc:
            msg = 'File (unknown-error): Error processing file from %(request)s referred in <%(referer)s>'
            log.spider_log(msg % {
                'request': request,
                'referer': referer
            },
                           spider=info.spider)
            raise FileException(str(exc))

        return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #20
0
파일: scrapy.py 프로젝트: zymITsky/ants
 def _process_spidermw_output(self, output, request, response, spider):
     """Process each Request/Item (given in the output parameter) returned
     from the given spider
     """
     if isinstance(output, Request):
         self.engine.add_request(request, output)
     elif isinstance(output, BaseItem):
         self.slot.itemproc_size += 1
         dfd = self.itemproc.process_item(output, spider)
         dfd.addBoth(self._itemproc_finished, output, request, response,
                     spider)
         return dfd
     elif output is None:
         self.engine.send_request_result()
     else:
         typename = type(output).__name__
         msg = 'Spider must return Request, BaseItem or None, got ' + typename + ' in ' + request.url
         log.spider_log(msg, level=log.ERROR, spider=spider)
         self.engine.send_request_result(request, msg)
예제 #21
0
파일: feedexport.py 프로젝트: zymITsky/ants
 def close_spider(self, spider):
     slot = self.slot
     if not slot.itemcount and not self.store_empty:
         return
     slot.exporter.finish_exporting()
     logfmt = "%%s %s feed (%d items) in: %s" % (self.format, slot.itemcount, slot.uri)
     d = defer.maybeDeferred(slot.storage.store, slot.file)
     d.addCallback(lambda _: log.spider_log(logfmt % "Stored", spider=spider))
     d.addErrback(log.spider_log, logfmt % "Error storing", spider=spider)
     return d
예제 #22
0
파일: depth.py 프로젝트: zymITsky/ants
 def _filter(request):
     if isinstance(request, Request):
         depth = response.meta['depth'] + 1
         request.meta['depth'] = depth
         if self.prio:
             request.priority -= depth * self.prio
         if self.maxdepth and depth > self.maxdepth:
             log.spider_log("Ignoring link (depth > %(" +
                            str(self.maxdepth) + ")d):" + request.url,
                            level=log.DEBUG,
                            spider=spider)
             return False
         elif self.stats:
             if self.verbose_stats:
                 self.stats.inc_value('request_depth_count/%s' % depth,
                                      spider=spider)
             self.stats.max_value('request_depth_max',
                                  depth,
                                  spider=spider)
     return True
예제 #23
0
파일: memusage.py 프로젝트: zymITsky/ants
    def _check_limit(self):
        if self.get_virtual_size() > self.limit:
            self.crawler.stats.set_value('memusage/limit_reached', 1)
            mem = self.limit / 1024 / 1024
            log.spider_log(
                "Memory usage exceeded %(memusage)dM. Shutting down Scrapy...",
                level=log.ERROR,
                memusage=mem)
            if self.notify_mails:
                subj = "%s terminated: memory usage exceeded %dM at %s" % \
                       (self.crawler.settings['BOT_NAME'], mem, socket.gethostname())
                self._send_report(self.notify_mails, subj)
                self.crawler.stats.set_value('memusage/limit_notified', 1)

            open_spiders = self.crawler.engine.open_spiders
            if open_spiders:
                for spider in open_spiders:
                    self.crawler.engine.close_spider(spider,
                                                     'memusage_exceeded')
            else:
                self.crawler.stop()
예제 #24
0
파일: scrapy.py 프로젝트: zymITsky/ants
    def _log_download_errors(spider_failure, download_failure, request,
                             spider):
        """Log and silence errors that come from the engine (typically download
        errors that got propagated thru here)
        """
        if isinstance(download_failure, Failure) \
                and not download_failure.check(IgnoreRequest):
            if download_failure.frames:
                log.spider_log('Error downloading %s' % request,
                               spider=spider,
                               level=log.ERROR)
            else:
                errmsg = download_failure.getErrorMessage()
                if errmsg:
                    log.spider_log('Error downloading ' + request.url + ':' +
                                   errmsg,
                                   level=log.ERROR,
                                   spider=spider)

        if spider_failure is not download_failure:
            return spider_failure
예제 #25
0
파일: scrapy.py 프로젝트: zymITsky/ants
 def _itemproc_finished(self, output, item, request, response, spider):
     """ItemProcessor finished for the given ``item`` and returned ``output``
     """
     self.slot.itemproc_size -= 1
     if isinstance(output, Failure):
         ex = output.value
         if isinstance(ex, DropItem):
             log.spider_log('scrape error:' + ex.message + ':in:' +
                            response.url,
                            spider=spider,
                            level=log.ERROR)
             return self.signals.send_catch_log_deferred(
                 signal=signals.item_dropped,
                 item=item,
                 response=response,
                 spider=spider,
                 exception=output.value)
         else:
             log.spider_log('Error processing %s' % item,
                            spider=spider,
                            level=log.ERROR)
         self.engine.send_request_result(request, ex)
     else:
         log.spider_log('scrape ok in:' + response.url, spider=spider)
         self.engine.send_request_result(request)
         return self.signals.send_catch_log_deferred(
             signal=signals.item_scraped,
             item=output,
             response=response,
             spider=spider)
예제 #26
0
파일: redirect.py 프로젝트: zymITsky/ants
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get(
                'redirect_urls', []) + [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.spider_log("Redirecting " + reason + " to " + redirected +
                           " from " + request.url,
                           level=log.DEBUG,
                           spider=spider)
            return redirected
        else:
            log.spider_log(format="Discarding " + request.url +
                           ": max redirections reached",
                           level=log.DEBUG,
                           spider=spider)
            raise IgnoreRequest("max redirections reached")
예제 #27
0
    def _parse_sitemap(self, response):
        if response.url.endswith('/robots.txt'):
            for url in sitemap_urls_from_robots(response.body):
                yield Request(url, callback=self._parse_sitemap)
        else:
            body = self._get_sitemap_body(response)
            if body is None:
                log.spider_log("Ignoring invalid sitemap:" + response.url,
                               level=log.WARNING,
                               spider=self)
                return

            s = Sitemap(body)
            if s.type == 'sitemapindex':
                for loc in iterloc(s, self.sitemap_alternate_links):
                    if any(x.search(loc) for x in self._follow):
                        yield Request(loc, callback=self._parse_sitemap)
            elif s.type == 'urlset':
                for loc in iterloc(s):
                    for r, c in self._cbs:
                        if r.search(loc):
                            yield Request(loc, callback=c)
                            break
예제 #28
0
        def _onsuccess(result):
            if not result:
                return  # returning None force download

            last_modified = result.get('last_modified', None)
            if not last_modified:
                return  # returning None force download

            age_seconds = time.time() - last_modified
            age_days = age_seconds / 60 / 60 / 24
            if age_days > self.EXPIRES:
                return  # returning None force download

            referer = request.headers.get('Referer')
            log.spider_log('File (uptodate): Downloaded ' + self.MEDIA_NAME +
                           ' from ' + request.url + ' referred in <' +
                           referer + '>',
                           level=log.DEBUG,
                           spider=info.spider)
            self.inc_stats(info.spider, 'uptodate')

            checksum = result.get('checksum', None)
            return {'url': request.url, 'path': path, 'checksum': checksum}
예제 #29
0
파일: ajaxcrawl.py 프로젝트: zymITsky/ants
    def process_response(self, request, response, spider):

        if not isinstance(response, HtmlResponse) or response.status != 200:
            return response

        if request.method != 'GET':
            # other HTTP methods are either not safe or don't have a body
            return response

        if 'ajax_crawlable' in request.meta:  # prevent loops
            return response

        if not self._has_ajax_crawlable_variant(response):
            return response

        # ants already handles #! links properly
        ajax_crawl_request = request.replace(url=request.url + '#!')
        log.spider_log("Downloading AJAX crawlable " + ajax_crawl_request +
                       " instead of " + request.url,
                       level=log.DEBUG,
                       spider=spider)

        ajax_crawl_request.meta['ajax_crawlable'] = True
        return ajax_crawl_request
예제 #30
0
    def from_settings(cls, settings, crawler=None):
        mwlist = cls._get_mwlist_from_settings(settings)
        middlewares = []
        for clspath in mwlist:
            try:
                mwcls = load_object(clspath)
                if crawler and hasattr(mwcls, 'from_crawler'):
                    mw = mwcls.from_crawler(crawler)
                elif hasattr(mwcls, 'from_settings'):
                    mw = mwcls.from_settings(settings)
                else:
                    mw = mwcls()
                middlewares.append(mw)
            except NotConfigured as e:
                if e.args:
                    clsname = clspath.split('.')[-1]
                    log.spider_log("Disabled " + clsname + ":" + e.args[0],
                                   level=log.WARNING)

        enabled = [x.__class__.__name__ for x in middlewares]
        log.spider_log("Enabled " + cls.component_name + ":" +
                       ', '.join(enabled),
                       level=log.INFO)
        return cls(*middlewares)