Beispiel #1
0
 def process_response(self, request, response, spider):
     stats.inc_value("crawled_url_num", spider=spider)
     stats.inc_value("crawled_page_size", count=len(response.body), spider=spider)
     # log.msg('download page:%s, size:%s, content-length:%s'\
     #    %(response.url, len(response.body),
     #    response.headers.get('Content-Length')))
     return response
Beispiel #2
0
 def handle_spider_error(self, _failure, request, spider, propagated_failure=None):
     referer = request.headers.get('Referer', None)
     msg = "Spider error processing <%s> (referer: <%s>)" % \
         (request.url, referer)
     log.err(_failure, msg, spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
         spider=spider)
Beispiel #3
0
 def process_request(self, request, spider):
     stats.inc_value('downloader/request_count')
     stats.inc_value('downloader/request_count', spider=spider)
     stats.inc_value('downloader/request_method_count/%s' % request.method, spider=spider)
     reqlen = len(request_httprepr(request))
     stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
     stats.inc_value('downloader/request_bytes', reqlen)
Beispiel #4
0
 def process_response(self, request, response, spider):
     if (self.is_cacheable(request)
         and self.is_cacheable_response(response)
         and 'cached' not in response.flags):
         self.storage.store_response(spider, request, response)
         stats.inc_value('httpcache/store', spider=spider)
     return response
Beispiel #5
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value('finish_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
     stats.set_value('finish_status',
                     'OK' if reason == 'finished' else reason,
                     spider=spider)
     stats.inc_value('spider_count/%s' % reason, spider=spider)
Beispiel #6
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time',
                     datetime.datetime.utcnow(),
                     spider=spider)
     stats.set_value('envinfo/host',
                     stats.get_value('envinfo/host'),
                     spider=spider)
     stats.inc_value('spider_count/opened')
Beispiel #7
0
 def process_response(self, request, response, spider):
     stats.inc_value('downloader/response_count')
     stats.inc_value('downloader/response_count', spider=spider)
     stats.inc_value('downloader/response_status_count/%s' % response.status, spider=spider)
     reslen = len(response_httprepr(response))
     stats.inc_value('downloader/response_bytes', reslen, spider=spider)
     stats.inc_value('downloader/response_bytes', reslen)
     return response
Beispiel #8
0
 def process_request(self, request, spider):
     stats.inc_value('downloader/request_count')
     stats.inc_value('downloader/request_count', spider=spider)
     stats.inc_value('downloader/request_method_count/%s' % request.method,
                     spider=spider)
     reqlen = len(request_httprepr(request))
     stats.inc_value('downloader/request_bytes', reqlen, spider=spider)
     stats.inc_value('downloader/request_bytes', reqlen)
Beispiel #9
0
    def process_response(self, request, response, spider):
        doc = request.meta['terms']
        if self.index.appendif(doc, response.url, 0.0):
            return response

        stats.inc_value('downloader/near_duplicates')

        raise IgnoreRequest
Beispiel #10
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or "cancelled")
         return
     log.err(_failure, "Spider error processing %s" % request, spider=spider)
     send_catch_log(signal=signals.spider_error, failure=_failure, response=response, spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider)
Beispiel #11
0
 def process_response(self, request, response, spider):
     stats.inc_value('crawled_url_num', spider=spider)
     stats.inc_value('crawled_page_size',
                     count=len(response.body),
                     spider=spider)
     #log.msg('download page:%s, size:%s, content-length:%s'\
     #    %(response.url, len(response.body),
     #    response.headers.get('Content-Length')))
     return response
Beispiel #12
0
 def handle_spider_error(self, _failure, request, response, spider):
     exc = _failure.value
     if isinstance(exc, CloseSpider):
         self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
         return
     log.err(_failure, "Spider error processing %s" % request, spider=spider)
     send_catch_log(signal=signals.spider_error, failure=_failure, response=response, \
         spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
         spider=spider)
    def process_item(self, item, spider):
        self.file.write('%s\n' % item['domain'])

        domain = Domain.get(item['domain'])
        if not domain:
            stats.inc_value('found_new_domain')
            log.msg("Found new domain: %s" % item['domain'])
            Domain.objects.create(domain_name=item['domain'], 
                                  monitoring_status='new', 
                                  metadata_status='new')
Beispiel #14
0
 def process_response(self, request, response, spider):
     stats.inc_value('downloader/response_count')
     stats.inc_value('downloader/response_count', spider=spider)
     stats.inc_value('downloader/response_status_count/%s' %
                     response.status,
                     spider=spider)
     reslen = len(response_httprepr(response))
     stats.inc_value('downloader/response_bytes', reslen, spider=spider)
     stats.inc_value('downloader/response_bytes', reslen)
     return response
Beispiel #15
0
    def parse(self, response):
        self.log('Response:%s, type:%s' % (response.url, type(response)))
        self.parser_manager.process_response(response, self)

        stats.inc_value('crawled_url_num', spider=self)
        stats.inc_value(
            'crawled_page_size',
            count=len(response.body),
            spider=self,
        )
Beispiel #16
0
 def handle_spider_error(self,
                         _failure,
                         request,
                         spider,
                         propagated_failure=None):
     referer = request.headers.get('Referer', None)
     msg = "Spider error processing <%s> (referer: <%s>)" % \
         (request.url, referer)
     log.err(_failure, msg, spider=spider)
     stats.inc_value("spider_exceptions/%s" % _failure.value.__class__.__name__, \
         spider=spider)
Beispiel #17
0
 def _dqpush(self, request):
     if self.dqs is None:
         return
     try:
         reqd = request_to_dict(request, self.spider)
         self.dqs.push(reqd, request.priority)
     except ValueError: # non serializable request
         return
     else:
         stats.inc_value('scheduler/disk_enqueued', spider=self.spider)
         return True
Beispiel #18
0
    def process_request(self, request, spider):
        if not self.is_cacheable(request):
            return
        response = self.storage.retrieve_response(spider, request)
        if response and self.is_cacheable_response(response):
            response.flags.append("cached")
            stats.inc_value("httpcache/hit", spider=spider)
            return response

        stats.inc_value("httpcache/miss", spider=spider)
        if self.ignore_missing:
            raise IgnoreRequest("Ignored request not in cache: %s" % request)
Beispiel #19
0
    def process_request(self, request, spider):
        if not self.is_cacheable(request):
            return
        response = self.storage.retrieve_response(spider, request)
        if response and self.is_cacheable_response(response):
            response.flags.append('cached')
            stats.inc_value('httpcache/hit', spider=spider)
            return response

        stats.inc_value('httpcache/miss', spider=spider)
        if self.ignore_missing:
            raise IgnoreRequest("Ignored request not in cache: %s" % request)
Beispiel #20
0
 def _filter(request):
     if isinstance(request, Request):
         depth = response.request.meta['depth'] + 1
         request.meta['depth'] = depth
         if self.maxdepth and depth > self.maxdepth:
             log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \
                 level=log.DEBUG, spider=spider)
             return False
         elif self.stats:
             stats.inc_value('request_depth_count/%s' % depth, spider=spider)
             if depth > stats.get_value('request_depth_max', 0, spider=spider):
                 stats.set_value('request_depth_max', depth, spider=spider)
     return True
Beispiel #21
0
    def parse_item(self, response):
        item = RecipebotItem()

        doc = response.meta['terms']

        # decide if the page is interesting
        if not self.sim.is_relevant(doc):
            stats.inc_value('recipe/filtered_out') # probably not recipe page
            return

        item['url'] = response.url

        return item
Beispiel #22
0
    def parse_item(self, response):
        item = ScifibotItem()
        # clean body
        orig_body = response.body_as_unicode()
        body = remove_tags_with_content(orig_body,
            which_ones=('script', 'head'))
        body = remove_tags(remove_comments(body))
        tokens = tokenize(body.lower())
        # decide if the page is interesting
        if not is_relevant(tokens):
            stats.inc_value('scifi/filtered_out') # probably not scifi page
            return

        item['keywords'] = tokens
        item['page'] = orig_body
        item['url'] = response.url
        return item
Beispiel #23
0
 def new_callback(*args, **kwargs):
     tbefore = time()
     mbefore = self._memusage()
     r = function(*args, **kwargs)
     mafter = self._memusage()
     ct = time() - tbefore
     tcc = stats.get_value('profiling/total_callback_time', 0, spider=spider)
     sct = stats.get_value('profiling/slowest_callback_time', 0, spider=spider)
     stats.set_value('profiling/total_callback_time', tcc+ct, spider=spider)
     if ct > sct:
         stats.set_value('profiling/slowest_callback_time', ct, spider=spider)
         stats.set_value('profiling/slowest_callback_name', function.__name__, \
             spider=spider)
         stats.set_value('profiling/slowest_callback_url', args[0].url, \
             spider=spider)
     if self._memusage:
         stats.inc_value('profiling/total_mem_allocated_in_callbacks', \
             count=mafter-mbefore, spider=spider)
     return r
Beispiel #24
0
    def parse_item(self, response):
        item = RecipebotItem()

        body = response.meta['body']

        result = self.detector.extract(body)

        if len(result) == 0:
            stats.inc_value('recipe/filtered_out') # probably not recipe page
            return

        item['url'] = response.url

        item['ingredients'] = []
        for item in result:
            if item[2] >= 0.25:
                item['ingredients'].append(item[0])

        return item
Beispiel #25
0
    def process_spider_output(self, response, result, spider):
        def _filter(request):
            if isinstance(request, Request):
                depth = response.request.meta['depth'] + 1
                request.meta['depth'] = depth
                if self.maxdepth and depth > self.maxdepth:
                    log.msg("Ignoring link (depth > %d): %s " % (self.maxdepth, request.url), \
                        level=log.DEBUG, spider=spider)
                    return False
                elif self.stats:
                    stats.inc_value('request_depth_count/%s' % depth, spider=spider)
                    if depth > stats.get_value('request_depth_max', 0, spider=spider):
                        stats.set_value('request_depth_max', depth, spider=spider)
            return True

        # base case (depth=0)
        if self.stats and 'depth' not in response.request.meta: 
            response.request.meta['depth'] = 0
            stats.inc_value('request_depth_count/0', spider=spider)

        return (r for r in result or () if _filter(r))
Beispiel #26
0
 def process_exception(self, request, exception, spider):
     ex_class = "%s.%s" % (exception.__class__.__module__,
                           exception.__class__.__name__)
     stats.inc_value('downloader/exception_count')
     stats.inc_value('downloader/exception_count', spider=spider)
     stats.inc_value('downloader/exception_type_count/%s' % ex_class,
                     spider=spider)
Beispiel #27
0
 def item_dropped(self, item, spider, exception):
     reason = exception.__class__.__name__
     stats.inc_value('item_dropped_count', spider=spider)
     stats.inc_value('item_dropped_reasons_count/%s' % reason,
                     spider=spider)
Beispiel #28
0
 def _mqpush(self, request):
     stats.inc_value('scheduler/memory_enqueued', spider=self.spider)
     self.mqs.push(request, -request.priority)
Beispiel #29
0
    def __len__(self):
        return len(self.dqs) + len(self.mqs) if self.dqs else len(self.mqs)

    def _dqpush(self, request):
        if self.dqs is None:
            return
        try:
            reqd = request_to_dict(request, self.spider)
            self.dqs.push(reqd, -request.priority)
        except ValueError, e: # non serializable request
            if self.logunser:
                log.msg("Unable to serialize request: %s - reason: %s" % \
                    (request, str(e)), level=log.ERROR, spider=self.spider)
            return
        else:
            stats.inc_value('scheduler/disk_enqueued', spider=self.spider)
            return True

    def _mqpush(self, request):
        stats.inc_value('scheduler/memory_enqueued', spider=self.spider)
        self.mqs.push(request, -request.priority)

    def _dqpop(self):
        if self.dqs:
            d = self.dqs.pop()
            if d:
                return request_from_dict(d, self.spider)

    def _newmq(self, priority):
        return self.mqclass()
Beispiel #30
0
 def inc_stats(self, spider, status):
     stats.inc_value('image_count', spider=spider)
     stats.inc_value('image_status_count/%s' % status, spider=spider)
Beispiel #31
0
 def item_dropped(self, item, spider, exception):
     reason = exception.__class__.__name__
     stats.inc_value("item_dropped_count", spider=spider)
     stats.inc_value("item_dropped_reasons_count/%s" % reason, spider=spider)
     stats.inc_value("item_dropped_count")
Beispiel #32
0
 def item_scraped(self, item, spider):
     stats.inc_value('item_scraped_count', spider=spider)
     stats.inc_value('item_scraped_count')
Beispiel #33
0
 def item_passed(self, item, spider):
     stats.inc_value('item_passed_count', spider=spider)
     stats.inc_value('item_passed_count')
Beispiel #34
0
 def inc_stats(self, spider, status):
     stats.inc_value("image_count", spider=spider)
     stats.inc_value("image_status_count/%s" % status, spider=spider)
Beispiel #35
0
 def item_scraped(self, item, spider):
     stats.inc_value('item_scraped_count', spider=spider)
Beispiel #36
0
    def parse(self, response):
        self.log('Response:%s, type:%s' %(response.url, type(response)))
        self.parser_manager.process_response(response, self)

        stats.inc_value('crawled_url_num', spider=self)
        stats.inc_value('crawled_page_size', count=len(response.body), spider=self,)
Beispiel #37
0
 def process_response(self, request, response, spider):
     if (self.is_cacheable(request) and self.is_cacheable_response(response)
             and 'cached' not in response.flags):
         self.storage.store_response(spider, request, response)
         stats.inc_value('httpcache/store', spider=spider)
     return response
Beispiel #38
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value('finish_time', datetime.datetime.utcnow(), spider=spider)
     stats.set_value('finish_status', 'OK' if reason == 'finished' else reason, spider=spider)
     stats.inc_value('spider_count/%s' % reason, spider=spider)
Beispiel #39
0
 def stats_spider_opened(self, spider):
     stats.set_value('start_time', datetime.datetime.utcnow(), spider=spider)
     stats.set_value('envinfo/host', stats.get_value('envinfo/host'), spider=spider)
     stats.inc_value('spider_count/opened')
Beispiel #40
0
 def process_response(self, request, response, spider):
     if self.is_cacheable(request) and self.is_cacheable_response(response) and "cached" not in response.flags:
         self.storage.store_response(spider, request, response)
         stats.inc_value("httpcache/store", spider=spider)
     return response
Beispiel #41
0
 def stats_spider_opened(self, spider):
     stats.set_value("start_time", datetime.datetime.utcnow(), spider=spider)
     stats.set_value("envinfo/host", stats.get_value("envinfo/host"), spider=spider)
     stats.inc_value("spider_count/opened")
Beispiel #42
0
 def inc_stats(self, spider, status):
     stats.inc_value('image_count', spider=spider)
     stats.inc_value('image_status_count/%s' % status, spider=spider)
Beispiel #43
0
 def stats_spider_closing(self, spider, reason):
     stats.set_value("finish_time", datetime.datetime.utcnow(), spider=spider)
     stats.set_value("finish_status", "OK" if reason == "finished" else reason, spider=spider)
     stats.inc_value("spider_count/%s" % reason, spider=spider)
Beispiel #44
0
 def process_exception(self, request, exception, spider):
     ex_class = "%s.%s" % (exception.__class__.__module__, exception.__class__.__name__)
     stats.inc_value('downloader/exception_count', spider=spider)
     stats.inc_value('downloader/exception_type_count/%s' % ex_class, spider=spider)
Beispiel #45
0
 def item_passed(self, item, spider):
     stats.inc_value("item_passed_count", spider=spider)
     stats.inc_value("item_passed_count")