コード例 #1
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
                    spider=spider, level=log.DEBUG)

            if spider.redirectprocessor_class:
                processor = spider.redirectprocessor_class()
                processor.process(redirected, request, spider, reason)

            #report redirect status, usually this link is not valid
            elif reason in [301, 302, 307]:
                source = request.meta['domain']
                career.report_link(source, 'http_error', request.url, reason)

            return redirected
        else:
            log.msg("Discarding %s: max redirections reached" % request,
                    spider=spider, level=log.DEBUG)
            raise IgnoreRequest
コード例 #2
0
 def process_spider_input(self, response, spider):
     if 200 <= response.status < 300: # common case
         return
     if 'handle_httpstatus_list' in response.request.meta:
         allowed_statuses = response.request.meta['handle_httpstatus_list']
     else:
         allowed_statuses = getattr(spider, 'handle_httpstatus_list', ())
     if response.status in allowed_statuses:
         return
     source = response.request.meta['domain']
     #report to monitor
     career.report_link(source, 'http_error', response.request.url, response.status)
コード例 #3
0
ファイル: base.py プロジェクト: qpwang/CareerTalkCrawler
    def is_item_valid(self, item, level=0):
        '''
        At least source and source link are present, so an item is valid only it has more attributes loaded.
        '''
        if level == 0:
            return True
            if len(item._values) <= 2:
                return False

        if level == 1:
            return True
            #check item count
            if len(item._values) < 9:
                career.report_link(item['source'], 'mass_null', item['source_link'])
                return False

            #check key item
#            if not item.has_key('name') or not item.has_key('version') or not item.has_key('download_link'):
            if not item.has_key('name') or not item.has_key('version') or not item.has_key('download_link'):
                career.report_link(item['source'], 'missing_key', item['source_link'])
                return False

            # check image and icon, just warning, so not return False
            if not item.has_key('images') or item['images'] == '':
                career.report_link(item['source'], 'missing_image', item['source_link'])
            if not item.has_key('icon_link') or item['icon_link'] == '':
                career.report_link(item['source'], 'missing_icon', item['source_link'])

        return True