def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), spider=spider, level=log.DEBUG) if spider.redirectprocessor_class: processor = spider.redirectprocessor_class() processor.process(redirected, request, spider, reason) #report redirect status, usually this link is not valid elif reason in [301, 302, 307]: source = request.meta['domain'] career.report_link(source, 'http_error', request.url, reason) return redirected else: log.msg("Discarding %s: max redirections reached" % request, spider=spider, level=log.DEBUG) raise IgnoreRequest
def process_spider_input(self, response, spider): if 200 <= response.status < 300: # common case return if 'handle_httpstatus_list' in response.request.meta: allowed_statuses = response.request.meta['handle_httpstatus_list'] else: allowed_statuses = getattr(spider, 'handle_httpstatus_list', ()) if response.status in allowed_statuses: return source = response.request.meta['domain'] #report to monitor career.report_link(source, 'http_error', response.request.url, response.status)
def is_item_valid(self, item, level=0): ''' At least source and source link are present, so an item is valid only it has more attributes loaded. ''' if level == 0: return True if len(item._values) <= 2: return False if level == 1: return True #check item count if len(item._values) < 9: career.report_link(item['source'], 'mass_null', item['source_link']) return False #check key item # if not item.has_key('name') or not item.has_key('version') or not item.has_key('download_link'): if not item.has_key('name') or not item.has_key('version') or not item.has_key('download_link'): career.report_link(item['source'], 'missing_key', item['source_link']) return False # check image and icon, just warning, so not return False if not item.has_key('images') or item['images'] == '': career.report_link(item['source'], 'missing_image', item['source_link']) if not item.has_key('icon_link') or item['icon_link'] == '': career.report_link(item['source'], 'missing_icon', item['source_link']) return True