def parse_directory(self, response): links = response.xpath(self.dir_link_xpath).extract() for link in links: if link.startswith('http'): if 'linkedin.com/pub' in link: r = get_redis() r.sadd('linkedin:user', link) continue new_task(link)
def __init__(self, *args, **kwargs): super(BlogSpider, self).__init__(*args, **kwargs) cnt = 0 r = get_redis() r.delete('blog:tasks:fingerprint') with open(os.path.join(cpath, 'github_user_blog_list')) as fi: for line in fi: cnt += 1 if cnt > 100:break new_task(line.strip())
def __init__(self, *args, **kwargs): super(LinkedinSpider, self).__init__(*args, **kwargs) filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list') r = get_redis() r.delete('linkedin:tasks:fingerprint') with open(filepath) as fi: cnt = 0 for line in fi: new_task(line.strip()) cnt += 1 if cnt > 100:break
def _process_spider_output(self, response, result, spider): if response.status == 404: # 404 of stackoverflow flag = False for string_404_list in self.settings.get('HTML_404_STRING'): flag = all( [string in response.body for string in string_404_list]) #flag = 'StackExchange.ready' in response.body and "Page Not Found" in response.body if flag: log.msg('404 of %s' % response.url, level=log.DEBUG) #metric.add_metric(int(response.meta['origin_url'].split('/')[-1]), '404') return else: # 404 of proxy proxy = response.meta['proxy'] #if proxy and proxy.startswith('http://'): if proxy: metric.rem_proxy(proxy.lstrip('http://')) log.msg('404 of proxy: ' + proxy) #yield scrapy.Request(response.meta['origin_url']) new_error_task(response.meta['origin_url']) elif response.status == 200: r = metric.get_redis() r.srem(task_key, response.url) print "delete", task_key, response.url for item in result: # succesful item if isinstance(item, scrapy.Item): #metric.add_metric(item['uid']) if not item: log.err('None item') yield item elif isinstance(item, scrapy.http.Response): # spider parsing error, maybe the proxy return the fucknig 200 status. # so just retry if 'exception' in item.meta: log.err(str(item.meta['exception']) + ': ' + item.url) log.msg(item.body) #yield scrapy.Request(item.meta['origin_url']) new_error_task(response.meta['origin_url']) else: log.err('bug: should be a bad response, but got %r' % item) elif isinstance(item, scrapy.http.Request): yield item else: log.err('bug: should be a Item or Response, bug got %r' % item) else: # 503, 204, 500...and so on if response.status != 403 or response.status != 400: #yield scrapy.Request(response.meta['origin_url']) new_error_task(response.meta['origin_url'])
def get_header(self): r = metric.get_redis() @wait_exponential_multiplier(100, time=0.01) def _get(): header = r.srandmember(self.header_pool_key) return header header = _get() if not header: raise self.exc() return header
def get_proxy(self): r = metric.get_redis() proxy = None cnt = 0 # TODO change hard code to settings while not proxy and cnt < 100: time.sleep(0.01) cnt += 1 proxy = r.srandmember(self.redis_key) if not proxy: log.msg('No enough proxies in proxy pool!') return proxy
def __init__(self, *args, **kwargs): super(LinkedinSpider, self).__init__(*args, **kwargs) filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list') r = get_redis() r.delete('linkedin:tasks:fingerprint') with open(filepath) as fi: cnt = 0 for line in fi: new_task(line.strip()) cnt += 1 if cnt > 100: break
def parse(self, response): if response.status in self.handle_httpstatus_list: return if 'StackExchange.ready' in response.body and "Page Not Found" in response.body: return response.selector.remove_namespaces() ids = extract_text_null(self, 'id', response) ranks = extract_text_null(self, 'rank', response) titles = extract_text_null(self, 'title', response) tags = response.xpath(self.tag_xpath).extract() author_names = extract_text_null(self, 'author_name', response) author_uris = extract_text_null(self, 'author_uri', response) links = response.xpath(self.link_xpath).extract() publisheds = extract_text_null(self, 'published', response) updateds = extract_text_null(self, 'updated', response) contents = extract_text_null(self, 'content', response) item = StackoverflowQuestionItem() item['uid'] = response.url.rstrip('/').split('/')[-1] item['rank'] = ranks[0] item['title'] = titles[0] item['tags'] = tags item['author_name'] = author_names[0] item['author_uri'] = author_uris[0] item['author_uid'] = author_uris[0].split('/')[-1] item['link'] = links[0] item['published'] = publisheds[0] item['updated'] = updateds[0] item['content'] = contents[0] item['answers'] = [] pipeline = metric.get_redis().pipeline() for i in xrange(1, len(ids)): answer = {} answer['uid'] = ids[i].split('#')[-1] pipeline.hincrby(':'.join([metric.metric_key, 'answer']), answer['uid'], 1) answer['rank'] = ranks[i] answer['author_name'] = author_names[i] answer['author_uri'] = author_uris[i] answer['author_uid'] = author_uris[i].split('/')[-1] answer['link'] = links[i] answer['published'] = publisheds[i] answer['updated'] = updateds[i] answer['content'] = contents[i] item['answers'].append(answer) pipeline.execute() return item
def spider_exit(self, response, request, spider): print "recieve signals" r = metric.get_redis() if int(r.get(mysettings.EXIT_KEY)): self.crawler.engine.close_spider(spider, 'recieve redis exit signals')