Exemple #1
0
 def parse_directory(self, response):
     links = response.xpath(self.dir_link_xpath).extract()
     for link in links:
         if link.startswith('http'):
             if 'linkedin.com/pub' in link:
                 r = get_redis()
                 r.sadd('linkedin:user', link)
                 continue
             new_task(link)
Exemple #2
0
 def parse_directory(self, response):
     links = response.xpath(self.dir_link_xpath).extract()
     for link in links:
         if link.startswith('http'):
             if 'linkedin.com/pub' in link:
                 r = get_redis()
                 r.sadd('linkedin:user', link)
                 continue
             new_task(link)
Exemple #3
0
 def __init__(self, *args, **kwargs):
     super(BlogSpider, self).__init__(*args, **kwargs)
     cnt = 0
     r = get_redis()
     r.delete('blog:tasks:fingerprint')
     with open(os.path.join(cpath, 'github_user_blog_list')) as fi:
         for line in fi:
             cnt += 1
             if cnt > 100:break
             new_task(line.strip())
Exemple #4
0
 def __init__(self, *args, **kwargs):
     super(LinkedinSpider, self).__init__(*args, **kwargs)
     filepath = os.path.join(get_main_path(), 'spiders/linkedin/pub_china_user_list')
     r = get_redis()
     r.delete('linkedin:tasks:fingerprint')
     with open(filepath) as fi:
         cnt = 0
         for line in fi:
             new_task(line.strip())
             cnt += 1
             if cnt > 100:break
Exemple #5
0
    def _process_spider_output(self, response, result, spider):
        if response.status == 404:
            # 404 of stackoverflow
            flag = False
            for string_404_list in self.settings.get('HTML_404_STRING'):
                flag = all(
                    [string in response.body for string in string_404_list])
            #flag = 'StackExchange.ready' in response.body and "Page Not Found" in response.body

            if flag:
                log.msg('404 of %s' % response.url, level=log.DEBUG)
                #metric.add_metric(int(response.meta['origin_url'].split('/')[-1]), '404')
                return
            else:
                # 404 of proxy
                proxy = response.meta['proxy']
                #if proxy and proxy.startswith('http://'):
                if proxy:
                    metric.rem_proxy(proxy.lstrip('http://'))
                log.msg('404 of proxy: ' + proxy)
                #yield scrapy.Request(response.meta['origin_url'])
                new_error_task(response.meta['origin_url'])
        elif response.status == 200:
            r = metric.get_redis()
            r.srem(task_key, response.url)
            print "delete", task_key, response.url
            for item in result:
                # succesful item
                if isinstance(item, scrapy.Item):
                    #metric.add_metric(item['uid'])
                    if not item:
                        log.err('None item')
                    yield item
                elif isinstance(item, scrapy.http.Response):
                    # spider parsing error, maybe the proxy return the fucknig 200 status.
                    # so just retry
                    if 'exception' in item.meta:
                        log.err(str(item.meta['exception']) + ': ' + item.url)
                        log.msg(item.body)
                        #yield scrapy.Request(item.meta['origin_url'])
                        new_error_task(response.meta['origin_url'])
                    else:
                        log.err('bug: should be a bad response, but got %r' %
                                item)
                elif isinstance(item, scrapy.http.Request):
                    yield item
                else:
                    log.err('bug: should be a Item or Response, bug got %r' %
                            item)
        else:
            # 503, 204, 500...and so on
            if response.status != 403 or response.status != 400:
                #yield scrapy.Request(response.meta['origin_url'])
                new_error_task(response.meta['origin_url'])
Exemple #6
0
    def get_header(self):
        r = metric.get_redis()

        @wait_exponential_multiplier(100, time=0.01)
        def _get():
            header = r.srandmember(self.header_pool_key)
            return header

        header = _get()
        if not header:
            raise self.exc()
        return header
Exemple #7
0
 def get_proxy(self):
   r = metric.get_redis()
   proxy = None
   cnt = 0
   # TODO change hard code to settings
   while not proxy and cnt < 100:
     time.sleep(0.01)
     cnt += 1
     proxy = r.srandmember(self.redis_key)
     if not proxy:
       log.msg('No enough proxies in proxy pool!')
   return proxy
Exemple #8
0
 def __init__(self, *args, **kwargs):
     super(LinkedinSpider, self).__init__(*args, **kwargs)
     filepath = os.path.join(get_main_path(),
                             'spiders/linkedin/pub_china_user_list')
     r = get_redis()
     r.delete('linkedin:tasks:fingerprint')
     with open(filepath) as fi:
         cnt = 0
         for line in fi:
             new_task(line.strip())
             cnt += 1
             if cnt > 100: break
    def parse(self, response):
        if response.status in self.handle_httpstatus_list:
            return

        if 'StackExchange.ready' in response.body and "Page Not Found" in response.body:
            return

        response.selector.remove_namespaces()

        ids = extract_text_null(self, 'id', response)
        ranks = extract_text_null(self, 'rank', response)
        titles = extract_text_null(self, 'title', response)
        tags = response.xpath(self.tag_xpath).extract()
        author_names = extract_text_null(self, 'author_name', response)
        author_uris = extract_text_null(self, 'author_uri', response)
        links = response.xpath(self.link_xpath).extract()
        publisheds = extract_text_null(self, 'published', response)
        updateds = extract_text_null(self, 'updated', response)
        contents = extract_text_null(self, 'content', response)

        item = StackoverflowQuestionItem()
        item['uid'] = response.url.rstrip('/').split('/')[-1]
        item['rank'] = ranks[0]
        item['title'] = titles[0]
        item['tags'] = tags
        item['author_name'] = author_names[0]
        item['author_uri'] = author_uris[0]
        item['author_uid'] = author_uris[0].split('/')[-1]
        item['link'] = links[0]
        item['published'] = publisheds[0]
        item['updated'] = updateds[0]
        item['content'] = contents[0]
        item['answers'] = []

        pipeline = metric.get_redis().pipeline()
        for i in xrange(1, len(ids)):
            answer = {}
            answer['uid'] = ids[i].split('#')[-1]
            pipeline.hincrby(':'.join([metric.metric_key, 'answer']),
                             answer['uid'], 1)
            answer['rank'] = ranks[i]
            answer['author_name'] = author_names[i]
            answer['author_uri'] = author_uris[i]
            answer['author_uid'] = author_uris[i].split('/')[-1]
            answer['link'] = links[i]
            answer['published'] = publisheds[i]
            answer['updated'] = updateds[i]
            answer['content'] = contents[i]
            item['answers'].append(answer)
        pipeline.execute()

        return item
  def parse(self, response):
    if response.status in self.handle_httpstatus_list:
      return

    if 'StackExchange.ready' in response.body and "Page Not Found" in response.body:
      return

    response.selector.remove_namespaces()

    ids = extract_text_null(self, 'id', response)
    ranks = extract_text_null(self, 'rank', response)
    titles = extract_text_null(self, 'title', response)
    tags = response.xpath(self.tag_xpath).extract()
    author_names = extract_text_null(self, 'author_name', response)
    author_uris = extract_text_null(self, 'author_uri', response)
    links = response.xpath(self.link_xpath).extract()
    publisheds = extract_text_null(self, 'published', response)
    updateds = extract_text_null(self, 'updated', response)
    contents = extract_text_null(self, 'content', response)

    item = StackoverflowQuestionItem()
    item['uid'] = response.url.rstrip('/').split('/')[-1]
    item['rank'] = ranks[0]
    item['title'] = titles[0]
    item['tags'] = tags
    item['author_name'] = author_names[0]
    item['author_uri'] = author_uris[0]
    item['author_uid'] = author_uris[0].split('/')[-1]
    item['link'] = links[0]
    item['published'] = publisheds[0]
    item['updated'] = updateds[0]
    item['content'] = contents[0]
    item['answers'] = []

    pipeline = metric.get_redis().pipeline()
    for i in xrange(1, len(ids)):
      answer = {}
      answer['uid'] = ids[i].split('#')[-1]
      pipeline.hincrby(':'.join([metric.metric_key, 'answer']), answer['uid'], 1)
      answer['rank'] = ranks[i]
      answer['author_name'] = author_names[i]
      answer['author_uri'] = author_uris[i]
      answer['author_uid'] = author_uris[i].split('/')[-1]
      answer['link'] = links[i]
      answer['published'] = publisheds[i]
      answer['updated'] = updateds[i]
      answer['content'] = contents[i]
      item['answers'].append(answer)
    pipeline.execute()

    return item
Exemple #11
0
 def spider_exit(self, response, request, spider):
     print "recieve signals"
     r = metric.get_redis()
     if int(r.get(mysettings.EXIT_KEY)):
         self.crawler.engine.close_spider(spider, 'recieve redis exit signals')