Exemple #1
0
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     if cnt > 2000:
         return []
     post_urls = redis.get_urls(self.redis_conn, 2000,
                                self.store_req_urls_redis_key)
     print "get post_urls num: ", len(post_urls)
     reqs = []
     for post_url in post_urls:
         #http://zhuanlan.zhihu.com/queen
         #http://zhuanlan.zhihu.com/api/columns/queen/posts?limit=10&offset=0
         print post_url
         post_key = urlparse.urlparse(post_url).path.split('/')[1]
         url = 'http://zhuanlan.zhihu.com/api/columns/%s/posts?limit=10&offset=0' % (
             post_key)
         req = Request(url,
                       callback=self.parse,
                       meta={
                           'offset': 0,
                           'post_url': post_url
                       })
         reqs.append(req)
         #http://zhuanlan.zhihu.com/api/columns/queen
         url_post_info = 'http://zhuanlan.zhihu.com/api/columns/%s' % (
             post_key)
         req_post_info = Request(url_post_info,
                                 callback=self.parse_post_info,
                                 meta={'post_url': post_url})
         reqs.append(req_post_info)
         self.total += 2
     return reqs
Exemple #2
0
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     if cnt > 5000:
         return []
     datas = redis.get_urls(self.redis_conn, 2000)
     print "get authors num: ", len(datas)
     reqs = []
     cookie = get_cookie()
     for data in datas:
         author_id, hash_id, follownum = data.split('`$`')
         print 'author_id: ', author_id
         print 'hash_id: ', hash_id
         print 'follownum: ', follownum
         #作者关注
         url = 'http://www.zhihu.com/people/%s/followees' % (author_id)
         req = Request(url,
                       callback=self.parse_author_follow,
                       cookies=cookie,
                       meta={
                           'hash_id': hash_id,
                           'follownum': follownum
                       })
         reqs.append(req)
         self.total += 1
     return reqs
Exemple #3
0
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     if cnt > 5000:
         return []
     author_ids = redis.get_urls(self.redis_conn, 2000)
     print "get authors num: ", len(author_ids)
     reqs_pool = []
     for author_id in author_ids:
         #作者信息
         info_url = 'http://www.zhihu.com/people/%s/about' % (author_id)
         info_req = Request(info_url,
                            callback=self.parse_author_info,
                            headers={'User-Agent': user_agent},
                            cookies=cookie,
                            meta={'author_id': author_id})
         reqs_pool.append(info_req)
         #作者提问
         #             ask_url = 'http://www.zhihu.com/people/%s/asks'%(author_id)
         #             ask_req = Request(ask_url,callback=self.parse_author_ask,meta={'author_id':author_id})
         #             self.reqs_pool.append(ask_req)
         #作者回答
         #             answer_url = 'http://www.zhihu.com/people/%s/answers'%(author_id)
         #             answer_req = Request(answer_url,callback=self.parse_author_answer,meta={'author_id':author_id})
         #             self.reqs_pool.append(answer_req)
         #作者专栏文章
         post_url = 'http://www.zhihu.com/people/%s/posts' % (author_id)
         post_req = Request(post_url,
                            callback=self.parse_author_post,
                            meta={'author_id': author_id})
         reqs_pool.append(post_req)
     return reqs_pool
    def make_requests(self):
        cnt = self.get_spider_pending_cnt(self.total)
        if cnt>5000:
            return []
        author_ids = redis.get_urls(self.redis_conn, 2000)
        print "get authors num: ",len(author_ids)
        reqs_pool = []
        for author_id in author_ids:
            #作者信息
            info_url = 'http://www.zhihu.com/people/%s/about'%(author_id)
            info_req = Request(info_url,callback=self.parse_author_info,headers={'User-Agent':user_agent},cookies=cookie,meta={'author_id':author_id})
            reqs_pool.append(info_req)
            #作者提问
#             ask_url = 'http://www.zhihu.com/people/%s/asks'%(author_id)
#             ask_req = Request(ask_url,callback=self.parse_author_ask,meta={'author_id':author_id})
#             self.reqs_pool.append(ask_req)
            #作者回答
#             answer_url = 'http://www.zhihu.com/people/%s/answers'%(author_id)
#             answer_req = Request(answer_url,callback=self.parse_author_answer,meta={'author_id':author_id})
#             self.reqs_pool.append(answer_req)
            #作者专栏文章
            post_url = 'http://www.zhihu.com/people/%s/posts'%(author_id)
            post_req = Request(post_url,callback=self.parse_author_post,meta={'author_id':author_id})
            reqs_pool.append(post_req)
        return reqs_pool
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     if cnt>3000:
         return []
     author_ids = redis.get_urls(self.redis_conn, 2000,key='zhihu_update_post')
     print "get authors num: ",len(author_ids)
     reqs_pool = []
     for author_id in author_ids:
         #作者专栏文章
         post_url = 'http://www.zhihu.com/people/%s/posts'%(author_id)
         post_req = Request(post_url,callback=self.parse_author_post,meta={'author_id':author_id})
         reqs_pool.append(post_req)
     return reqs_pool
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     print 'cnt: ',cnt
     if cnt>5000:
         return []
     author_ids = redis.get_urls(self.redis_conn, 2000, key=self.rediskey)
     print "get authors num: ",len(author_ids)
     reqs_pool = []
     for author_id in author_ids:
         #作者主页
         author_url = 'http://www.zhihu.com/people/%s'%(author_id)
         author_req = Request(author_url,callback=self.parse_author_info,meta={'author_id':author_id})
         reqs_pool.append(author_req)
         self.total += 1
     return reqs_pool
Exemple #7
0
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     if cnt > 3000:
         return []
     author_ids = redis.get_urls(self.redis_conn, 2000, key=self.key)
     print "get authors num: ", len(author_ids)
     reqs_pool = []
     for author_id in author_ids:
         #作者专栏文章
         post_url = 'http://www.zhihu.com/people/%s/posts' % (author_id)
         post_req = Request(post_url,
                            callback=self.parse_author_post,
                            meta={'author_id': author_id})
         reqs_pool.append(post_req)
         self.total += 1
     return reqs_pool
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     if cnt>5000:
         return []
     datas = redis.get_urls(self.redis_conn, 2000)
     print "get authors num: ",len(datas)
     reqs = []
     cookie = get_cookie()
     for data in datas:
         author_id,hash_id,follownum = data.split('`$`')
         #作者关注
         url = 'http://www.zhihu.com/people/%s/followees'%(author_id)
         req = Request(url,callback=self.parse_author_follow,cookies=cookie,meta={'hash_id':hash_id,'follownum':follownum})
         reqs.append(req)
         self.total += 1
     return reqs
 def make_requests(self):
     cnt = self.get_spider_pending_cnt(self.total)
     print 'cnt: ', cnt
     if cnt > 5000:
         return []
     author_ids = redis.get_urls(self.redis_conn, 2000, key='zhihu_answer')
     print "get authors num: ", len(author_ids)
     reqs_pool = []
     for author_id in author_ids:
         #作者回答
         answer_url = 'http://www.zhihu.com/people/%s/answers' % (author_id)
         answer_req = Request(answer_url,
                              callback=self.parse_author_answer,
                              meta={'author_id': author_id})
         reqs_pool.append(answer_req)
         self.total += 1
     return reqs_pool
    def make_requests(self):
        cnt = self.get_spider_pending_cnt(self.total)
        if cnt>2000:
            return []
        post_urls = redis.get_urls(self.redis_conn, 2000,self.store_req_urls_redis_key)
        print "get post_urls num: ",len(post_urls)
        reqs = []
        for post_url in post_urls:
            #http://zhuanlan.zhihu.com/queen
            #http://zhuanlan.zhihu.com/api/columns/queen/posts?limit=10&offset=0
	    print post_url
            post_key = urlparse.urlparse(post_url).path.split('/')[1]
            url = 'http://zhuanlan.zhihu.com/api/columns/%s/posts?limit=10&offset=0'%(post_key)
            req = Request(url,callback=self.parse,meta={'offset':0,'post_url':post_url})
            reqs.append(req)
            #http://zhuanlan.zhihu.com/api/columns/queen
            url_post_info = 'http://zhuanlan.zhihu.com/api/columns/%s'%(post_key)
            req_post_info = Request(url_post_info,callback=self.parse_post_info,meta={'post_url':post_url})
            reqs.append(req_post_info)
            self.total += 2
        return reqs