def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt > 2000: return [] post_urls = redis.get_urls(self.redis_conn, 2000, self.store_req_urls_redis_key) print "get post_urls num: ", len(post_urls) reqs = [] for post_url in post_urls: #http://zhuanlan.zhihu.com/queen #http://zhuanlan.zhihu.com/api/columns/queen/posts?limit=10&offset=0 print post_url post_key = urlparse.urlparse(post_url).path.split('/')[1] url = 'http://zhuanlan.zhihu.com/api/columns/%s/posts?limit=10&offset=0' % ( post_key) req = Request(url, callback=self.parse, meta={ 'offset': 0, 'post_url': post_url }) reqs.append(req) #http://zhuanlan.zhihu.com/api/columns/queen url_post_info = 'http://zhuanlan.zhihu.com/api/columns/%s' % ( post_key) req_post_info = Request(url_post_info, callback=self.parse_post_info, meta={'post_url': post_url}) reqs.append(req_post_info) self.total += 2 return reqs
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt > 5000: return [] datas = redis.get_urls(self.redis_conn, 2000) print "get authors num: ", len(datas) reqs = [] cookie = get_cookie() for data in datas: author_id, hash_id, follownum = data.split('`$`') print 'author_id: ', author_id print 'hash_id: ', hash_id print 'follownum: ', follownum #作者关注 url = 'http://www.zhihu.com/people/%s/followees' % (author_id) req = Request(url, callback=self.parse_author_follow, cookies=cookie, meta={ 'hash_id': hash_id, 'follownum': follownum }) reqs.append(req) self.total += 1 return reqs
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt > 5000: return [] author_ids = redis.get_urls(self.redis_conn, 2000) print "get authors num: ", len(author_ids) reqs_pool = [] for author_id in author_ids: #作者信息 info_url = 'http://www.zhihu.com/people/%s/about' % (author_id) info_req = Request(info_url, callback=self.parse_author_info, headers={'User-Agent': user_agent}, cookies=cookie, meta={'author_id': author_id}) reqs_pool.append(info_req) #作者提问 # ask_url = 'http://www.zhihu.com/people/%s/asks'%(author_id) # ask_req = Request(ask_url,callback=self.parse_author_ask,meta={'author_id':author_id}) # self.reqs_pool.append(ask_req) #作者回答 # answer_url = 'http://www.zhihu.com/people/%s/answers'%(author_id) # answer_req = Request(answer_url,callback=self.parse_author_answer,meta={'author_id':author_id}) # self.reqs_pool.append(answer_req) #作者专栏文章 post_url = 'http://www.zhihu.com/people/%s/posts' % (author_id) post_req = Request(post_url, callback=self.parse_author_post, meta={'author_id': author_id}) reqs_pool.append(post_req) return reqs_pool
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt>5000: return [] author_ids = redis.get_urls(self.redis_conn, 2000) print "get authors num: ",len(author_ids) reqs_pool = [] for author_id in author_ids: #作者信息 info_url = 'http://www.zhihu.com/people/%s/about'%(author_id) info_req = Request(info_url,callback=self.parse_author_info,headers={'User-Agent':user_agent},cookies=cookie,meta={'author_id':author_id}) reqs_pool.append(info_req) #作者提问 # ask_url = 'http://www.zhihu.com/people/%s/asks'%(author_id) # ask_req = Request(ask_url,callback=self.parse_author_ask,meta={'author_id':author_id}) # self.reqs_pool.append(ask_req) #作者回答 # answer_url = 'http://www.zhihu.com/people/%s/answers'%(author_id) # answer_req = Request(answer_url,callback=self.parse_author_answer,meta={'author_id':author_id}) # self.reqs_pool.append(answer_req) #作者专栏文章 post_url = 'http://www.zhihu.com/people/%s/posts'%(author_id) post_req = Request(post_url,callback=self.parse_author_post,meta={'author_id':author_id}) reqs_pool.append(post_req) return reqs_pool
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt>3000: return [] author_ids = redis.get_urls(self.redis_conn, 2000,key='zhihu_update_post') print "get authors num: ",len(author_ids) reqs_pool = [] for author_id in author_ids: #作者专栏文章 post_url = 'http://www.zhihu.com/people/%s/posts'%(author_id) post_req = Request(post_url,callback=self.parse_author_post,meta={'author_id':author_id}) reqs_pool.append(post_req) return reqs_pool
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) print 'cnt: ',cnt if cnt>5000: return [] author_ids = redis.get_urls(self.redis_conn, 2000, key=self.rediskey) print "get authors num: ",len(author_ids) reqs_pool = [] for author_id in author_ids: #作者主页 author_url = 'http://www.zhihu.com/people/%s'%(author_id) author_req = Request(author_url,callback=self.parse_author_info,meta={'author_id':author_id}) reqs_pool.append(author_req) self.total += 1 return reqs_pool
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt > 3000: return [] author_ids = redis.get_urls(self.redis_conn, 2000, key=self.key) print "get authors num: ", len(author_ids) reqs_pool = [] for author_id in author_ids: #作者专栏文章 post_url = 'http://www.zhihu.com/people/%s/posts' % (author_id) post_req = Request(post_url, callback=self.parse_author_post, meta={'author_id': author_id}) reqs_pool.append(post_req) self.total += 1 return reqs_pool
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt>5000: return [] datas = redis.get_urls(self.redis_conn, 2000) print "get authors num: ",len(datas) reqs = [] cookie = get_cookie() for data in datas: author_id,hash_id,follownum = data.split('`$`') #作者关注 url = 'http://www.zhihu.com/people/%s/followees'%(author_id) req = Request(url,callback=self.parse_author_follow,cookies=cookie,meta={'hash_id':hash_id,'follownum':follownum}) reqs.append(req) self.total += 1 return reqs
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) print 'cnt: ', cnt if cnt > 5000: return [] author_ids = redis.get_urls(self.redis_conn, 2000, key='zhihu_answer') print "get authors num: ", len(author_ids) reqs_pool = [] for author_id in author_ids: #作者回答 answer_url = 'http://www.zhihu.com/people/%s/answers' % (author_id) answer_req = Request(answer_url, callback=self.parse_author_answer, meta={'author_id': author_id}) reqs_pool.append(answer_req) self.total += 1 return reqs_pool
def make_requests(self): cnt = self.get_spider_pending_cnt(self.total) if cnt>2000: return [] post_urls = redis.get_urls(self.redis_conn, 2000,self.store_req_urls_redis_key) print "get post_urls num: ",len(post_urls) reqs = [] for post_url in post_urls: #http://zhuanlan.zhihu.com/queen #http://zhuanlan.zhihu.com/api/columns/queen/posts?limit=10&offset=0 print post_url post_key = urlparse.urlparse(post_url).path.split('/')[1] url = 'http://zhuanlan.zhihu.com/api/columns/%s/posts?limit=10&offset=0'%(post_key) req = Request(url,callback=self.parse,meta={'offset':0,'post_url':post_url}) reqs.append(req) #http://zhuanlan.zhihu.com/api/columns/queen url_post_info = 'http://zhuanlan.zhihu.com/api/columns/%s'%(post_key) req_post_info = Request(url_post_info,callback=self.parse_post_info,meta={'post_url':post_url}) reqs.append(req_post_info) self.total += 2 return reqs