def parse_user(self, response): """ 解析用户的基本信息 返回数据是json,所以对json解析(反序列化) :param response: :return: """ result = json.loads(response.text) item = ZhihuUserItem() # 实例化Item for field in item.fields: # 遍历Item字段,并且对应赋值 if field in result.keys(): item[field] = result.get(field, '') if 'id' in result.keys(): item['userid'] = result.get( 'id', '') # json中是id,Item中定义是userid,所以要特殊处理 item['updatetime'] = datetime.datetime.now().isoformat(' ') yield item # print(item) # 解析出关注列表请求URL放入调度器 yield scrapy.Request(url=self.followees_url.format( username=result.get('url_token'), followees_query=self.followees_query, offset=0, limit=20), callback=self.parse_followees) # 解析出关注他的人(粉丝)列表请求URL放入调度器 yield scrapy.Request(url=self.fans_url.format( username=result.get('url_token'), fans_query=self.fans_query, offset=0, limit=20), callback=self.parse_fans)
def parse_user(self,response): #user数据的解析 item_loader = ItemLoader(item=ZhihuUserItem(), response=response) vote_pat = '<div class="IconGraf".*?</div>.*?获得.*?>([0-9,]+)<.*?次赞同.*?</div>' thanks_pat = '次赞同.*?<div.*?获得(.*?)次感谢' collection_pat = '次感谢.*?>([0-9,]+).*?次收藏' get_vote_num = re.compile(vote_pat,re.DOTALL).findall(response.text) get_thanks = re.compile(thanks_pat, re.DOTALL).findall(response.text) get_collection = re.compile(collection_pat, re.DOTALL).findall(response.text) answer_num = response.xpath('//li[@aria-controls="Profile-answers"]/a/span/text()').extract_first() inf_pat = '<div class="ProfileHeader-iconWrapper".*?</div>.*?>(.*?)<.*?(</div>|<div>)' code_pat = '<div class="ProfileHeader-iconWrapper".*?<svg.*?<path d="(.*?)".*?>' inf= re.compile(inf_pat,re.DOTALL).findall(response.text) code = re.compile(code_pat,re.DOTALL).findall(response.text) if inf and code is not None: inf_code = list(zip([i[0] for i in inf],code)) for i in inf_code: if i[1] == MAN_CODE: sex = "男" item_loader.add_value('sex', sex) elif i[1] == WOMAN_CODE: sex = "女" item_loader.add_value('sex', sex) elif i[1] == JOB_CODE: career = i[0] item_loader.add_value('career', career) elif i[1] ==EDUCATION: edu = i[0] item_loader.add_value('educational_experience', edu) item_loader.add_value('user_url',response.url) item_loader.add_value('url_id',get_md5(response.url)) item_loader.add_xpath('name','//h1[@class="ProfileHeader-title"]/span[1]/text()') item_loader.add_xpath('introduce_yourself','//h1[@class="ProfileHeader-title"]/span[2]/text()') item_loader.add_value('get_vote_num',get_vote_num) item_loader.add_value('get_thanks',get_thanks) item_loader.add_value('get_collection',get_collection) item_loader.add_xpath('followers','//div[@class="Card FollowshipCard"]/div/a[2]/div/strong/text()') item_loader.add_xpath('following','//div[@class="Card FollowshipCard"]/div/a[1]/div/strong/text()') item_loader.add_value('answer_num',answer_num) item_loader.add_xpath('questions_num','//li[@aria-controls="Profile-asks"]/a/span/text()') item_loader.add_xpath('articles_num','//li[@aria-controls="Profile-posts"]/a/span/text()') item_loader.add_xpath('columns_num','//li[@aria-controls="Profile-columns"]/a/span/text()') item_loader.add_xpath('ideal_num', '//li[@aria-controls="Profile-pins"]/a/span/text()') author_item = item_loader.load_item() yield author_item author_id = response.meta.get("author_id","") if author_id is not None and answer_num is not None and answer_num !='0': user_answer_url = self.user_answer_api.format(author_id,0,20) yield Request(url=user_answer_url,callback=self.recursion_question)
def parse_user(self, response): result = json.loads(response.text) item = ZhihuUserItem() for field in item.fields: if field in result.keys(): item[field] = result.get(field) yield item yield Request( self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0), self.parse_follows) yield Request( self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0), self.parse_followers)
def parse_user(self, response): selector = Selector(response) user = ZhihuUserItem() user['_id'] = user['username'] = response.url.split('/')[-2] user['url'] = response.url user['nickname'] = ''.join( selector.xpath( "//div[@class='title-section ellipsis']/a[@class='name']/text()" ).extract()) user['location'] = ''.join( selector.xpath("//span[@class='location item']/@title").extract()) user['industry'] = ''.join( selector.xpath("//span[@class='business item']/@title").extract()) user['sex'] = ''.join( selector.xpath( '//div[@class="item editable-group"]/span/span[@class="item"]/i/@class' ).extract()).replace("zg-icon gender ", "") user['description'] = ''.join( selector.xpath( "//span[@class='description unfold-item']/span/text()"). extract()).strip().replace("\n", '') user['view_num'] = ''.join( selector.xpath( "//span[@class='zg-gray-normal']/strong/text()").extract()) user['update_time'] = str(datetime.now()) user['jobs'] = [] job_nodes = selector.xpath( '//div[@class="zm-profile-module zg-clear"][1]/div/ul[@class="zm-profile-details-items"]/li' ) for node in job_nodes: company = ''.join(node.xpath('@data-title').extract()) title = ''.join(node.xpath('@data-sub-title').extract()) user['jobs'].append({'company': company, 'title': title}) user['educations'] = [] edu_nodes = selector.xpath( '//div[@class="zm-profile-module zg-clear"][3]/div/ul[@class="zm-profile-details-items"]/li' ) for node in edu_nodes: school = ''.join(node.xpath('@data-title').extract()) major = ''.join(node.xpath('@data-sub-title').extract()) user['educations'].append({'school': school, 'major': major}) user['sinaweibo'] = '' user['tencentweibo'] = '' for node in selector.xpath( "//a[@class='zm-profile-header-user-weibo']/@href").extract(): if node.startswith('http://weibo.com'): user['sinaweibo'] = node elif node.startswith('http://t.qq.com'): user['tencentweibo'] = node statistics = selector.xpath( "//a[@class='item']/strong/text()").extract() followee_num = user['followee_num'] = statistics[0] follower_num = user['follower_num'] = statistics[1] statistics = selector.xpath( "//div[@class='zm-profile-module-desc']/span/strong/text()" ).extract() if len(statistics) == 4: user['agree_num'] = statistics[0] user['thank_num'] = statistics[1] user['fav_num'] = statistics[2] user['share_num'] = statistics[3] statistics = selector.xpath( "//div[@class='profile-navbar clearfix']/a/span/text()").extract() if len(statistics) == 6: user['ask_num'] = statistics[1] user['answer_num'] = statistics[2] user['post_num'] = statistics[3] user['collection_num'] = statistics[4] user['log_num'] = statistics[5] _xsrf = ''.join( selector.xpath('//input[@name="_xsrf"]/@value').extract()) hash_id = ''.join( selector.xpath( '//div[@class="zm-profile-header-op-btns clearfix"]/button/@data-id' ).extract()) yield user self.user_names.append(user['username']) print 'NEW:%s' % user['username'] num = int(followee_num) if followee_num else 0 page_num = num / 20 page_num += 1 if num % 20 else 0 for i in xrange(page_num): params = json.dumps({ "hash_id": hash_id, "order_by": "created", "offset": i * 20 }) payload = {"method": "next", "params": params, "_xsrf": _xsrf} yield Request("http://www.zhihu.com/node/ProfileFolloweesListV2?" + urlencode(payload), callback=self.parse_follow_url) num = int(follower_num) if follower_num else 0 page_num = num / 20 page_num += 1 if num % 20 else 0 for i in xrange(page_num): params = json.dumps({ "hash_id": hash_id, "order_by": "created", "offset": i * 20 }) payload = {"method": "next", "params": params, "_xsrf": _xsrf} yield Request("http://www.zhihu.com/node/ProfileFollowersListV2?" + urlencode(payload), callback=self.parse_follow_url)
def parse_user(self, response): print 'parsing user: %s' % response.url selector = Selector(response) user = ZhihuUserItem() if(response.url.endswith('about')): user['_id'] = user['username'] = response.url.split('/')[-2] else: user['_id'] = user['username'] = response.url.split('/')[-1] user['url'] = response.url user['nickname'] = ''.join( selector.xpath("//div[@class='title-section ellipsis']/a[@class='name']/text()").extract()) user['location'] = ''.join(selector.xpath("//span[@class='location item']/@title").extract()) user['industry'] = ''.join(selector.xpath("//span[@class='business item']/@title").extract()) user['sex'] = ''.join( selector.xpath('//div[@class="item editable-group"]/span/span[@class="item"]/i/@class').extract()).replace( "zg-icon gender ", "") user['description'] = ''.join( selector.xpath("//span[@class='description unfold-item']/span/text()").extract()).strip().replace("\n", '') user['view_num'] = ''.join(selector.xpath("//span[@class='zg-gray-normal']/strong/text()").extract()) user['update_time'] = str(datetime.now()) user['jobs'] = [] job_nodes = selector.xpath( '//div[@class="zm-profile-module zg-clear"][1]/div/ul[@class="zm-profile-details-items"]/li') for node in job_nodes: company = ''.join(node.xpath('@data-title').extract()) title = ''.join(node.xpath('@data-sub-title').extract()) user['jobs'].append({'company': company, 'title': title}) user['educations'] = [] edu_nodes = selector.xpath( '//div[@class="zm-profile-module zg-clear"][3]/div/ul[@class="zm-profile-details-items"]/li') for node in edu_nodes: school = ''.join(node.xpath('@data-title').extract()) major = ''.join(node.xpath('@data-sub-title').extract()) user['educations'].append({'school': school, 'major': major}) user['sinaweibo'] = '' user['tencentweibo'] = '' for node in selector.xpath("//a[@class='zm-profile-header-user-weibo']/@href").extract(): if node.startswith('http://weibo.com'): user['sinaweibo'] = node elif node.startswith('http://t.qq.com'): user['tencentweibo'] = node statistics = selector.xpath("//a[@class='item']/strong/text()").extract() followee_num = user['followee_num'] = statistics[0] follower_num = user['follower_num'] = statistics[1] statistics = selector.xpath("//div[@class='zm-profile-module-desc']/span/strong/text()").extract() if len(statistics) == 4: user['agree_num'] = statistics[0] user['thank_num'] = statistics[1] user['fav_num'] = statistics[2] user['share_num'] = statistics[3] statistics = selector.xpath("//div[@class='profile-navbar clearfix']/a/span/text()").extract() if len(statistics) == 6: user['ask_num'] = statistics[1] user['answer_num'] = statistics[2] user['post_num'] = statistics[3] user['collection_num'] = statistics[4] user['log_num'] = statistics[5] _xsrf = ''.join(selector.xpath('//input[@name="_xsrf"]/@value').extract()) hash_id = ''.join( selector.xpath('//div[@class="zm-profile-header-op-btns clearfix"]/button/@data-id').extract()) # questions num = int(user['ask_num']) if "ask_num" in user.keys() else 0 page_num = num / 20 page_num += 1 if num % 20 else 0 for i in xrange(page_num): url = host + "/people/" + user["username"] + '/asks?page=%d' % (i + 1) yield Request(url, callback=self.parse_ask) # answers num = int(user['answer_num']) if "answer_num" in user.keys() else 0 page_num = num / 20 page_num += 1 if num % 20 else 0 for i in xrange(page_num): yield Request(host + "/people/" + user["username"] + '/answers?page=%d' % (i + 1), callback=self.parse_ans) self.user_names.append(user['username']) print 'User parsed: %s' % user['username'] yield user num = int(followee_num) if followee_num else 0 page_num = num / 20 page_num += 1 if num % 20 else 0 for i in xrange(page_num): params = json.dumps({"hash_id": hash_id, "order_by": "created", "offset": i * 20}) payload = {"method": "next", "params": params, "_xsrf": _xsrf} yield Request("http://www.zhihu.com/node/ProfileFolloweesListV2?" + urlencode(payload), callback=self.parse_follow_url) num = int(follower_num) if follower_num else 0 page_num = num / 20 page_num += 1 if num % 20 else 0 for i in xrange(page_num): params = json.dumps({"hash_id": hash_id, "order_by": "created", "offset": i * 20}) payload = {"method": "next", "params": params, "_xsrf": _xsrf} yield Request("http://www.zhihu.com/node/ProfileFollowersListV2?" + urlencode(payload), callback=self.parse_follow_url)
def parse(self, response): time.sleep(random.random()) if response.url == host: yield Request("http://www.zhihu.com/people/raymond-wang/about", headers = self.headers, cookies = self.cookies) else: typeinfo = response.url.split('/')[-1] selector = Selector(response) if typeinfo.startswith('about'): try: user = ZhihuUserItem() user['_id']=user['username']=response.url.split('/')[-2] user['url']= response.url user['nickname'] = ''.join(selector.xpath("//div[@class='title-section ellipsis']/a[@class='name']/text()").extract()) user['location'] = ''.join(selector.xpath("//span[@class='location item']/@title").extract()) user['industry'] = ''.join(selector.xpath("//span[@class='business item']/@title").extract()) user['sex'] = ''.join(selector.xpath('//div[@class="item editable-group"]/span/span[@class="item"]/i/@class').extract()).replace("zg-icon gender ","") user['description'] = ''.join(selector.xpath("//span[@class='description unfold-item']/span/text()").extract()).strip().replace("\n",'') user['view_num'] = ''.join(selector.xpath("//span[@class='zg-gray-normal']/strong/text()").extract()) user['update_time'] = str(datetime.now()) user['jobs'] = [] job_nodes = selector.xpath('//div[@class="zm-profile-module zg-clear"][1]/div/ul[@class="zm-profile-details-items"]/li') for node in job_nodes: company = ''.join(node.xpath('@data-title').extract()) title = ''.join(node.xpath('@data-sub-title').extract()) user['jobs'].append({'company': company, 'title':title}) user['educations'] = [] edu_nodes = selector.xpath('//div[@class="zm-profile-module zg-clear"][3]/div/ul[@class="zm-profile-details-items"]/li') for node in edu_nodes: school = ''.join(node.xpath('@data-title').extract()) major = ''.join(node.xpath('@data-sub-title').extract()) user['educations'].append({'school':school, 'major':major}) for node in selector.xpath("//a[@class='zm-profile-header-user-weibo']/@href").extract(): if node.startswith('http://weibo.com'): user['sinaweibo'] = node elif node.startswith('http://t.qq.com'): user['tencentweibo'] = node statistics = selector.xpath("//a[@class='item']/strong/text()").extract() followee_num =user['followee_num'] = statistics[0] follower_num = user['follower_num']= statistics[1] statistics = selector.xpath("//div[@class='zm-profile-module-desc']/span/strong/text()").extract() if len(statistics) ==4: user['agree_num'] = statistics[0] user['thank_num'] = statistics[1] user['fav_num'] = statistics[2] user['share_num'] = statistics[3] statistics = selector.xpath("//div[@class='profile-navbar clearfix']/a/span/text()").extract() if len(statistics) ==6: user['ask_num'] = statistics[1] user['answer_num'] = statistics[2] user['post_num'] = statistics[3] user['collection_num'] = statistics[4] user['log_num'] = statistics[5] _xsrf=''.join(selector.xpath('//input[@name="_xsrf"]/@value').extract()) hash_id=''.join(selector.xpath('//div[@class="zm-profile-header-op-btns clearfix"]/button/@data-id').extract()) print 'NEW:%s' % user['username'] yield user self.user_names.append(user['username']) print 'NEW:%s' % user['username'] base_url = '/'.join(response.url.split('/')[:-1]) headers = self.headers headers['Referer'] = response.url # followees num = int(followee_num) if followee_num else 0 page_num = num/20 page_num += 1 if num%20 else 0 for i in xrange(page_num): params = json.dumps({"hash_id":hash_id,"order_by":"created","offset":i*20}) payload = {"method":"next", "params": params, "_xsrf":_xsrf,"username":user['username']} yield Request("http://www.zhihu.com/node/ProfileFolloweesListV2?"+urlencode(payload), headers = headers, cookies = self.cookies) # followers num = int(follower_num) if follower_num else 0 page_num = num/20 page_num += 1 if num%20 else 0 for i in xrange(page_num): params = json.dumps({"hash_id":hash_id,"order_by":"created","offset":i*20}) payload = {"method":"next", "params": params, "_xsrf":_xsrf,"username":user['username']} yield Request("http://www.zhihu.com/node/ProfileFollowersListV2?"+urlencode(payload), headers = headers, cookies = self.cookies) ''' # questions num = int(user['ask_num']) if user['ask_num'] else 0 page_num = num/20 page_num += 1 if num%20 else 0 for i in xrange(page_num): if i > 0: headers['Referer'] = base_url + '/asks?page=%d' % (i-1) else: headers['Referer'] = base_url + '/asks' yield Request(base_url + '/asks?page=%d' % (i+1), headers = headers, cookies = self.cookies) # answers num = int(user['answer_num']) if user['answer_num'] else 0 page_num = num/20 page_num += 1 if num%20 else 0 for i in xrange(page_num): if i > 0: headers['Referer'] = base_url + '/answers?page=%d' % (i-1) else: headers['Referer'] = base_url + '/answers' yield Request(base_url + '/answers?page=%d' % (i+1), headers = headers, cookies = self.cookies) ''' except Exception, e: open('error_pages/about_' + response.url.split('/')[-2]+'.html', 'w').write(response.body) print '='*10 + str(e) elif typeinfo.startswith('followees') or typeinfo.startswith('ProfileFolloweesListV2'): followees = [] try: links = selector.xpath('//div[@class="zm-list-content-medium"]/h2/a/@href').extract() for link in links: username_tmp = link.split('/')[-1] followees.append(username_tmp) if username_tmp in self.user_names: print 'GET:' + '%s' % username_tmp continue headers = self.headers; headers['Referer'] = response.url yield Request(link+'/about', headers = headers, cookies=self.cookies) username=urlparse.parse_qs(urlparse.urlparse(response.url).query,True)['username'][0] #yield ZhihuFolloweesItem(_id=username,username = username,followees = followees) except Exception, e: open('error_pages/followees_' + response.url.split('/')[-2]+'.html', 'w').write(response.body) print '='*10 + str(e)