def mainpage_parse(self,response): #inspect_response(response,self) if response == None: yield self.start_requests() exit(1) sel = Selector(response) login_user = {} login_user['toCrawl_user_id'] = response.meta['user_id'] login_user['toCrawl_user_nick'] = self.get_property(response,"onick") login_user['login_user_id'] = self.get_property(response,"uid") login_user['page_id'] = self.get_property(response,"page_id") login_user['domain'] = self.get_property(response,"domain") # if uid's mainpage does note exist, pass to next user in queue if login_user['login_user_id'] is None or login_user['page_id'] is None or login_user['domain'] is None: log.msg(' user toCrawl id:%s does not exist! Please try next one' % login_user['toCrawl_user_id'],level=log.DEBUG) next_uid = self.forward_crawling_redis() if next_uid: self.redis_server.lpush(self.ids_problem_name,next_uid) self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1) trypage_url = QueryFactory.mainpage_query(next_uid) mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid}) yield mainpage_request else: log.msg(' Queue is empty, task to terminate.',level=log.INFO) else: print '\n',login_user,'\n' login_user_profile_url = QueryFactory.info_query(page_id=login_user['page_id'], domain=login_user['domain']) log.msg(' user toCrawl id: %s, user login id: %s' \ % (login_user['toCrawl_user_id'],login_user['login_user_id']), level=log.INFO) request = Request(url=login_user_profile_url,callback=self.user_info_parse,meta={'login_user':login_user}) yield request
def weibo_pages_num(self,response): #inspect_response(response,self) if response == None: yield self.start_requests() exit(1) # pares the current js response self.weibo_parse(response) login_user = response.meta['login_user'] # load response in json form html_block_soup = self.json_load_response(response) # get the tag containing the max num of page page_list_tag = html_block_soup.find('div',{'action-type':'feed_list_page_morelist'}) MAX_PAGE_NUM = 50 if page_list_tag: total_num_pages = int(re.search(r'\d+',page_list_tag.a.string).group(0)) if total_num_pages > MAX_PAGE_NUM: total_num_pages = MAX_PAGE_NUM else: total_num_pages = 1 # warp weibo page urls to crawl weibo_page_urls = self.wrap_weibo_pages_urls(domain=login_user['domain'], page_id=login_user['page_id'], num_page=total_num_pages ) print '\n\n Number of user weibos pages: ',total_num_pages,'\n\n' # test part weibo parser user_weibo_page_url = QueryFactory.weibo_js_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=2 )[0] # first request to get the total number of user weibos' pages #request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user}) #yield request # send requests contained in the weibo pages urls for page_url in weibo_page_urls: yield Request(url=page_url,callback=self.weibo_parse,meta={'login_user':login_user}) # insert id crawled into redis self.redis_server.lrem(self.ids_processing_name,login_user['toCrawl_user_id'],num=-1) self.redis_server.lpush(self.ids_crawled_name ,login_user['toCrawl_user_id']) # TODO next_uid = self.forward_crawling_redis() if next_uid: trypage_url = QueryFactory.mainpage_query(next_uid) mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':next_uid}) yield mainpage_request else: log.msg(' Queue is empty, task to terminate.',level=log.INFO)
def wrap_weibo_pages_urls(self, domain,page_id, num_page): weibo_urls = set() for page in range(num_page): weibo_urls=weibo_urls.union(QueryFactory.weibo_js_query( domain=domain,page_id=page_id,page_num=page+1)) return weibo_urls
def user_weibo_pages_num(self,response): #inspect_response(response,self) if response: # pares the current js response self.user_weibo_parse(response) login_user = response.meta['login_user'] # load response in json form html_block_soup = self.json_load_response(response) # get the tag containing the max num of page page_list_tag = html_block_soup.find('div',{'class':'W_pages_layer S-FIXED'}) if page_list_tag: total_num_pages = int(re.search(r'\d+',page_list_tag.a.string).group(0)) else: total_num_pages = 1 # warp weibo page urls to crawl weibo_page_urls = self.wrap_weibo_pages_urls( page_id=login_user['page_id'], num_page=total_num_pages ) # test part weibo parser user_weibo_page_url = QueryFactory.weibo_page_num_query(page_id = login_user['page_id'], page_num=2 ) # first request to get the total number of user weibos' pages request = Request(url=user_weibo_page_url,callback=self.user_weibo_parse,meta={'login_user':login_user}) yield request else: self.start_requests()
def user_info_parse(self,response): user_profile_translation = { u"性别" : 'sex', u"简介" : 'description', u"注册时间" : 'signed_time', u"所在地" : 'location', u"生日" : 'birthday', u"公司" : 'company', u"大学" : 'university', u"标签" : 'personal_tags' } if response == None: yield self.start_requests() exit(1) login_user = response.meta['login_user'] user = UserProfileItem() # fulfill the user Item user['user_id'] = login_user['toCrawl_user_id'] user['screen_name'] = login_user['toCrawl_user_nick'] user_tags_dict = self.get_userinfo_by_html(response) for property_name in user_profile_translation: user[user_profile_translation[property_name]] = user_tags_dict.get(property_name,'') if user.get('signed_time'): user['signed_time'] = datetime.strptime(user['signed_time'],'%Y-%m-%d') print '\n\n User Profile:\n' for user_item in dict(user).items(): print '\t',user_item[0],' : ',user_item[1] print "\n\n" yield user # url to get total number of weibos' pages user_weibo_page_url = QueryFactory.weibo_page_num_query(domain = login_user['domain'], page_id = login_user['page_id'], page_num=1 ) # first request to get the total number of user weibos' pages request = Request(url=user_weibo_page_url,callback=self.weibo_pages_num,meta={'login_user':login_user}) yield request
def mainpage_parse(self,response): if response: sel = Selector(response) login_user = {} login_user['toCrawl_user_id'] = response.meta['user_id'] login_user['login_user_id'] = self.get_property(response,"uid") login_user['page_id'] = self.get_property(response,"page_id") login_user['pid'] = self.get_property(response,"pid") print '\n',login_user,'\n' login_user_profile_url = QueryFactory.info_query(page_id=login_user['page_id'], pid=login_user['pid']) log.msg(' user toCrawl id: %s, user login id: %s' \ % (login_user['toCrawl_user_id'],login_user['login_user_id']), level=log.INFO) request = Request(url=login_user_profile_url,callback=self.user_info_parse,meta={'login_user':login_user}) yield request else: yield self.start_requests()
def login_parse(self,response): if response.body.find('feedBackUrlCallBack') != -1: data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1)) userinfo = data.get('userinfo', '') if len(userinfo): user_id = userinfo.get('uniqueid') screen_name = userinfo.get('displayname') log.msg('user login displayname: %s, user login id: %s' % (screen_name,user_id), level=log.INFO) assert screen_name in self.username self.logined = True #mainpage_url = QueryFactory.mainpage_query(user_id) # get 1 id from the list toCrawl id_toCrawl = self.id_toCrawl_list.pop() trypage_url = QueryFactory.mainpage_query(id_toCrawl) mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':id_toCrawl}) yield mainpage_request else: self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))
def login_parse(self,response): if response.body.find('feedBackUrlCallBack') != -1: data = json.loads(re.search(r'feedBackUrlCallBack\((.*?)\)', response.body, re.I).group(1)) userinfo = data.get('userinfo', '') if len(userinfo): user_id = userinfo.get('uniqueid') screen_name = userinfo.get('displayname') log.msg('user login displayname: %s, user login id: %s' % (screen_name,user_id), level=log.INFO) #assert screen_name in self.username self.logined = True # load first user_id toCrawl from the Queue id_toCrawl = self.redis_server.rpop(self.ids_toCrawl_name) if id_toCrawl: # push user_id toCrawl into the Processing Queue self.redis_server.lpush(self.ids_processing_name,id_toCrawl) trypage_url = QueryFactory.mainpage_query(id_toCrawl) mainpage_request = Request(url=trypage_url,callback=self.mainpage_parse,meta={'user_id':id_toCrawl}) yield mainpage_request else: self.log('login failed: errno=%s, reason=%s' % (data.get('errno', ''), data.get('reason', '')))