def get_thirdpage(self, url, uid): self.body['count'] = '15' self.body['pagebar'] = '1' self.body['pre_page'] = self.body['page'] self.num = 3 content = self.download(url); if(content == None): return True; result = self.wbmsg.get_content(content) utility.iprint(self.get_uid() + ':获取第' + str(self.body['page']) + '页微博成功') return result;
def test(): WBcontent = getWeiboPage.getWeiboPage(); while not GV.task_list.empty(): keyword = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if keyword: WBcontent.set_keyword(keyword); utility.iprint( 'handle id:%s'%WBcontent.get_keyword() ); try: WBcontent.get_msg(WBcontent.get_keyword()); except Exception, e: logging.exception(keyword + "用户信息解析出错: " + str(e)); continue;
def test(): WBcontent = getWeiboPage.getWeiboPage() while not GV.task_list.empty(): keyword = GV.task_list.get() utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize()) if keyword: WBcontent.set_keyword(keyword) utility.iprint('handle id:%s' % WBcontent.get_keyword()) try: WBcontent.get_msg(WBcontent.get_keyword()) except Exception, e: logging.exception(keyword + "用户信息解析出错: " + str(e)) continue
def test(): WBcontent = getWeiboPage.getWeiboPage(GV.dict_klg); while not GV.task_list.empty(): uid = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if uid: WBcontent.set_uid(uid); utility.iprint( 'handle id:%s'%WBcontent.get_uid() ); try: WBcontent.get_msg(WBcontent.get_uid()); except Exception, e: logging.exception(uid + "用户信息解析出错: " + str(e)); continue;
def controller(): num = input('input threads number:') for i in range(1, num+1): worker('T'+str(i)).start() while True: time.sleep(60); count = threading.activeCount(); utility.iprint( '还有 %d 个活动线程'%count ); if(count < num and not GV.task_list.empty()): for j in range(count-1, num): worker('T' + str(j)).start(); elif(GV.task_list.empty() and count <= 1): break;
def controller(): num = input('input threads number:') for i in range(1, num + 1): worker('T' + str(i)).start() while True: time.sleep(60) count = threading.activeCount() utility.iprint('还有 %d 个活动线程' % count) if (count < num and not GV.task_list.empty()): for j in range(count - 1, num): worker('T' + str(j)).start() elif (GV.task_list.empty() and count <= 1): break
def run(self): global task_list; WBcontent = getWeiboPage.getWeiboPage(); while not GV.task_list.empty() and not self.thread_stop: keyword = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if keyword: WBcontent.set_keyword( keyword ); utility.iprint( 'Thread %s handle id:%s'%( self.t_name, WBcontent.get_keyword() ) ); try: WBcontent.get_msg(WBcontent.get_keyword()); except Exception, e: logging.exception( "%s 用户信息解析出错:" + str(e), WBcontent.get_keyword() ); continue;
def run(self): global task_list; WBcontent = getWeiboPage.getWeiboPage(); while not GV.task_list.empty() and not self.thread_stop: uid = GV.task_list.get(); utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() ); if uid: WBcontent.set_uid(uid); utility.iprint( 'Thread %s handle id:%s'%(self.t_name, WBcontent.get_uid()) ); try: WBcontent.get_msg(WBcontent.get_uid()); except Exception, e: logging.exception("%s 用户信息解析出错:" + str(e), WBcontent.get_uid()); continue;
def run(self): global task_list WBcontent = getWeiboPage.getWeiboPage() while not GV.task_list.empty() and not self.thread_stop: keyword = GV.task_list.get() utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize()) if keyword: WBcontent.set_keyword(keyword) utility.iprint('Thread %s handle id:%s' % (self.t_name, WBcontent.get_keyword())) try: WBcontent.get_msg(WBcontent.get_keyword()) except Exception, e: logging.exception("%s 用户信息解析出错:" + str(e), WBcontent.get_keyword()) continue
class getWeiboPage: def __init__(self): self.charset = 'utf-8' self.page_num = 1 # 微博总共有多少页 self.flag = 0 # 标记是否已经获取页数 self.num = 0 # 标记一页中的第几分页 self.version = -1; # 微博的版本(普通用户版0, 企业用户版1, 新浪官方版2) self.comm_wbmsg = CommonWeiboMsg.CommonWeiboMsg(); self.comp_wbmsg = CompanyWeiboMsg.CompanyWeiboMsg(); self.offi_wbmsg = OfficeWeiboMsg.OfficeWeiboMsg(); self.wbmsg = None; # 为微博解析选择版本 def select_version(self): if(self.version == 0): self.comm_wbmsg.init_user(self.uid); self.wbmsg = self.comm_wbmsg; elif(self.version == 1): self.comp_wbmsg.init_user(self.uid); self.wbmsg = self.comp_wbmsg; elif(self.version == 2): self.offi_wbmsg.init_user(self.uid); self.wbmsg = self.offi_wbmsg; else: self.wbmsg = None; def set_uid(self,puid): self.uid = puid def get_uid(self): return self.uid # 预处理,包括:获取page_id,选择微博解析类型,微博总页数.成功返回True,否则返回False def preprocess(self, uid): # 获取 page_id self.body = { '__rnd':'', '_k':'', '_t':'0', 'count':'15', 'end_id':'', 'max_id':'', 'page':1, 'pagebar':'', 'pre_page':'0', 'uid':uid }; url = 'http://weibo.com/u/' + uid + '?profile_ftype=1'; content = self.download(url); if( content == None ): logging.info('%s 页面加载失败', url); return False; tag = "$CONFIG['page_id']='"; pos1 = content.find(tag) + len(tag); if( pos1 == -1): logging.info('%s page_id解析失败', uid); return False; pos2 = content.find("'", pos1); self.page_id = content[pos1:pos2]; # 获取微博总页数,以及版本选择 if(not self.get_totallpage_num(content, uid)): #微博总页数获取失败 logging.info('%s 微博总页数解析失败', uid); return False; self.select_version(); # 设置页面url加载的参数 self.body = { 'is_search':'0', 'visible':'0', 'is_tag':'0', 'profile_ftype':1, 'pagebar':'', 'pre_page':'0', 'page':1 }; return True; # 处理一个 uid 的微博知识 def get_msg(self, uid): self.flag = 0 self.uid = uid; if( not self.preprocess(uid) ): return; url = self.get_url() for i in range(1, self.page_num+1): self.body['page'] = i if( not self.get_firstpage(url, uid) ): break; if( not self.get_secondpage(url, uid) ): break; if( not self.get_thirdpage(url, uid) ): break; # 判断用户是否存在,存在返回True,否则返回False def user_exist(self, content): if(content.find('<title>错误提示') != -1): return False; return True; # 获取新浪官方微博总数,成功返回True,否则返回False def totalpage_office(self, content): pos1 = content.find('<table class="W_tc"'); if( pos1 != -1 ): pos2 = content.find('<\/table>', pos1); if(pos2 != -1): slug = content[pos1:pos2]; bTag = 'mod=weibo"><strong class="">'; pos1 = slug.find(bTag) + len(bTag); pos2 = slug.find('<\/strong>', pos1); temp = slug[pos1:pos2]; if(temp.isdigit()): self.page_num = int(temp); return True; return False; # 获取企业用户微博总数,成功返回True,否则返回False def totalpage_company(self, content): pos1 = content.find('class="user_atten clearfix">'); if( pos1 != -1): pos2 = content.find('<\/ul>', pos1); if(pos2 != -1): slug = content[pos1:pos2]; eTag = '<\/strong><span>微博'; pos2 = slug.find(eTag); if(pos2 != -1): bTag = '<strong>'; pos1 = slug.rfind(bTag, 0, pos2); if(pos1 != -1): pos1 = pos1 + len(bTag); temp = slug[pos1:pos2]; if(temp.isdigit()): self.page_num = int(temp); return True; return False; # 获取一般用户微博总数,成功返回True,否则返回False def totalpage_common(self, content): tag1 = '<strong node-type="weibo">' pos1 = content.find(tag1)+len(tag1) tag2 = '<\/strong>' pos2 = content.find(tag2,pos1) temp = content[pos1:pos2] if(temp.isdigit()): self.page_num = int(temp); return True; return False; # 获取微博页面的总页数,成功True,否则返回False def get_totallpage_num(self, content, uid): version = -1; try: if(self.user_exist(content)): if(not self.totalpage_common(content)): if(not self.totalpage_company(content)): if(not self.totalpage_office(content)): logging.warning('%s 无法解析微博总页数', uid); self.version = -1; return False; else: self.version = 2; #return False; else: self.version = 1; else: self.version = 0; else: logging.info('%s 用户不存在', uid); return False; except Exception,e: logging.exception("%s 获取总页数失败: " + str(e), uid); writer = utility.createFile('error', uid); writer.write(content); writer.close(); return False; self.page_num = self.page_num / 45 + 1 utility.iprint( self.get_uid() + ':微博总共有 ' + str(self.page_num) + ' 页' ) logging.info(uid + " 共有 %d 页微博", self.page_num); return True;
class getWeiboPage: def __init__(self): self.charset = 'utf-8'; self.wbmsg = None; def set_keyword(self, keyword): self.keyword = keyword def get_keyword(self): return self.keyword # 预处理,包括:构造传递参数等.成功返回True,否则返回False # sort: time => 按时间排序, hot => 按热门程度排序, 空为综合排序 def preprocess(self, sort = 'time'): #构造url传递参数 self.body = { 'category':'4', 'page':'1' }; if( sort == 'time' or sort == 'hot'): self.body['xsort'] = sort; self.wbmsg = SearchWeiboMsg.SearchWeiboMsg(self.keyword); self.page_num = 1 # 微博总共有多少页 self.flag = 0 # 标记是否已经获取页数 return True; # 处理一个关键词的微博知识 def get_msg(self, keyword, sort = 'time'): self.keyword = keyword; if( not self.preprocess( sort ) ): return; self.flag = 0; url = self.get_url(); self.handle_one_page(url); for i in range(2, self.page_num+1): self.body['page'] = i if( not self.handle_one_page(url) ): break; # 判断用户是否存在,存在返回True,否则返回False def keyword_exist(self, content): if(content.find('class="search_noresult">') != -1): return False; return True; # 获取微博页面的总页数,成功True,否则返回False def get_totallpage_num(self, content): try: if( self.keyword_exist(content) ): bTag = 'class="W_textc">'; eTag = '<\/span>'; pos1 = content.find(bTag); count = ''; if( pos1 != -1 ): pos1 = pos1 + len(bTag); pos2 = content.find(eTag, pos1); if(pos2 != -1): slug = unicode(content[pos1:pos2], 'utf-8'); for i in slug: if( utility.is_number(i) ): count += i; self.page_num = int(count); else: logging.info('%s 关键词总页数获取失败', self.get_keyword()); return False; else: logging.info('%s 关键词不存在', self.get_keyword()); return False; except Exception,e: logging.exception("%s 获取总页数失败: " + str(e), self.get_keyword()); return False; page_num = self.page_num / 20; if(self.page_num % 20 != 0): page_num += 1; self.page_num = page_num; if(self.page_num > 50): self.page_num = 50; utility.iprint( self.get_keyword() + ':微博总共有 ' + str(self.page_num) + ' 页' ) logging.info(self.get_keyword() + " 共有 %d 页微博", self.page_num); return True;