コード例 #1
0
	def get_thirdpage(self, url, uid):
		self.body['count'] = '15'
		self.body['pagebar'] = '1'
		self.body['pre_page'] = self.body['page']
		self.num = 3
		content = self.download(url);
		if(content == None):
			return True;
		result = self.wbmsg.get_content(content)
		utility.iprint(self.get_uid() + ':获取第' + str(self.body['page']) + '页微博成功')
		return result;
コード例 #2
0
ファイル: main2.py プロジェクト: bollwang/weibo_search
def test():
	WBcontent = getWeiboPage.getWeiboPage();
	while not GV.task_list.empty():
		keyword = GV.task_list.get();
		utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
		if keyword:
			WBcontent.set_keyword(keyword);
			utility.iprint( 'handle id:%s'%WBcontent.get_keyword() );
			try:
				WBcontent.get_msg(WBcontent.get_keyword());
			except Exception, e:
				logging.exception(keyword + "用户信息解析出错: " + str(e));
				continue;
コード例 #3
0
ファイル: main2.py プロジェクト: wesavetheworld/weibo_search
def test():
    WBcontent = getWeiboPage.getWeiboPage()
    while not GV.task_list.empty():
        keyword = GV.task_list.get()
        utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize())
        if keyword:
            WBcontent.set_keyword(keyword)
            utility.iprint('handle id:%s' % WBcontent.get_keyword())
            try:
                WBcontent.get_msg(WBcontent.get_keyword())
            except Exception, e:
                logging.exception(keyword + "用户信息解析出错: " + str(e))
                continue
コード例 #4
0
def test():
	WBcontent = getWeiboPage.getWeiboPage(GV.dict_klg);
	while not GV.task_list.empty():
		uid = GV.task_list.get();
		utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
		if uid:
			WBcontent.set_uid(uid);
			utility.iprint( 'handle id:%s'%WBcontent.get_uid() );
			try:
				WBcontent.get_msg(WBcontent.get_uid());
			except Exception, e:
				logging.exception(uid + "用户信息解析出错: " + str(e));
				continue;
コード例 #5
0
ファイル: main2.py プロジェクト: bollwang/weibo_search
def controller():
	num = input('input threads number:')
	for i in range(1, num+1):
		worker('T'+str(i)).start()

	while True:
		time.sleep(60);
		count = threading.activeCount();
		utility.iprint( '还有 %d 个活动线程'%count );
		if(count < num and not GV.task_list.empty()):
			for j in range(count-1, num):
				 worker('T' + str(j)).start();
		elif(GV.task_list.empty() and count <= 1):
			break;
コード例 #6
0
ファイル: main2.py プロジェクト: wesavetheworld/weibo_search
def controller():
    num = input('input threads number:')
    for i in range(1, num + 1):
        worker('T' + str(i)).start()

    while True:
        time.sleep(60)
        count = threading.activeCount()
        utility.iprint('还有 %d 个活动线程' % count)
        if (count < num and not GV.task_list.empty()):
            for j in range(count - 1, num):
                worker('T' + str(j)).start()
        elif (GV.task_list.empty() and count <= 1):
            break
コード例 #7
0
ファイル: main2.py プロジェクト: bollwang/weibo_search
	def run(self):
		global task_list;

		WBcontent = getWeiboPage.getWeiboPage();
		while not GV.task_list.empty() and not self.thread_stop:
			keyword = GV.task_list.get();
			utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
			if keyword:
				WBcontent.set_keyword( keyword );
				utility.iprint( 'Thread %s handle id:%s'%( self.t_name, WBcontent.get_keyword() ) );
				try:
					WBcontent.get_msg(WBcontent.get_keyword());
				except Exception, e:					
					logging.exception( "%s 用户信息解析出错:" + str(e), WBcontent.get_keyword() );
					continue;
コード例 #8
0
	def run(self):
		global task_list;

		WBcontent = getWeiboPage.getWeiboPage();
		while not GV.task_list.empty() and not self.thread_stop:
			uid = GV.task_list.get();
			utility.iprint( "还剩下 %d 个任务" % GV.task_list.qsize() );
			if uid:
				WBcontent.set_uid(uid);
				utility.iprint( 'Thread %s handle id:%s'%(self.t_name, WBcontent.get_uid()) );
				try:
					WBcontent.get_msg(WBcontent.get_uid());
				except Exception, e:					
					logging.exception("%s 用户信息解析出错:" + str(e), WBcontent.get_uid());
					continue;
コード例 #9
0
ファイル: main2.py プロジェクト: wesavetheworld/weibo_search
    def run(self):
        global task_list

        WBcontent = getWeiboPage.getWeiboPage()
        while not GV.task_list.empty() and not self.thread_stop:
            keyword = GV.task_list.get()
            utility.iprint("还剩下 %d 个任务" % GV.task_list.qsize())
            if keyword:
                WBcontent.set_keyword(keyword)
                utility.iprint('Thread %s handle id:%s' %
                               (self.t_name, WBcontent.get_keyword()))
                try:
                    WBcontent.get_msg(WBcontent.get_keyword())
                except Exception, e:
                    logging.exception("%s 用户信息解析出错:" + str(e),
                                      WBcontent.get_keyword())
                    continue
コード例 #10
0
class getWeiboPage:	
	def __init__(self):
		self.charset = 'utf-8'
		self.page_num = 1		# 微博总共有多少页
		self.flag = 0			# 标记是否已经获取页数
		self.num = 0			# 标记一页中的第几分页
		self.version = -1;		# 微博的版本(普通用户版0, 企业用户版1, 新浪官方版2)
		self.comm_wbmsg = CommonWeiboMsg.CommonWeiboMsg();
		self.comp_wbmsg = CompanyWeiboMsg.CompanyWeiboMsg();
		self.offi_wbmsg = OfficeWeiboMsg.OfficeWeiboMsg();
		self.wbmsg = None;

	# 为微博解析选择版本
	def select_version(self):
		if(self.version == 0):
			self.comm_wbmsg.init_user(self.uid);
			self.wbmsg = self.comm_wbmsg;
		elif(self.version == 1):
			self.comp_wbmsg.init_user(self.uid);
			self.wbmsg = self.comp_wbmsg;
		elif(self.version == 2):
			self.offi_wbmsg.init_user(self.uid);
			self.wbmsg = self.offi_wbmsg;
		else:
			self.wbmsg = None;

	def set_uid(self,puid):
		self.uid = puid

	def get_uid(self):
		return self.uid

	# 预处理,包括:获取page_id,选择微博解析类型,微博总页数.成功返回True,否则返回False
	def preprocess(self, uid):
		# 获取 page_id	
		self.body = {
			'__rnd':'',
			'_k':'',
			'_t':'0',
			'count':'15',
			'end_id':'',
			'max_id':'',
			'page':1,
			'pagebar':'',
			'pre_page':'0',
			'uid':uid
		};
		url = 'http://weibo.com/u/' + uid + '?profile_ftype=1';
		content = self.download(url);		
		if( content == None ):
			logging.info('%s 页面加载失败', url);
			return False;
		tag = "$CONFIG['page_id']='";
		pos1 = content.find(tag) + len(tag);
		if( pos1 == -1):
			logging.info('%s page_id解析失败', uid);
			return False;
		pos2 = content.find("'", pos1);
		self.page_id = content[pos1:pos2];

		# 获取微博总页数,以及版本选择
		if(not self.get_totallpage_num(content, uid)): #微博总页数获取失败
			logging.info('%s 微博总页数解析失败', uid);
			return False;
		self.select_version();

		# 设置页面url加载的参数
		self.body = {
			'is_search':'0',
			'visible':'0',
			'is_tag':'0',
			'profile_ftype':1,
			'pagebar':'',
			'pre_page':'0',
			'page':1
		};
		return True;

	# 处理一个 uid 的微博知识
	def get_msg(self, uid):
		self.flag = 0
		self.uid = uid;
		if( not self.preprocess(uid) ):
			return;
		url = self.get_url()
		for i in range(1, self.page_num+1):
			self.body['page'] = i			
			if( not self.get_firstpage(url, uid) ):
				break;
			if( not self.get_secondpage(url, uid) ):
				break;
			if( not self.get_thirdpage(url, uid) ):
				break;

	# 判断用户是否存在,存在返回True,否则返回False
	def user_exist(self, content):
		if(content.find('<title>错误提示') != -1):
			return False;
		return True;

	# 获取新浪官方微博总数,成功返回True,否则返回False
	def totalpage_office(self, content):
		pos1 = content.find('<table class="W_tc"');
		if( pos1 != -1 ):
			pos2 = content.find('<\/table>', pos1);
			if(pos2 != -1):
				slug = content[pos1:pos2];
				bTag = 'mod=weibo"><strong class="">';
				pos1 = slug.find(bTag) + len(bTag);
				pos2 = slug.find('<\/strong>', pos1);
				temp = slug[pos1:pos2];
				if(temp.isdigit()):
					self.page_num = int(temp);
					return True;
		return False;

	# 获取企业用户微博总数,成功返回True,否则返回False
	def totalpage_company(self, content):
		pos1 = content.find('class="user_atten clearfix">');
		if( pos1 != -1):
			pos2 = content.find('<\/ul>', pos1);
			if(pos2 != -1):
				slug = content[pos1:pos2];
				eTag = '<\/strong><span>微博';
				pos2 = slug.find(eTag);
				if(pos2 != -1):
					bTag = '<strong>';
					pos1 = slug.rfind(bTag, 0, pos2);
					if(pos1 != -1):
						pos1 = pos1 + len(bTag);
						temp = slug[pos1:pos2];
						if(temp.isdigit()):
							self.page_num = int(temp);
							return True;
		return False;

	# 获取一般用户微博总数,成功返回True,否则返回False
	def totalpage_common(self, content):
		tag1 = '<strong node-type="weibo">'
		pos1 = content.find(tag1)+len(tag1)
		tag2 = '<\/strong>'
		pos2 = content.find(tag2,pos1)
		temp = content[pos1:pos2]
		if(temp.isdigit()):
			self.page_num = int(temp);
			return True;
		return False;

	# 获取微博页面的总页数,成功True,否则返回False
	def get_totallpage_num(self, content, uid):
		version = -1;
		try:
			if(self.user_exist(content)):
				if(not self.totalpage_common(content)):
					if(not self.totalpage_company(content)):
						if(not self.totalpage_office(content)):
							logging.warning('%s 无法解析微博总页数', uid);
							self.version = -1;
							return False;
						else:
							self.version = 2;
							#return False;
					else:
						self.version = 1;
				else:
					self.version = 0;
			else:
				logging.info('%s 用户不存在', uid);
				return False;
		except Exception,e:
			logging.exception("%s 获取总页数失败: " + str(e), uid);
			writer = utility.createFile('error', uid);
			writer.write(content);
			writer.close();
			return False;

		self.page_num = self.page_num / 45 + 1
		utility.iprint( self.get_uid() + ':微博总共有 ' + str(self.page_num) + ' 页' )
		logging.info(uid + " 共有 %d 页微博", self.page_num);
		return True;
コード例 #11
0
class getWeiboPage:	
	def __init__(self):
		self.charset = 'utf-8';
		self.wbmsg = None;

	def set_keyword(self, keyword):
		self.keyword = keyword

	def get_keyword(self):
		return self.keyword

	# 预处理,包括:构造传递参数等.成功返回True,否则返回False
	# sort: time => 按时间排序, hot => 按热门程度排序, 空为综合排序
	def preprocess(self, sort = 'time'):
		#构造url传递参数
		self.body = {
			'category':'4',
			'page':'1'
		};
		if( sort == 'time' or sort == 'hot'):
			self.body['xsort'] = sort;

		self.wbmsg = SearchWeiboMsg.SearchWeiboMsg(self.keyword);
		self.page_num = 1		# 微博总共有多少页
		self.flag = 0			# 标记是否已经获取页数
		return True;

	# 处理一个关键词的微博知识
	def get_msg(self, keyword, sort = 'time'):
		self.keyword = keyword;
		if( not self.preprocess( sort ) ):
			return;

		self.flag = 0;
		url = self.get_url();
		self.handle_one_page(url);
		for i in range(2, self.page_num+1):
			self.body['page'] = i			
			if( not self.handle_one_page(url) ):
				break;

	# 判断用户是否存在,存在返回True,否则返回False
	def keyword_exist(self, content):
		if(content.find('class="search_noresult">') != -1):
			return False;
		return True;

	# 获取微博页面的总页数,成功True,否则返回False
	def get_totallpage_num(self, content):
		try:
			if( self.keyword_exist(content) ):
				bTag = 'class="W_textc">';
				eTag = '<\/span>';
				pos1 = content.find(bTag);
				count = '';
				if( pos1 != -1 ):
					pos1 = pos1 + len(bTag);
					pos2 = content.find(eTag, pos1);
					if(pos2 != -1):
						slug = unicode(content[pos1:pos2], 'utf-8');
						for i in slug:
							if( utility.is_number(i) ):
								count += i;
						self.page_num = int(count);
				else:
					logging.info('%s 关键词总页数获取失败', self.get_keyword());
					return False;
			else:
				logging.info('%s 关键词不存在', self.get_keyword());
				return False;
		except Exception,e:
			logging.exception("%s 获取总页数失败: " + str(e), self.get_keyword());
			return False;

		page_num = self.page_num / 20;
		if(self.page_num % 20 != 0):
			page_num += 1;
		self.page_num = page_num;
		if(self.page_num > 50):
			self.page_num = 50;
		utility.iprint( self.get_keyword() + ':微博总共有 ' + str(self.page_num) + ' 页' )
		logging.info(self.get_keyword() + " 共有 %d 页微博", self.page_num);
		return True;