def one_id_text(self, one_user, a, b, s): """ 爬取猎头博文的方法 :param a: 博文的开始页 :param b: 博文的结束页 :param s: 信号量,防止多进程时,WeiboPage.all_hunterlist, append时冲突 :return: all_dict 字典的键,即是uid, 值为(时间,博文id,博文,点赞,转发链接,转发量,评论链接,评论量,新闻源)元组组成的列表 """ all_dict = {} writing_time = [] weibo = [] weibo_id = [] praise = [] forward = [] comment = [] forward_url = [] comment_url = [] blog_origin = [] try: host_url = "http://weibo.cn/u/" + str(one_user) url_request = urllib2.Request(host_url, headers=self.header) response = urllib2.urlopen(url_request, timeout=30) text = response.read() page_num = re.compile('跳页" />.*?/(.*?)页') # 匹配微博页数 num = page_num.findall(text) for nm in num: # 判断页数,不足b页时到pm页为止 pm = int(nm) if b > pm: b = pm else: pass for k in xrange(a, b): # 每一页的博文获取 if k % 5 == 4: time.sleep(random.randint(0, 5)) else: pass print "第", k, "页" url = "http://weibo.cn/u/" + str(one_user) + "?page=" + str(k) # 猎头的首页 req = urllib2.Request(url=url, headers=self.header) homepage = urllib2.urlopen(req).read() base_patterns = re.compile( 'class="c" id="(.*?)">.*?<span class="ctt">(.*?)</span>.*?>赞\[(\d+)]</a> <a href="(.*?)">转发\[(\d+)]</a> <a href="(.*?)" class="cc">评论\[(\d+)]</a>.*?<span class="ct">(.*?) ', re.M) # 匹配博文id 、博文 点赞 转发链接 转发 评论链接 评论 时间 self.base_page = base_patterns.findall(homepage) if len(self.base_page) > 0: print "登陆成功" else: print "登陆失败" self.add_text() # 进行今天的博文筛选 if len(self.dictwt) > 0: print "用户 ", one_user, " 今天有发表博文" for item0, item1 in self.dictwt.items(): # item0为tu[0],键=博文id. item1 = 值 weibo_id.append(item0) post_time = self.cleaned_wbtime(item1[6]) blog_origin_patternts = re.compile('>(http://t.cn/.*?)</a></span>') blog_origin_one = blog_origin_patternts.findall(item1[0]) if len(blog_origin_one) == 0: blog_origin.append(str(None)) else: blog_origin.append(blog_origin_one[0]) content = self.cleaned_weibo(item1[0]) writing_time.append(post_time) weibo.append(content) praise.append(item1[1]) forward_url.append(item1[2]) forward.append(item1[3]) comment_url.append(item1[4]) comment.append(item1[5]) else: print "用户 ", one_user, " 今天没有发表博文" if len(weibo) > 0: all_list = zip(writing_time, weibo_id, weibo, praise, forward_url, forward, comment_url, comment, blog_origin) all_dict.setdefault(one_user, all_list) s.acquire() WeiboPage.all_hunterlist.append(all_dict) # 中间变量保存全部猎头的博文 s.release() self.dictwt.clear() # 清空一个用户的信息,准备开始下一个用户 except: pass write_hunter_txt(all_dict.keys(), all_dict.values()) return all_dict
def one_id_text(self, one_user, a, b, s): """ 爬取猎头博文的方法 :param a: 博文的开始页 :param b: 博文的结束页 :param s: 信号量,防止多进程时,WeiboPage.all_hunterlist, append时冲突 :return: all_dict 字典的键,即是uid, 值为(时间,博文,点赞,转发链接,转发量,评论链接,评论量)元组组成的列表 """ all_dict = {} writing_time = [] weibo = [] weibo_id = [] praise = [] forward = [] comment = [] forward_url = [] comment_url = [] blog_origin = [] try: host_url = "http://weibo.cn/u/" + str(one_user) url_request = urllib2.Request(host_url, headers=self.header) response = urllib2.urlopen(url_request, timeout=30) text = response.read() page_num = re.compile('跳页" />.*?/(.*?)页') # 匹配微博页数 num = page_num.findall(text) for nm in num: # 判断页数,不足b页时到pm页为止 pm = int(nm) if b > pm: b = pm else: pass for k in xrange(a, b): # 每一页的博文获取 if k % 5 == 4: time.sleep(random.randint(0, 5)) else: pass print "第", k, "页" url = "http://weibo.cn/u/" + str(one_user) + "?page=" + str(k) # 猎头的首页 req = urllib2.Request(url=url, headers=self.header) homepage = urllib2.urlopen(req).read() base_patterns = re.compile( 'class="c" id="(.*?)">.*?<span class="ctt">(.*?)</span>.*?>赞\[(\d+)]</a> <a href="(.*?)">转发\[(\d+)]</a> <a href="(.*?)" class="cc">评论\[(\d+)]</a>.*?<span class="ct">(.*?) ', re.M) # 匹配博文id 、博文 点赞 转发链接 转发 评论链接 评论 时间 self.base_page = base_patterns.findall(homepage) if len(self.base_page) > 0: print "登陆成功" else: print "登陆失败" self.add_text() # 进行今天的博文筛选 if len(self.dictwt) > 0: print "用户 ", one_user, " 今天有发表博文" for item0, item1 in self.dictwt.items(): # item0为tu[0],键=博文id. item1 = 值 weibo_id.append(item0) post_time = self.cleaned_wbtime(item1[6]) blog_origin_patternts = re.compile('>(http://t.cn/.*?)</a></span>') blog_origin_one = blog_origin_patternts.findall(item1[0]) if len(blog_origin_one) == 0: blog_origin.append(str(None)) else: blog_origin.append(blog_origin_one[0]) content = self.cleaned_weibo(item1[0]) writing_time.append(post_time) weibo.append(content) praise.append(item1[1]) forward_url.append(item1[2]) forward.append(item1[3]) comment_url.append(item1[4]) comment.append(item1[5]) else: print "用户 ", one_user, " 今天没有发表博文" if len(weibo) > 0: all_list = zip(writing_time, weibo_id, weibo, praise, forward_url, forward, comment_url, comment, blog_origin) all_dict.setdefault(one_user, all_list) s.acquire() WeiboPage.all_hunterlist.append(all_dict) # 中间变量保存全部猎头的博文 s.release() self.dictwt.clear() # 清空一个用户的信息,准备开始下一个用户 except: pass write_hunter_txt(all_dict.keys(), all_dict.values()) return all_dict