def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). get('href')) parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode('gbk', 'ignore') wb_repost.weibo_id = repost['mid'] wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). get('href')) parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error("error occurred when parsing the parent's name ,the detail is {}".format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user, _ = user_get.get_profile(uid) if total_page < limit: limit = total_page + 1 for page_num in range(2, limit): cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # when it comes to errors, set the args to default(root) repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj weibo_repost.save_reposts(repost_datas)
def crawl_repost_page(mid, uid): limit = get_max_repost_page() + 1 first_repost_data = crawl_repost_by_page(mid, 1) wb_data.set_weibo_repost_crawled(mid) total_page = repost.get_total_page(first_repost_data[0]) repost_datas = first_repost_data[1] if not repost_datas: return root_user = user_get.get_profile(uid) if total_page < limit: limit = total_page + 1 # todo 这里需要衡量是否有用网络调用的必要性 for page_num in range(2, limit): # app.send_task('tasks.comment.crawl_comment_by_page', args=(mid, page_num), queue='comment_page_crawler', # routing_key='comment_page_info') cur_repost_datas = crawl_repost_by_page(mid, page_num)[1] if cur_repost_datas: repost_datas.extend(cur_repost_datas) # 补上user_id,方便可视化 for index, repost_obj in enumerate(repost_datas): user_id = IdNames.fetch_uid_by_name(repost_obj.parent_user_name) if not user_id: # 设置成根用户的uid和用户名 repost_obj.parent_user_id = root_user.uid repost_obj.parent_user_name = root_user.name else: repost_obj.parent_user_id = user_id repost_datas[index] = repost_obj weibo_repost.save_reposts(repost_datas)
def test_get_name(self): from db.redis_db import IdNames print(IdNames.fetch_uid_by_name('腐剧基地'))
def get_repost_list(html, mid): """ 获取转发列表 :param html: :param mid: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode( 'gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={ 'class': 'WB_face W_fl' }).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('title') wb_repost.weibo_url = repost_url.format( repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('href')) parents = repost.find(attrs={ 'class': 'WB_text' }).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # 把当前转发的用户id和用户名存储到redis中,作为中间结果 IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # 第一个即是最上层用户,由于拿不到上层用户的uid,只能拿昵称,但是昵称可以修改,所以入库前还是得把uid拿到 temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error('解析上层用户名发生错误,具体信息是{}'.format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def test_store_and_fetch_name_id(self): IdNames.store_id_name(FAKE_STR, FAKE_ID) rs = IdNames.fetch_uid_by_name(FAKE_STR) assert rs == FAKE_ID