def _crawl_loop(page, page_counter, mid, uid, user_name, spread_other_and_caches, spread_others, spread_other_caches): while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: # 如果出现异常,默认不抓该ajax_url对应的微博信息 parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: repost_urls = parse_status.get_reposturls(repost_html) # 转发节点排序逻辑 # todo 不通过repost_urls去获取转发微博的相关信息,验证扩散效果是否相同 for repost_url in repost_urls: repost_cont = status.get_status_info(repost_url, uid, user_name, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{}页'.format(page)) page -= 1 page_counter += 1
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') comment_list = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip() wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] # todo 日期格式化 wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_comment_list(html, wb_id): """ 获取评论列表 :param html: :param wb_id: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') comment_list = list() comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'}) for comment in comments: wb_comment = WeiboComment() try: wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip() wb_comment.comment_id = comment['comment_id'] wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] # todo 日期格式化 wb_comment.create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text wb_comment.weibo_id = wb_id except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: comment_list.append(wb_comment) return comment_list
def get_total_page(html): try: page_count = json.loads(html, encoding='utf-8').get('data', '').get('page', '').get('totalpage', 1) except Exception as e: parser.error('获取总也面出错,具体错误是{}'.format(e)) page_count = 1 return page_count
def get_feed_info(feed_infos,goal): info_num = None for info in feed_infos: if goal in info.text: info_num = info.text.replace(goal, '') break if info_num is None: parser.error('unexcept template:{}'.format(feed_infos)) return int(info_num)
def get_feed_info(feed_infos, goal): info_num = None for info in feed_infos: if goal in info.text: info_num = info.text.replace(goal, '') break if info_num is None: parser.error('解析出现意外模板:{}'.format(feed_infos)) return int(info_num)
def get_total_page(html): try: page_count = json.loads(html, encoding='utf-8').get('data', '').get( 'page', '').get('totalpage', 1) except Exception as e: parser.error( 'Errors occurred when parsing total page of repost,specification is {}' .format(e)) page_count = 1 return page_count
def get_mid(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, 'html.parser') try: return soup.find(attrs={'action-type': 'feed_list_item'})['mid'] except TypeError: mid_pattern = r'mid=(\d+)' mid_matcher = re.search(mid_pattern, html) return mid_matcher.group(1) if mid_matcher else '' except Exception as e: parser.error('get_mid()发生异常,具体异常为{e}'.format(e=e))
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning('未提取到页面的微博id,页面源码是{}'.format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if 'weibo.com' not in wb_data.weibo_url: wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url) wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\ (attrs={'node-type': 'feed_list_content'}).text.strip() if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={'class': 'WB_from'}).find(attrs={'action-type': 'app_source'}).text except Exception as e: parser.error('本次解析设备出错,具体是{}'.format(e)) wb_data.device = '' try: wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_commentcounts(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, "html.parser") try: comments = soup.find(attrs={'node-type': 'comment_btn_text'}).find('span').find('em').find_next_sibling().text if comments == '评论': return 0 counts = int(comments) return counts except (ValueError, AttributeError) as e: parser.error(e) return 0
def get_upperusername(html, defaultname): cont = _get_statushtml(html) if 'type=atname' in cont: try: soup = BeautifulSoup(cont, 'html.parser') content = soup.find(attrs={'node-type': 'feed_list_content'}).find(attrs={'render': 'ext', 'extra-data': 'type=atname'}).text return content[1:] except AttributeError: return defaultname except Exception as e: parser.error(e) return defaultname else: return defaultname
def get_likecounts(html): cont = _get_statushtml(html) soup = BeautifulSoup(cont, "html.parser") try: if is_root(html): likes = soup.find(attrs={'node-type': 'like_status'}).find_all('em')[1].text else: likes = soup.find_all(attrs={'node-type': 'like_status'})[1].find_all('em')[1].text if likes == '赞': return 0 else: return int(likes) except (ValueError, AttributeError) as e: parser.error(e) return 0
def _get_statushtml(html): soup = BeautifulSoup(html, "html.parser") scripts = soup.find_all('script') pattern = re.compile(r'FM.view\((.*)\)') cont = '' for script in scripts: try: m = pattern.search(script.string) if m and 'pl.content.weiboDetail.index' in script.string: all_info = m.group(1) cont = json.loads(all_info)['html'] except TypeError: return '' except Exception as e: parser.error('__get__statushtml()错误,具体错误是'.format(e=e)) parser.error('网页代码为{page}'.format(page=html)) return cont
def handle_error(*keys): try: return func(*keys) except Exception as e: parser.error(e) if return_type == 5: return None elif return_type == 4: return {} elif return_type == 3: return False elif return_type == 2: return [] elif return_type == 1: return '' else: return 0
def _get_statushtml(html): soup = BeautifulSoup(html, "html.parser") scripts = soup.find_all('script') pattern = re.compile(r'FM.view\((.*)\)') cont = '' for script in scripts: try: m = pattern.search(script.string) if m and 'pl.content.weiboDetail.index' in script.string: all_info = m.group(1) # TODO 留意这里可能发生异常 cont = json.loads(all_info)['html'] except TypeError: return '' except Exception as e: parser.error('__get__statushtml()错误,具体错误是'.format(e=e)) parser.error('网页代码为{page}'.format(page=html)) return cont
def _get_total_page(wb_mid): page = 1 ajax_url = base_url.format(mid=wb_mid, currpage=page) source = get_page(ajax_url, False) if source == '': crawler.error('本次转发url{}抓取出错'.format(ajax_url)) return 0 crawler.info('本次转发信息url为{}'.format(ajax_url)) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why)) return 0 else: return total_page
def handle_error(*keys): try: return func(*keys) except Exception as e: parser.error(e) return return_value
def _get_current_reposts(url, session, weibo_mid): """ 修改过后的抓取主程序,由于微博频率限制比较严格,目前只抓取当前微博及其子微博,不抓取源微博 """ spread_other_caches = list() spread_others = list() spread_other_and_caches = list() html = get_page(url, session, headers) reposts = status_parse.get_repostcounts(html) comments = status_parse.get_commentcounts(html) # 更新weibo_search_data表中的转发数、评论数 weibosearch_dao.update_repost_comment(mid=weibo_mid, reposts=reposts, comments=comments) if not basic.is_404(html): root_url = url mid = status_parse.get_mid(html) user_id = status_parse.get_userid(html) user_name = status_parse.get_username(html) post_time = status_parse.get_statustime(html) device = status_parse.get_statussource(html) comments_count = status_parse.get_commentcounts(html) reposts_count = status_parse.get_repostcounts(html) root_user = user.get_profile(user_id, session, headers) spread_original_dao.save(root_user, mid, post_time, device, reposts_count, comments_count, root_url) crawler.info('该微博转发数为{counts}'.format(counts=reposts_count)) if reposts_count > 0: base_url = 'http://weibo.com/aj/v6/mblog/info/big?ajwvr=6&id={mid}&page={currpage}' soc = SpreadOtherCache() soc.set_id(user_id) soc.set_name(user_name) spread_other_caches.append(soc) page = 1 ajax_url = base_url.format(mid=mid, currpage=page) source = get_page(ajax_url, session, headers, False) crawler.info('本次转发信息url为:' + ajax_url) try: repost_json = json.loads(source) total_page = int(repost_json['data']['page']['totalpage']) except Exception as why: parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: page = total_page page_counter = 0 while page > 0 and page_counter < page_max: ajax_url = base_url.format(mid=mid, currpage=page) repost_info = get_page(ajax_url, session, headers, False) try: repost_json = json.loads(repost_info) repost_html = repost_json['data']['html'] except Exception as why: parser.error( '{url}使用json解析转发信息出现异常,具体信息为:{why}'.format( url=ajax_url, why=why)) else: repost_urls = status_parse.get_reposturls(repost_html) # 转发节点排序逻辑 for repost_url in repost_urls: repost_cont = status.get_status_info( repost_url, session, user_id, user_name, headers, mid) if repost_cont is not None: spread_other_and_caches.append(repost_cont) for soac in spread_other_and_caches: if soac.get_so().id != '': spread_others.append(soac.get_so()) spread_other_caches.append(soac.get_soc()) finally: print('当前位于第{currpage}页'.format(currpage=page)) page -= 1 page_counter += 1 for so in spread_others: if so.verify_type == '': so.verify_type = 0 for i in spread_other_caches: if so.upper_user_name == i.get_name(): so.upper_user_id = i.get_id() break else: so.upper_user_id = user_id spread_others = list(set(spread_others)) spread_other_dao.save(spread_others) crawler.info('一共获取了{num}条转发信息,该条微博的转发信息已经采集完成'.format( num=len(spread_others))) else: crawler.info('{url}为404页面'.format(url=url))
def get_repost_list(html, mid): """ 获取转发列表 :param html: :param mid: :return: """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode( 'gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={ 'class': 'WB_face W_fl' }).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('title') wb_repost.weibo_url = repost_url.format( repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('href')) parents = repost.find(attrs={ 'class': 'WB_text' }).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # 把当前转发的用户id和用户名存储到redis中,作为中间结果 IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # 第一个即是最上层用户,由于拿不到上层用户的uid,只能拿昵称,但是昵称可以修改,所以入库前还是得把uid拿到 temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error('解析上层用户名发生错误,具体信息是{}'.format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error('解析评论失败,具体信息是{}'.format(e)) else: repost_list.append(wb_repost) return repost_list
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning('未提取到页面的微博id,页面源码是{}'.format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if 'weibo.com' not in wb_data.weibo_url: wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url) wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\ (attrs={'node-type': 'feed_list_content'}).text.strip() # test for weibo_pic capture # 先判断这条微博是否有带图片,再进行后续的处理 try: weibo_pic = [] have_pic = 1 pic_list = each.find_all(attrs={'action-type': 'fl_pics'}) except Exception as e: have_pic = 0 if have_pic == 1: for pic in pic_list: wb_pic = WeiboPic() wb_pic.uid = wb_data.uid wb_pic.weibo_id = wb_data.weibo_id wb_pic.pic_url = pic.find('img').get('src') # wb_pic.url_hash = md5Encode(wb_pic.pic_url) wb_pic.url_hash = re.match('.*/thumb150/(.*).jpg', wb_pic.pic_url).group(1) wb_pic.dl_flag = 0 wb_pic.judge_flag = 0 weibo_pic.append(wb_pic) # end if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={ 'class': 'WB_from' }).find(attrs={ 'action-type': 'app_source' }).text except Exception as e: parser.error('本次解析设备出错,具体是{}'.format(e)) wb_data.device = '' try: wb_data.repost_num = int( each.find(attrs={ 'action-type': 'fl_forward' }).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'action-type': 'fl_comment' }).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'action-type': 'fl_like' }).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont, weibo_pic
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(USER_PATTERN, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning("fail to get user'sid, the page source is{}".format(html)) return None try: wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html)) return None def url_filter(url): return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip() except Exception as why: parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: try: user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(user_pattern, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None except Exception as why: parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html)) return None wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html)) return None try: feed_action = each.find(attrs={'class': 'feed_action'}) wb_data.create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['title'] except Exception as why: parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, html)) wb_data.device = '' else: try: wb_data.repost_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_forward' }).find('em').text) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_comment' }).find('em').text) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html)) return None except Exception as why: parser.error('整条解析出错,原因为:{}, 页面源码是{}'.format(why, html)) return None else: return wb_data
def get_weibo_info(each, html): wb_data = WeiboData() try: user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(user_pattern, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None except Exception as why: parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html)) return None wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html)) return None try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_repost_list(html, mid): """ Get repost details :param html: page source :param mid: weibo mid :return: list of repost infos """ cont = get_html_cont(html) if not cont: return list() soup = BeautifulSoup(cont, 'html.parser') repost_list = list() reposts = soup.find_all(attrs={'action-type': 'feed_list_item'}) for repost in reposts: wb_repost = WeiboRepost() try: repost_cont = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}).text.strip().\ split('//@') wb_repost.repost_cont = repost_cont[0].encode('gbk', 'ignore').decode( 'gbk', 'ignore') wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={ 'class': 'WB_face W_fl' }).find('a').get('usercard')[3:] wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ text wb_repost.repost_time = repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('title') wb_repost.weibo_url = repost_url.format( repost.find(attrs={ 'class': 'WB_from S_txt2' }).find('a').get('href')) parents = repost.find(attrs={ 'class': 'WB_text' }).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid # Save the current repost user's name and id as the middle result IdNames.store_id_name(wb_repost.user_name, wb_repost.user_id) if not parents: wb_repost.parent_user_name = '' else: try: # We can't get the parent's uid, We can get the parent's nickname, but the name can be changed temp = parents.find(attrs={'extra-data': 'type=atname'}) if temp: wb_repost.parent_user_name = temp.get('usercard')[5:] else: wb_repost.parent_user_name = '' except Exception as e: parser.error( "error occurred when parsing the parent's name ,the detail is {}" .format(e)) wb_repost.parent_user_name = '' except Exception as e: parser.error( 'repost parse error occurred,the detail is {}'.format(e)) else: repost_list.append(wb_repost) return repost_list