def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning('未提取到页面的微博id,页面源码是{}'.format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if 'weibo.com' not in wb_data.weibo_url: wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url) wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\ (attrs={'node-type': 'feed_list_content'}).text.strip() if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={'class': 'WB_from'}).find(attrs={'action-type': 'app_source'}).text except Exception as e: parser.error('本次解析设备出错,具体是{}'.format(e)) wb_data.device = '' try: wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning('未提取到页面的微博id,页面源码是{}'.format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if 'weibo.com' not in wb_data.weibo_url: wb_data.weibo_url = 'http://weibo.com{}'.format(wb_data.weibo_url) wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find\ (attrs={'node-type': 'feed_list_content'}).text.strip() # test for weibo_pic capture # 先判断这条微博是否有带图片,再进行后续的处理 try: weibo_pic = [] have_pic = 1 pic_list = each.find_all(attrs={'action-type': 'fl_pics'}) except Exception as e: have_pic = 0 if have_pic == 1: for pic in pic_list: wb_pic = WeiboPic() wb_pic.uid = wb_data.uid wb_pic.weibo_id = wb_data.weibo_id wb_pic.pic_url = pic.find('img').get('src') # wb_pic.url_hash = md5Encode(wb_pic.pic_url) wb_pic.url_hash = re.match('.*/thumb150/(.*).jpg', wb_pic.pic_url).group(1) wb_pic.dl_flag = 0 wb_pic.judge_flag = 0 weibo_pic.append(wb_pic) # end if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={ 'class': 'WB_from' }).find(attrs={ 'action-type': 'app_source' }).text except Exception as e: parser.error('本次解析设备出错,具体是{}'.format(e)) wb_data.device = '' try: wb_data.repost_num = int( each.find(attrs={ 'action-type': 'fl_forward' }).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'action-type': 'fl_comment' }).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'action-type': 'fl_like' }).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont, weibo_pic
def get_weibo_info(each, html): wb_data = WeiboData() try: user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(user_pattern, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None except Exception as why: parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html)) return None wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html)) return None try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info_detail(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = str(user_cont.find('a')) user_pattern = 'id=(\\d+)&' m = re.search(user_pattern, user_info) if m: wb_data.uid = m.group(1) else: parser.warning("fail to get user'sid, the page source is{}".format(html)) return None weibo_pattern = 'mid=(\\d+)' m = re.search(weibo_pattern, str(each)) if m: wb_data.weibo_id = m.group(1) else: parser.warning("fail to get weibo's id,the page source {}".format(html)) return None time_url = each.find(attrs={'node-type': 'feed_list_item_date'}) wb_data.create_time = time_url.get('title', '') wb_data.weibo_url = time_url.get('href', '') if ROOT_URL not in wb_data.weibo_url: wb_data.weibo_url = '{}://{}{}'.format(PROTOCOL, ROOT_URL, wb_data.weibo_url) def url_filter(url): return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url try: imgs = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}). find_all('img')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: li = str(each.find(attrs={'node-type': 'feed_content'}).find(attrs={'node-type': 'feed_list_media_prev'}). find_all('li')) extracted_url = urllib.parse.unquote(re.findall(r"video_src=(.+?)&", li)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_content'}).find( attrs={'node-type': 'feed_list_content'}).text.strip() except Exception: wb_data.weibo_cont = '' if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 try: wb_data.device = each.find(attrs={'class': 'WB_from S_txt2'}).find(attrs={'action-type': 'app_source'}).text except Exception: wb_data.device = '' try: wb_data.repost_num = int(each.find(attrs={'action-type': 'fl_forward'}).find_all('em')[1].text) except Exception: wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'action-type': 'fl_comment'}).find_all('em')[1].text) except Exception: wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'action-type': 'fl_like'}).find_all('em')[1].text) except Exception: wb_data.praise_num = 0 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(USER_PATTERN, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning("fail to get user'sid, the page source is{}".format(html)) return None try: wb_data.weibo_id = each.find(attrs={'class': 'WB_screen'}).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('fail to get weibo url, the error is {}, the source page is {}'.format(e, html)) return None def url_filter(url): return ':'.join([PROTOCOL, url]) if PROTOCOL not in url and ORIGIN not in url else url try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: create_time = each.find(attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error('failt to get feed_action, the error is {},the page source is {}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={'class': 'comment_txt'}).text.strip() except Exception as why: parser.error('fail to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: try: user_cont = each.find(attrs={'class': 'face'}) user_info = user_cont.find('a') m = re.match(user_pattern, user_info.img.get('usercard')) if m: wb_data.uid = m.group(1) else: parser.warning('未提取到用户id,页面源码是{}'.format(html)) return None except Exception as why: parser.error('解析用户信息出错,出错原因:{},页面源码是{}'.format(why, html)) return None wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error('解析微博url出错,出错原因是{},页面源码是{}'.format(e, html)) return None try: feed_action = each.find(attrs={'class': 'feed_action'}) wb_data.create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['title'] except Exception as why: parser.error('解析feed_action出错,出错原因:{},页面源码是{}'.format(why, html)) wb_data.device = '' else: try: wb_data.repost_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_forward' }).find('em').text) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_comment' }).find('em').text) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error('解析微博内容出错:{}, 页面源码是{}'.format(why, html)) return None except Exception as why: parser.error('整条解析出错,原因为:{}, 页面源码是{}'.format(why, html)) return None else: return wb_data