def get_headimg(html): """ Get the head img url of current user :param html: page source :return: head img url """ soup = BeautifulSoup(_get_header(html), 'html.parser') try: headimg = url_filter(soup.find(attrs={'class': 'photo_wrap'}).find(attrs={'class': 'photo'})['src']) except AttributeError: headimg = '' return headimg
def get_weibo_clip(feed_content, is_repost): """ 获取微博配的视频 :param feed_content: 单条微博feed_content下的html :param is_repost: 是否为转发 :return: 微博所配视频链接 """ clip = "" if is_repost: return clip else: try: li = str( feed_content.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) extracted_url = urllib.parse.unquote( re.findall(r"video_src=(.+?)&", li)[0]) return url_filter(extracted_url) except: return clip
def get_weibo_info(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except (AttributeError, IndexError, TypeError): parser.error( 'Failed to get weibo id, the page source is {}'.format(html)) return None imgs = list() imgs_url = list() try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' # todo 没找到vedio的测试数据 try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: # todo 日期格式化,会有今日XXX,X分钟前等噪音 wb_data.create_time = each.find(attrs={ 'class': 'from' }).find(attrs={ 'target': '_blank' }).text.strip() wb_data.weibo_url = 'https:' + each.find(attrs={ 'class': 'from' }).find(attrs={'target': '_blank'})['href'] wb_data.uid = each.find(attrs={ 'class': 'from' }).find(attrs={'target': '_blank'})['href'].split('/')[3] except (AttributeError, KeyError): wb_data.create_time = '' wb_data.weibo_url = '' wb_data.weibo_uid = '' try: wb_data.repost_num = int( each.find(attrs={ 'class': 'card-act' }).find_all('li')[1].find('a').text.split(' ')[-1]) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int( each.find(attrs={ 'class': 'card-act' }).find_all('li')[2].find('a').text.split(' ')[-1]) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( each.find(attrs={ 'class': 'card-act' }).find_all('li')[3].find('a').find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 if '展开全文' in str(each): is_all_cont = 1 try: wb_data.weibo_cont = each.find( attrs={ 'node-type': 'feed_list_content_full' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}' .format(why, html)) return None else: is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={ 'node-type': 'feed_list_content' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}' .format(why, html)) return None return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) usercard = user_cont.find('img').get('usercard', '') # this only for login user if not usercard: return None wb_data.uid = usercard.split('&')[0][3:] try: wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error( 'Failed to get weibo url, the error is {}, the source page is {}'. format(e, html)) return None imgs = list() imgs_url = list() try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def test_url_filter(url, expect): assert url_filter(url) == expect
def get_weibo_info(each, html): # print ("----------------------") wb_data = WeiboData() # print ("-------" * 10) # print(each) # print ("#$#" * 10) # print(html) # print ("-----" * 10) user_cont = each.find(attrs={'class': 'card-feed'}) user_avator = user_cont.find(attrs={'class': 'avator'}) #usercard = user_cont.find('img').get('usercard', '') usercard = user_avator.find('a').get('href', '') # this only for login user if not usercard: return None wb_data.uid = usercard.split('?')[0][12:] # print ("uid", wb_data.uid) try: wb_data.weibo_id = each.find(attrs={'title': '赞'}).get('action-data')[4:] # print ("weibo_id", wb_data.weibo_id) except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').get("href", "")[2:] # wb_data.weibo_url = each.find(attrs={'node-type': 'feed_list_item_date'})['href'] # print ("weibo_url", wb_data.weibo_url) except Exception as e: parser.error('Failed to get weibo url, the error is {}, the source page is {}'.format(e, html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'feed_from'}).find(attrs={'rel': 'nofollow'}).text.strip() except AttributeError: wb_data.device = '' try: create_time = each.find(attrs={"class": "content"}).find(attrs={"class": "from"}).find('a').text.strip() if "年" not in create_time and "月" in create_time: create_time = "2019年" + create_time elif "今天" in create_time: pass create_time.replace("今天", datetime.datetime.now().strftime("%Y-%m-%d ")) print( "啦啦啦啦啦 今天") create_time = datetime.datetime.strptime(create_time, "%Y年%m月%d日 %H:%M") wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") # print ("create_time", wb_data.create_time) except Exception as e: traceback.print_exc() wb_data.create_time = '' try: feed_action = each.find(attrs={'class': 'card-act'}) except Exception as why: parser.error('Failed to get feed_action, the error is {},the page source is {}'.format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(feed_action.find(attrs={'action-type': 'feed_list_like'}).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: try: wb_data.weibo_cont = each.find(attrs={"node-type": "feed_list_content_full"}).text.strip() # print ("full_weibo_cont", wb_data.weibo_cont) except: wb_data.weibo_cont = each.find(attrs={'class': 'txt'}).text.strip() # print ("weibo_cont", wb_data.weibo_cont) except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() try: wb_data.weibo_id = each['mid'] except (AttributeError, IndexError, TypeError): parser.error('Failed to get weibo id, the page source is {}'.format(html)) return None imgs = list() imgs_url = list() try: imgs = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('li')) imgs_url = list(map(url_filter, re.findall(r"src=\"(.+?)\"", imgs))) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' if IMG_ALLOW and imgs and imgs_url: app.send_task('tasks.downloader.download_img_task', args=(wb_data.weibo_id, imgs_url), queue='download_queue', routing_key='for_download') wb_data.weibo_img_path = IMG_PATH else: wb_data.weibo_img_path = '' # todo 没找到vedio的测试数据 try: a_tag = str(each.find(attrs={'node-type': 'feed_list_media_prev'}).find_all('a')) extracted_url = urllib.parse.unquote(re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={'class': 'from'}).find(attrs={'rel': 'nofollow'}).text except AttributeError: wb_data.device = '' try: # todo 日期格式化,会有今日XXX,X分钟前等噪音 wb_data.create_time = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'}).text.strip() wb_data.weibo_url = 'https:'+each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'] wb_data.uid = each.find(attrs={'class': 'from'}).find(attrs={'target': '_blank'})['href'].split('/')[3] except (AttributeError, KeyError): wb_data.create_time = '' wb_data.weibo_url = '' wb_data.weibo_uid = '' try: wb_data.repost_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[0].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[1].find('a').text.split('/')[-1]) except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int(each.find(attrs={'class': 'card-act'}).find_all('li')[2].find('a').find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 if '展开全文' in str(each): is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content_full'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None else: is_all_cont = 1 try: wb_data.weibo_cont = each.find(attrs={'node-type': 'feed_list_content'}).text.strip() except Exception as why: parser.error('Failed to get weibo cont, the error is {}, the page source is {}'.format(why, html)) return None return wb_data, is_all_cont
def get_weibo_info(each, html): wb_data = WeiboData() user_cont = each.find(attrs={'class': 'face'}) user_href = user_cont.find('a').get('href', '') if not user_href: parser.warning('Failed to get user id') return None wb_data.uid = parse_url(user_href).path[3:] try: wb_data.weibo_id = each.find(attrs={ 'class': 'WB_screen' }).find('a').get('action-data')[4:] except (AttributeError, IndexError, TypeError): return None try: wb_data.weibo_url = each.find( attrs={'node-type': 'feed_list_item_date'})['href'] except Exception as e: parser.error( 'Failed to get weibo url, the error is {}, the source page is {}'. format(e, html)) return None try: imgs = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('li')) imgs_url = map(url_filter, re.findall(r"src=\"(.+?)\"", imgs)) wb_data.weibo_img = ';'.join(imgs_url) except Exception: wb_data.weibo_img = '' try: a_tag = str( each.find(attrs={ 'node-type': 'feed_list_media_prev' }).find_all('a')) extracted_url = urllib.parse.unquote( re.findall(r"full_url=(.+?)&", a_tag)[0]) wb_data.weibo_video = url_filter(extracted_url) except Exception: wb_data.weibo_video = '' try: wb_data.device = each.find(attrs={ 'class': 'feed_from' }).find(attrs={ 'rel': 'nofollow' }).text except AttributeError: wb_data.device = '' try: create_time = each.find( attrs={'node-type': 'feed_list_item_date'})['date'] except (AttributeError, KeyError): wb_data.create_time = '' else: create_time = int(create_time) / 1000 # 时间戳单位不同 create_time = datetime.fromtimestamp(create_time) wb_data.create_time = create_time.strftime("%Y-%m-%d %H:%M") try: feed_action = each.find(attrs={'class': 'feed_action'}) except Exception as why: parser.error( 'Failed to get feed_action, the error is {},the page source is {}'. format(why, each)) else: feed_infos = feed_action.find_all('li') try: wb_data.repost_num = get_feed_info(feed_infos, '转发') except (AttributeError, ValueError): wb_data.repost_num = 0 try: wb_data.comment_num = get_feed_info(feed_infos, '评论') except (AttributeError, ValueError): wb_data.comment_num = 0 try: wb_data.praise_num = int( feed_action.find(attrs={ 'action-type': 'feed_list_like' }).find('em').text) except (AttributeError, ValueError): wb_data.praise_num = 0 try: wb_data.weibo_cont = each.find(attrs={ 'class': 'comment_txt' }).text.strip() except Exception as why: parser.error( 'Failed to get weibo cont, the error is {}, the page source is {}'. format(why, html)) return None if '展开全文' in str(each): is_all_cont = 0 else: is_all_cont = 1 return wb_data, is_all_cont