def test_get_article_detail(self): file_name = os.path.join(fake_data_path, 'article_detail_backgroud-image.html') with io.open(file_name, encoding='utf-8') as f: text = f.read() article_detail = WechatSogouStructuring.get_article_detail(text) assert_equal(len(article_detail['content_img_list']), 29, article_detail) assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html']) assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html']) # 图片有src属性,无data-src属性 content_html = BeautifulSoup(article_detail['content_html'], 'lxml') imgs = content_html.find_all("img", src=re.compile(r'http')) assert_equal(len(imgs), 29, imgs) for img in imgs: assert_is_none(img.attrs.get('data-src')) file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html') with io.open(file_name, encoding='utf-8') as f: text = f.read() article_detail = WechatSogouStructuring.get_article_detail(text) assert_equal(len(article_detail['content_img_list']), 9, article_detail) assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html']) assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html']) assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html']) file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html') with io.open(file_name, encoding='utf-8') as f: text = f.read() article_detail = WechatSogouStructuring.get_article_detail(text) assert_equal(len(article_detail['content_img_list']), 2, article_detail) assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html']) assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html']) assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html']) file_name = os.path.join(fake_data_path, 'article_detail_iframe.html') with io.open(file_name, encoding='utf-8') as f: text = f.read() article_detail = WechatSogouStructuring.get_article_detail(text) assert_equal(len(article_detail['content_img_list']), 6, article_detail) assert_not_in('data-wxurl', article_detail['content_html'], article_detail['content_html']) assert_not_in('qqmusic', article_detail['content_html'], article_detail['content_html']) assert_not_in('mpvoice', article_detail['content_html'], article_detail['content_html']) # 图片有src属性,无data-src属性 content_html = BeautifulSoup(article_detail['content_html'], 'lxml') iframes = content_html.find_all("iframe", src=re.compile(r'http')) assert_equal(len(iframes), 1, iframes) for iframe in iframes: assert_is_none(iframe.attrs.get('data-src'))
def get_article_content(self, url, del_qqmusic=True, del_mpvoice=True, unlock_callback=None, identify_image_callback=None, hosting_callback=None, raw=False): """获取文章原文,避免临时链接失效 Parameters ---------- url : str or unicode 原文链接,临时链接 raw : bool True: 返回原始html False: 返回处理后的html del_qqmusic: bool True:微信原文中有插入的qq音乐,则删除 False:微信源文中有插入的qq音乐,则保留 del_mpvoice: bool True:微信原文中有插入的语音消息,则删除 False:微信源文中有插入的语音消息,则保留 unlock_callback : callable 处理 文章明细 的时候出现验证码的函数,参见 unlock_callback_example identify_image_callback : callable 处理 文章明细 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example hosting_callback: callable 将微信采集的文章托管到7牛或者阿里云回调函数,输入微信图片源地址,返回托管后地址 Returns ------- content_html 原文内容 content_img_list 文章中图片列表 Raises ------ WechatSogouRequestsException """ resp = self.__get_by_unlock( url, unlock_platform=self.__unlock_wechat, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback) resp.encoding = 'utf-8' if '链接已过期' in resp.text: raise WechatSogouException( 'get_article_content 链接 [{}] 已过期'.format(url)) if raw: return resp.text content_info = WechatSogouStructuring.get_article_detail( resp.text, del_qqmusic=del_qqmusic, del_voice=del_mpvoice) if hosting_callback: content_info = self.__hosting_wechat_img(content_info, hosting_callback) return content_info
def get_article_content(self, url, del_qqmusic=True, del_mpvoice=True, unlock_callback=None, identify_image_callback=None, hosting_callback=None, raw=False): """获取文章原文,避免临时链接失效 Parameters ---------- url : str or unicode 原文链接,临时链接 raw : bool True: 返回原始html False: 返回处理后的html del_qqmusic: bool True:微信原文中有插入的qq音乐,则删除 False:微信源文中有插入的qq音乐,则保留 del_mpvoice: bool True:微信原文中有插入的语音消息,则删除 False:微信源文中有插入的语音消息,则保留 unlock_callback : callable 处理 文章明细 的时候出现验证码的函数,参见 unlock_callback_example identify_image_callback : callable 处理 文章明细 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example hosting_callback: callable 将微信采集的文章托管到7牛或者阿里云回调函数,输入微信图片源地址,返回托管后地址 Returns ------- content_html 原文内容 content_img_list 文章中图片列表 Raises ------ WechatSogouRequestsException """ resp = self.__get_by_unlock(url, unlock_platform=self.__unlock_wechat, unlock_callback=unlock_callback, identify_image_callback=identify_image_callback) resp.encoding = 'utf-8' if '链接已过期' in resp.text: raise WechatSogouException('get_article_content 链接 [{}] 已过期'.format(url)) if raw: return resp.text content_info = WechatSogouStructuring.get_article_detail(resp.text, del_qqmusic=del_qqmusic, del_voice=del_mpvoice) if hosting_callback: content_info = self.__hosting_wechat_img(content_info, hosting_callback) return content_info