Example #1
0
    def test_get_article_detail(self):
        file_name = os.path.join(fake_data_path, 'article_detail_backgroud-image.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 29, article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
        # 图片有src属性,无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        imgs = content_html.find_all("img", src=re.compile(r'http'))
        assert_equal(len(imgs), 29, imgs)
        for img in imgs:
            assert_is_none(img.attrs.get('data-src'))

        file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 9, article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 2, article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'], article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'], article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_iframe.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 6, article_detail)
        assert_not_in('data-wxurl', article_detail['content_html'], article_detail['content_html'])
        assert_not_in('qqmusic', article_detail['content_html'], article_detail['content_html'])
        assert_not_in('mpvoice', article_detail['content_html'], article_detail['content_html'])

        # 图片有src属性,无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        iframes = content_html.find_all("iframe", src=re.compile(r'http'))
        assert_equal(len(iframes), 1, iframes)
        for iframe in iframes:
            assert_is_none(iframe.attrs.get('data-src'))
Example #2
0
    def get_article_content(self,
                            url,
                            del_qqmusic=True,
                            del_mpvoice=True,
                            unlock_callback=None,
                            identify_image_callback=None,
                            hosting_callback=None,
                            raw=False):
        """获取文章原文,避免临时链接失效

        Parameters
        ----------
        url : str or unicode
            原文链接,临时链接
        raw : bool
            True: 返回原始html
            False: 返回处理后的html
        del_qqmusic: bool
            True:微信原文中有插入的qq音乐,则删除
            False:微信源文中有插入的qq音乐,则保留
        del_mpvoice: bool
            True:微信原文中有插入的语音消息,则删除
            False:微信源文中有插入的语音消息,则保留
        unlock_callback : callable
            处理 文章明细 的时候出现验证码的函数,参见 unlock_callback_example
        identify_image_callback : callable
            处理 文章明细 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
        hosting_callback: callable
            将微信采集的文章托管到7牛或者阿里云回调函数,输入微信图片源地址,返回托管后地址

        Returns
        -------
        content_html
            原文内容
        content_img_list
            文章中图片列表

        Raises
        ------
        WechatSogouRequestsException
        """

        resp = self.__get_by_unlock(
            url,
            unlock_platform=self.__unlock_wechat,
            unlock_callback=unlock_callback,
            identify_image_callback=identify_image_callback)

        resp.encoding = 'utf-8'
        if '链接已过期' in resp.text:
            raise WechatSogouException(
                'get_article_content 链接 [{}] 已过期'.format(url))
        if raw:
            return resp.text
        content_info = WechatSogouStructuring.get_article_detail(
            resp.text, del_qqmusic=del_qqmusic, del_voice=del_mpvoice)
        if hosting_callback:
            content_info = self.__hosting_wechat_img(content_info,
                                                     hosting_callback)
        return content_info
Example #3
0
    def get_article_content(self, url, del_qqmusic=True, del_mpvoice=True, unlock_callback=None,
                            identify_image_callback=None, hosting_callback=None, raw=False):
        """获取文章原文,避免临时链接失效

        Parameters
        ----------
        url : str or unicode
            原文链接,临时链接
        raw : bool
            True: 返回原始html
            False: 返回处理后的html
        del_qqmusic: bool
            True:微信原文中有插入的qq音乐,则删除
            False:微信源文中有插入的qq音乐,则保留
        del_mpvoice: bool
            True:微信原文中有插入的语音消息,则删除
            False:微信源文中有插入的语音消息,则保留
        unlock_callback : callable
            处理 文章明细 的时候出现验证码的函数,参见 unlock_callback_example
        identify_image_callback : callable
            处理 文章明细 的时候处理验证码函数,输入验证码二进制数据,输出文字,参见 identify_image_callback_example
        hosting_callback: callable
            将微信采集的文章托管到7牛或者阿里云回调函数,输入微信图片源地址,返回托管后地址

        Returns
        -------
        content_html
            原文内容
        content_img_list
            文章中图片列表

        Raises
        ------
        WechatSogouRequestsException
        """

        resp = self.__get_by_unlock(url,
                                    unlock_platform=self.__unlock_wechat,
                                    unlock_callback=unlock_callback,
                                    identify_image_callback=identify_image_callback)

        resp.encoding = 'utf-8'
        if '链接已过期' in resp.text:
            raise WechatSogouException('get_article_content 链接 [{}] 已过期'.format(url))
        if raw:
            return resp.text
        content_info = WechatSogouStructuring.get_article_detail(resp.text, del_qqmusic=del_qqmusic,
                                                                 del_voice=del_mpvoice)
        if hosting_callback:
            content_info = self.__hosting_wechat_img(content_info, hosting_callback)
        return content_info
Example #4
0
    def test_get_article_detail(self):
        file_name = os.path.join(fake_data_path,
                                 'article_detail_backgroud-image.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 29,
                     article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'],
                    article_detail['content_html'])
        # 图片有src属性,无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        imgs = content_html.find_all("img", src=re.compile(r'http'))
        assert_equal(len(imgs), 29, imgs)
        for img in imgs:
            assert_is_none(img.attrs.get('data-src'))

        file_name = os.path.join(fake_data_path, 'article_detail_mpvoice.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 9,
                     article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'],
                    article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_qqmusic.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 2,
                     article_detail)
        assert_true('data-wxurl' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('qqmusic' not in article_detail['content_html'],
                    article_detail['content_html'])
        assert_true('mpvoice' not in article_detail['content_html'],
                    article_detail['content_html'])

        file_name = os.path.join(fake_data_path, 'article_detail_iframe.html')
        with io.open(file_name, encoding='utf-8') as f:
            text = f.read()

        article_detail = WechatSogouStructuring.get_article_detail(text)
        assert_equal(len(article_detail['content_img_list']), 6,
                     article_detail)
        assert_not_in('data-wxurl', article_detail['content_html'],
                      article_detail['content_html'])
        assert_not_in('qqmusic', article_detail['content_html'],
                      article_detail['content_html'])
        assert_not_in('mpvoice', article_detail['content_html'],
                      article_detail['content_html'])

        # 图片有src属性,无data-src属性
        content_html = BeautifulSoup(article_detail['content_html'], 'lxml')
        iframes = content_html.find_all("iframe", src=re.compile(r'http'))
        assert_equal(len(iframes), 1, iframes)
        for iframe in iframes:
            assert_is_none(iframe.attrs.get('data-src'))