Example #1
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'view_perm': '',  # 最近一月阅读量
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        post_view_perms = WechatSogouStructuring.__get_post_view_perm(text)

        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = get_first_of_element(li, 'div/div[1]/a/@href')
            headimage = format_image_url(get_first_of_element(li, 'div/div[1]/a/img/@src'))
            wechat_name = get_elem_text(get_first_of_element(li, 'div/div[2]/p[1]'))
            info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
            qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
            authentication = get_first_of_element(li, 'dl[2]/dd/text()')

            relist.append({
                'open_id': headimage.split('/')[-1],
                'profile_url': url,
                'headimage': headimage,
                'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id': info.replace('微信号:', ''),
                'qrcode': qrcode,
                'introduction': introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication': authentication,
                'post_perm': -1,
                'view_perm': -1,
            })

        if post_view_perms:
            for i in relist:
                if i['open_id'] in post_view_perms:
                    post_view_perm = post_view_perms[i['open_id']].split(',')
                    if len(post_view_perm) == 2:
                        i['post_perm'] = int(post_view_perm[0])
                        i['view_perm'] = int(post_view_perm[1])
        return relist
Example #2
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = get_first_of_element(li, 'div/div[1]/a/@href')
            headimage = get_first_of_element(li, 'div/div[1]/a/img/@src')
            wechat_name = get_elem_text(
                get_first_of_element(li, 'div/div[2]/p[1]'))
            info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
            post_perm = 0  # TODO 月发文 <script>var account_anti_url = "/websearch/weixin/pc/anti_account.jsp?.......";</script>
            qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
            authentication = get_first_of_element(li, 'dl[2]/dd/text()')
            relist.append({
                'open_id':
                headimage.split('/')[-1],
                'profile_url':
                url,
                'headimage':
                headimage,
                'wechat_name':
                wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id':
                info.replace('微信号:', ''),
                'post_perm':
                post_perm,
                'qrcode':
                qrcode,
                'introduction':
                introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication':
                authentication
            })
        return relist
Example #3
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = li.xpath('div/div[1]/a/@href')
            headimage = li.xpath('div/div[1]/a/img/@src')
            wechat_name = get_elem_text(li.xpath('div/div[2]/p[1]')[0])
            info = get_elem_text(li.xpath('div/div[2]/p[2]')[0])
            post_perm = 0  # TODO 月发文 <script>var account_anti_url = "/websearch/weixin/pc/anti_account.jsp?.......";</script>
            qrcode = li.xpath('div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(li.xpath('dl[1]/dd')[0])
            authentication = li.xpath('dl[2]/dd/text()')
            relist.append({
                'open_id': headimage[0].split('/')[-1],
                'profile_url': url[0],
                'headimage': headimage[0],
                'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id': info.replace('微信号:', ''),
                'post_perm': post_perm,
                'qrcode': qrcode[0] if qrcode else '',
                'introduction': introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication': authentication[0] if authentication else ''
            })
        return relist
Example #4
0
 def test_get_elem_text(self):
     html = '''
     <div>
         <div>111</div>
         <div>222</div>
     </div>
     '''
     elem = etree.HTML(html)
     assert_equal(get_elem_text(elem), '111222')
Example #5
0
 def test_get_elem_text(self):
     html = '''
     <div>
         <div>111</div>
         <div>222</div>
     </div>
     '''
     elem = etree.HTML(html)
     assert_equal(get_elem_text(elem), '111222')
Example #6
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'view_perm': '',  # 最近一月阅读量
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        post_view_perms = WechatSogouStructuring.__get_post_view_perm(text)

        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = get_first_of_element(li, 'div/div[1]/a/@href')
            headimage = get_first_of_element(li, 'div/div[1]/a/img/@src')
            wechat_name = get_elem_text(
                get_first_of_element(li, 'div/div[2]/p[1]'))
            info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
            qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
            authentication = get_first_of_element(li, 'dl[2]/dd/text()')
            relist.append({
                'open_id':
                headimage.split('/')[-1],
                'profile_url':
                url,
                'headimage':
                headimage,
                'wechat_name':
                wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id':
                info.replace('微信号:', ''),
                'qrcode':
                qrcode,
                'introduction':
                introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication':
                authentication,
                'post_perm':
                -1,
                'view_perm':
                -1,
            })

        if post_view_perms:
            for i in relist:
                if i['open_id'] in post_view_perms:
                    post_view_perm = post_view_perms[i['open_id']].split(',')
                    if len(post_view_perm) == 2:
                        i['post_perm'] = int(post_view_perm[0])
                        i['view_perm'] = int(post_view_perm[1])
        return relist
Example #7
0
    def get_article_by_search(text):
        """从搜索文章获得的文本 提取章列表信息

        Parameters
        ----------
        text : str or unicode
            搜索文章获得的文本

        Returns
        -------
        list[dict]
            {
                'article': {
                    'title': '',  # 文章标题
                    'url': '',  # 文章链接
                    'imgs': '',  # 文章图片list
                    'abstract': '',  # 文章摘要
                    'time': ''  # 文章推送时间
                },
                'gzh': {
                    'profile_url': '',  # 公众号最近10条群发页链接
                    'headimage': '',  # 头像
                    'wechat_name': '',  # 名称
                    'isv': '',  # 是否加v
                }
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list"]/li')

        articles = []
        for li in lis:
            url = get_first_of_element(li, 'div[1]/a/@href')
            if url:
                title = get_first_of_element(li, 'div[2]/h3/a')
                imgs = li.xpath('div[1]/a/img/@src')
                abstract = get_first_of_element(li, 'div[2]/p')
                time = get_first_of_element(li,
                                            'div[2]/div/span/script/text()')
                gzh_info = li.xpath('div[2]/div/a')[0]
            else:
                url = get_first_of_element(li, 'div/h3/a/@href')
                title = get_first_of_element(li, 'div/h3/a')
                imgs = []
                spans = li.xpath('div/div[1]/a')
                for span in spans:
                    img = span.xpath('span/img/@src')
                    if img:
                        imgs.append(img)
                abstract = get_first_of_element(li, 'div/p')
                time = get_first_of_element(li,
                                            'div/div[2]/span/script/text()')
                gzh_info = li.xpath('div/div[2]/a')[0]

            if title is not None:
                title = get_elem_text(title).replace("red_beg", "").replace(
                    "red_end", "")
            if abstract is not None:
                abstract = get_elem_text(abstract).replace("red_beg",
                                                           "").replace(
                                                               "red_end", "")

            time = re.findall('timeConvert\(\'(.*?)\'\)', time)
            time = list_or_empty(time, int)
            profile_url = get_first_of_element(gzh_info, '@href')
            headimage = get_first_of_element(gzh_info, '@data-headimage')
            wechat_name = get_first_of_element(gzh_info, 'text()')
            gzh_isv = get_first_of_element(gzh_info, '@data-isv', int)

            articles.append({
                'article': {
                    'title': title,
                    'url': url,
                    'imgs': imgs,
                    'abstract': abstract,
                    'time': time
                },
                'gzh': {
                    'profile_url': profile_url,
                    'headimage': headimage,
                    'wechat_name': wechat_name,
                    'isv': gzh_isv,
                }
            })
        return articles
Example #8
0
    def get_article_by_search(text):
        """从搜索文章获得的文本 提取章列表信息

        Parameters
        ----------
        text : str or unicode
            搜索文章获得的文本

        Returns
        -------
        list[dict]
            {
                'article': {
                    'title': '',  # 文章标题
                    'url': '',  # 文章链接
                    'imgs': '',  # 文章图片list
                    'abstract': '',  # 文章摘要
                    'time': ''  # 文章推送时间
                },
                'gzh': {
                    'profile_url': '',  # 公众号最近10条群发页链接
                    'headimage': '',  # 头像
                    'wechat_name': '',  # 名称
                    'isv': '',  # 是否加v
                }
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list"]/li')

        articles = []
        for li in lis:
            url = get_first_of_element(li, 'div[1]/a/@href')
            if url:
                title = get_first_of_element(li, 'div[2]/h3/a')
                imgs = li.xpath('div[1]/a/img/@src')
                abstract = get_first_of_element(li, 'div[2]/p')
                time = get_first_of_element(li, 'div[2]/div/span/script/text()')
                gzh_info = li.xpath('div[2]/div/a')[0]
            else:
                url = get_first_of_element(li, 'div/h3/a/@href')
                title = get_first_of_element(li, 'div/h3/a')
                imgs = []
                spans = li.xpath('div/div[1]/a')
                for span in spans:
                    img = span.xpath('span/img/@src')
                    if img:
                        imgs.append(img)
                abstract = get_first_of_element(li, 'div/p')
                time = get_first_of_element(li, 'div/div[2]/span/script/text()')
                gzh_info = li.xpath('div/div[2]/a')[0]

            if title is not None:
                title = get_elem_text(title).replace("red_beg", "").replace("red_end", "")
            if abstract is not None:
                abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "")

            time = re.findall('timeConvert\(\'(.*?)\'\)', time)
            time = list_or_empty(time, int)
            profile_url = get_first_of_element(gzh_info, '@href')
            headimage = get_first_of_element(gzh_info, '@data-headimage')
            wechat_name = get_first_of_element(gzh_info, 'text()')
            gzh_isv = get_first_of_element(gzh_info, '@data-isv', int)

            articles.append({
                'article': {
                    'title': title,
                    'url': url,
                    'imgs': format_image_url(imgs),
                    'abstract': abstract,
                    'time': time
                },
                'gzh': {
                    'profile_url': profile_url,
                    'headimage': headimage,
                    'wechat_name': wechat_name,
                    'isv': gzh_isv,
                }
            })
        return articles