Beispiel #1
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'view_perm': '',  # 最近一月阅读量
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        post_view_perms = WechatSogouStructuring.__get_post_view_perm(text)

        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = get_first_of_element(li, 'div/div[1]/a/@href')
            headimage = format_image_url(get_first_of_element(li, 'div/div[1]/a/img/@src'))
            wechat_name = get_elem_text(get_first_of_element(li, 'div/div[2]/p[1]'))
            info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
            qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
            authentication = get_first_of_element(li, 'dl[2]/dd/text()')

            relist.append({
                'open_id': headimage.split('/')[-1],
                'profile_url': url,
                'headimage': headimage,
                'wechat_name': wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id': info.replace('微信号:', ''),
                'qrcode': qrcode,
                'introduction': introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication': authentication,
                'post_perm': -1,
                'view_perm': -1,
            })

        if post_view_perms:
            for i in relist:
                if i['open_id'] in post_view_perms:
                    post_view_perm = post_view_perms[i['open_id']].split(',')
                    if len(post_view_perm) == 2:
                        i['post_perm'] = int(post_view_perm[0])
                        i['view_perm'] = int(post_view_perm[1])
        return relist
Beispiel #2
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = get_first_of_element(li, 'div/div[1]/a/@href')
            headimage = get_first_of_element(li, 'div/div[1]/a/img/@src')
            wechat_name = get_elem_text(
                get_first_of_element(li, 'div/div[2]/p[1]'))
            info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
            post_perm = 0  # TODO 月发文 <script>var account_anti_url = "/websearch/weixin/pc/anti_account.jsp?.......";</script>
            qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
            authentication = get_first_of_element(li, 'dl[2]/dd/text()')
            relist.append({
                'open_id':
                headimage.split('/')[-1],
                'profile_url':
                url,
                'headimage':
                headimage,
                'wechat_name':
                wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id':
                info.replace('微信号:', ''),
                'post_perm':
                post_perm,
                'qrcode':
                qrcode,
                'introduction':
                introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication':
                authentication
            })
        return relist
Beispiel #3
0
    def get_gzh_info_by_history(text):
        """从 历史消息页的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            历史消息页的文本

        Returns
        -------
        dict
            {
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'introduction': '',  # 描述
                'authentication': '',  # 认证
                'headimage': ''  # 头像
            }
        """

        page = etree.HTML(text)
        profile_area = get_first_of_element(
            page, '//div[@class="profile_info_area"]')

        profile_img = get_first_of_element(profile_area,
                                           'div[1]/span/img/@src')
        profile_name = get_first_of_element(profile_area,
                                            'div[1]/div/strong/text()')
        profile_wechat_id = get_first_of_element(profile_area,
                                                 'div[1]/div/p/text()')
        profile_desc = get_first_of_element(profile_area,
                                            'ul/li[1]/div/text()')
        profile_principal = get_first_of_element(profile_area,
                                                 'ul/li[2]/div/text()')

        return {
            'wechat_name': profile_name.strip(),
            'wechat_id': profile_wechat_id.replace('微信号: ', '').strip('\n'),
            'introduction': profile_desc,
            'authentication': profile_principal,
            'headimage': profile_img
        }
Beispiel #4
0
    def get_gzh_info_by_history(text):
        """从 历史消息页的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            历史消息页的文本

        Returns
        -------
        dict
            {
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'introduction': '',  # 描述
                'authentication': '',  # 认证
                'headimage': ''  # 头像
            }
        """

        page = etree.HTML(text)
        profile_area = get_first_of_element(page, '//div[@class="profile_info_area"]')

        profile_img = get_first_of_element(profile_area, 'div[1]/span/img/@src')
        profile_name = get_first_of_element(profile_area, 'div[1]/div/strong/text()')
        profile_wechat_id = get_first_of_element(profile_area, 'div[1]/div/p/text()')
        profile_desc = get_first_of_element(profile_area, 'ul/li[1]/div/text()')
        profile_principal = get_first_of_element(profile_area, 'ul/li[2]/div/text()')

        return {
            'wechat_name': profile_name.strip(),
            'wechat_id': profile_wechat_id.replace('微信号: ', '').strip('\n'),
            'introduction': profile_desc,
            'authentication': profile_principal,
            'headimage': profile_img
        }
Beispiel #5
0
    def get_gzh_by_search(text):
        """从搜索公众号获得的文本 提取公众号信息

        Parameters
        ----------
        text : str or unicode
            搜索公众号获得的文本

        Returns
        -------
        list[dict]
            {
                'open_id': '', # 微信号唯一ID
                'profile_url': '',  # 最近10条群发页链接
                'headimage': '',  # 头像
                'wechat_name': '',  # 名称
                'wechat_id': '',  # 微信id
                'post_perm': '',  # 最近一月群发数
                'view_perm': '',  # 最近一月阅读量
                'qrcode': '',  # 二维码
                'introduction': '',  # 介绍
                'authentication': ''  # 认证
            }
        """
        post_view_perms = WechatSogouStructuring.__get_post_view_perm(text)

        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list2"]/li')
        relist = []
        for li in lis:
            url = get_first_of_element(li, 'div/div[1]/a/@href')
            headimage = get_first_of_element(li, 'div/div[1]/a/img/@src')
            wechat_name = get_elem_text(
                get_first_of_element(li, 'div/div[2]/p[1]'))
            info = get_elem_text(get_first_of_element(li, 'div/div[2]/p[2]'))
            qrcode = get_first_of_element(li, 'div/div[3]/span/img[1]/@src')
            introduction = get_elem_text(get_first_of_element(li, 'dl[1]/dd'))
            authentication = get_first_of_element(li, 'dl[2]/dd/text()')
            relist.append({
                'open_id':
                headimage.split('/')[-1],
                'profile_url':
                url,
                'headimage':
                headimage,
                'wechat_name':
                wechat_name.replace('red_beg', '').replace('red_end', ''),
                'wechat_id':
                info.replace('微信号:', ''),
                'qrcode':
                qrcode,
                'introduction':
                introduction.replace('red_beg', '').replace('red_end', ''),
                'authentication':
                authentication,
                'post_perm':
                -1,
                'view_perm':
                -1,
            })

        if post_view_perms:
            for i in relist:
                if i['open_id'] in post_view_perms:
                    post_view_perm = post_view_perms[i['open_id']].split(',')
                    if len(post_view_perm) == 2:
                        i['post_perm'] = int(post_view_perm[0])
                        i['view_perm'] = int(post_view_perm[1])
        return relist
Beispiel #6
0
    def get_gzh_article_by_hot(text):
        """从 首页热门搜索 提取公众号信息 和 文章列表信息

        Parameters
        ----------
        text : str or unicode
            首页热门搜索 页 中 某一页 的文本

        Returns
        -------
        list[dict]
            {
                'gzh': {
                    'headimage': str,  # 公众号头像
                    'wechat_name': str,  # 公众号名称
                },
                'article': {
                    'url': str,  # 文章临时链接
                    'title': str,  # 文章标题
                    'abstract': str,  # 文章摘要
                    'time': int,  # 推送时间,10位时间戳
                    'open_id': str,  # open id
                    'main_img': str  # 封面图片
                }
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('/html/body/li')
        gzh_article_list = []
        for li in lis:
            url = get_first_of_element(li, 'div[1]/h4/a/@href')
            title = get_first_of_element(li, 'div[1]/h4/a/div/text()')
            abstract = get_first_of_element(li, 'div[1]/p[1]/text()')
            xpath_time = get_first_of_element(li, 'div[1]/p[2]')
            open_id = get_first_of_element(xpath_time, 'span/@data-openid')
            headimage = get_first_of_element(xpath_time,
                                             'span/@data-headimage')
            gzh_name = get_first_of_element(xpath_time, 'span/text()')
            send_time = xpath_time.xpath('a/span/@data-lastmodified')
            main_img = get_first_of_element(li, 'div[2]/a/img/@src')

            try:
                send_time = int(send_time[0])
            except ValueError:
                send_time = send_time[0]

            gzh_article_list.append({
                'gzh': {
                    'headimage': headimage,
                    'wechat_name': gzh_name,
                },
                'article': {
                    'url': url,
                    'title': title,
                    'abstract': abstract,
                    'time': send_time,
                    'open_id': open_id,
                    'main_img': main_img
                }
            })

        return gzh_article_list
Beispiel #7
0
    def get_article_by_search(text):
        """从搜索文章获得的文本 提取章列表信息

        Parameters
        ----------
        text : str or unicode
            搜索文章获得的文本

        Returns
        -------
        list[dict]
            {
                'article': {
                    'title': '',  # 文章标题
                    'url': '',  # 文章链接
                    'imgs': '',  # 文章图片list
                    'abstract': '',  # 文章摘要
                    'time': ''  # 文章推送时间
                },
                'gzh': {
                    'profile_url': '',  # 公众号最近10条群发页链接
                    'headimage': '',  # 头像
                    'wechat_name': '',  # 名称
                    'isv': '',  # 是否加v
                }
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list"]/li')

        articles = []
        for li in lis:
            url = get_first_of_element(li, 'div[1]/a/@href')
            if url:
                title = get_first_of_element(li, 'div[2]/h3/a')
                imgs = li.xpath('div[1]/a/img/@src')
                abstract = get_first_of_element(li, 'div[2]/p')
                time = get_first_of_element(li,
                                            'div[2]/div/span/script/text()')
                gzh_info = li.xpath('div[2]/div/a')[0]
            else:
                url = get_first_of_element(li, 'div/h3/a/@href')
                title = get_first_of_element(li, 'div/h3/a')
                imgs = []
                spans = li.xpath('div/div[1]/a')
                for span in spans:
                    img = span.xpath('span/img/@src')
                    if img:
                        imgs.append(img)
                abstract = get_first_of_element(li, 'div/p')
                time = get_first_of_element(li,
                                            'div/div[2]/span/script/text()')
                gzh_info = li.xpath('div/div[2]/a')[0]

            if title is not None:
                title = get_elem_text(title).replace("red_beg", "").replace(
                    "red_end", "")
            if abstract is not None:
                abstract = get_elem_text(abstract).replace("red_beg",
                                                           "").replace(
                                                               "red_end", "")

            time = re.findall('timeConvert\(\'(.*?)\'\)', time)
            time = list_or_empty(time, int)
            profile_url = get_first_of_element(gzh_info, '@href')
            headimage = get_first_of_element(gzh_info, '@data-headimage')
            wechat_name = get_first_of_element(gzh_info, 'text()')
            gzh_isv = get_first_of_element(gzh_info, '@data-isv', int)

            articles.append({
                'article': {
                    'title': title,
                    'url': url,
                    'imgs': imgs,
                    'abstract': abstract,
                    'time': time
                },
                'gzh': {
                    'profile_url': profile_url,
                    'headimage': headimage,
                    'wechat_name': wechat_name,
                    'isv': gzh_isv,
                }
            })
        return articles
Beispiel #8
0
    def get_gzh_article_by_hot(text):
        """从 首页热门搜索 提取公众号信息 和 文章列表信息

        Parameters
        ----------
        text : str or unicode
            首页热门搜索 页 中 某一页 的文本

        Returns
        -------
        list[dict]
            {
                'gzh': {
                    'headimage': str,  # 公众号头像
                    'wechat_name': str,  # 公众号名称
                },
                'article': {
                    'url': str,  # 文章临时链接
                    'title': str,  # 文章标题
                    'abstract': str,  # 文章摘要
                    'time': int,  # 推送时间,10位时间戳
                    'open_id': str,  # open id
                    'main_img': str  # 封面图片
                }
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('/html/body/li')
        gzh_article_list = []
        for li in lis:
            url = get_first_of_element(li, 'div[1]/h4/a/@href')
            title = get_first_of_element(li, 'div[1]/h4/a/div/text()')
            abstract = get_first_of_element(li, 'div[1]/p[1]/text()')
            xpath_time = get_first_of_element(li, 'div[1]/p[2]')
            open_id = get_first_of_element(xpath_time, 'span/@data-openid')
            headimage = get_first_of_element(xpath_time, 'span/@data-headimage')
            gzh_name = get_first_of_element(xpath_time, 'span/text()')
            send_time = xpath_time.xpath('a/span/@data-lastmodified')
            main_img = get_first_of_element(li, 'div[2]/a/img/@src')

            try:
                send_time = int(send_time[0])
            except ValueError:
                send_time = send_time[0]

            gzh_article_list.append({
                'gzh': {
                    'headimage': headimage,
                    'wechat_name': gzh_name,
                },
                'article': {
                    'url': url,
                    'title': title,
                    'abstract': abstract,
                    'time': send_time,
                    'open_id': open_id,
                    'main_img': main_img
                }
            })

        return gzh_article_list
Beispiel #9
0
    def get_article_by_search(text):
        """从搜索文章获得的文本 提取章列表信息

        Parameters
        ----------
        text : str or unicode
            搜索文章获得的文本

        Returns
        -------
        list[dict]
            {
                'article': {
                    'title': '',  # 文章标题
                    'url': '',  # 文章链接
                    'imgs': '',  # 文章图片list
                    'abstract': '',  # 文章摘要
                    'time': ''  # 文章推送时间
                },
                'gzh': {
                    'profile_url': '',  # 公众号最近10条群发页链接
                    'headimage': '',  # 头像
                    'wechat_name': '',  # 名称
                    'isv': '',  # 是否加v
                }
            }
        """
        page = etree.HTML(text)
        lis = page.xpath('//ul[@class="news-list"]/li')

        articles = []
        for li in lis:
            url = get_first_of_element(li, 'div[1]/a/@href')
            if url:
                title = get_first_of_element(li, 'div[2]/h3/a')
                imgs = li.xpath('div[1]/a/img/@src')
                abstract = get_first_of_element(li, 'div[2]/p')
                time = get_first_of_element(li, 'div[2]/div/span/script/text()')
                gzh_info = li.xpath('div[2]/div/a')[0]
            else:
                url = get_first_of_element(li, 'div/h3/a/@href')
                title = get_first_of_element(li, 'div/h3/a')
                imgs = []
                spans = li.xpath('div/div[1]/a')
                for span in spans:
                    img = span.xpath('span/img/@src')
                    if img:
                        imgs.append(img)
                abstract = get_first_of_element(li, 'div/p')
                time = get_first_of_element(li, 'div/div[2]/span/script/text()')
                gzh_info = li.xpath('div/div[2]/a')[0]

            if title is not None:
                title = get_elem_text(title).replace("red_beg", "").replace("red_end", "")
            if abstract is not None:
                abstract = get_elem_text(abstract).replace("red_beg", "").replace("red_end", "")

            time = re.findall('timeConvert\(\'(.*?)\'\)', time)
            time = list_or_empty(time, int)
            profile_url = get_first_of_element(gzh_info, '@href')
            headimage = get_first_of_element(gzh_info, '@data-headimage')
            wechat_name = get_first_of_element(gzh_info, 'text()')
            gzh_isv = get_first_of_element(gzh_info, '@data-isv', int)

            articles.append({
                'article': {
                    'title': title,
                    'url': url,
                    'imgs': format_image_url(imgs),
                    'abstract': abstract,
                    'time': time
                },
                'gzh': {
                    'profile_url': profile_url,
                    'headimage': headimage,
                    'wechat_name': wechat_name,
                    'isv': gzh_isv,
                }
            })
        return articles