Esempio n. 1
0
    def deal_comment(self, req_url, text):
        """
        解析评论
        :param req_url:
        :param text:
        :return:
        """

        data = tools.get_json(text)

        __biz = tools.get_param(req_url, "__biz")

        comment_id = tools.get_param(req_url, "comment_id")  # 与文章关联
        elected_comment = data.get("elected_comment", [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get("nick_name"),
                logo_url=comment.get("logo_url"),
                content=comment.get("content"),
                create_time=tools.timestamp_to_date(
                    comment.get("create_time")),
                content_id=comment.get("content_id"),
                like_num=comment.get("like_num"),
                is_top=comment.get("is_top"),
                spider_time=tools.get_current_date(),
            ) for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Esempio n. 2
0
    def deal_comment(self, req_url, text):
        data = tools.get_json(text)

        __biz = tools.get_param(req_url, '__biz')

        comment_id = tools.get_param(req_url, 'comment_id')  # 与文章关联
        elected_comment = data.get('elected_comment', [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get('nick_name'),
                logo_url=comment.get('logo_url'),
                content=comment.get('content'),
                create_time=tools.timestamp_to_date(comment.get('create_time')),
                content_id=comment.get('content_id'),
                like_num=comment.get('like_num'),
                is_top=comment.get('is_top'),
                spider_time=tools.get_current_date()
            )
            for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Esempio n. 3
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        ).extract_first(default="")
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)
        biz = tools.get_param(req_url, "__biz")

        text = remove_tags(content).strip()
        spider_name = 'wechat'
        collection_mode = 'spider'
        data_source_type = '微信公众号'

        article_data = {
            "data_type": account,
            "title": title,
            "data_address": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "text": text,
            "spider_name": spider_name,
            "collection_mode": collection_mode,
            "data_source_type": data_source_type,
            "sn": sn,
            "collection_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
Esempio n. 4
0
        def parse_article_info(article_info, release_time):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))
            title = article_info.get('title')
            summary = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace(
                'amp;', '')
            source_url = article_info.get('source_url').replace('\\',
                                                                '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            author = article_info.get('author')
            if url and url.startswith(
                    'http://mp.weixin.qq.com/'
            ):  # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库
                mid = tools.get_param(url, 'mid') or tools.get_param(
                    url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
                idx = tools.get_param(url, 'idx') or tools.get_param(
                    url, 'itemidx')  # 第几条图文消息 从1开始
                article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

                # 判断该文章库中是否已存在
                if WechatAction._wechat_service.is_exist(
                        'wechat_article',
                        article_id) or (ONLY_TODAY_MSG and release_time <
                                        tools.get_current_date('%Y-%m-%d')):
                    self._is_need_get_more = False
                    return  # 不往下进行 舍弃之后的文章

                __biz = tools.get_param(url, '__biz')  # 用于关联公众号

                # 缓存文章信息
                WechatAction._article_info[article_id] = {
                    'article_id': int(article_id),
                    'title': title,
                    'summary': summary,
                    'release_time': release_time,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'account': '',
                    'author': author,
                    '__biz': __biz,
                    'read_num': None,
                    'like_num': None,
                    'content': '',
                    'comment': [],
                    'record_time': tools.get_current_date()
                }

                # 将文章url添加到待抓取队列
                WechatAction._todo_urls.append(url)
Esempio n. 5
0
    def get_read_watched_count(self, data, req_url):
        '''
        @summary:
        ---------
        @param data:
        {
            "advertisement_num":0,
            "advertisement_info":[

            ],
            "appmsgstat":{
                "show":true,
                "is_login":true,
                "liked":false,
                "read_num":38785,
                "like_num":99,
                "ret":0,
                "real_read_num":0
            },
            "comment_enabled":1,
            "reward_head_imgs":[

            ],
            "only_fans_can_comment":false,
            "is_ios_reward_open":0,
            "base_resp":{
                "wxtoken":3465907592
            }
        }
        @param req_url:
        ---------
        @result:
        '''

        log.debug('获取观看和点赞量')

        req_url = req_url.replace('amp;', '')

        mid = tools.get_param(req_url, 'mid')  # 图文消息id 同一天发布的图文消息 id一样
        idx = tools.get_param(req_url, 'idx')  # 第几条图文消息 从1开始
        article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

        data = tools.get_json(data)
        read_num = data.get('appmsgstat', {}).get('read_num')
        like_num = data.get('appmsgstat', {}).get('like_num')

        # 缓存文章阅读量点赞量
        WechatAction._article_info[article_id]['read_num'] = read_num
        WechatAction._article_info[article_id]['like_num'] = like_num

        if not data.get('comment_enabled'):  # 无评论区,不请求get_comment 函数,此时直接入库
            self._wechat_service.add_article_info(
                WechatAction._article_info.pop(article_id))
Esempio n. 6
0
    def deal_article(self, req_url, text):
        sn = tools.get_param(req_url, 'sn')

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]')
        title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip()
        account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip()
        author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip()

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(publish_timestamp) if publish_timestamp else None
        publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None

        pics_url = content.xpath('.//img/@src|.//img/@data-src').extract()
        biz = tools.get_param(req_url, '__biz')

        digest = selector.re_first('var msg_desc = "(.*?)"')
        cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"')
        source_url = selector.re_first("var msg_source_url = '(.*?)';")

        content_html = content.extract_first(default='')
        comment_id = selector.re_first('var comment_id = "(\d+)"')

        article_data = {
            'account': account,
            'title': title,
            'url': req_url,
            'author': author,
            'publish_time': publish_time,
            '__biz': biz,
            'digest': digest,
            'cover': cover,
            "pics_url": pics_url,
            "content_html": content_html,
            "source_url": source_url,
            "comment_id": comment_id,
            "sn": sn,
            "spider_time": tools.get_current_date()

        }

        # 入库
        if article_data and data_pipeline.save_article(article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
Esempio n. 7
0
    def get_article_content(self, data, req_url):
        log.debug('获取文章内容')

        if data:  # 被验证不详实的文章 首次不反回内容,跳转到https://mp.weixin.qq.com/mp/rumor
            req_url = req_url.replace('amp;', '')
            mid = tools.get_param(req_url, 'mid') or tools.get_param(
                req_url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
            idx = tools.get_param(req_url, 'idx') or tools.get_param(
                req_url, 'itemidx')  # 第几条图文消息 从1开始
            article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601
            WechatAction._current_aritcle_id = article_id  # 记录当前文章的id 为获取评论信息时找对应的文章id使用
            print('当前id' + WechatAction._current_aritcle_id)
            regex = '(<div class="rich_media_content ".*?)<script nonce'
            content = tools.get_info(data, regex, fetch_one=True)
            if content:
                # 缓存文章内容
                WechatAction._article_info[article_id]['content'] = content
                # 取公众号名
                regex = '<title>(.*?)</title>'
                account = tools.get_info(data, regex, fetch_one=True)
                WechatAction._article_info[article_id]['account'] = account

            else:  # 被验证不实的文章,不会请求观看点赞数,此时直接入库
                regex = '<title>(.*?)</title>'
                content = tools.get_info(data, regex, fetch_one=True)
                WechatAction._article_info[article_id]['content'] = content

                # 入库
                print('被验证不实的文章,不会请求观看点赞数,此时直接入库')
                WechatAction._wechat_service.add_article_info(
                    WechatAction._article_info.pop(article_id))

            # 如果下一页是文章列表的链接, 替换文章列表中的appmsg_token,防止列表链接过期
            if (len(WechatAction._todo_urls)
                    == 1) and ('/mp/profile_ext'
                               in WechatAction._todo_urls[-1]):
                regex = 'appmsg_token = "(.*?)"'
                appmsg_token = tools.get_info(data, regex,
                                              fetch_one=True).strip()

                WechatAction._todo_urls[-1] = tools.replace_str(
                    WechatAction._todo_urls[-1], 'appmsg_token=.*?&',
                    'appmsg_token=%s&' % appmsg_token)

            return self.__open_next_page()

        else:
            # 无文章内容
            pass
Esempio n. 8
0
    def get_comment(self, data, req_url):
        log.debug('获取评论信息')

        req_url = req_url.replace('amp;', '')
        mid = tools.get_param(req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样
        idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始
        article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

        data = tools.get_json(data)
        comment = data.get('elected_comment', []) # 精选留言

        # 缓存文章评论信息
        WechatAction._article_info[article_id]['comment'] = comment

        WechatAction._wechat_service.add_article_info(WechatAction._article_info.pop(article_id))
Esempio n. 9
0
    def __parse_account_info(self, data, req_url):

        __biz = tools.get_param(req_url, "__biz")

        regex = 'id="nickname">(.*?)</strong>'
        account = tools.get_info(data, regex, fetch_one=True).strip()

        regex = 'profile_avatar">.*?<img src="(.*?)"'
        head_url = tools.get_info(data, regex, fetch_one=True)

        regex = 'class="profile_desc">(.*?)</p>'
        summary = tools.get_info(data, regex, fetch_one=True).strip()

        # 认证信息(关注的账号直接点击查看历史消息,无认证信息)
        regex = '<i class="icon_verify success">.*?</i>(.*?)</span>'
        verify = tools.get_info(data, regex, fetch_one=True)
        verify = verify.strip() if verify else ""

        # 二维码
        regex = 'var username = "" \|\| "(.*?)";'  # ||  需要转译
        qr_code = tools.get_info(data, regex, fetch_one=True)
        qr_code = "http://open.weixin.qq.com/qr/code?username="******"__biz": __biz,
            "account": account,
            "head_url": head_url,
            "summary": summary,
            "qr_code": qr_code,
            "verify": verify,
            "spider_time": tools.get_current_date(),
        }

        if account_data:
            data_pipeline.save_account(account_data)
Esempio n. 10
0
    def deal_article_dynamic_info(self, req_data, text):
        """
        取文章动态信息 阅读 点赞 评论
        :param req_data: post 请求的data str格式
        :param text:
        :return:
        """
        data = tools.get_json(text)

        dynamic_data = dict(
            sn=tools.get_param(req_data, "sn"),
            __biz=tools.get_param(req_data, "__biz").replace("%3D", "="),
            read_num=data.get("appmsgstat", {}).get("read_num"),
            like_num=data.get("appmsgstat", {}).get("like_num"),
            comment_count=data.get("comment_count"),
            spider_time=tools.get_current_date(),
        )

        if dynamic_data:
            data_pipeline.save_article_dynamic(dynamic_data)
Esempio n. 11
0
    def deal_article_dynamic_info(self, req_data, text):
        """
        取文章动态信息 阅读 点赞 评论
        :param req_data: post 请求的data str格式
        :param text:
        :return:
        """
        data = tools.get_json(text)

        dynamic_data = dict(
            sn=tools.get_param(req_data, 'sn'),
            __biz=tools.get_param(req_data, '__biz').replace('%3D', '='),
            read_num=data.get('appmsgstat', {}).get('read_num'),
            like_num=data.get('appmsgstat', {}).get('like_num'),
            comment_count=data.get('comment_count'),
            spider_time=tools.get_current_date()
        )

        if dynamic_data:
            data_pipeline.save_article_dynamic(dynamic_data)
Esempio n. 12
0
    def __parse_account_info(self, data, req_url):
        '''
        @summary:
        ---------
        @param data:
        ---------
        @result:
        '''
        __biz = tools.get_param(req_url, '__biz')
        WechatAction._current_account_biz = __biz

        regex = 'id="nickname">(.*?)</strong>'
        account = tools.get_info(data, regex, fetch_one=True).strip()

        regex = 'profile_avatar">.*?<img src="(.*?)"'
        head_url = tools.get_info(data, regex, fetch_one=True)

        regex = 'class="profile_desc">(.*?)</p>'
        summary = tools.get_info(data, regex, fetch_one=True).strip()

        # 认证信息(关注的账号直接点击查看历史消息,无认证信息)
        regex = '<i class="icon_verify success">.*?</i>(.*?)</span>'
        verify = tools.get_info(data, regex, fetch_one=True)
        verify = verify.strip() if verify else ''

        # 二维码
        regex = 'var username = "" \|\| "(.*?)";'  # ||  需要转译
        qr_code = tools.get_info(data, regex, fetch_one=True)
        qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz':
            __biz,
            'account':
            account,
            'head_url':
            head_url,
            'summary':
            summary,
            'qr_code':
            qr_code,
            'verify':
            verify,
            'account_id':
            WechatAction._account_info.pop(__biz)
            if __biz in WechatAction._account_info.keys() else '',
            'record_time':
            tools.get_current_date()
        }

        if not WechatAction._wechat_service.is_exist('wechat_account', __biz):
            WechatAction._wechat_service.add_account_info(account_info)
Esempio n. 13
0
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get("title")
            digest = article_info.get("digest")
            url = article_info.get("content_url").replace("\\", "").replace(
                "amp;", "")
            source_url = article_info.get("source_url").replace("\\",
                                                                "")  # 引用的文章链接
            cover = article_info.get("cover").replace("\\", "")
            subtype = article_info.get("subtype")
            is_multi = article_info.get("is_multi")
            author = article_info.get("author")
            copyright_stat = article_info.get("copyright_stat")
            duration = article_info.get("duration")
            del_flag = article_info.get("del_flag")
            type = comm_msg_info.get("type")
            publish_time = tools.timestamp_to_date(
                comm_msg_info.get("datetime"))
            sn = tools.get_param(url, "sn")

            if sn:
                # 缓存文章信息
                article_data = {
                    "title": title,
                    "digest": digest,
                    "url": url,
                    "source_url": source_url,
                    "cover": cover,
                    "subtype": subtype,
                    "is_multi": is_multi,
                    "author": author,
                    "copyright_stat": copyright_stat,
                    "duration": duration,
                    "del_flag": del_flag,
                    "type": type,
                    "publish_time": publish_time,
                    "sn": sn,
                    "__biz": __biz,
                    "spider_time": tools.get_current_date(),
                }

                return article_data
Esempio n. 14
0
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get('title')
            digest = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace('amp;', '')
            source_url = article_info.get('source_url').replace('\\', '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            subtype = article_info.get('subtype')
            is_multi = article_info.get('is_multi')
            author = article_info.get('author')
            copyright_stat = article_info.get('copyright_stat')
            duration = article_info.get('duration')
            del_flag = article_info.get('del_flag')
            type = comm_msg_info.get('type')
            publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime'))
            sn = tools.get_param(url, 'sn')

            if sn:
                # 缓存文章信息
                article_data = {
                    'title': title,
                    'digest': digest,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'subtype': subtype,
                    'is_multi': is_multi,
                    'author': author,
                    'copyright_stat': copyright_stat,
                    'duration': duration,
                    'del_flag': del_flag,
                    'type': type,
                    'publish_time': publish_time,
                    'sn': sn,
                    '__biz': __biz,
                    'spider_time': tools.get_current_date()
                }

                return article_data
Esempio n. 15
0
    def __parse_account_info(self, data, req_url):
        '''
        @summary:
        ---------
        @param data:
        ---------
        @result:
        '''
        __biz = tools.get_param(req_url, '__biz')

        regex = 'id="nickname">(.*?)</strong>'
        account = tools.get_info(data, regex, fetch_one=True).strip()

        regex = 'profile_avatar">.*?<img src="(.*?)"'
        head_url = tools.get_info(data, regex, fetch_one=True)

        regex = 'class="profile_desc">(.*?)</p>'
        summary = tools.get_info(data, regex, fetch_one=True).strip()

        # 认证信息(关注的账号直接点击查看历史消息,无认证信息)
        regex = '<i class="icon_verify success">.*?</i>(.*?)</span>'
        verify = tools.get_info(data, regex, fetch_one=True)
        verify = verify.strip() if verify else ''

        # 二维码
        regex = 'var username = "" \|\| "(.*?)";'  # ||  需要转译
        qr_code = tools.get_info(data, regex, fetch_one=True)
        qr_code = 'http://open.weixin.qq.com/qr/code?username='******'__biz': __biz,
            'account': account,
            'head_url': head_url,
            'summary': summary,
            'qr_code': qr_code,
            'verify': verify,
            'spider_time': tools.get_current_date()
        }

        if account_data:
            data_pipeline.save_account(account_data)
Esempio n. 16
0
    def __parse_article_list(self, article_list, req_url):
        '''
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        {
            "list":[
                {
                    "comm_msg_info":{
                        "id":1000000513,
                        "type":49,
                        "datetime":1511354167,
                        "fakeid":"3082125093",
                        "status":2,
                        "content":""
                    },
                    "app_msg_ext_info":{
                        "title":"Python 内存优化",
                        "digest":"实际项目中,pythoner更加关注的是Python的性能问题。本文,关注的是Python的内存优化,一般说来,如果不发生内存泄露,运行在服务端的Python代码不用太关心内存,但是如果运行在客户端,那还是有优化的必要。",
                        "content":"",
                        "fileid":505083208,
                        "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&amp;mid=2652566858&amp;idx=1&amp;sn=d2a76f4a601f94d8acc7b436d18e9648&amp;chksm=8464dd00b313541684c14f974325ea6ae725ffc901fd9888cc00d1acdd13619de3297a5d9a35&amp;scene=27#wechat_redirect",
                        "source_url":"http:\/\/www.cnblogs.com\/xybaby\/p\/7488216.html",
                        "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQVbRHy3FhzwMHEvCvtzXVicHTaPEu8jZ2pgkCAgBqEHugYMvzg3tpoww\/0?wx_fmt=jpeg",
                        "subtype":9,
                        "is_multi":1,
                        "multi_app_msg_item_list":[
                            {
                                "title":"面向对象:With the wonder of your love, the sun above always shines",
                                "digest":"With the wonder of your love, the sun above always shines",
                                "content":"",
                                "fileid":505083209,
                                "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&amp;mid=2652566858&amp;idx=2&amp;sn=97f223783da7748080f8103654447c99&amp;chksm=8464dd00b313541601938565a41487ea76209331fd6f4c8996a2ff5572f4fd465de9fa4cbaac&amp;scene=27#wechat_redirect",
                                "source_url":"https:\/\/mp.weixin.qq.com\/s\/_uD9jY4nXQQ6CtA__dsN8w?scene=25#wechat_redirect",
                                "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQ5ukvwH1GPq5zlWxv05WvRiaw6BiaeyGRD1w17nAPGTlQgEvvDuZnB9HA\/0?wx_fmt=jpeg",
                                "author":"",
                                "copyright_stat":101,
                                "del_flag":1
                            }
                        ],
                        "author":"",
                        "copyright_stat":100,
                        "del_flag":1
                    }
                }
            ]
        }
        ---------
        @result:
        '''
        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, release_time):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))
            title = article_info.get('title')
            summary = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace('amp;', '')
            source_url = article_info.get('source_url').replace('\\', '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            author = article_info.get('author')
            if url and url.startswith('http://mp.weixin.qq.com/'):# 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库
                mid = tools.get_param(url, 'mid') or tools.get_param(url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样
                idx = tools.get_param(url, 'idx') or tools.get_param(url, 'itemidx')# 第几条图文消息 从1开始
                article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

                # 判断该文章库中是否已存在
                if WechatAction._wechat_service.is_exist('wechat_article', article_id) or (ONLY_TODAY_MSG and release_time < tools.get_current_date('%Y-%m-%d')):
                    self._is_need_get_more  = False
                    return # 不往下进行 舍弃之后的文章

                __biz = tools.get_param(url, '__biz') # 用于关联公众号

                # 缓存文章信息
                WechatAction._article_info[article_id] = {
                    'article_id':int(article_id),
                    'title' : title,
                    'summary' : summary,
                    'release_time':release_time,
                    'url' : url,
                    'source_url' : source_url,
                    'cover' : cover,
                    'account':'',
                    'author' : author,
                    '__biz' : __biz,
                    'read_num' : None,
                    'like_num' : None,
                    'content' : '',
                    'comment' : [],
                    'record_time':tools.get_current_date()
                }

                # 将文章url添加到待抓取队列
                WechatAction._todo_urls.append(url)

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list = article_list.get('list', [])
        for article in article_list:
            article_type = article.get('comm_msg_info', {}).get('type')
            if article_type != 49: # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            release_time = article.get('comm_msg_info', {}).get('datetime')
            release_time = tools.timestamp_to_date(release_time)

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get('app_msg_ext_info', {})
            parse_article_info(app_msg_ext_info, release_time)

            if not self._is_need_get_more:
                break

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
            for multi_app_msg_item in multi_app_msg_item_list:
                parse_article_info(multi_app_msg_item, release_time)

                if not self._is_need_get_more:
                    break

        # 将更新公众号为做完的回调加入到队列中
        __biz = tools.get_param(req_url, '__biz') # 用于关联公众号
        WechatAction._todo_urls.append(lambda: WechatAction._wechat_service.update_account_article_num(__biz))
Esempio n. 17
0
    def get_article_list(self, data, req_url):
        '''
        @summary: 获取文章列表
        分为两种
            1、第一次查看历史消息 返回的是html格式 包含公众号信息
            2、下拉显示更多时 返回json格式
        但是文章列表都是json格式 且合适相同
        抓取思路:
        1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址
        2、如果是第二种格式,
        ---------
        @param data:
        ---------
        @result:
        '''
        try:
            # 判断是否为被封的账号, 被封账号没有文章列表
            if 'list' in data:
                # 取html格式里的文章列表
                if 'action=home' in req_url:
                    # 解析公众号信息
                    self.__parse_account_info(data, req_url)

                    # 解析文章列表
                    regex = "msgList = '(.*?})';"
                    article_list = tools.get_info(data, regex, fetch_one=True)
                    article_list = article_list.replace('&quot;', '"')
                    self.__parse_article_list(article_list)

                    #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    regex = "can_msg_continue = '(\d)'"
                    can_msg_continue = tools.get_info(data,
                                                      regex,
                                                      fetch_one=True)
                    if can_msg_continue == '0':  # 无更多文章
                        pass
                    elif self._is_need_get_more:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取appmsg_token 在html中
                        regex = 'appmsg_token = "(.*?)";'
                        appmsg_token = tools.get_info(data,
                                                      regex,
                                                      fetch_one=True)

                        # 取其他参数  在url中
                        __biz = tools.get_param(req_url, '__biz')
                        pass_ticket = tools.get_param(req_url, 'pass_ticket')

                        next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format(
                            __biz=__biz,
                            offset=10,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token)
                        WechatAction._todo_urls.append(next_page_url)

                else:  # json格式
                    data = tools.get_json(data)
                    article_list = data.get('general_msg_list', {})
                    self.__parse_article_list(article_list)

                    #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    can_msg_continue = data.get('can_msg_continue')
                    if not can_msg_continue:  # 无更多文章
                        pass
                    elif self._is_need_get_more:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取参数  在url中
                        __biz = tools.get_param(req_url, '__biz')
                        pass_ticket = tools.get_param(req_url, 'pass_ticket')
                        appmsg_token = tools.get_param(req_url, 'appmsg_token')

                        # 取offset 在json中
                        offset = data.get('next_offset', 0)

                        next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format(
                            __biz=__biz,
                            offset=offset,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token)
                        WechatAction._todo_urls.append(next_page_url)

            else:  # 该__biz 账号已被封
                pass
        except Exception as e:
            log.error(e)

        return self.__open_next_page()
Esempio n. 18
0
    def deal_article_list(self, req_url, text):
        """
        @summary: 获取文章列表
        分为两种
            1、第一次查看历史消息 返回的是html格式 包含公众号信息
            2、下拉显示更多时 返回json格式
        但是文章列表都是json格式 且合适相同
        抓取思路:
        1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址
        2、如果是第二种格式,
        ---------
        @param data:
        ---------
        @result:
        """
        try:
            # 判断是否为被封的账号, 被封账号没有文章列表
            __biz = tools.get_param(req_url, "__biz")

            if "list" in text:
                # 取html格式里的文章列表
                if "action=home" in req_url:
                    # 解析公众号信息
                    self.__parse_account_info(text, req_url)

                    # 解析文章列表
                    regex = "msgList = '(.*?})';"
                    article_list = tools.get_info(text, regex, fetch_one=True)
                    article_list = article_list.replace("&quot;", '"')
                    publish_time = self.__parse_article_list(
                        article_list, __biz, is_first_page=True)

                    # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    regex = "can_msg_continue = '(\d)'"
                    can_msg_continue = tools.get_info(text,
                                                      regex,
                                                      fetch_one=True)
                    if can_msg_continue == "0":  # 无更多文章
                        log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz))
                        new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                            __biz)
                        if not new_last_publish_time:
                            # 标记成僵尸号
                            log.info("公众号 {} 为僵尸账号 不再监控".format(__biz))
                            self._task_manager.sign_account_is_zombie(__biz)
                        else:
                            self._task_manager.update_account_last_publish_time(
                                __biz, new_last_publish_time)

                    elif publish_time:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取appmsg_token 在html中
                        regex = 'appmsg_token = "(.*?)";'
                        appmsg_token = tools.get_info(text,
                                                      regex,
                                                      fetch_one=True)

                        # 取其他参数  在url中
                        __biz = tools.get_param(req_url, "__biz")
                        pass_ticket = tools.get_param(req_url, "pass_ticket")

                        next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format(
                            __biz=__biz,
                            offset=10,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token,
                        )
                        return self._task_manager.get_task(
                            next_page_url,
                            tip="正在抓取列表 next_offset {} 抓取到 {}".format(
                                10, publish_time),
                        )

                else:  # json格式
                    text = tools.get_json(text)
                    article_list = text.get("general_msg_list", {})
                    publish_time = self.__parse_article_list(
                        article_list, __biz)

                    # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    can_msg_continue = text.get("can_msg_continue")
                    if not can_msg_continue:  # 无更多文章
                        log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz))
                        new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                            __biz)
                        self._task_manager.update_account_last_publish_time(
                            __biz, new_last_publish_time)
                        pass

                    elif publish_time:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取参数  在url中
                        __biz = tools.get_param(req_url, "__biz")
                        pass_ticket = tools.get_param(req_url, "pass_ticket")
                        appmsg_token = tools.get_param(req_url, "appmsg_token")

                        # 取offset 在json中
                        offset = text.get("next_offset", 0)

                        next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format(
                            __biz=__biz,
                            offset=offset,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token,
                        )
                        return self._task_manager.get_task(
                            next_page_url,
                            tip="正在抓取列表 next_offset {} 抓取到 {}".format(
                                offset, publish_time),
                        )

            else:  # 该__biz 账号已被封
                self._task_manager.sign_account_is_zombie(__biz)
                pass

        except Exception as e:
            log.exception(e)

        return self._task_manager.get_task()
Esempio n. 19
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        )
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)

        pics_url = content.xpath(".//img/@src|.//img/@data-src").extract()
        biz = tools.get_param(req_url, "__biz")

        digest = selector.re_first('var msg_desc = "(.*?)"')
        cover = selector.re_first('var cover = "(.*?)";') or selector.re_first(
            'msg_cdn_url = "(.*?)"')
        source_url = selector.re_first("var msg_source_url = '(.*?)';")

        content_html = content.extract_first(default="")
        comment_id = selector.re_first('var comment_id = "(\d+)"')

        article_data = {
            "account": account,
            "title": title,
            "url": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "digest": digest,
            "cover": cover,
            "pics_url": pics_url,
            "content_html": content_html,
            "source_url": source_url,
            "comment_id": comment_id,
            "sn": sn,
            "spider_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()