Example #1
0
    def __init__(self, remote_url, local_save_path, project_path, main_lnk_paths, sync_files = [], ignore_files = []):
        '''
        @summary: 更新代码初始化函数
        ---------
        @param remote_url: 远程代码发布地址
        @param local_save_path: 代码下载路径
        @param project_path: 本地项目路径
        @param main_lnk_paths: 本地项目执行文件快捷方式地址
        @param sync_files: 同步的文件 .* 表示同步全部
        @param ignore_files: 忽略的文件
        ---------
        @result:
        '''

        self._remote_url = remote_url
        self._local_save_path = local_save_path
        self._project_path = project_path
        self._main_lnk_paths = main_lnk_paths
        self._sync_files = sync_files
        self._ignore_files = ignore_files

        self._remote_zip_url = ''
        self._tag = ''
        self._zip_path = ''
        self._unpack_path = ''

        self._project_name = tools.get_info(remote_url, '/([^/]*?)/releases', fetch_one = True)
        self._tag_json = tools.get_json(tools.read_file(VERSION_FILE)) or {}
Example #2
0
    def deal_comment(self, req_url, text):
        data = tools.get_json(text)

        __biz = tools.get_param(req_url, '__biz')

        comment_id = tools.get_param(req_url, 'comment_id')  # 与文章关联
        elected_comment = data.get('elected_comment', [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get('nick_name'),
                logo_url=comment.get('logo_url'),
                content=comment.get('content'),
                create_time=tools.timestamp_to_date(comment.get('create_time')),
                content_id=comment.get('content_id'),
                like_num=comment.get('like_num'),
                is_top=comment.get('is_top'),
                spider_time=tools.get_current_date()
            )
            for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Example #3
0
    def deal_comment(self, req_url, text):
        """
        解析评论
        :param req_url:
        :param text:
        :return:
        """

        data = tools.get_json(text)

        __biz = tools.get_param(req_url, "__biz")

        comment_id = tools.get_param(req_url, "comment_id")  # 与文章关联
        elected_comment = data.get("elected_comment", [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get("nick_name"),
                logo_url=comment.get("logo_url"),
                content=comment.get("content"),
                create_time=tools.timestamp_to_date(
                    comment.get("create_time")),
                content_id=comment.get("content_id"),
                like_num=comment.get("like_num"),
                is_top=comment.get("is_top"),
                spider_time=tools.get_current_date(),
            ) for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Example #4
0
    def get_read_watched_count(self, data, req_url):
        '''
        @summary:
        ---------
        @param data:
        {
            "advertisement_num":0,
            "advertisement_info":[

            ],
            "appmsgstat":{
                "show":true,
                "is_login":true,
                "liked":false,
                "read_num":38785,
                "like_num":99,
                "ret":0,
                "real_read_num":0
            },
            "comment_enabled":1,
            "reward_head_imgs":[

            ],
            "only_fans_can_comment":false,
            "is_ios_reward_open":0,
            "base_resp":{
                "wxtoken":3465907592
            }
        }
        @param req_url:
        ---------
        @result:
        '''

        log.debug('获取观看和点赞量')

        req_url = req_url.replace('amp;', '')

        # 2018-04-13 微信版本更新 地址中无mid与idx参数 article_id拼不出来
        # mid = tools.get_param(req_url, 'mid') # 图文消息id 同一天发布的图文消息 id一样
        # idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始
        # article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

        article_id = WechatAction._current_aritcle_id  # 直接取

        data = tools.get_json(data)
        read_num = data.get('appmsgstat', {}).get('read_num')
        like_num = data.get('appmsgstat', {}).get('like_num')

        # 缓存文章阅读量点赞量
        WechatAction._article_info[article_id]['read_num'] = read_num
        WechatAction._article_info[article_id]['like_num'] = like_num

        # if not data.get('comment_enabled'): # 无评论区,不请求get_comment 函数,此时直接入库
        WechatAction._wechat_service.add_article_info(
            WechatAction._article_info.pop(article_id))
Example #5
0
 def __init__(self, table):
     self._record_time = tools.get_json(
         tools.read_file(SYNC_TIME_FILE)) or {}
     self._compare_keywords = CompareKeywords()
     self._summary = Summary()
     self._emotion = Emotion()
     self._word_cloud = WordCloud()
     self._es = ES()
     self._hot_sync = HotSync()
     self._vip_checked = VipChecked()
     self._table = table
     self._per_record_time_key = '{table}_record_time'.format(
         table=self._table)
Example #6
0
    def get_comment(self, data, req_url):
        log.debug('获取评论信息')

        req_url = req_url.replace('amp;', '')
        mid = tools.get_param(req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样
        idx = tools.get_param(req_url, 'idx') # 第几条图文消息 从1开始
        article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

        data = tools.get_json(data)
        comment = data.get('elected_comment', []) # 精选留言

        # 缓存文章评论信息
        WechatAction._article_info[article_id]['comment'] = comment

        WechatAction._wechat_service.add_article_info(WechatAction._article_info.pop(article_id))
Example #7
0
    def __init__(self, table):
        self._sync_time_file = SYNC_TIME_FILE + table + '.txt'
        self._record_time = tools.get_json(
            tools.read_file(self._sync_time_file)) or {}
        self._compare_keywords = CompareKeywords()
        self._summary = Summary()
        self._emotion = Emotion()
        self._word_cloud = WordCloud()
        self._yqtj_es = ES(YQTJ)
        self._data_pool_es = ES(DATA_POOL)
        self._hot_sync = HotSync()
        self._vip_checked = VipChecked()
        self._province_filter = ProvinceFilter()
        self._table = table
        self._per_record_time_key = '{table}_record_time'.format(
            table=self._table)

        self._vip_checked.start()
        self._compare_keywords.start()
Example #8
0
    def deal_article_dynamic_info(self, req_data, text):
        """
        取文章动态信息 阅读 点赞 评论
        :param req_data: post 请求的data str格式
        :param text:
        :return:
        """
        data = tools.get_json(text)

        dynamic_data = dict(
            sn=tools.get_param(req_data, "sn"),
            __biz=tools.get_param(req_data, "__biz").replace("%3D", "="),
            read_num=data.get("appmsgstat", {}).get("read_num"),
            like_num=data.get("appmsgstat", {}).get("like_num"),
            comment_count=data.get("comment_count"),
            spider_time=tools.get_current_date(),
        )

        if dynamic_data:
            data_pipeline.save_article_dynamic(dynamic_data)
Example #9
0
    def deal_article_dynamic_info(self, req_data, text):
        """
        取文章动态信息 阅读 点赞 评论
        :param req_data: post 请求的data str格式
        :param text:
        :return:
        """
        data = tools.get_json(text)

        dynamic_data = dict(
            sn=tools.get_param(req_data, 'sn'),
            __biz=tools.get_param(req_data, '__biz').replace('%3D', '='),
            read_num=data.get('appmsgstat', {}).get('read_num'),
            like_num=data.get('appmsgstat', {}).get('like_num'),
            comment_count=data.get('comment_count'),
            spider_time=tools.get_current_date()
        )

        if dynamic_data:
            data_pipeline.save_article_dynamic(dynamic_data)
Example #10
0
    def __parse_article_list(self, article_list, __biz, is_first_page=False):
        '''
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        ---------
        @result: True / None (True: 继续向下抓取; None: 停止向下抓取)
        '''

        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get('title')
            digest = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace('amp;', '')
            source_url = article_info.get('source_url').replace('\\', '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            subtype = article_info.get('subtype')
            is_multi = article_info.get('is_multi')
            author = article_info.get('author')
            copyright_stat = article_info.get('copyright_stat')
            duration = article_info.get('duration')
            del_flag = article_info.get('del_flag')
            type = comm_msg_info.get('type')
            publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime'))
            sn = tools.get_param(url, 'sn')

            if sn:
                # 缓存文章信息
                article_data = {
                    'title': title,
                    'digest': digest,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'subtype': subtype,
                    'is_multi': is_multi,
                    'author': author,
                    'copyright_stat': copyright_stat,
                    'duration': duration,
                    'del_flag': del_flag,
                    'type': type,
                    'publish_time': publish_time,
                    'sn': sn,
                    '__biz': __biz,
                    'spider_time': tools.get_current_date()
                }

                return article_data

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list_data = []
        publish_time = None
        is_need_get_more = True
        article_list = article_list.get('list', [])
        is_first_article = True
        for article in article_list:
            comm_msg_info = article.get('comm_msg_info', {})

            publish_timestamp = comm_msg_info.get('datetime')
            publish_time = tools.timestamp_to_date(publish_timestamp)

            # 记录最新发布时间
            if is_first_page and is_first_article:
                self._task_manager.record_new_last_article_publish_time(__biz, publish_time)
                is_first_article = False

                if publish_timestamp and self._task_manager.is_zombie_account(publish_timestamp):  # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号
                    log.info('公众号 {} 为僵尸账号 不再监控'.format(__biz))
                    self._task_manager.sign_account_is_zombie(__biz, publish_time)
                    is_need_get_more = False
                    break

            # 对比时间 若采集到上次时间,则跳出
            is_reach = self._task_manager.is_reach_last_article_publish_time(__biz, publish_time)
            if is_reach:
                log.info('采集到上次发布时间 公众号 {} 采集完成'.format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz)
                self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time)
                is_need_get_more = False
                break

            elif is_reach is None:
                log.info('公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号'.format(__biz))
                return

            article_type = comm_msg_info.get('type')
            if article_type != 49:  # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            # 看是否在抓取时间范围
            publish_time_status = self._task_manager.is_in_crawl_time_range(publish_time)
            if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE:
                log.info('公众号 {} 超过采集时间范围 采集完成'.format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz)
                self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time)
                is_need_get_more = False
                break
            elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE:
                log.info('公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集'.format(__biz, publish_time))
                continue

            # 在时间范围

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get('app_msg_ext_info', {})
            article_data = parse_article_info(app_msg_ext_info, comm_msg_info)
            if article_data:
                article_list_data.append(article_data)

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
            for multi_app_msg_item in multi_app_msg_item_list:
                article_data = parse_article_info(multi_app_msg_item, comm_msg_info)
                if article_data:
                    article_list_data.append(article_data)

        if article_list_data:
            data_pipeline.save_article_list(article_list_data)

        if is_need_get_more:
            return publish_time
Example #11
0
    def get_article_list(self, data, req_url):
        '''
        @summary: 获取文章列表
        分为两种
            1、第一次查看历史消息 返回的是html格式 包含公众号信息
            2、下拉显示更多时 返回json格式
        但是文章列表都是json格式 且合适相同
        抓取思路:
        1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址
        2、如果是第二种格式,
        ---------
        @param data:
        ---------
        @result:
        '''
        try:
            # 判断是否为被封的账号, 被封账号没有文章列表
            if 'list' in data:
                # 取html格式里的文章列表
                if 'action=home' in req_url:
                    # 解析公众号信息
                    self.__parse_account_info(data, req_url)

                    # 解析文章列表
                    regex = "msgList = '(.*?})';"
                    article_list = tools.get_info(data, regex, fetch_one=True)
                    article_list = article_list.replace('"', '"')
                    self.__parse_article_list(article_list)

                    #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    regex = "can_msg_continue = '(\d)'"
                    can_msg_continue = tools.get_info(data,
                                                      regex,
                                                      fetch_one=True)
                    if can_msg_continue == '0':  # 无更多文章
                        pass
                    elif self._is_need_get_more:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取appmsg_token 在html中
                        regex = 'appmsg_token = "(.*?)";'
                        appmsg_token = tools.get_info(data,
                                                      regex,
                                                      fetch_one=True)

                        # 取其他参数  在url中
                        __biz = tools.get_param(req_url, '__biz')
                        pass_ticket = tools.get_param(req_url, 'pass_ticket')

                        next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format(
                            __biz=__biz,
                            offset=10,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token)
                        WechatAction._todo_urls.append(next_page_url)

                else:  # json格式
                    data = tools.get_json(data)
                    article_list = data.get('general_msg_list', {})
                    self.__parse_article_list(article_list)

                    #判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    can_msg_continue = data.get('can_msg_continue')
                    if not can_msg_continue:  # 无更多文章
                        pass
                    elif self._is_need_get_more:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取参数  在url中
                        __biz = tools.get_param(req_url, '__biz')
                        pass_ticket = tools.get_param(req_url, 'pass_ticket')
                        appmsg_token = tools.get_param(req_url, 'appmsg_token')

                        # 取offset 在json中
                        offset = data.get('next_offset', 0)

                        next_page_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json'.format(
                            __biz=__biz,
                            offset=offset,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token)
                        WechatAction._todo_urls.append(next_page_url)

            else:  # 该__biz 账号已被封
                pass
        except Exception as e:
            log.error(e)

        return self.__open_next_page()
Example #12
0
    def __parse_article_list(self, article_list):
        '''
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        {
            "list":[
                {
                    "comm_msg_info":{
                        "id":1000000513,
                        "type":49,
                        "datetime":1511354167,
                        "fakeid":"3082125093",
                        "status":2,
                        "content":""
                    },
                    "app_msg_ext_info":{
                        "title":"Python 内存优化",
                        "digest":"实际项目中,pythoner更加关注的是Python的性能问题。本文,关注的是Python的内存优化,一般说来,如果不发生内存泄露,运行在服务端的Python代码不用太关心内存,但是如果运行在客户端,那还是有优化的必要。",
                        "content":"",
                        "fileid":505083208,
                        "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=1&sn=d2a76f4a601f94d8acc7b436d18e9648&chksm=8464dd00b313541684c14f974325ea6ae725ffc901fd9888cc00d1acdd13619de3297a5d9a35&scene=27#wechat_redirect",
                        "source_url":"http:\/\/www.cnblogs.com\/xybaby\/p\/7488216.html",
                        "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQVbRHy3FhzwMHEvCvtzXVicHTaPEu8jZ2pgkCAgBqEHugYMvzg3tpoww\/0?wx_fmt=jpeg",
                        "subtype":9,
                        "is_multi":1,
                        "multi_app_msg_item_list":[
                            {
                                "title":"面向对象:With the wonder of your love, the sun above always shines",
                                "digest":"With the wonder of your love, the sun above always shines",
                                "content":"",
                                "fileid":505083209,
                                "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&mid=2652566858&idx=2&sn=97f223783da7748080f8103654447c99&chksm=8464dd00b313541601938565a41487ea76209331fd6f4c8996a2ff5572f4fd465de9fa4cbaac&scene=27#wechat_redirect",
                                "source_url":"https:\/\/mp.weixin.qq.com\/s\/_uD9jY4nXQQ6CtA__dsN8w?scene=25#wechat_redirect",
                                "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQ5ukvwH1GPq5zlWxv05WvRiaw6BiaeyGRD1w17nAPGTlQgEvvDuZnB9HA\/0?wx_fmt=jpeg",
                                "author":"",
                                "copyright_stat":101,
                                "del_flag":1
                            }
                        ],
                        "author":"",
                        "copyright_stat":100,
                        "del_flag":1
                    }
                }
            ]
        }
        ---------
        @result:
        '''

        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, release_time):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))
            title = article_info.get('title')
            summary = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace(
                'amp;', '')
            source_url = article_info.get('source_url').replace('\\',
                                                                '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            author = article_info.get('author')
            if url and url.startswith(
                    'http://mp.weixin.qq.com/'
            ):  # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库
                mid = tools.get_param(url, 'mid') or tools.get_param(
                    url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
                idx = tools.get_param(url, 'idx') or tools.get_param(
                    url, 'itemidx')  # 第几条图文消息 从1开始
                article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

                # 判断该文章库中是否已存在
                if WechatAction._wechat_service.is_exist(
                        'wechat_article',
                        article_id) or (ONLY_TODAY_MSG and release_time <
                                        tools.get_current_date('%Y-%m-%d')):
                    self._is_need_get_more = False
                    return  # 不往下进行 舍弃之后的文章

                __biz = tools.get_param(url, '__biz')  # 用于关联公众号

                # 缓存文章信息
                WechatAction._article_info[article_id] = {
                    'article_id': int(article_id),
                    'title': title,
                    'summary': summary,
                    'release_time': release_time,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'account': '',
                    'author': author,
                    '__biz': __biz,
                    'read_num': None,
                    'like_num': None,
                    'content': '',
                    'comment': [],
                    'record_time': tools.get_current_date()
                }

                # 将文章url添加到待抓取队列
                WechatAction._todo_urls.append(url)

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list = article_list.get('list', [])
        for article in article_list:
            article_type = article.get('comm_msg_info', {}).get('type')
            if article_type != 49:  # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            release_time = article.get('comm_msg_info', {}).get('datetime')
            release_time = tools.timestamp_to_date(release_time)

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get('app_msg_ext_info', {})
            parse_article_info(app_msg_ext_info, release_time)

            if not self._is_need_get_more:
                break

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get(
                'multi_app_msg_item_list')
            for multi_app_msg_item in multi_app_msg_item_list:
                parse_article_info(multi_app_msg_item, release_time)

                if not self._is_need_get_more:
                    break
Example #13
0
    def __parse_article_list(self, article_list, __biz, is_first_page=False):
        """
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        ---------
        @result: True / None (True: 继续向下抓取; None: 停止向下抓取)
        """

        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get("title")
            digest = article_info.get("digest")
            url = article_info.get("content_url").replace("\\", "").replace(
                "amp;", "")
            source_url = article_info.get("source_url").replace("\\",
                                                                "")  # 引用的文章链接
            cover = article_info.get("cover").replace("\\", "")
            subtype = article_info.get("subtype")
            is_multi = article_info.get("is_multi")
            author = article_info.get("author")
            copyright_stat = article_info.get("copyright_stat")
            duration = article_info.get("duration")
            del_flag = article_info.get("del_flag")
            type = comm_msg_info.get("type")
            publish_time = tools.timestamp_to_date(
                comm_msg_info.get("datetime"))
            sn = tools.get_param(url, "sn")

            if sn:
                # 缓存文章信息
                article_data = {
                    "title": title,
                    "digest": digest,
                    "url": url,
                    "source_url": source_url,
                    "cover": cover,
                    "subtype": subtype,
                    "is_multi": is_multi,
                    "author": author,
                    "copyright_stat": copyright_stat,
                    "duration": duration,
                    "del_flag": del_flag,
                    "type": type,
                    "publish_time": publish_time,
                    "sn": sn,
                    "__biz": __biz,
                    "spider_time": tools.get_current_date(),
                }

                return article_data

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list_data = []
        publish_time = None
        is_need_get_more = True
        article_list = article_list.get("list", [])
        is_first_article = True
        for article in article_list:
            comm_msg_info = article.get("comm_msg_info", {})

            publish_timestamp = comm_msg_info.get("datetime")
            publish_time = tools.timestamp_to_date(publish_timestamp)

            # 记录最新发布时间
            if is_first_page and is_first_article:
                self._task_manager.record_new_last_article_publish_time(
                    __biz, publish_time)
                is_first_article = False

                if publish_timestamp and self._task_manager.is_zombie_account(
                        publish_timestamp):  # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号
                    log.info("公众号 {} 为僵尸账号 不再监控".format(__biz))
                    self._task_manager.sign_account_is_zombie(
                        __biz, publish_time)
                    is_need_get_more = False
                    break

            # 对比时间 若采集到上次时间,则跳出
            is_reach = self._task_manager.is_reach_last_article_publish_time(
                __biz, publish_time)
            if is_reach:
                log.info("采集到上次发布时间 公众号 {} 采集完成".format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                    __biz)
                self._task_manager.update_account_last_publish_time(
                    __biz, new_last_publish_time)
                is_need_get_more = False
                break

            elif is_reach is None:
                log.info(
                    "公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号".format(__biz))
                return

            article_type = comm_msg_info.get("type")
            if article_type != 49:  # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            # 看是否在抓取时间范围
            publish_time_status = self._task_manager.is_in_crawl_time_range(
                __biz, publish_time)
            if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE:
                log.info("公众号 {} 超过采集时间范围 采集完成".format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                    __biz)
                self._task_manager.update_account_last_publish_time(
                    __biz, new_last_publish_time)
                is_need_get_more = False
                break
            elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE:
                log.info("公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集".format(
                    __biz, publish_time))
                continue

            # 在时间范围

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get("app_msg_ext_info", {})
            article_data = parse_article_info(app_msg_ext_info, comm_msg_info)
            if article_data:
                article_list_data.append(article_data)

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get(
                "multi_app_msg_item_list")
            for multi_app_msg_item in multi_app_msg_item_list:
                article_data = parse_article_info(multi_app_msg_item,
                                                  comm_msg_info)
                if article_data:
                    article_list_data.append(article_data)

        if article_list_data:
            data_pipeline.save_article_list(article_list_data)

        if is_need_get_more:
            return publish_time
Example #14
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']['keyword']
    monitor_type = url_info['remark']['monitor_type']
    official_accounts_id = remark
    retry_times = url_info['retry_times']

    headers = {
    "Host": "weixin.sogou.com",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Upgrade-Insecure-Requests": "1"
    }

    # 获取代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()

    # 解析
    # print(proxies)
    # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies)
    # print(html)

    html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies)
    if not html:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # print(html)
    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    print(root_url)
    log.debug('取文章链接' + check_info)

    if check_info:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_block = tools.get_info(html, regex, fetch_one = True)
    # url
    regex = '<a.*?account_name.*?href="(.*?)">'
    account_url = tools.get_info(account_block, regex, fetch_one = True)
    account_url = account_url.replace('&amp;',"&")
    log.debug('account_url = ' + account_url)

    if not account_url:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive"
    }

    # 代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()
    proxies = {} #使用代理会出现验证码 暂时不使用

    html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies)
    regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    log.debug('''
        取文章详细内容 %s
        url %s
        request.headers %s
        '''%(check_info, account_url, request.headers))
    # print(html)

    regex = 'var msgList = (.*?});'
    article_json = tools.get_info(html, regex, fetch_one = True)
    article_json = tools.get_json(article_json)

    article_list = article_json.get('list', {})
    for article in article_list:
        title = tools.get_json_value(article, 'app_msg_ext_info.title')
        is_have = mongodb.find('WWA_wechat_article', {'title' : title})
        if is_have:
            log.debug(title + " 已存在")
            continue

        summary = tools.get_json_value(article, 'app_msg_ext_info.digest')
        image_url = tools.get_json_value(article, 'app_msg_ext_info.cover')

        sexy_image_url = []

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''
        sexy_image_url.append(local_image_url)

        article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url')
        article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
        article_url = article_url.replace('&amp;',"&")

        release_time = tools.get_json_value(article, 'comm_msg_info.datetime')
        release_time = tools.timestamp_to_date(int(release_time)) if release_time else ''

        content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
        regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
        content = tools.get_info(content_html, regex, fetch_one = True)

        # # 取content里的图片 下载图片 然后替换内容中原来的图片地址
        regex = '<img.*?data-src="(.*?)"'
        images = tools.get_info(content, regex)
        for image in images:
            local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
            is_download = tools.download_file(image, local_image_path)
            if is_download:
                content = content.replace(image, local_image_path)
                sexy_image_url.append(local_image_path)
            tools.delay_time(5)

        # 敏感事件
        sensitive_id = ''
        if monitor_type == 1 or monitor_type == 2:
            sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else []
                keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else []
                keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

        # 违规事件
        violate_id = ''
        if monitor_type == 0 or monitor_type == 2:
            vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else []
                keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

        log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

        base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

        # 同一天发布的
        oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', [])
        for article in oneday_article_list:
            title = tools.get_json_value(article, 'title')
            summary = tools.get_json_value(article, 'digest')
            image_url = tools.get_json_value(article, 'cover')

            sexy_image_url = []

            # 下载图片
            local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(image_url, local_image_url)
            local_image_url = local_image_url if is_download else ''
            sexy_image_url.append(local_image_url)

            article_url = tools.get_json_value(article, 'content_url')
            article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
            article_url = article_url.replace('&amp;',"&")

            content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
            regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
            content = tools.get_info(content_html, regex, fetch_one = True)

            # 取content里的图片 下载图片 然后替换内容中原来的图片地址
            regex = '<img.*?data-src="(.*?)"'
            images = tools.get_info(content, regex)
            for image in images:
                local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
                is_download = tools.download_file(image, local_image_path)
                if is_download:
                    content = content.replace(image, local_image_path)
                    sexy_image_url.append(local_image_path)
                tools.delay_time(5)

            # 敏感事件
            sensitive_id = ''
            sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []
                keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else []
                keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

            # 违规事件
            violate_id = ''
            vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []
                keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

            log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

            base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

    base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE)
    tools.delay_time()
Example #15
0
    def deal_article_list(self, req_url, text):
        """
        @summary: 获取文章列表
        分为两种
            1、第一次查看历史消息 返回的是html格式 包含公众号信息
            2、下拉显示更多时 返回json格式
        但是文章列表都是json格式 且合适相同
        抓取思路:
        1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址
        2、如果是第二种格式,
        ---------
        @param data:
        ---------
        @result:
        """
        try:
            # 判断是否为被封的账号, 被封账号没有文章列表
            __biz = tools.get_param(req_url, "__biz")

            if "list" in text:
                # 取html格式里的文章列表
                if "action=home" in req_url:
                    # 解析公众号信息
                    self.__parse_account_info(text, req_url)

                    # 解析文章列表
                    regex = "msgList = '(.*?})';"
                    article_list = tools.get_info(text, regex, fetch_one=True)
                    article_list = article_list.replace("&quot;", '"')
                    publish_time = self.__parse_article_list(
                        article_list, __biz, is_first_page=True)

                    # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    regex = "can_msg_continue = '(\d)'"
                    can_msg_continue = tools.get_info(text,
                                                      regex,
                                                      fetch_one=True)
                    if can_msg_continue == "0":  # 无更多文章
                        log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz))
                        new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                            __biz)
                        if not new_last_publish_time:
                            # 标记成僵尸号
                            log.info("公众号 {} 为僵尸账号 不再监控".format(__biz))
                            self._task_manager.sign_account_is_zombie(__biz)
                        else:
                            self._task_manager.update_account_last_publish_time(
                                __biz, new_last_publish_time)

                    elif publish_time:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取appmsg_token 在html中
                        regex = 'appmsg_token = "(.*?)";'
                        appmsg_token = tools.get_info(text,
                                                      regex,
                                                      fetch_one=True)

                        # 取其他参数  在url中
                        __biz = tools.get_param(req_url, "__biz")
                        pass_ticket = tools.get_param(req_url, "pass_ticket")

                        next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format(
                            __biz=__biz,
                            offset=10,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token,
                        )
                        return self._task_manager.get_task(
                            next_page_url,
                            tip="正在抓取列表 next_offset {} 抓取到 {}".format(
                                10, publish_time),
                        )

                else:  # json格式
                    text = tools.get_json(text)
                    article_list = text.get("general_msg_list", {})
                    publish_time = self.__parse_article_list(
                        article_list, __biz)

                    # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多
                    can_msg_continue = text.get("can_msg_continue")
                    if not can_msg_continue:  # 无更多文章
                        log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz))
                        new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                            __biz)
                        self._task_manager.update_account_last_publish_time(
                            __biz, new_last_publish_time)
                        pass

                    elif publish_time:
                        # 以下是拼接下拉显示更多的历史文章 跳转
                        # 取参数  在url中
                        __biz = tools.get_param(req_url, "__biz")
                        pass_ticket = tools.get_param(req_url, "pass_ticket")
                        appmsg_token = tools.get_param(req_url, "appmsg_token")

                        # 取offset 在json中
                        offset = text.get("next_offset", 0)

                        next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format(
                            __biz=__biz,
                            offset=offset,
                            pass_ticket=pass_ticket,
                            appmsg_token=appmsg_token,
                        )
                        return self._task_manager.get_task(
                            next_page_url,
                            tip="正在抓取列表 next_offset {} 抓取到 {}".format(
                                offset, publish_time),
                        )

            else:  # 该__biz 账号已被封
                self._task_manager.sign_account_is_zombie(__biz)
                pass

        except Exception as e:
            log.exception(e)

        return self._task_manager.get_task()
Example #16
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    def get_tkey(t):
        def ror(val, key):
            i = 0
            while (i < key):
                val = (0x7fffffff & (val >> 1)) | ((val & 1) << 31)
                i += 1

            return val

        key = 185025305
        val = ror(t, key % 17)
        val = val ^ key
        return val

    def getdownload(episode_download_url_json):
        episode_json = tools.get_json_by_requests(episode_download_url_json)
        #print(episode_download_url_json)
        episode_download_url = tools.get_json_value(episode_json,
                                                    'msgs.playurl.domain')
        episode_download_url = episode_download_url and episode_download_url[
            0] or ''
        #print('-----',episode_download_url)
        episode_download_url_definition = tools.get_json_value(
            episode_json, 'msgs.playurl.dispatch.1080p')
        episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[
            0] or ''
        print(episode_download_url_definition, '*********')
        episode_download_url = episode_download_url + episode_download_url_definition
        episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format(
            random.random(), '1080p')
        episode_download_url_json = tools.get_json_by_requests(
            episode_download_url)
        episode_download_url = tools.get_json_value(episode_download_url_json,
                                                    'location')
        return episode_download_url

    if depth == 0:
        cs_regex = 'cs(.*?)_'
        o_regex = 'cs.*?_o(.*?)_p'
        cs = tools.get_info(source_url, cs_regex)
        cs_value = cs and cs[0] or ''
        o = tools.get_info(source_url, o_regex)
        o_value = o and o[0] or ''
        #print('1'+o_value+'2','***', cs_value)
        url = 'http://list.le.com/apin/chandata.json?cs=' + cs_value + '&_o=' + o_value + '&_p='
        base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1)
    if depth == 1:
        page = '1'
        #电视剧
        if 'cs=2' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                json_list = tools.get_json_value(json, 'album_list')
                #print(source_url)
                for info in json_list:
                    image_url = tools.get_json_value(info, 'images.1080*608')
                    program_name = tools.get_json_value(info, 'name')
                    program_url = tools.get_json_value(info, 'aid')
                    program_url = 'http://www.le.com/tv/' + program_url + '.html'
                    episode = tools.get_json_value(info, 'nowEpisodes')

                    directors = tools.get_json_value(info, 'directory')
                    #print(type(directors))
                    directors = ','.join(tools.get_json(directors).values())

                    actors = tools.get_json_value(info, 'starring')
                    actors = ' '.join(actors.values())

                    summary = tools.get_json_value(info, 'description')

                    release_time = tools.get_json_value(info, 'releaseDate')
                    release_time = int(release_time) / 1000
                    x = time.localtime(release_time)
                    release_time = time.strftime("%Y-%m-%d", x)

                    log.debug(
                        '''
                                    depth                       = %s
                                    program_name                = %s
                                    program_url                 = %s
                                    image_url                   = %s
                                    episode                     = %s
                                    directors                   = %s
                                    actors                      = %s
                                    summary                     = %s
                                    release_time                = %s
                                 ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time)

                    episode_url = tools.get_json_value(info, 'vids')
                    episode_url = episode_url + ','
                    regex = '(\d*?),'
                    episode_urls = tools.get_info(episode_url, regex)
                    for episode_url_num in episode_urls:

                        episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                        episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                            episode_url_num, get_tkey(int(time.time())))

                        episode_json = tools.get_json_by_requests(
                            episode_download_url_json)

                        episode_image_url = tools.get_json_value(
                            episode_json, 'msgs.playurl.pic')

                        episode_name = tools.get_json_value(
                            episode_json, 'msgs.playurl.title')

                        episode_num_regex = "(\d*?)"
                        episode_num = tools.get_info(episode_name,
                                                     episode_num_regex)
                        episode_num = episode_num and episode_num[0] or ''

                        episode_download_url = getdownload(
                            episode_download_url_json)

                        time_length = ''

                        episode_summary = ''

                        download_status = ''

                        log.debug(
                            '''
                                                   depth                       = %s
                                                   episode_num                 = %s
                                                   time_length                 = %s
                                                   episode_name                = %s
                                                   episode_url                 = %s
                                                   download_url                = %s
                                                   episode_summary             = %s
                                                   episode_image_url           = %s

                                                ''' %
                            (depth, episode_num, time_length, episode_name,
                             episode_url, episode_download_url,
                             episode_summary, episode_image_url))

                        base_parser.add_program_episode_info(
                            'PROGRAM_EPISODE_info', site_id, program_id,
                            episode_num, time_length, episode_name,
                            download_status, episode_download_url, episode_url,
                            episode_summary, episode_image_url, '')

                page = str(int(page) + 1)

                if not json_list:
                    return False

        #体育
        if 'cs=4' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '第(.*?)期'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://sports.le.com/video/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    program_url = ''
                    actors = ''
                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                    ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                            ''' %
                              (depth, episode_num, time_length, episode_name,
                               episode_url, episode_download_url,
                               episode_summary, episode_image_url))
                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id, episode,
                        time_length, program_name, download_status,
                        episode_download_url, program_url, summary, image_url,
                        '')
                page = str(int(page) + 1)

                if not json_list:
                    return False

        # 综艺
        if 'cs=11' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '第(.*?)期'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    actors = ''
                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    program_url = ''
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                    ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                    ''' % (depth, episode_num, time_length, episode_name,
                           episode_url, episode_download_url, episode_summary,
                           episode_image_url))

                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id,
                        episode_num, time_length, episode_name,
                        download_status, episode_download_url, episode_url,
                        episode_summary, episode_image_url, '')
                page = str(int(page) + 1)
                if not video_list:
                    return False

        # 音乐
        if 'cs=9' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '(\d*?):'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    actors = tools.get_json_value(info, 'actor').values()
                    actors = ''.join(actors)
                    #print('**********', actors)

                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    program_url = ''
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                              ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                            ''' %
                              (depth, episode_num, time_length, episode_name,
                               episode_url, episode_download_url,
                               episode_summary, episode_image_url))
                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id,
                        episode_num, time_length, episode_name,
                        download_status, episode_download_url, episode_url,
                        episode_summary, episode_image_url, '')

                page = str(int(page) + 1)
                if not video_list:
                    return False
    # #     # 取当前页的文章信息
    # #     # 标题
    # #
    # #
    # #     everyone_html = tools.get_html_by_requests(program_url)
    # #
    # #     regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>'
    # #     directors = tools.get_info(everyone_html, regexs_directors)
    # #     directors = directors and directors[0] or ''
    # #
    # #     # 摘要
    # #     regexs_summary = '<p class="summaryList_long">(.*?)</p>'
    # #     summary = tools.get_info(everyone_html, regexs_summary)
    # #     summary = summary and summary[0] or ''
    # #
    # #     # 更新时间
    # #     regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>'
    # #     release_time = tools.get_info(everyone_html, regexs_release_time)
    # #     release_time = release_time and release_time[0] or ''
    # #
    # #     # 下载地址
    # #     regexs_download_url = 'videoUrl=(.*?)"'
    # #     download_url = tools.get_info(everyone_html, regexs_download_url)
    # #     download_url = download_url and download_url[0] or ''
    # #
    # #     download_status = 102
    # #     time_length = ''
    # #
    # #
    # #     if download_url:
    # #         program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url = image_url,
    # #                              episode = episode, directors = directors, actors = '', summary = summary,
    # #                              release_time = release_time)
    # #
    # #         sto_path = '/video/' + program_name + '.mp4'
    # #         is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
    # #         download_status = 101 if is_download else 102
    # #
    # #         base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status,
    # #                          download_url, program_url, summary, image_url, sto_path)
    # #
    # #
    # # # 更新source_url为done
    # # base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)

    # 电影
        if 'cs=1' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                json_list = tools.get_json_value(json, 'album_list')
                #print(source_url)
                for info in json_list:
                    image_url = tools.get_json_value(info, 'images.1080*608')
                    program_name = tools.get_json_value(info, 'name')
                    program_url = tools.get_json_value(info, 'aid')
                    program_url = 'http://www.le.com/movie/' + program_url + '.html'
                    episode = ' '

                    directors = tools.get_json_value(info, 'directory')
                    directors = ','.join(tools.get_json(directors).values())

                    actors = tools.get_json_value(info, 'starring')
                    actors = ' '.join(actors.values())

                    summary = tools.get_json_value(info, 'description')

                    release_time = tools.get_json_value(info, 'releaseDate')
                    release_time = int(release_time) / 1000
                    x = time.localtime(release_time)
                    release_time = time.strftime("%Y-%m-%d", x)

                    log.debug(
                        '''
                                    depth                       = %s
                                    program_name                = %s
                                    program_url                 = %s
                                    image_url                   = %s
                                    episode                     = %s
                                    directors                   = %s
                                    actors                      = %s
                                    summary                     = %s
                                    release_time                = %s
                                 ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time)

                    episode_url = tools.get_json_value(info, 'vids')
                    episode_url = episode_url + ','
                    regex = '(.*?),'
                    episode_urls = tools.get_info(episode_url, regex)

                    for episode_url_num in episode_urls:

                        episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                        episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                            episode_url_num, get_tkey(int(time.time())))
                        print(episode_download_url_json)
                        episode_json = tools.get_json_by_requests(
                            episode_download_url_json)

                        episode_image_url = tools.get_json_value(
                            episode_json, 'msgs.playurl.pic')

                        episode_name = tools.get_json_value(
                            episode_json, 'msgs.playurl.title')

                        episode_num_regex = "第(.*?)期"
                        episode_num = tools.get_info(episode_name,
                                                     episode_num_regex)
                        episode_num = episode_num and episode_num[0] or ''

                        episode_download_url = getdownload(
                            episode_download_url_json)

                        time_length = ''

                        episode_summary = ''

                        download_status = ''

                        log.debug(
                            '''
                                                   depth                       = %s
                                                   episode_num                 = %s
                                                   time_length                 = %s
                                                   episode_name                = %s
                                                   episode_url                 = %s
                                                   download_url                = %s
                                                   episode_summary             = %s
                                                   episode_image_url           = %s

                                                ''' %
                            (depth, episode_num, time_length, episode_name,
                             episode_url, episode_download_url,
                             episode_summary, episode_image_url))

                        base_parser.add_program_episode_info(
                            'PROGRAM_EPISODE_info', site_id, program_id,
                            episode_num, time_length, episode_name,
                            download_status, episode_download_url, episode_url,
                            episode_summary, episode_image_url, '')

                page = str(int(page) + 1)

                if not json_list:
                    return False
def parser_episode_info(url_info):

    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_mongo_id = remark['program_mongo_id']
    classify = remark['classify']

    if classify == '电视剧':
        episode_url = root_url
        episode_name = remark['program_name'] + '_' + remark['episode_num']
        episode_num = remark['episode_num']
        # vid  http://v.qq.com/x/cover/nuijxf6k13t6z9b/u0023ickfto.html
        vid = root_url[root_url.rfind('/') + 1:root_url.rfind('.')]
        download_url = get_download_url(vid)
        download_url = '^_^'.join(download_url)
        image_url = 'http://puui.qpic.cn/vpic/0/%s_160_90_3.jpg/0' % vid

        log.debug(
            '''
                episode_name  %s
                episode_num   %s
                episode_url   %s
                image_url     %s
                download_url  %s
                ''' %
            (episode_name, episode_num, episode_url, image_url, download_url))

        base_parser.add_program_episode_info('PROGRAM_EPISODE_info',
                                             site_id,
                                             program_mongo_id,
                                             episode_num=episode_num,
                                             time_length='',
                                             episode_name=episode_name,
                                             download_status='',
                                             download_url=download_url,
                                             episode_url=episode_url,
                                             summary='',
                                             image_url=image_url,
                                             sto_path='')

    elif classify == '综艺':
        # 用youget 的qq
        html, request = tools.get_html_by_requests(root_url)
        if not html:
            base_parser.update_url('PROGRAM_urls', root_url,
                                   Constance.EXCEPTION)
            return

        episodes_json = html[len('QZOutputJson='):-1]
        # print(episodes_json)
        episodes_json = tools.get_json(episodes_json)
        episodes = episodes_json.get('PlaylistItem',
                                     {}).get('videoPlayList', [])
        for episode in episodes:
            try:
                image_url = episode['pic']
                episode_url = episode[
                    'playUrl']  # http://v.qq.com/x/cover/3e70vfbgrss48n8/e00237dneke.html
                episode_name = episode['title']
                vid = episode_url[episode_url.rfind('/') +
                                  1:episode_url.rfind('.html')]
                print('qq_download_by_vid  begin')
                import time
                b = time.time()
                download_url = qq.qq_download_by_vid(vid)
                print('qq_download_by_vid end time = %s' %
                      str(time.time() - b))
                download_url = '^_^'.join(download_url)

                log.debug('''
                    episode_name  %s
                    image_url     %s
                    episode_url   %s
                    download_url  %s
                    ''' % (episode_name, image_url, episode_url, download_url))

                base_parser.add_program_episode_info('PROGRAM_EPISODE_info',
                                                     site_id,
                                                     program_mongo_id,
                                                     episode_num='',
                                                     time_length='',
                                                     episode_name=episode_name,
                                                     download_status='',
                                                     download_url=download_url,
                                                     episode_url=episode_url,
                                                     summary='',
                                                     image_url=image_url,
                                                     sto_path='')
            except Exception as e:
                log.error(e)

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
Example #18
0
    def _get_per_record_time(self):
        news_record_time = ''
        news_record_time = tools.get_json(self._record_time).get('news_record_time')

        return news_record_time