Ejemplo n.º 1
0
    def deal_comment(self, req_url, text):
        data = tools.get_json(text)

        __biz = tools.get_param(req_url, '__biz')

        comment_id = tools.get_param(req_url, 'comment_id')  # 与文章关联
        elected_comment = data.get('elected_comment', [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get('nick_name'),
                logo_url=comment.get('logo_url'),
                content=comment.get('content'),
                create_time=tools.timestamp_to_date(comment.get('create_time')),
                content_id=comment.get('content_id'),
                like_num=comment.get('like_num'),
                is_top=comment.get('is_top'),
                spider_time=tools.get_current_date()
            )
            for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Ejemplo n.º 2
0
def deal_comment(reply):
    if not reply: return

    comment_id = reply.get('id')
    pre_id = reply.get('replyId')
    content = reply.get('content')
    article_id = reply.get('mainContentId')
    release_time = reply.get('addTime')
    release_time = tools.timestamp_to_date(release_time)
    head_url = reply.get('userInfo', {}).get('icon')
    consumer = reply.get('userInfo', {}).get('uname')
    gender = int(reply.get('userInfo', {}).get('gender'))
    up_count = reply.get('likes')

    # TODO
    emotion = random.randint(0, 2)
    hot_id = comment_id

    log.debug('''
        评论id:  %s
        父id      %s
        文章id    %s
        发布人:  %s
        头像地址  %s
        性别      %s
        内容:    %s
        点赞量    %s
        发布时间  %s
        ''' % (comment_id, pre_id, article_id, consumer, head_url, gender,
               content, up_count, release_time))

    return self_base_parser.add_comment(comment_id, pre_id, article_id,
                                        consumer, head_url, gender, content,
                                        up_count, release_time, emotion,
                                        hot_id)
Ejemplo n.º 3
0
    def deal_comment(self, req_url, text):
        """
        解析评论
        :param req_url:
        :param text:
        :return:
        """

        data = tools.get_json(text)

        __biz = tools.get_param(req_url, "__biz")

        comment_id = tools.get_param(req_url, "comment_id")  # 与文章关联
        elected_comment = data.get("elected_comment", [])

        comment_datas = [
            dict(
                __biz=__biz,
                comment_id=comment_id,
                nick_name=comment.get("nick_name"),
                logo_url=comment.get("logo_url"),
                content=comment.get("content"),
                create_time=tools.timestamp_to_date(
                    comment.get("create_time")),
                content_id=comment.get("content_id"),
                like_num=comment.get("like_num"),
                is_top=comment.get("is_top"),
                spider_time=tools.get_current_date(),
            ) for comment in elected_comment
        ]

        if comment_datas:
            data_pipeline.save_article_commnet(comment_datas)
Ejemplo n.º 4
0
    def is_have_new_article(self, account_id='', account=''):
        '''
        @summary: 检查公众号今日是否发文
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''

        account_block = self.__get_account_blocks(account_id, account)
        if account_block == constance.VERIFICATION_CODE:
            return constance.VERIFICATION_CODE

        regex = "timeConvert\('(\d*?)'\)"
        release_time = tools.get_info(account_block, regex, fetch_one=True)

        if release_time:
            release_time = int(release_time)
            release_time = tools.timestamp_to_date(release_time)
            log.debug("最近发文时间 %s" % release_time)

            if release_time >= tools.get_current_date('%Y-%m-%d'):
                return constance.UPDATE
            else:
                return constance.NOT_UPDATE

        else:
            return constance.ERROR
Ejemplo n.º 5
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False  # 是否做完一轮
        is_all_done = False  # 是否全部做完(所有公众号当日的发布的信息均已采集)

        if WechatAction._todo_urls:
            url = WechatAction._todo_urls.popleft()
        else:
            # 做完一个公众号 更新其文章数
            WechatAction._wechat_service.update_account_article_num(
                WechatAction._current_account_biz)

            # 跳转到下一个公众号
            account_id, __biz, is_done, is_all_done = WechatAction._wechat_service.get_next_account(
            )
            WechatAction._account_info[__biz] = account_id or ''

            # url = 'http://mp.weixin.qq.com/mp/getmasssendmsg?__biz=%s#wechat_webview_type=1&wechat_redirect'%__biz
            url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect' % __biz
            log.debug('''
                下一个公众号 : %s
                ''' % url)

        # 注入js脚本实现自动跳转
        if is_all_done:  # 当天文章均已爬取 下一天再爬
            # 睡眠到下一天
            sleep_time = self.get_next_day_time_interval()
        elif is_done:  # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date(
        ) < tools.get_current_date(
                "%Y-%m-%d"
        ) + ' ' + SPIDER_START_TIME:  # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else:  # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        log.debug('''
            next_page_url : %s
            is_done:        %s
            is_all_done:    %s
            sleep_time:     %s
            next_start_time %s
            ''' % (url, is_done, is_all_done,
                   tools.seconds_to_h_m_s(sleep_time / 1000),
                   tools.timestamp_to_date(tools.get_current_timestamp() +
                                           sleep_time / 1000)))
        next_page = "<script>setTimeout(function(){window.location.href='%s';},%d);</script>" % (
            url, sleep_time)
        return next_page
Ejemplo n.º 6
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        ).extract_first(default="")
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)
        biz = tools.get_param(req_url, "__biz")

        text = remove_tags(content).strip()
        spider_name = 'wechat'
        collection_mode = 'spider'
        data_source_type = '微信公众号'

        article_data = {
            "data_type": account,
            "title": title,
            "data_address": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "text": text,
            "spider_name": spider_name,
            "collection_mode": collection_mode,
            "data_source_type": data_source_type,
            "sn": sn,
            "collection_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
Ejemplo n.º 7
0
    def get_task(self, url=None, tip=''):
        """
        获取任务
        :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取
        :return:
        """

        sleep_time = random.randint(self._spider_interval_min,
                                    self._spider_interval_max)

        if not url:
            account_task = self.get_account_task()
            if account_task:
                __biz = account_task.get('__biz')
                last_publish_time = account_task.get('last_publish_time')
                self.record_last_article_publish_time(__biz, last_publish_time)
                tip = '正在抓取列表'
                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(
                    __biz)
            else:
                article_task = self.get_article_task()
                if article_task:
                    tip = '正在抓取详情'
                    url = article_task.get('article_url')
                else:
                    sleep_time = config.get('spider').get('no_task_sleep_time')
                    log.info('暂无任务 休眠 {}s'.format(sleep_time))
                    tip = '暂无任务 '

        if url:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ',
                sleep_time=sleep_time,
                begin_spider_time=tools.timestamp_to_date(
                    tools.get_current_timestamp() + sleep_time),
                url=url,
                sleep_time_msec=sleep_time * 1000)
        else:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ',
                sleep_time=sleep_time,
                begin_spider_time=tools.timestamp_to_date(
                    tools.get_current_timestamp() + sleep_time),
                sleep_time_msec=sleep_time * 1000)

        return next_page
Ejemplo n.º 8
0
    def __open_next_page(self):
        '''
        @summary: 跳转到历史文章
        ---------
        @param __biz:
        @param pass_ticket:
        @param appmsg_token:
        @param offset:
        ---------
        @result:
        '''
        is_done = False # 是否做完一轮
        url = None

        while WechatAction._todo_urls:
            result = WechatAction._todo_urls.popleft()
            if callable(result): # 为更新公众号已做完的回调
                result() #执行回调
            else:
                url = result
                break

        if not url:
            # 跳转到下一个公众号
            account = WechatAction._wechat_service.get_next_account()
            if account:
                account_id, __biz = account
                WechatAction._account_info[__biz] = account_id or ''

                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=%s&scene=124#wechat_redirect'%__biz
                log.debug('''
                    下一个公众号 : %s
                    '''%url)
            else:
                is_done = True

        # 注入js脚本实现自动跳转
        if is_done: # 做完一轮 休息
            sleep_time = self.get_wait_time()
        elif ONLY_TODAY_MSG and tools.get_current_date() < tools.get_current_date("%Y-%m-%d") + ' ' + SPIDER_START_TIME: # 只爬取今日文章且当前时间小于指定的开始时间,则休息不爬取,因为公众号下半夜很少发布文章
            sleep_time = self.get_spider_start_time_interval()
        else: # 做完一篇文章 间隔一段时间
            sleep_time = self.get_sleep_time()

        tip_sleep_time = tools.seconds_to_h_m_s(sleep_time / 1000)
        tip_next_start_time = tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time / 1000)
        if not url:
            url = 'http://localhost:6210/tip/wait?sleep_time={}&next_start_time={}'.format(tip_sleep_time, tip_next_start_time)

        log.debug('''
            next_page_url : %s
            is_done:        %s
            sleep_time:     %s
            next_start_time %s
            '''%(url, is_done, tip_sleep_time, tip_next_start_time))
        next_page = "休眠 %s 下次刷新时间 %s<script>setTimeout(function(){window.location.href='%s';},%d);</script>"%(tip_sleep_time, tip_next_start_time, url, sleep_time)
        return next_page
Ejemplo n.º 9
0
def parser_next_page_article(video_id, wall_id, feed_id, sns_time, url):
    article_json_url = 'http://api-t.iqiyi.com/feed/get_feeds?authcookie=&device_id=pc_web&m_device_id=a11e6ea94270eaaa0b46be30af84fc54&agenttype=118&wallId={wall_id}&feedTypes=1%2C7%2C8%2C9&count=20&top=1&hasRecomFeed=1&feedId={feed_id}&needTotal=1&notice=1&version=1&upOrDown=1&snsTime={sns_time}&_={timestamp_m}'.format(wall_id = wall_id, feed_id = feed_id, sns_time = sns_time, timestamp_m = int(tools.get_current_timestamp() * 1000))
    print(article_json_url)
    article_json = tools.get_json_by_requests(article_json_url)

    wall_id = article_json.get('data', {}).get('wallId')
    # 评论数组
    feeds = article_json.get('data', {}).get('feeds', [])
    for feed in feeds:
        article_id = feed.get('commentId')

        head_url = feed.get('icon')

        name = feed.get('name')

        release_time = feed.get('releaseDate')
        release_time = tools.timestamp_to_date(release_time)

        title = feed.get('feedTitle')

        content = feed.get('description')

        image_urls = ','.join([img.get('url') for img in feed.get('pictures', [])])#逗号分隔

        watch_count = feed.get('uvCount')

        up_count = feed.get('agreeCount')

        comment_count = feed.get('commentCount')

        log.debug('''
            id:       %s
            节目id     %s
            头像地址: %s
            名字:     %s
            发布时间: %s
            标题:     %s
            内容:     %s
            图片地址: %s
            观看量:   %s
            点赞量:   %s
            评论量:   %s
            '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count))

        if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'):
            # 解析評論
            parser_comment(article_id, wall_id)
        else:
            break
    else:
        if feeds:
            feed_id = feeds[-1].get('feedId')
            sns_time = feeds[-1].get('snsTime')
            parser_next_page_article(video_id, wall_id, feed_id, sns_time, url)
Ejemplo n.º 10
0
    def get_article_release_time(self, account_id='', account=''):
        account_block = self.__get_account_blocks(account_id, account)
        if account_block == constance.VERIFICATION_CODE:
            return constance.VERIFICATION_CODE

        regex = "timeConvert\('(\d*?)'\)"
        release_time = tools.get_info(account_block, regex, fetch_one=True)

        if release_time:
            release_time = int(release_time)
            release_time = tools.timestamp_to_date(release_time)

        return release_time
Ejemplo n.º 11
0
    def deal_article(self, req_url, text):
        sn = tools.get_param(req_url, 'sn')

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath('//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]')
        title = selector.xpath('//h2[@class="rich_media_title"]/text()').extract_first(default='').strip()
        account = selector.xpath('//a[@id="js_name"]/text()').extract_first(default='').strip()
        author = selector.xpath('//span[@class="rich_media_meta rich_media_meta_text"]//text()').extract_first(default='').strip()

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(publish_timestamp) if publish_timestamp else None
        publish_time = tools.timestamp_to_date(publish_timestamp) if publish_timestamp else None

        pics_url = content.xpath('.//img/@src|.//img/@data-src').extract()
        biz = tools.get_param(req_url, '__biz')

        digest = selector.re_first('var msg_desc = "(.*?)"')
        cover = selector.re_first('var cover = "(.*?)";') or selector.re_first('msg_cdn_url = "(.*?)"')
        source_url = selector.re_first("var msg_source_url = '(.*?)';")

        content_html = content.extract_first(default='')
        comment_id = selector.re_first('var comment_id = "(\d+)"')

        article_data = {
            'account': account,
            'title': title,
            'url': req_url,
            'author': author,
            'publish_time': publish_time,
            '__biz': biz,
            'digest': digest,
            'cover': cover,
            "pics_url": pics_url,
            "content_html": content_html,
            "source_url": source_url,
            "comment_id": comment_id,
            "sn": sn,
            "spider_time": tools.get_current_date()

        }

        # 入库
        if article_data and data_pipeline.save_article(article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
Ejemplo n.º 12
0
def get_url(time_lenght = 60):
    '''
    @summary:
    ---------
    @param time_lenght: 时间段 分钟
    ---------
    @result:
    '''

    current_date = tools.get_current_date()
    per_date = tools.read_file(STO_PER_SYNC_TIME) or tools.timestamp_to_date(tools.get_current_timestamp() - time_lenght * 60)

    tools.write_file(STO_PER_SYNC_TIME, current_date)

    root_url = 'http://192.168.60.38:8001/hotspot_al/interface/getCluesDataSearchInfo?pageNo=%d&pageSize=100&updateSTime={per_date}&updateETime={current_date}&sort=5&isDesc=0'.format(per_date = per_date, current_date = current_date)
    return root_url
Ejemplo n.º 13
0
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get("title")
            digest = article_info.get("digest")
            url = article_info.get("content_url").replace("\\", "").replace(
                "amp;", "")
            source_url = article_info.get("source_url").replace("\\",
                                                                "")  # 引用的文章链接
            cover = article_info.get("cover").replace("\\", "")
            subtype = article_info.get("subtype")
            is_multi = article_info.get("is_multi")
            author = article_info.get("author")
            copyright_stat = article_info.get("copyright_stat")
            duration = article_info.get("duration")
            del_flag = article_info.get("del_flag")
            type = comm_msg_info.get("type")
            publish_time = tools.timestamp_to_date(
                comm_msg_info.get("datetime"))
            sn = tools.get_param(url, "sn")

            if sn:
                # 缓存文章信息
                article_data = {
                    "title": title,
                    "digest": digest,
                    "url": url,
                    "source_url": source_url,
                    "cover": cover,
                    "subtype": subtype,
                    "is_multi": is_multi,
                    "author": author,
                    "copyright_stat": copyright_stat,
                    "duration": duration,
                    "del_flag": del_flag,
                    "type": type,
                    "publish_time": publish_time,
                    "sn": sn,
                    "__biz": __biz,
                    "spider_time": tools.get_current_date(),
                }

                return article_data
    def is_have_new_article(self, __biz):
        '''
        @summary: 检查公众号今日是否发文
        ---------
        @param account_id:
        @param account:
        ---------
        @result:
        '''

        log.debug('search keywords ' + __biz)

        url = 'https://mp.weixin.qq.com/cgi-bin/appmsg'
        params = {
            "lang": "zh_CN",
            "token": TOOKEN,
            "query": "",
            "f": "json",
            "count": "5",
            "action": "list_ex",
            "ajax": "1",
            "type": "9",
            "fakeid": __biz,
            "random": str(random.random()) + str(random.randint(1, 9)),
            "begin": "0"
        }

        articles_json = tools.get_json_by_requests(url,
                                                   params=params,
                                                   headers=HEADERS)
        # print(articles_json)

        # TOOLEN 过期 返回 {'base_resp': {'err_msg': 'invalid csrf token', 'ret': 200040}}
        article_list = articles_json.get('app_msg_list', [])
        for article in article_list:
            release_time = article.get('update_time')
            release_time = tools.timestamp_to_date(release_time)
            log.debug("最近发文时间 %s" % release_time)

            if release_time >= tools.get_current_date('%Y-%m-%d'):
                return constance.UPDATE
            else:
                return constance.NOT_UPDATE
        else:
            return constance.ERROR
Ejemplo n.º 15
0
    def get_wait_check_account(self):
        '''
        @summary:
        ---------
        @param :
        ---------
        @result:
        '''
        # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章
        before_tow_hours = tools.timestamp_to_date(
            tools.get_current_timestamp() - 60 * 60 * 2)
        sql = '''
            select t.id,
                   t.domain,
                   t.name,
                   to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                   t.biz
              from TAB_IOPM_SITE t
             where t.biz is not null
               and mointor_status = 701
               and t.spider_status = 603
               and (t.last_article_release_time is null or
                   t.last_article_release_time <=
                   to_date('{}', 'yyyy-mm-dd hh24:mi:ss'))
        '''.format(before_tow_hours)

        accounts = self._oracledb.find(sql)

        # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发
        if not accounts and not self._redisdb.sget_count('wechat:account'):
            sql = '''
                select t.id,
                       t.domain,
                       t.name,
                       to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                       t.biz
                  from TAB_IOPM_SITE t
                 where t.biz is not null
                   and mointor_status = 701
                   and t.spider_status != 603
            '''

            accounts = self._oracledb.find(sql)

        return accounts
Ejemplo n.º 16
0
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get('title')
            digest = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace('amp;', '')
            source_url = article_info.get('source_url').replace('\\', '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            subtype = article_info.get('subtype')
            is_multi = article_info.get('is_multi')
            author = article_info.get('author')
            copyright_stat = article_info.get('copyright_stat')
            duration = article_info.get('duration')
            del_flag = article_info.get('del_flag')
            type = comm_msg_info.get('type')
            publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime'))
            sn = tools.get_param(url, 'sn')

            if sn:
                # 缓存文章信息
                article_data = {
                    'title': title,
                    'digest': digest,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'subtype': subtype,
                    'is_multi': is_multi,
                    'author': author,
                    'copyright_stat': copyright_stat,
                    'duration': duration,
                    'del_flag': del_flag,
                    'type': type,
                    'publish_time': publish_time,
                    'sn': sn,
                    '__biz': __biz,
                    'spider_time': tools.get_current_date()
                }

                return article_data
Ejemplo n.º 17
0
def parser(url_info):
    # url  = 'http://user.xiaoyouzb.net/v3/vod/small_recommend?nwtime=1571816563&sign=883f96aee2655d8885e7815de3423df7&type=1&cateId=13&pageNum=0&isFirst=N&_u=edac2c15598946bd9ba7bda78a83489c&version=4.7.0&platform=android&appx=yuntu&apppn=org.fungo.fungolive&enterprise=0&channel=tencent&market=32&os_version=8.0.0&device_model=MIX%25202&device_code=780493075490198&udid=77e2cb72797f20afdcaaa6265872cea9&androidId=220240afd2e0e640&source=android'
    root_url = url_info['url']
    cname = url_info['remark']["category_name"]
    headers = {
        "User-Agent": "yuntutv/4.7.0 (Android 8.0.0)",
        "Host": "user.xiaoyouzb.net"
    }
    json_data = tools.get_json_by_requests(root_url, headers=headers)
    data_infos = json_data["data"]
    for data_info in data_infos:
        publishTime = data_info["publishTime"]
        release_time = tools.timestamp_to_date(str(publishTime)[:-3])
        title = data_info["content"]
        content = data_info["content"]
        video_url = data_info["videoUrl"]
        img_url = data_info["coverUrl"]
        base_parser.save_info('content_info', site_id=SITE_ID, url=video_url, title=title, site_name=NAME,
                              content=content, release_time=release_time, image_url=img_url,
                              video_url=video_url, is_out_link=1, download_image=False, is_debug=False,
                              )
    base_parser.update_url('urls', root_url, Constance.DONE)
Ejemplo n.º 18
0
    def monitor_cookies(self):
        '''
        @summary: 监控管理cookies
        1、删除无用的cookie : 不可用次数超过最大值
        2、将闲置24小时的cookie 设为可用
        ---------
        ---------
        @result:
        '''

        # 删除无用的cookie
        sql = 'delete from sogou_cookies where un_available_times > %d'%MAX_UN_AVAILABLE_TIMES
        self._sqlite3db.delete(sql)

        # 将闲置24小时的cookie 设为可用
        sql = '''
            update sogou_cookies set
                is_available = 1
            where un_available_time < '%s'
        '''%(tools.timestamp_to_date(tools.get_current_timestamp() - 24 * 60 * 60 ))

        self._sqlite3db.update(sql)
Ejemplo n.º 19
0
def parser(url_info):
    root_url = url_info['url']
    para = url_info["remark"]["para_template"]
    headers = url_info["remark"]["header_template"]
    response = requests.get(root_url, params=para, headers=headers)
    time.sleep(2)
    json_info = response.json()
    cate = url_info["remark"]["cate_name"]
    data_jsons = jsonpath(json_info, "$..items..data")
    if cate != '':
        for data_info in data_jsons:
            data_json = json.loads(data_info)
            title = jsonpath(data_json, "$..title")[0]
            img_str = glom(data_json, "coverUrl")
            img_json = json.loads(img_str)
            img_url = img_json["L"][0]
            content = jsonpath(data_json, "$..summary")[0]
            updateTime = jsonpath(data_json, "$..updateTime")[0]
            video_str = glom(data_json, "videoUrl")
            video_json = json.loads(video_str)
            video_url = video_json["source"]["hd"]
            release_time = tools.timestamp_to_date(str(updateTime)[:-3])
            base_parser.save_info(
                'content_info',
                site_id=SITE_ID,
                url=video_url,
                title=title,
                site_name=NAME,
                content=content,
                release_time=release_time,
                image_url=img_url,
                video_url=video_url,
                is_out_link=1,
                download_image=False,
                is_debug=False,
            )

    base_parser.update_url('urls', root_url, Constance.DONE)
Ejemplo n.º 20
0
def extract_info(json_data):
    try:
        data_infos = json_data["Data"]["List"]
        for data_info in data_infos:
            data = data_info["Data"]
            title = data["Title"]
            content = data["Summary"]
            img_url = data["Photo"]
            url = data["Href"]
            print(url)
            video_url = data["VideoUrl"]
            time_regx = r"/(\d+).shtml"
            time_str = tools.get_info(url, time_regx, fetch_one=True)
            release_time = tools.timestamp_to_date(time_str[:-3])
            like_count = data_info["LikeCount"]
            comment_cnt = data_info["CommentCnt"]
            # print(title)
            # print(release_time)
            # print(content)
            # print(img_url)
            # print(video_url)
            # print(like_count)
            # print(comment_cnt)
            data_info = {
                "url": url,
                "title": title,
                "release_time": release_time,
                "content": content,
                "img_url": img_url,
                "video_url": video_url,
                "like_count": like_count,
                "comment_cnt": comment_cnt
            }
            save_info(data_info)
    except Exception as e:
        print(e)
Ejemplo n.º 21
0
    def __parse_article_list(self, article_list, __biz, is_first_page=False):
        '''
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        ---------
        @result: True / None (True: 继续向下抓取; None: 停止向下抓取)
        '''

        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get('title')
            digest = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace('amp;', '')
            source_url = article_info.get('source_url').replace('\\', '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            subtype = article_info.get('subtype')
            is_multi = article_info.get('is_multi')
            author = article_info.get('author')
            copyright_stat = article_info.get('copyright_stat')
            duration = article_info.get('duration')
            del_flag = article_info.get('del_flag')
            type = comm_msg_info.get('type')
            publish_time = tools.timestamp_to_date(comm_msg_info.get('datetime'))
            sn = tools.get_param(url, 'sn')

            if sn:
                # 缓存文章信息
                article_data = {
                    'title': title,
                    'digest': digest,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'subtype': subtype,
                    'is_multi': is_multi,
                    'author': author,
                    'copyright_stat': copyright_stat,
                    'duration': duration,
                    'del_flag': del_flag,
                    'type': type,
                    'publish_time': publish_time,
                    'sn': sn,
                    '__biz': __biz,
                    'spider_time': tools.get_current_date()
                }

                return article_data

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list_data = []
        publish_time = None
        is_need_get_more = True
        article_list = article_list.get('list', [])
        is_first_article = True
        for article in article_list:
            comm_msg_info = article.get('comm_msg_info', {})

            publish_timestamp = comm_msg_info.get('datetime')
            publish_time = tools.timestamp_to_date(publish_timestamp)

            # 记录最新发布时间
            if is_first_page and is_first_article:
                self._task_manager.record_new_last_article_publish_time(__biz, publish_time)
                is_first_article = False

                if publish_timestamp and self._task_manager.is_zombie_account(publish_timestamp):  # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号
                    log.info('公众号 {} 为僵尸账号 不再监控'.format(__biz))
                    self._task_manager.sign_account_is_zombie(__biz, publish_time)
                    is_need_get_more = False
                    break

            # 对比时间 若采集到上次时间,则跳出
            is_reach = self._task_manager.is_reach_last_article_publish_time(__biz, publish_time)
            if is_reach:
                log.info('采集到上次发布时间 公众号 {} 采集完成'.format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz)
                self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time)
                is_need_get_more = False
                break

            elif is_reach is None:
                log.info('公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号'.format(__biz))
                return

            article_type = comm_msg_info.get('type')
            if article_type != 49:  # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            # 看是否在抓取时间范围
            publish_time_status = self._task_manager.is_in_crawl_time_range(publish_time)
            if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE:
                log.info('公众号 {} 超过采集时间范围 采集完成'.format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(__biz)
                self._task_manager.update_account_last_publish_time(__biz, new_last_publish_time)
                is_need_get_more = False
                break
            elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE:
                log.info('公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集'.format(__biz, publish_time))
                continue

            # 在时间范围

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get('app_msg_ext_info', {})
            article_data = parse_article_info(app_msg_ext_info, comm_msg_info)
            if article_data:
                article_list_data.append(article_data)

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get('multi_app_msg_item_list')
            for multi_app_msg_item in multi_app_msg_item_list:
                article_data = parse_article_info(multi_app_msg_item, comm_msg_info)
                if article_data:
                    article_list_data.append(article_data)

        if article_list_data:
            data_pipeline.save_article_list(article_list_data)

        if is_need_get_more:
            return publish_time
Ejemplo n.º 22
0
def monitor_task():
    task_manager = TaskManager()
    total_time = 0

    task_count = 0
    begin_time = None
    end_time = None
    spend_hours = None

    is_show_start_tip = False
    is_show_have_task = False

    while True:
        task_count = task_manager.get_task_count()
        if not task_count:
            if not is_show_start_tip:
                log.info('开始监控任务池...')
                is_show_start_tip = True

            total_time += CHECK_HAVE_TASK_SLEEP_TIME
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)
        else:
            if not is_show_have_task:
                log.info('任务池中有%s条任务,work可以正常工作' % task_count)
                is_show_have_task = True

            total_time = 0
            tools.delay_time(CHECK_HAVE_TASK_SLEEP_TIME)

        if total_time > MAX_NULL_TASK_TIME:
            is_show_start_tip = False
            is_show_have_task = False

            # 结束一轮 做些统计
            if begin_time:
                # 统计时间
                end_time = tools.timestamp_to_date(
                    tools.get_current_timestamp() - MAX_NULL_TASK_TIME)
                spend_time = tools.date_to_timestamp(
                    end_time) - tools.date_to_timestamp(begin_time)
                spend_hours = tools.seconds_to_h_m_s(spend_time)

                # 统计url数量
                depth_count_info = task_manager.get_ever_depth_count(5)

                # 统计文章数量
                article_count_msg = statistic_article_count.get_article_count_msg(
                    begin_time, end_time)

                log.info(
                    '''
                    ------- 已做完一轮 --------
                    \r开始时间:%s
                    \r结束时间:%s
                    \r耗时:%s
                    \r网站数量:%s
                    \rurl数量信息:%s
                    \r文章数量信息:%s
                    ''' %
                    (begin_time, end_time, spend_hours, task_count,
                     tools.dumps_json(depth_count_info), article_count_msg))

            # 删除url指纹
            log.info('删除url指纹...')
            task_manager.clear_task()

            log.info('redis 中连续%s秒无任务,超过允许最大等待%s秒 开始添加任务' %
                     (total_time, MAX_NULL_TASK_TIME))
            # 取任务
            tasks = task_manager.get_task_from_oracle()
            if tasks:
                total_time = 0
                task_manager.add_task_to_redis(tasks)
                task_count = task_manager.get_task_count()
                if task_count:
                    begin_time = tools.get_current_date()
                    log.info('添加任务到redis中成功 共添加%s条任务。 work开始工作' % (task_count))
            else:
                log.error('未从oracle中取到任务')
def parser(url_info):
    url = url_info['url']
    list_datas = tools.get_json_by_requests(url)
    list_datas = list_datas['list']

    for list_data in list_datas:
        title = list_data['title']
        watched_count = list_data['playsCounts']
        image_url = list_data['coverLarge']
        comment_count = list_data['commentsCount']
        charge_type = list_data['priceTypeId']
        is_finished = list_data['isFinished']
        article_type = list_data['tags']
        origin = list_data['provider']
        episodes = list_data['tracks']
        # uid = list_data['uid']
        author = list_data['nickname']
        album_id = list_data['albumId']
        abstract = list_data['intro']
        score = tools.get_json_value(list_data, 'score')
        # id = list_data['id']
        new_url_2 = 'http://mobile.ximalaya.com/mobile/v1/album/rich?albumId=%s' % album_id
        list_datas_2 = tools.get_json_by_requests(new_url_2)
        content = tools.get_json_value(list_datas_2, 'data.album.intro')
        release_time = tools.get_json_value(list_datas_2,
                                            'data.album.createdAt')
        release_time = tools.timestamp_to_date(release_time / 1000)
        update_time = tools.get_json_value(list_datas_2,
                                           'data.album.lastUptrackAt')
        update_time = tools.timestamp_to_date(update_time / 1000)
        subscribe_count = tools.get_json_value(list_datas_2,
                                               'data.album.subscribeCount')

        new_url_3 = 'http://mobile.ximalaya.com/mobile/v1/album/track?albumId=%s&device=android&isAsc=true&pageId=1&' \
                    'pageSize=5000&pre_page=1' % album_id

        list_datas_3 = tools.get_json_by_requests(new_url_3)
        lists = tools.get_json_value(list_datas_3, 'data.list')

        log.debug('''
                    书名:                 %s
                    作品类型:             %s
                    集数:                 %s
                    评分:                 %s   (免费作品均无评分)
                    订阅数:               %s
                    作者:                 %s
                    创建时间:             %s
                    最近更新日期:         %s
                    贴图:                 %s
                    播放次数:             %s
                    评论数:               %s   (免费作品均无评论)
                    收费类型:             %s   (0:免费,1:单期购买, 2:全集购买)
                    是否完结:             %s   (0、1:未完结, 2:完结)
                    提供者:               %s
                    简介:                 %s
                    完整介绍:             %s
                    ''' %
                  (title, article_type, episodes, score, subscribe_count,
                   author, release_time, update_time, image_url, watched_count,
                   comment_count, charge_type, is_finished, origin, abstract,
                   content))
        content_id = base_parser.add_wp_content_info(
            'WP_content_info',
            SITE_ID,
            title=title,
            article_type=article_type,
            episodes=episodes,
            score=score,
            subscribe_count=subscribe_count,
            author=author,
            release_time=release_time,
            update_time=update_time,
            image_url=image_url,
            watched_count=watched_count,
            comment_count=comment_count,
            charge_type=charge_type,
            is_finished=is_finished,
            origin=origin,
            abstract=abstract,
            content=content,
            data_type=DATA_TYPE)

        for list in lists:
            title = list['title']
            download_url = list['playPathAacv164']
            watched_count = list['playtimes']
            play_length = list['duration']
            comments_count = list['comments']
            create_time = list['createdAt']
            create_time = tools.timestamp_to_date(create_time / 1000)
            # log.debug('''
            #                    书名:                 %s
            #                    下载链接:             %s
            #                    播放次数:             %s
            #                    播放时长:             %s
            #                    评论数:                %s
            #                    创建时间:             %s
            #                    ''' % (
            # title, download_url, watched_count, play_length, comments_count, create_time))
            base_parser.add_wp_content_episode_info(
                'WP_content_episode_info',
                content_id=content_id,
                title=title,
                video_url=download_url,
                watched_count=watched_count,
                play_length=play_length,
                comments_count=comments_count,
                release_time=create_time,
                data_type=DATA_TYPE)

        base_parser.update_url('WP_urls', url, Constance.DONE)
Ejemplo n.º 24
0
    def __parse_article_list(self, article_list, __biz, is_first_page=False):
        """
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        ---------
        @result: True / None (True: 继续向下抓取; None: 停止向下抓取)
        """

        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, comm_msg_info):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))

            title = article_info.get("title")
            digest = article_info.get("digest")
            url = article_info.get("content_url").replace("\\", "").replace(
                "amp;", "")
            source_url = article_info.get("source_url").replace("\\",
                                                                "")  # 引用的文章链接
            cover = article_info.get("cover").replace("\\", "")
            subtype = article_info.get("subtype")
            is_multi = article_info.get("is_multi")
            author = article_info.get("author")
            copyright_stat = article_info.get("copyright_stat")
            duration = article_info.get("duration")
            del_flag = article_info.get("del_flag")
            type = comm_msg_info.get("type")
            publish_time = tools.timestamp_to_date(
                comm_msg_info.get("datetime"))
            sn = tools.get_param(url, "sn")

            if sn:
                # 缓存文章信息
                article_data = {
                    "title": title,
                    "digest": digest,
                    "url": url,
                    "source_url": source_url,
                    "cover": cover,
                    "subtype": subtype,
                    "is_multi": is_multi,
                    "author": author,
                    "copyright_stat": copyright_stat,
                    "duration": duration,
                    "del_flag": del_flag,
                    "type": type,
                    "publish_time": publish_time,
                    "sn": sn,
                    "__biz": __biz,
                    "spider_time": tools.get_current_date(),
                }

                return article_data

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list_data = []
        publish_time = None
        is_need_get_more = True
        article_list = article_list.get("list", [])
        is_first_article = True
        for article in article_list:
            comm_msg_info = article.get("comm_msg_info", {})

            publish_timestamp = comm_msg_info.get("datetime")
            publish_time = tools.timestamp_to_date(publish_timestamp)

            # 记录最新发布时间
            if is_first_page and is_first_article:
                self._task_manager.record_new_last_article_publish_time(
                    __biz, publish_time)
                is_first_article = False

                if publish_timestamp and self._task_manager.is_zombie_account(
                        publish_timestamp):  # 首页检测是否为最新发布的文章 若最近未发布 则为僵尸账号
                    log.info("公众号 {} 为僵尸账号 不再监控".format(__biz))
                    self._task_manager.sign_account_is_zombie(
                        __biz, publish_time)
                    is_need_get_more = False
                    break

            # 对比时间 若采集到上次时间,则跳出
            is_reach = self._task_manager.is_reach_last_article_publish_time(
                __biz, publish_time)
            if is_reach:
                log.info("采集到上次发布时间 公众号 {} 采集完成".format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                    __biz)
                self._task_manager.update_account_last_publish_time(
                    __biz, new_last_publish_time)
                is_need_get_more = False
                break

            elif is_reach is None:
                log.info(
                    "公众号 {} 为爬虫启动时的手点公众号。不遍历历史消息,即将抓取监控池里的公众号".format(__biz))
                return

            article_type = comm_msg_info.get("type")
            if article_type != 49:  # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            # 看是否在抓取时间范围
            publish_time_status = self._task_manager.is_in_crawl_time_range(
                __biz, publish_time)
            if publish_time_status == TaskManager.OVER_MIN_TIME_RANGE:
                log.info("公众号 {} 超过采集时间范围 采集完成".format(__biz))
                new_last_publish_time = self._task_manager.get_new_last_article_publish_time(
                    __biz)
                self._task_manager.update_account_last_publish_time(
                    __biz, new_last_publish_time)
                is_need_get_more = False
                break
            elif publish_time_status == TaskManager.NOT_REACH_TIME_RANGE:
                log.info("公众号 {} 当前采集到的时间 {} 未到采集时间范围 不采集".format(
                    __biz, publish_time))
                continue

            # 在时间范围

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get("app_msg_ext_info", {})
            article_data = parse_article_info(app_msg_ext_info, comm_msg_info)
            if article_data:
                article_list_data.append(article_data)

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get(
                "multi_app_msg_item_list")
            for multi_app_msg_item in multi_app_msg_item_list:
                article_data = parse_article_info(multi_app_msg_item,
                                                  comm_msg_info)
                if article_data:
                    article_list_data.append(article_data)

        if article_list_data:
            data_pipeline.save_article_list(article_list_data)

        if is_need_get_more:
            return publish_time
Ejemplo n.º 25
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    column_id = remark

    headers = {
        'Host': 'is.snssdk.com',
        'Accept': ' */*',
        'X-SS-Cookie':
        '_ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b',
        'tt-request-time': '1489990271848',
        'Cookie':
        ' _ba=BA0.2-20170101-51e32-mV0oh6KwzUmWxXl227kO; install_id=8738029911; ttreq=1$b34d173d3544397b1ca82d19a58a7db80e2aef29; qh[360]=1; alert_coverage=33; _ga=GA1.2.1084363974.1479979043; login_flag=cd47dd57ff2f963719bc324163954696; sessionid=3554607744525de375854663cc7e355b; sid_guard="3554607744525de375854663cc7e355b|1489461314|2592000|Thu\054 13-Apr-2017 03:15:14 GMT"; sid_tt=3554607744525de375854663cc7e355b',
        'User-Agent': 'News/6.0.1 (iPhone; iOS 10.2.1; Scale/3.00)',
        'Accept-Language': ' zh-Hans-CN;q=1, en-CN;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': ' keep-alive'
    }

    json = tools.get_json_by_requests(root_url)

    if not json:
        base_parser.update_url('VAApp_urls', root_url, Constance.EXCEPTION)
        return

    datas = json['data']
    for data in datas:
        data = tools.get_json_value(data, 'content')

        title = tools.get_json_value(data, 'title')

        # 检测数据库中是否存在,若存在则退出
        if db.find('VAApp_content_info', {'title': title}):
            continue

        abstract = tools.get_json_value(data, 'abstract')
        abstract = abstract and abstract or tools.get_json_value(
            data, 'content')

        img_url = tools.get_json_value(data, 'image_list.url')
        img_url = img_url and img_url or tools.get_json_value(
            data, 'middle_image.url')
        img_url = img_url and img_url or tools.get_json_value(
            data, 'large_image_list.url')
        img_url = img_url and img_url.replace('.webp', '.jpg') or img_url

        original_url = tools.get_json_value(data, 'article_url')
        original_url = original_url and original_url or tools.get_json_value(
            data, 'share_url')

        release_time = tools.get_json_value(data, 'publish_time')
        release_time = release_time and release_time or tools.get_json_value(
            data, '1481012423')
        release_time = release_time and tools.timestamp_to_date(
            release_time) or release_time

        video_msg = tools.get_json_value(data, 'video_play_info')  #需要处理
        video_main_url = tools.get_json_value(video_msg,
                                              'video_list.video_2.main_url')
        video_main_url = video_main_url and video_main_url or tools.get_json_value(
            video_msg, 'video_list.video_1.main_url')
        parse_video_url = tools.compile_js(PARSE_VIDEO_URL_JSFUNC)
        video_url = parse_video_url('base64decode', video_main_url)

        html = tools.get_html_auto_deal_code(original_url)
        regexs = [
            'class="article-content">(.*?)<div class="article-actions">',
            '<div class="content">(.*?)<div class="suggestion-list-con"',
            '<!-- 文章内容 -->(.*?)<!-- @end 文章内容 -->',
            'class="yi-content-text">(.*?)<div class="yi-normal"',
            '<p.*?>(.*?)</p>'
        ]

        if video_url:
            content = abstract
        else:
            content = ''.join(tools.get_info(html, regexs))
            content = tools.del_html_tag(content)

        if len(content) < len(abstract):
            content = abstract

        # 敏感事件
        sensitive_id = ''
        sensitive_event_infos = oracledb.find(
            'select * from tab_mvms_sensitive_event')
        for sensitive_event_info in sensitive_event_infos:
            _id = sensitive_event_info[0]
            keyword1 = sensitive_event_info[3].split(
                ' ') if sensitive_event_info[3] else []
            keyword2 = sensitive_event_info[4].split(
                ' ') if sensitive_event_info[4] else []
            keyword3 = sensitive_event_info[5].split(
                ' ') if sensitive_event_info[5] else []

            if base_parser.is_violate(title + content,
                                      key1=keyword1,
                                      key2=keyword2,
                                      key3=keyword3):
                sensitive_id = _id

        # 违规事件
        violate_id = ''
        vioation_knowledge_infos = oracledb.find(
            'select * from tab_mvms_violation_knowledge')
        for vioation_knowledge_info in vioation_knowledge_infos:
            _id = vioation_knowledge_info[0]
            keyword1 = vioation_knowledge_info[2].split(
                ' ') if vioation_knowledge_info[2] else []
            keyword2 = vioation_knowledge_info[3].split(
                ' ') if vioation_knowledge_info[3] else []
            keyword3 = vioation_knowledge_info[4].split(
                ' ') if vioation_knowledge_info[4] else []

            if base_parser.is_violate(title + content,
                                      key1=keyword1,
                                      key2=keyword2,
                                      key3=keyword3):
                violate_id = _id

        log.debug('''
            title:          %s
            abstract :      %s
            img_url :       %s
            original_url:   %s
            release_time :  %s
            video_main_url: %s
            video_url:      %s
            content :       %s
            column_id:      %d
            sensitive_id:   %d
            violate_id:     %d

            ''' % (title, abstract, img_url, original_url, release_time,
                   video_main_url, video_url, content, column_id, sensitive_id
                   and sensitive_id or 0, violate_id and violate_id or 0))

        # 如果是视频栏 并且不包含敏感或违法信息 则不下载
        if column_id == VIDEO:
            if not sensitive_id and not violate_id:
                continue

        # 下载
        base_path = FILE_LOCAL_PATH
        is_download = 0

        # 下载图片
        img_name = ''
        if img_url:
            img_name = 'images/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(img_url, base_path, img_name)
            if not is_download:
                img_name = ''

        # 下载视频
        video_name = ''
        if video_url:
            video_name = 'videos/' + tools.get_current_date(
                date_format='%Y-%m-%d') + "/" + tools.get_current_date(
                    date_format='%Y%m%d%H%M%S.%f') + '.mp4'
            is_download = tools.download_file(video_url, base_path, video_name)
            if not is_download:
                video_name = ''

        if original_url:
            base_parser.add_va_app_content_info(
                'VAApp_content_info', SITE_ID, title, abstract, img_url,
                img_name, original_url, release_time, video_url, video_name,
                content, column_id, is_download, sensitive_id, violate_id,
                STORAGE_ID)

    base_parser.update_url('VAApp_urls', root_url, Constance.DONE)
Ejemplo n.º 26
0
    def deal_article(self, req_url, text):
        """
        解析文章
        :param req_url:
        :param text:
        :return:
        """
        sn = tools.get_param(req_url, "sn")

        if not text:
            self._task_manager.update_article_task_state(sn, -1)
            return None

        selector = Selector(text)

        content = selector.xpath(
            '//div[@class="rich_media_content "]|//div[@class="rich_media_content"]|//div[@class="share_media"]'
        )
        title = (selector.xpath('//h2[@class="rich_media_title"]/text()').
                 extract_first(default="").strip())
        account = (selector.xpath('//a[@id="js_name"]/text()').extract_first(
            default="").strip())
        author = (selector.xpath(
            '//span[@class="rich_media_meta rich_media_meta_text"]//text()').
                  extract_first(default="").strip())

        publish_timestamp = selector.re_first('n="(\d{10})"')
        publish_timestamp = int(
            publish_timestamp) if publish_timestamp else None
        publish_time = (tools.timestamp_to_date(publish_timestamp)
                        if publish_timestamp else None)

        pics_url = content.xpath(".//img/@src|.//img/@data-src").extract()
        biz = tools.get_param(req_url, "__biz")

        digest = selector.re_first('var msg_desc = "(.*?)"')
        cover = selector.re_first('var cover = "(.*?)";') or selector.re_first(
            'msg_cdn_url = "(.*?)"')
        source_url = selector.re_first("var msg_source_url = '(.*?)';")

        content_html = content.extract_first(default="")
        comment_id = selector.re_first('var comment_id = "(\d+)"')

        article_data = {
            "account": account,
            "title": title,
            "url": req_url,
            "author": author,
            "publish_time": publish_time,
            "__biz": biz,
            "digest": digest,
            "cover": cover,
            "pics_url": pics_url,
            "content_html": content_html,
            "source_url": source_url,
            "comment_id": comment_id,
            "sn": sn,
            "spider_time": tools.get_current_date(),
        }

        # 入库
        if article_data and data_pipeline.save_article(
                article_data) is not None:
            self._task_manager.update_article_task_state(sn, 1)

        return self._task_manager.get_task()
Ejemplo n.º 27
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']['keyword']
    monitor_type = url_info['remark']['monitor_type']
    official_accounts_id = remark
    retry_times = url_info['retry_times']

    headers = {
    "Host": "weixin.sogou.com",
    "Connection": "keep-alive",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Cookie": "ABTEST=8|1506658658|v1; IPLOC=CN1100; SUID=C5C47C7B642E940A0000000059CDC962; SUID=C5C47C7B1508990A0000000059CDC963; weixinIndexVisited=1; SUV=00F95AA57B7CC4C559CDC963CE316529; SNUID=2B2A9295EDE8B7A2BCECB605EE30F1BE; JSESSIONID=aaadcwpP9yaKs-PCMhz6v",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
    "Upgrade-Insecure-Requests": "1"
    }

    # 获取代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()

    # 解析
    # print(proxies)
    # html, r = tools.get_html_by_requests('http://ip.chinaz.com/getip.aspx', headers = headers, proxies = proxies)
    # print(html)

    html, request = tools.get_html_by_requests(root_url, headers = headers, proxies = proxies)
    if not html:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # print(html)
    regex = '<input type=text name="c" value="" placeholder="(.*?)" id="seccodeInput">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    print(root_url)
    log.debug('取文章链接' + check_info)

    if check_info:
        base_parser.update_url('urls', root_url, Constance.TODO, retry_times + 1)
        return

    # 公众号信息块
    regex = '<!-- a -->(.*?)<!-- z -->'
    account_block = tools.get_info(html, regex, fetch_one = True)
    # url
    regex = '<a.*?account_name.*?href="(.*?)">'
    account_url = tools.get_info(account_block, regex, fetch_one = True)
    account_url = account_url.replace('&amp;',"&")
    log.debug('account_url = ' + account_url)

    if not account_url:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    headers = {
        "Accept-Language": "zh-CN,zh;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
        "Upgrade-Insecure-Requests": "1",
        "Connection": "keep-alive"
    }

    # 代理
    proxies = base_parser.get_proxies()
    headers["User-Agent"] = base_parser.get_user_agent()
    proxies = {} #使用代理会出现验证码 暂时不使用

    html, request = tools.get_html_by_requests(account_url, headers = headers, proxies = proxies)
    regex = '<input class="weui_input frm_input" id="input" placeholder="(.*?)" maxlength="4">'
    check_info = tools.get_info(html, regex, fetch_one = True)
    log.debug('''
        取文章详细内容 %s
        url %s
        request.headers %s
        '''%(check_info, account_url, request.headers))
    # print(html)

    regex = 'var msgList = (.*?});'
    article_json = tools.get_info(html, regex, fetch_one = True)
    article_json = tools.get_json(article_json)

    article_list = article_json.get('list', {})
    for article in article_list:
        title = tools.get_json_value(article, 'app_msg_ext_info.title')
        is_have = mongodb.find('WWA_wechat_article', {'title' : title})
        if is_have:
            log.debug(title + " 已存在")
            continue

        summary = tools.get_json_value(article, 'app_msg_ext_info.digest')
        image_url = tools.get_json_value(article, 'app_msg_ext_info.cover')

        sexy_image_url = []

        # 下载图片
        local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
        is_download = tools.download_file(image_url, local_image_url)
        local_image_url = local_image_url if is_download else ''
        sexy_image_url.append(local_image_url)

        article_url = tools.get_json_value(article, 'app_msg_ext_info.content_url')
        article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
        article_url = article_url.replace('&amp;',"&")

        release_time = tools.get_json_value(article, 'comm_msg_info.datetime')
        release_time = tools.timestamp_to_date(int(release_time)) if release_time else ''

        content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
        regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
        content = tools.get_info(content_html, regex, fetch_one = True)

        # # 取content里的图片 下载图片 然后替换内容中原来的图片地址
        regex = '<img.*?data-src="(.*?)"'
        images = tools.get_info(content, regex)
        for image in images:
            local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
            is_download = tools.download_file(image, local_image_path)
            if is_download:
                content = content.replace(image, local_image_path)
                sexy_image_url.append(local_image_path)
            tools.delay_time(5)

        # 敏感事件
        sensitive_id = ''
        if monitor_type == 1 or monitor_type == 2:
            sensitive_event_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_sensitive_event t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[1].split(',') if sensitive_event_info[1] else []
                keyword2 = sensitive_event_info[2].split(',') if sensitive_event_info[2] else []
                keyword3 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

        # 违规事件
        violate_id = ''
        if monitor_type == 0 or monitor_type == 2:
            vioation_knowledge_infos = oracledb.find('select t.id, t.keyword1, t.keyword2, t.keyword3 from tab_mvms_violation_knowledge t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[1].split(',') if vioation_knowledge_info[1] else []
                keyword2 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword3 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

        log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

        base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

        # 同一天发布的
        oneday_article_list = article.get('app_msg_ext_info', {}).get('multi_app_msg_item_list', [])
        for article in oneday_article_list:
            title = tools.get_json_value(article, 'title')
            summary = tools.get_json_value(article, 'digest')
            image_url = tools.get_json_value(article, 'cover')

            sexy_image_url = []

            # 下载图片
            local_image_url = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.jpg'
            is_download = tools.download_file(image_url, local_image_url)
            local_image_url = local_image_url if is_download else ''
            sexy_image_url.append(local_image_url)

            article_url = tools.get_json_value(article, 'content_url')
            article_url = tools.get_full_url('http://mp.weixin.qq.com', article_url)
            article_url = article_url.replace('&amp;',"&")

            content_html, request = tools.get_html_by_requests(article_url, headers = headers, proxies = proxies)
            regex = '(<div class="rich_media_content " id="js_content">.*?)<script nonce'
            content = tools.get_info(content_html, regex, fetch_one = True)

            # 取content里的图片 下载图片 然后替换内容中原来的图片地址
            regex = '<img.*?data-src="(.*?)"'
            images = tools.get_info(content, regex)
            for image in images:
                local_image_path = FILE_LOCAL_PATH + 'images/' + tools.get_current_date(date_format = '%Y-%m-%d') + "/" + tools.get_current_date(date_format = '%Y%m%d%H%M%S.%f') + '.' + (image[image.find('wx_fmt=') + len('wx_fmt='):(image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) if image.find('&', image.find('wx_fmt=') + len('wx_fmt=')) != -1 else None)] if 'wx_fmt=' in image else 'jpg')
                is_download = tools.download_file(image, local_image_path)
                if is_download:
                    content = content.replace(image, local_image_path)
                    sexy_image_url.append(local_image_path)
                tools.delay_time(5)

            # 敏感事件
            sensitive_id = ''
            sensitive_event_infos = oracledb.find('select * from tab_mvms_sensitive_event')
            for sensitive_event_info in sensitive_event_infos:
                _id = sensitive_event_info[0]
                keyword1 = sensitive_event_info[3].split(',') if sensitive_event_info[3] else []
                keyword2 = sensitive_event_info[4].split(',') if sensitive_event_info[4] else []
                keyword3 = sensitive_event_info[5].split(',') if sensitive_event_info[5] else []

                if base_parser.is_violate(title + content, key1 = keyword1, key2 = keyword2, key3 = keyword3):
                    sensitive_id = _id
                    break

            # 违规事件
            violate_id = ''
            vioation_knowledge_infos = oracledb.find('select * from tab_mvms_violation_knowledge')
            for vioation_knowledge_info in vioation_knowledge_infos:
                _id = vioation_knowledge_info[0]
                keyword1 = vioation_knowledge_info[2].split(',') if vioation_knowledge_info[2] else []
                keyword2 = vioation_knowledge_info[3].split(',') if vioation_knowledge_info[3] else []
                keyword3 = vioation_knowledge_info[4].split(',') if vioation_knowledge_info[4] else []

                if base_parser.is_violate(title + tools.del_html_tag(content), key1=keyword1, key2=keyword2, key3=keyword3):
                    violate_id = _id
                    break

            log.debug('''
            标题         %s
            简介         %s
            图片地址     %s
            文章地址     %s
            发布时间     %s
            内容         %s
            本地贴图地址 %s
            违规状态     %s
            敏感事件     %s
            图片鉴别地址 %s
            '''%(title, summary, image_url, article_url, release_time, content, local_image_url, violate_id, sensitive_id, sexy_image_url))

            base_parser.add_wechat_content_info('WWA_wechat_article', site_id, official_accounts_id, title, summary, image_url, article_url, release_time, content, video_url = '', local_image_url = local_image_url, violate_status = violate_id, sensitive_id = sensitive_id, sexy_image_url = sexy_image_url)

    base_parser.update_url('WWA_wechat_article_url', root_url, Constance.DONE)
    tools.delay_time()
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url, headers=HEADER)

    if not html:
        base_parser.update_url('urls', root_url, Constance.EXCEPTION)
        return

    news_box = tools.get_tag(html, name='div', attrs={'class': "news-box"})[0]

    news_list = tools.get_tag(news_box, name='li')
    for news in news_list:
        try:
            # 图片
            image = tools.get_tag(news, name='img')[0]
            image = tools.get_json_value(image, 'src')

            # url
            url = tools.get_tag(news, name='h3')[0]
            try:
                url = tools.get_json_value(url.a, 'href')
            except:
                url = ''

            # 标题
            title = tools.get_tag(news, name='h3')[0]
            title = tools.get_text(title)
            title = tools.del_html_tag(title)

            # 内容
            content = tools.get_tag(news,
                                    name='p',
                                    attrs={'class': "txt-info"})[0]
            content = tools.get_text(content)
            content = tools.del_html_tag(content)

            # 观看数
            watched_count = ''

            # 来源
            origin = tools.get_tag(news, name='div', attrs={'class': "s-p"})[0]
            origin = ''.join(tools.get_info(origin, '<a.*?>(.*?)<'))

            # 日期
            release_time = tools.get_tag(news,
                                         name='div',
                                         attrs={'class': "s-p"})[0]
            release_time = tools.get_json_value(release_time, 't')
            release_time = tools.timestamp_to_date(int(release_time))

            # 判断是否有视频 根据视频播放图标判断
            regex = '<div class="img-box">.*?<i></i>.*?</div>'
            play_icon = tools.get_info(news, regex)

        except:
            continue

        contained_key, contained_key_count = base_parser.get_contained_key(
            title, content, remark['search_keyword1'],
            remark['search_keyword2'], remark['search_keyword3'])

        log.debug(
            '''
            标题:   %s
            内容:   %s
            来源:   %s
            原文url:%s
            图片url:%s
            观看数: %s
            日期:   %s
            有视频: %d
            关键词: %s
            关键词数:%s
                  ''' %
            (title, content, origin, url, image, watched_count, release_time,
             play_icon and True or False, contained_key, contained_key_count))

        if not contained_key or not play_icon:
            continue

        base_parser.add_content_info('VA_content_info',
                                     SITE_ID,
                                     url,
                                     title,
                                     content,
                                     image_url=image,
                                     release_time=release_time,
                                     origin=origin,
                                     watched_count=watched_count,
                                     search_type=SEARCH_TYPE,
                                     keyword=contained_key,
                                     keyword_count=contained_key_count)

    base_parser.update_url('VA_urls', root_url, Constance.DONE)
Ejemplo n.º 29
0
    def __parse_article_list(self, article_list):
        '''
        @summary: 解析文章列表
        ---------
        @param article_list: 文章列表信息 str
        {
            "list":[
                {
                    "comm_msg_info":{
                        "id":1000000513,
                        "type":49,
                        "datetime":1511354167,
                        "fakeid":"3082125093",
                        "status":2,
                        "content":""
                    },
                    "app_msg_ext_info":{
                        "title":"Python 内存优化",
                        "digest":"实际项目中,pythoner更加关注的是Python的性能问题。本文,关注的是Python的内存优化,一般说来,如果不发生内存泄露,运行在服务端的Python代码不用太关心内存,但是如果运行在客户端,那还是有优化的必要。",
                        "content":"",
                        "fileid":505083208,
                        "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&amp;mid=2652566858&amp;idx=1&amp;sn=d2a76f4a601f94d8acc7b436d18e9648&amp;chksm=8464dd00b313541684c14f974325ea6ae725ffc901fd9888cc00d1acdd13619de3297a5d9a35&amp;scene=27#wechat_redirect",
                        "source_url":"http:\/\/www.cnblogs.com\/xybaby\/p\/7488216.html",
                        "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQVbRHy3FhzwMHEvCvtzXVicHTaPEu8jZ2pgkCAgBqEHugYMvzg3tpoww\/0?wx_fmt=jpeg",
                        "subtype":9,
                        "is_multi":1,
                        "multi_app_msg_item_list":[
                            {
                                "title":"面向对象:With the wonder of your love, the sun above always shines",
                                "digest":"With the wonder of your love, the sun above always shines",
                                "content":"",
                                "fileid":505083209,
                                "content_url":"http:\/\/mp.weixin.qq.com\/s?__biz=MzA4MjEyNTA5Mw==&amp;mid=2652566858&amp;idx=2&amp;sn=97f223783da7748080f8103654447c99&amp;chksm=8464dd00b313541601938565a41487ea76209331fd6f4c8996a2ff5572f4fd465de9fa4cbaac&amp;scene=27#wechat_redirect",
                                "source_url":"https:\/\/mp.weixin.qq.com\/s\/_uD9jY4nXQQ6CtA__dsN8w?scene=25#wechat_redirect",
                                "cover":"http:\/\/mmbiz.qpic.cn\/mmbiz_jpg\/fhujzoQe7TpODTuicia4geCiaIj1AbZwVQQ5ukvwH1GPq5zlWxv05WvRiaw6BiaeyGRD1w17nAPGTlQgEvvDuZnB9HA\/0?wx_fmt=jpeg",
                                "author":"",
                                "copyright_stat":101,
                                "del_flag":1
                            }
                        ],
                        "author":"",
                        "copyright_stat":100,
                        "del_flag":1
                    }
                }
            ]
        }
        ---------
        @result:
        '''

        # log.debug(tools.dumps_json(article_list))

        # 解析json内容里文章信息
        def parse_article_info(article_info, release_time):
            if not article_info:
                return

            # log.debug(tools.dumps_json(article_info))
            title = article_info.get('title')
            summary = article_info.get('digest')
            url = article_info.get('content_url').replace('\\', '').replace(
                'amp;', '')
            source_url = article_info.get('source_url').replace('\\',
                                                                '')  # 引用的文章链接
            cover = article_info.get('cover').replace('\\', '')
            author = article_info.get('author')
            if url and url.startswith(
                    'http://mp.weixin.qq.com/'
            ):  # 被发布者删除的文章 无url和其他信息, 此时取不到mid 且不用入库, 或者商城类的url不入库
                mid = tools.get_param(url, 'mid') or tools.get_param(
                    url, 'appmsgid')  # 图文消息id 同一天发布的图文消息 id一样
                idx = tools.get_param(url, 'idx') or tools.get_param(
                    url, 'itemidx')  # 第几条图文消息 从1开始
                article_id = mid + idx  # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260  idx = 1,则article_id = 26504922601

                # 判断该文章库中是否已存在
                if WechatAction._wechat_service.is_exist(
                        'wechat_article',
                        article_id) or (ONLY_TODAY_MSG and release_time <
                                        tools.get_current_date('%Y-%m-%d')):
                    self._is_need_get_more = False
                    return  # 不往下进行 舍弃之后的文章

                __biz = tools.get_param(url, '__biz')  # 用于关联公众号

                # 缓存文章信息
                WechatAction._article_info[article_id] = {
                    'article_id': int(article_id),
                    'title': title,
                    'summary': summary,
                    'release_time': release_time,
                    'url': url,
                    'source_url': source_url,
                    'cover': cover,
                    'account': '',
                    'author': author,
                    '__biz': __biz,
                    'read_num': None,
                    'like_num': None,
                    'content': '',
                    'comment': [],
                    'record_time': tools.get_current_date()
                }

                # 将文章url添加到待抓取队列
                WechatAction._todo_urls.append(url)

        # log.debug(tools.dumps_json(article_list))
        article_list = tools.get_json(article_list)

        article_list = article_list.get('list', [])
        for article in article_list:
            article_type = article.get('comm_msg_info', {}).get('type')
            if article_type != 49:  # 49为常见的图文消息、其他消息有文本、语音、视频,此处不采集,格式不统一
                continue

            release_time = article.get('comm_msg_info', {}).get('datetime')
            release_time = tools.timestamp_to_date(release_time)

            # 微信公众号每次可以发多个图文消息
            # 第一个图文消息
            app_msg_ext_info = article.get('app_msg_ext_info', {})
            parse_article_info(app_msg_ext_info, release_time)

            if not self._is_need_get_more:
                break

            # 同一天附带的图文消息
            multi_app_msg_item_list = app_msg_ext_info.get(
                'multi_app_msg_item_list')
            for multi_app_msg_item in multi_app_msg_item_list:
                parse_article_info(multi_app_msg_item, release_time)

                if not self._is_need_get_more:
                    break
Ejemplo n.º 30
0
def parser_comment_article(html, video_id, program_id, url, page=1):
    '''
    @summary: 评论区  如 http://www.iqiyi.com/a_19rrhcvhph.html
    ---------
    @param html:
    @param video_id:
    ---------
    @result:
    '''
    regex = 'data-qitancomment-tvid="(.*?)"'
    tvid = tools.get_info(html, regex, fetch_one=True)

    regex = 'data-qitancomment-qitanid="48970759"'
    aid = tools.get_info(html, regex, fetch_one=True)

    if not tvid and not aid:
        return

    comment_url = 'http://api-t.iqiyi.com/qx_api/comment/get_video_comments?aid={aid}&albumid={video_id}&categoryid=15&cb=fnsucc&escape=true&is_video_page=true&need_reply=true&need_subject=true&need_total=1&page={page}&page_size=10&page_size_reply=3&qitan_comment_type=1&qitanid={aid}&qypid=01010011010000000000&reply_sort=hot&sort=add_time&tvid={tvid}'.format(
        aid=aid, video_id=video_id, tvid=tvid, page=page)

    comment_json = tools.get_json_by_requests(comment_url)
    comments = comment_json.get('data', {}).get('comments', [])
    for comment in comments:
        article_id = comment.get('contentId')
        title = comment.get('title')
        content = comment.get('content')
        image_urls = None

        release_time = comment.get('addTime')
        release_time = tools.timestamp_to_date(release_time)

        up_count = comment.get('counterList', {}).get('likes')
        watch_count = comment.get('counterList', {}).get('reads')
        comment_count = comment.get('counterList', {}).get('replies')

        name = comment.get('userInfo', {}).get('uname')
        head_url = comment.get('userInfo', {}).get('icon')
        gender = int(comment.get('userInfo', {}).get('gender'))

        log.debug('''
            id:       %s
            节目id     %s
            头像地址: %s
            名字:     %s
            发布时间: %s
            标题:     %s
            内容:     %s
            图片地址: %s
            观看量:   %s
            点赞量:   %s
            评论量:   %s
            ''' % (article_id, video_id, head_url, name, release_time, title,
                   content, '', watch_count, up_count, comment_count))

        if self_base_parser.add_article(article_id,
                                        head_url,
                                        name,
                                        release_time,
                                        title,
                                        content,
                                        image_urls,
                                        watch_count,
                                        up_count,
                                        comment_count,
                                        program_id=program_id,
                                        gender=gender,
                                        url=url,
                                        info_type=3,
                                        emotion=random.randint(0, 2),
                                        collect=0,
                                        source='爱奇艺'):
            # 解析回复
            reply_list = comment.get('replyList') or []
            parser_relpy_comment(reply_list)
        else:
            break

    else:
        if comments:
            parser_comment_article(html, video_id, program_id, url, page + 1)