コード例 #1
0
def parser_program(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    regex = '<li class="v-item-v5.*?">(.*?)</li>'
    video_blocks = tools.get_info(html, regex)
    for video_block in video_blocks:
        regex = '<a class="u-video" href="(.*?)"'
        program_url = tools.get_info(video_block, regex, fetch_one = True)
        program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')]
        program_url = 'http://www.mgtv.com/h/%s.html'%program_id

        regex = '<img class="u-image" src="(.*?)"'
        image_url = tools.get_info(video_block, regex, fetch_one = True)

        regex = 'em class="u-time">(.*?)</em>'
        episode = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<a class="u-title".*?>(.*?)</a>'
        title = tools.get_info(video_block, regex, fetch_one = True)

        regex = '<span class="u-desc">(.*?)</span>'
        actors_block = tools.get_info(video_block, regex, fetch_one = True)
        regex = '<a .*?>(.*?)</a?'
        actors = tools.get_info(actors_block, regex)
        actors = '/'.join(actors) if actors else '暂无'

        detail_html, r = tools.get_html_by_requests(program_url)
        regex = '<em class="label">简介.*?<span>(.*?)</span>'
        summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else ''

        log.debug('''
            program_url %s
            image_url   %s
            episode     %s
            title       %s
            actors      %s
            summary     %s
            '''%(program_url, image_url, episode, title, actors, summary))

        program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '')

        # 获取集信息url  没月份参数默认是最近月份的数据
        episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id
        base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id})

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
コード例 #2
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url, code='GBK')
    episode_list = 'var url = "(.*?)"'
    episode_list_json = tools.get_info(html, episode_list)
    episode_list_json = episode_list_json and episode_list_json[0] or ''

    episode_list_json_url = episode_list_json + '&cb=jsonp' + str(
        int(time.time()))
    episode_list_json_url = episode_list_json_url.replace("\\", "")
    #print(episode_list_json_url)
    # base_parser.add_url('PROGRAM_urls', site_id, url, depth+1)

    # 取类型

    # 标题
    regexs_program_name = '<meta name="keywords" content="(.*?)" />'
    program_name = tools.get_info(html, regexs_program_name)
    program_name = program_name and program_name[0] or ''

    program_url = source_url

    episode_list_json_html, r = tools.get_html_by_requests(
        episode_list_json_url)

    regexs = 'jsonp\d*?\((.*)\)'
    episode_list_json = tools.get_info(episode_list_json_html, regexs)
    episode_list_json = episode_list_json and episode_list_json[0] or ''
    episode_list_json = tools.dumps_json(episode_list_json)

    episode_list_json_value_list = tools.get_json_value(
        episode_list_json, 'data.list')

    episode = len(episode_list_json_value_list)

    summary = ''

    log.debug('''
                    depth                       = %s
                    program_name                = %s
                    program_url                 = %s
                    episode                     = %s
                    summary                     = %s

                 ''' % (depth, program_name, program_url, episode, summary))

    program_id = base_parser.add_program_info('PROGRAM_info',
                                              site_id,
                                              program_name,
                                              program_url,
                                              image_url='',
                                              episode=episode,
                                              directors='',
                                              actors='',
                                              summary=summary,
                                              release_time='')

    for episode_info in episode_list_json_value_list:
        episode_name = tools.get_json_value(episode_info, 'title')

        episode_image_url = tools.get_json_value(episode_info, 'picurl')

        episode_url = tools.get_json_value(episode_info, 'podurl')

        episode_summary = tools.get_json_value(episode_info, 'desc')

        episode_num = tools.get_json_value(episode_info, 'title')

        episode_num_regex = '第(\d*?)期'
        episode_num = tools.get_info(episode_num, episode_num_regex)
        episode_num = episode_num and episode_num[0] or ''
        if episode_num:
            episode_num = '第' + episode_num + '期'

        download_url_json_str = tools.get_json_value(episode_info, 'vid')

        download_url_json_url = 'http://v.ku6.com/fetchVideo4Player/' + download_url_json_str + '.html'
        download_url_json = tools.get_json_by_requests(download_url_json_url)
        download_url = tools.get_json_value(download_url_json, 'data.f')

        download_status = 102
        time_length = ''

        if download_url:
            #     sto_path = '/video/' + program_name + '.mp4'
            #     is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
            #     download_status = 101 if is_download else 102
            log.debug('''
                                depth                       = %s
                                episode_num                 = %s
                                time_length                 = %s
                                episode_name                = %s
                                episode_url                 = %s
                                download_url                = %s
                                episode_summary             = %s
                                episode_image_url           = %s

                             ''' % (depth + 1, episode_num, time_length,
                                    episode_name, episode_url, download_url,
                                    episode_summary, episode_image_url))
            base_parser.add_program_episode_info(
                'PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                time_length, episode_name, download_status, download_url,
                episode_url, episode_summary, episode_image_url, '')

        # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
コード例 #3
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    html, r = tools.get_html_by_requests(source_url)
    regexs = '<ul class="st-list cfix">(.*?)<div class="ssPages area">'
    lis = tools.get_info(html, regexs)
    regexs = '<li>(.*?)</li>'
    html_lis = tools.get_info(lis, regexs)
    for html_li in html_lis:
        url_regex = '<a href="(.*?)"'
        url = tools.get_info(html_li, url_regex)
        url = url and url[0] or ''
        #取到每一部url
        url = "http:" + url

        everyone_html, r = tools.get_html_by_requests(url)

        # 部 信息
        regexs_program_name = '<h3 class="lh-tit">.*?<a.*?>(.*?)</a>'
        program_name = tools.get_info(html_li, regexs_program_name)
        program_name = program_name and program_name[0] or ''


        # 部 url
        program_url = url

        #部 发布时间
        release_time_regex = '发布时间:(.*?)</p>'
        release_time = tools.get_info(html_li, release_time_regex)
        release_time = release_time and release_time[0] or ''

        # 部 摘要
        regexs_summary = '<span class="full_intro" style="display: none">(.*?)</span>'
        summary = tools.get_info(everyone_html, regexs_summary)
        summary = summary and summary[0] or ''
        summary = tools.del_html_tag(summary)

        # 部 图片
        img_url_regex = '<img.*?src="(.*?)".*?>'
        image_url = tools.get_info(html_li, img_url_regex)
        image_url = image_url and image_url[0] or ''
        image_url = "http:"+image_url

        # log.debug('''
        #          depth                       = %s
        #          program_name                = %s
        #          program_url                 = %s
        #          image_url                   = %s
        #          summary                     = %s
        #          release_time                = %s
        #       ''' % (depth, program_name, program_url, image_url, summary, release_time))

        program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url,
                                                  image_url=image_url,
                                                  episode='', directors='', actors='', summary=summary,
                                                  release_time=release_time)
        print('-=-=-=-=-=-=-=-=-=-=-')

        #获取每集信息json url参数playlistId,variety_year

        everyone_html, r = tools.get_html_by_requests(program_url)
        playlistId_regex = 'var playlistId="(\d*?)";'
        playlistId = tools.get_info(everyone_html, playlistId_regex)
        playlistId = ''.join(playlistId)


        # 获取每集信息json url参数variety_year

        variety_years_regex = '<li class="v-year">(.*?)</li>'
        variety_years_html = tools.get_info(everyone_html, variety_years_regex)
        variety_years_regex = '<em>(\d*?)</em>'
        variety_years = tools.get_info(variety_years_html, variety_years_regex)


        if playlistId and variety_years:
            for variety_year in variety_years:

                episode_json_url = 'http://tv.sohu.com/item/VideoServlet?callback=&source=sohu&id=' + \
                                   playlistId + '&year=' + variety_year + '&month=0&page=1'
                episode_json = tools.get_json_by_requests(episode_json_url)
                # print(tools.dumps_json(episode_json))
                # 获取集数
                episode_json_infos = tools.get_json_value(episode_json, 'videos')
                #episode = len(episode_json_infos)

                for episode_json_info in episode_json_infos:
                    # 集摘要
                    episode_summary = tools.get_json_value(episode_json_info, 'videoDesc')
                    # 集名字
                    episode_name = tools.get_json_value(episode_json_info, 'title')
                    # 集url
                    episode_url = tools.get_json_value(episode_json_info, 'url')
                    # 下载地址
                    episode_download_url = you_get.get_video_url(episode_url)
                    if episode_download_url:
                        episode_download_url = '^_^'.join(episode_download_url)
                    # 集图片地址
                    episode_image_url = tools.get_json_value(episode_json_info, 'pic10')
                    # 当前集数
                    episode_num = tools.get_json_value(episode_json_info, 'showDate')

                    download_status = 102

                    time_length = ''

                    if episode_download_url:
                        # log.debug('''
                        #                         depth                       = %s
                        #                         episode_num                 = %s
                        #                         time_length                 = %s
                        #                         episode_name                = %s
                        #                         episode_url                 = %s
                        #                         episode_download_url        = %s
                        #                         episode_summary             = %s
                        #                         episode_image_url           = %s
                        #
                        #                      ''' % (
                        # depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary,
                        # episode_image_url))

                        base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status,
                                                     episode_download_url, episode_url, episode_summary, episode_image_url, sto_path='')
        if playlistId and not variety_years:
            regexs = '<!-- start : juqing title -->(.*?)<!-- end : plot content -->'
            episode_infos = tools.get_info(everyone_html, regexs)
            for episode_info in episode_infos:
                # 集名字
                regex = '<h4><.*?>(.*?)<span></span></a></h4>'
                episode_name = tools.get_info(episode_info, regex)
                episode_name = episode_name and episode_name[0] or ''

                # 摘要
                regex = '<p class="intro synopsis text">(.*?)</p>'
                episode_summary = tools.get_info(episode_info, regex)
                episode_summary = episode_summary and episode_summary[0] or ''
                episode_summary = tools.del_html_tag(episode_summary)

                # 图片url
                regex = '<img src="(.*?)" width=".*?" height=".*?"'
                episode_image_url = tools.get_info(episode_info, regex)
                episode_image_url = episode_image_url and episode_image_url[0] or ''
                episode_image_url = "http:" + episode_image_url

                # 集数
                regex = '<h4><a href=.*?>画心师 第一季(.*?)<span></span></a></h4>'
                episode_num = tools.get_info(episode_info, regex)
                episode_num = episode_num and episode_num[0] or ''

                # url
                regex = '<h4><a href="(.*?)" title=".*?" target="_blank">'
                episode_url = tools.get_info(episode_info, regex)
                episode_url = episode_url and episode_url[0] or ''
                episode_url = "http:" + episode_url

                # 下载地址
                episode_download_url = you_get.get_video_url(episode_url)
                if episode_download_url:
                    episode_download_url = '^_^'.join(episode_download_url)

                download_status = 102

                time_length = ''
                if episode_download_url:
                    log.debug('''
                                           depth                       = %s
                                           episode_num                 = %s
                                           time_length                 = %s
                                           episode_name                = %s
                                           episode_url                 = %s
                                           episode_download_url        = %s
                                           episode_summary             = %s
                                           episode_image_url           = %s

                                        ''' % (
                    depth+1, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary,
                    episode_image_url))

                base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                                                     time_length, episode_name, download_status,
                                                     episode_download_url, episode_url, episode_summary,
                                                     episode_image_url, sto_path='')

    # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
コード例 #4
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    def r1(pattern, text):
        m = re.search(pattern, text)
        if m:
            return m.group(1)

    program_name = '风行星风范'
    actors = '姜武,秦海璐,黄海波,柳岩'
    release_time = '2011年07月23日'
    directors = ''
    program_url = 'http://www.fun.tv/vplay/g-98097/'
    summary = ''
    image_url = 'http://img3.funshion.com/sdw?oid=cc09e4ab792d4008d86efcbbbf4c55dc&w=200&h=280'
    id = '98097'
    json_episode_info = tools.get_json_by_requests(
        'http://pm.funshion.com/v5/media/episode?id=' + id + '&cl=aphone&uc=5')
    episode = len(json_episode_info)

    log.debug('''
                    depth                       = %s
                    program_name                = %s
                    program_url                 = %s
                    episode                     = %s
                    summary                     = %s
                    image_url                   = %s

                 ''' %
              (depth, program_name, program_url, episode, summary, image_url))

    program_id = base_parser.add_program_info('PROGRAM_info',
                                              site_id,
                                              program_name,
                                              program_url,
                                              image_url=image_url,
                                              episode=episode,
                                              directors='',
                                              actors=actors,
                                              summary='',
                                              release_time=release_time)

    if re.match(r'http://www.fun.tv/vplay/.*g-(\w+)', source_url):
        id = r1(r'http://www.fun.tv/vplay/.*g-(\d+)', source_url)
        json_info = tools.get_json_by_requests(
            'http://pm.funshion.com/v5/media/episode?id=' + id +
            '&cl=aphone&uc=5')
        json_episodes_info = tools.get_json_value(json_info, 'episodes')
        for json_episode_info in json_episodes_info:
            vid = tools.get_json_value(json_episode_info, 'id')

            episode_name = tools.get_json_value(json_episode_info, 'name')

            image_episode_info = tools.get_json_value(json_episode_info,
                                                      'still')

            episode_url = tools.get_json_value(json_episode_info, 'num')
            episode_url = 'http://pm.funshion.com/v5/media/share?id=98097&num=' + episode_url

            episode_num = tools.get_json_value(json_episode_info, 'num')

            #总集数
            episode = len(json_episode_info)

            time_length = ''
            episode_summary = ''
            download_status = ''

            download_url = ''

            log.debug(
                '''
                                    depth                       = %s
                                    episode_num                 = %s
                                    time_length                 = %s
                                    episode_name                = %s
                                    episode_url                 = %s
                                    download_url                = %s
                                    episode_summary             = %s
                                    image_episode_info          = %s

                                 ''' %
                (depth, episode_num, time_length, episode_name, episode_url,
                 download_url, episode_summary, image_episode_info))
            base_parser.add_program_episode_info(
                'PROGRAM_EPISODE_info', site_id, program_id, episode_num,
                time_length, episode_name, download_status, download_url,
                episode_url, episode_summary, image_episode_info, '')

        # 取当前页的文章信息
        # 标题

    # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
コード例 #5
0
def parser_program_info(url_info):
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    program_id = remark['program_id']
    classify = remark['classify']

    # 解析
    html, request = tools.get_html_by_requests(root_url)
    if not html:
        base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION)
        return

    # 标题
    regex = '<h1 class="video_title_cn" >.*?>(.*?)<'
    program_name = tools.get_info(html, regex, fetch_one=True)
    # 地址
    program_url = root_url
    # 图片地址
    regex = '<img src="(.*?)".*?_stat="info:poster"/>'
    image_url = tools.get_info(html, regex, fetch_one=True)
    image_url = tools.get_full_url('http://', image_url)
    # 集数
    regex = ['更新期数:</span>.*?>(.*?)</span>', '总集数:</span>.*?>(.*?)</span>']
    episode = tools.get_info(html, regex, fetch_one=True)
    # 导演
    regex = '<span class="director">导演</span>.*?info:actor_name">(.*?)<'
    directors = tools.get_info(html, regex, split='/')
    # 演员
    regex = '_stat="info:actor_name">(.*?)</span>'  # 包含导演
    actors = tools.get_info(html, regex, split='/')
    actors = actors.replace(directors + '/', '') if directors else actors
    # 发布时间
    regex = ['首播时间:</span>.*?>(.*?)<', '出品时间:</span>.*?>(.*?)<']
    release_time = tools.get_info(html, regex, fetch_one=True)
    # 简介
    regex = 'desc_txt">(.*?)</span>'
    summary = tools.get_info(html, regex, fetch_one=True)
    summary = tools.del_html_tag(summary)

    log.debug('''
        program_name    %s
        program_url     %s
        image_url       %s
        episode         %s
        directors       %s
        actors          %s
        release_time    %s
        summary         %s
        ''' % (program_name, program_url, image_url, episode, directors,
               actors, release_time, summary))

    program_mongo_id = base_parser.add_program_info('PROGRAM_info',
                                                    site_id,
                                                    program_name,
                                                    program_url,
                                                    image_url,
                                                    episode,
                                                    directors,
                                                    actors=actors,
                                                    summary=summary,
                                                    release_time=release_time)

    # 解析分集信息的地址

    # 综艺(json)
    if classify == '综艺':
        # 月份
        regex = '_stat="series:tab".*?>(\d*)月'
        months = tools.get_info(html, regex)
        # print(months)

        # 年
        regex = '<div class="year_slt_list">.*?data-value="(.*?)"'
        years = tools.get_info(html, regex)
        # print(years)

        for year in years:
            for month in months:
                episode_url = 'http://s.video.qq.com/get_playsource?id=%s&plat=2&type=4&data_type=3&video_type=10&year=%s&month=%s&plname=qq&otype=json' % (
                    program_id, year, month)
                log.debug('%s分集json地址:%s' % (program_name, episode_url))
                base_parser.add_url('PROGRAM_urls',
                                    site_id,
                                    episode_url,
                                    depth=2,
                                    remark={
                                        'program_mongo_id': program_mongo_id,
                                        'classify': '综艺'
                                    })

    # 电视剧(包含第几集, url)
    elif classify == '电视剧':
        regex = 'series:numbtn".*?<a href="(.*?)".*?<span itemprop="episodeNumber">(.*?)</span>'
        episode_msgs = tools.get_info(html, regex)
        for episode_msg in episode_msgs:
            episode_url = episode_msg[0]
            episode_num = episode_msg[1]
            log.debug('''
                episode_url  %s
                episode_num  %s
                ''' % (episode_url, episode_num))
            base_parser.add_url('PROGRAM_urls',
                                site_id,
                                episode_url,
                                depth=2,
                                remark={
                                    'program_mongo_id': program_mongo_id,
                                    'episode_num': episode_num,
                                    'program_name': program_name,
                                    'classify': '电视剧'
                                })

    base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
コード例 #6
0
def parser(url_info):
    root_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    remark = url_info['remark']
    # root_url = 'http://list.youku.com/show/id_ze7cc3b8ed96711e68ce4.html'
    # depth = 0
    # headers = {'Host': 'cmstool.youku.com',
    #            'Referer': 'http://v.youku.com/v_show/id_XMjY2NzY3MTE4NA.html',
    #            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    #            'Cookie': '__ysuid=1491380613750xxD; __yscnt=1; juid=01bg7f56tqm9e; __aryft=1495434329; yseid=1495503610725JmZw8d; yseidcount=11; seid=01bgpfc8rb2vm6; ykss=fe922359521ce2d462cbda53; cna=Y5NrEThaR2MCAdOcjEogCug8; __ayvstp=6; __aysvstp=110; l=AmdnSHROpJU3344cDsaqhZhFd5Ex5jvO; isg=AlZW_barEwKJtiefqvOnVZcapwzSXpoxTdXpV8C_SDnUg_YdKIfqQbwzbaiV; __ayft=1495503611023; __aysid=1495416942598jZ1; __arpvid=1495504158930FOANHy-1495504158944; __arycid=; __ayscnt=1; __arcms=; __aypstp=5; __ayspstp=140; ypvid=1495504161820uZFGHk; ysestep=5; yseidtimeout=1495511361821; ycid=0; ystep=237; referhost=; seidtimeout=1495505961826'}

    if depth == 0:
        html = tools.get_html_by_urllib(root_url)

        header_info = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False)
        program_name = header_info.a['title']

        recent_video_url = header_info.a['href']
        recent_video_url = 'http:'+recent_video_url

        recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)='], fetch_one=True)
        if not recent_video_id:
            recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)\.h'], fetch_one=True)

        actors = tools.get_tag(html, 'li', {'class': 'p-row'})[2].get_text()
        actors = ''.join(tools.re.compile('主持人:(.+)').findall(actors))

        summary = tools.get_tag(html, 'span', {'class': 'text'}, find_all=False).get_text()
        summary = ''.join(tools.re.compile('简介:(.+)').findall(summary))

        image_url = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False)
        image_url = image_url.img['src']

        list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id
        list_json = tools.get_json_by_requests(list_url)
        video_list = tools.get_json_value(list_json, 'data.videos.list')
        # print(video_list)

        episode = tools.get_json_value(list_json, 'data.show.episode_total')

        log.debug('''
                      recent_video_url:  %s
                      recent_video_id:   %s
                      集数:              %s
                      主持人:            %s
                      封面地址:          %s
                      专辑地址:          %s
                      简介:              %s
                      节目名称:          %s
                      视频列表:          %s
                      list_url:          %s
                      ''' % (recent_video_url, recent_video_id, episode, actors, image_url, root_url, summary, program_name, video_list, list_url))

        program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, actors=actors, image_url=image_url, program_url=root_url,
                                     summary=summary, program_name=program_name, episode=episode)

        for vl in video_list:
            vl_id = tools.get_json_value(vl, 'encodevid')
            vl_url = 'http://v.youku.com/v_show/id_%s.html' % vl_id
            base_parser.add_url('PROGRAM_urls', SITE_ID, vl_url, depth=1, remark=program_id)

        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
    elif depth == 1:
        program_id = remark
        html, res = tools.get_html_by_requests(root_url)
        episode_name = tools.get_tag(html, 'h1', find_all=False)
        episode_name = episode_name.get_text()

        videoId = tools.get_info(html, ['videoId:"(.+?)"'], fetch_one=True)
        play_count, res = tools.get_html_by_requests('http://v.youku.com/action/getVideoPlayInfo?vid=%s&callback=tuijsonp5'% videoId)
        if not play_count:
            print(1)
        play_count = tools.get_info(play_count, ['"vv":"(.+?)"'], fetch_one=True)
        play_count = play_count.replace(',', '')

        # info_html, info_res = tools.get_html_by_requests('http://cmstool.youku.com/cms/playlog/get?callback=tuijsonp7', headers)
        # # print(info_html)
        # image_url = tools.get_info(info_html, ['"thumburl":"(.+?)",'], fetch_one=True)
        # image_url = image_url.replace('\\', '')
        # print(image_url)
        # episode_num = tools.get_info(info_html, ['"watchStage":"(.+?)",'], fetch_one=True)
        # episode_num = tools.to_chinese(episode_num)
        # print(episode_num)

        recent_video_id = tools.get_info(root_url, ['id_(.+?)='], fetch_one=True)
        if not recent_video_id:
            recent_video_id = tools.get_info(root_url, ['id_(.+?)\.h'], fetch_one=True)
        list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id
        list_info = tools.get_json_by_requests(list_url)
        stream = tools.get_json_value(list_info, "data.stream")
        download_url = stream[layer]['m3u8_url']

        time_length = tools.get_json_value(list_info, "data.video.seconds")

        episode_num = tools.get_json_value(list_info, "data.show.stage")

        image_url = tools.get_json_value(list_info, "data.video.logo")

        segs = stream[layer]['segs']
        cdn_url = []
        for video_url in segs:
            cdn_url.append(video_url['cdn_url'])
        # print(cdn_url)

        log.debug('''
                                     节目id:            %s
                                     当前集数:          %s
                                     本集时长:          %s
                                     播放次数:          %s
                                     节目名称:          %s
                                     下载地址:          %s
                                     节目链接:          %s
                                     图片地址:          %s
                                     ''' % (
                 program_id, episode_num, time_length, play_count, episode_name, download_url, root_url, image_url))

        base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num,
                                            time_length=time_length, episode_name=episode_name, download_url=download_url,
                                            episode_url=root_url, image_url=image_url, play_count=play_count)

        base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
コード例 #7
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    def get_tkey(t):
        def ror(val, key):
            i = 0
            while (i < key):
                val = (0x7fffffff & (val >> 1)) | ((val & 1) << 31)
                i += 1

            return val

        key = 185025305
        val = ror(t, key % 17)
        val = val ^ key
        return val

    def getdownload(episode_download_url_json):
        episode_json = tools.get_json_by_requests(episode_download_url_json)
        #print(episode_download_url_json)
        episode_download_url = tools.get_json_value(episode_json,
                                                    'msgs.playurl.domain')
        episode_download_url = episode_download_url and episode_download_url[
            0] or ''
        #print('-----',episode_download_url)
        episode_download_url_definition = tools.get_json_value(
            episode_json, 'msgs.playurl.dispatch.1080p')
        episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[
            0] or ''
        print(episode_download_url_definition, '*********')
        episode_download_url = episode_download_url + episode_download_url_definition
        episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format(
            random.random(), '1080p')
        episode_download_url_json = tools.get_json_by_requests(
            episode_download_url)
        episode_download_url = tools.get_json_value(episode_download_url_json,
                                                    'location')
        return episode_download_url

    if depth == 0:
        cs_regex = 'cs(.*?)_'
        o_regex = 'cs.*?_o(.*?)_p'
        cs = tools.get_info(source_url, cs_regex)
        cs_value = cs and cs[0] or ''
        o = tools.get_info(source_url, o_regex)
        o_value = o and o[0] or ''
        #print('1'+o_value+'2','***', cs_value)
        url = 'http://list.le.com/apin/chandata.json?cs=' + cs_value + '&_o=' + o_value + '&_p='
        base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1)
    if depth == 1:
        page = '1'
        #电视剧
        if 'cs=2' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                json_list = tools.get_json_value(json, 'album_list')
                #print(source_url)
                for info in json_list:
                    image_url = tools.get_json_value(info, 'images.1080*608')
                    program_name = tools.get_json_value(info, 'name')
                    program_url = tools.get_json_value(info, 'aid')
                    program_url = 'http://www.le.com/tv/' + program_url + '.html'
                    episode = tools.get_json_value(info, 'nowEpisodes')

                    directors = tools.get_json_value(info, 'directory')
                    #print(type(directors))
                    directors = ','.join(tools.get_json(directors).values())

                    actors = tools.get_json_value(info, 'starring')
                    actors = ' '.join(actors.values())

                    summary = tools.get_json_value(info, 'description')

                    release_time = tools.get_json_value(info, 'releaseDate')
                    release_time = int(release_time) / 1000
                    x = time.localtime(release_time)
                    release_time = time.strftime("%Y-%m-%d", x)

                    log.debug(
                        '''
                                    depth                       = %s
                                    program_name                = %s
                                    program_url                 = %s
                                    image_url                   = %s
                                    episode                     = %s
                                    directors                   = %s
                                    actors                      = %s
                                    summary                     = %s
                                    release_time                = %s
                                 ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time)

                    episode_url = tools.get_json_value(info, 'vids')
                    episode_url = episode_url + ','
                    regex = '(\d*?),'
                    episode_urls = tools.get_info(episode_url, regex)
                    for episode_url_num in episode_urls:

                        episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                        episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                            episode_url_num, get_tkey(int(time.time())))

                        episode_json = tools.get_json_by_requests(
                            episode_download_url_json)

                        episode_image_url = tools.get_json_value(
                            episode_json, 'msgs.playurl.pic')

                        episode_name = tools.get_json_value(
                            episode_json, 'msgs.playurl.title')

                        episode_num_regex = "(\d*?)"
                        episode_num = tools.get_info(episode_name,
                                                     episode_num_regex)
                        episode_num = episode_num and episode_num[0] or ''

                        episode_download_url = getdownload(
                            episode_download_url_json)

                        time_length = ''

                        episode_summary = ''

                        download_status = ''

                        log.debug(
                            '''
                                                   depth                       = %s
                                                   episode_num                 = %s
                                                   time_length                 = %s
                                                   episode_name                = %s
                                                   episode_url                 = %s
                                                   download_url                = %s
                                                   episode_summary             = %s
                                                   episode_image_url           = %s

                                                ''' %
                            (depth, episode_num, time_length, episode_name,
                             episode_url, episode_download_url,
                             episode_summary, episode_image_url))

                        base_parser.add_program_episode_info(
                            'PROGRAM_EPISODE_info', site_id, program_id,
                            episode_num, time_length, episode_name,
                            download_status, episode_download_url, episode_url,
                            episode_summary, episode_image_url, '')

                page = str(int(page) + 1)

                if not json_list:
                    return False

        #体育
        if 'cs=4' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '第(.*?)期'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://sports.le.com/video/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    program_url = ''
                    actors = ''
                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                    ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                            ''' %
                              (depth, episode_num, time_length, episode_name,
                               episode_url, episode_download_url,
                               episode_summary, episode_image_url))
                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id, episode,
                        time_length, program_name, download_status,
                        episode_download_url, program_url, summary, image_url,
                        '')
                page = str(int(page) + 1)

                if not json_list:
                    return False

        # 综艺
        if 'cs=11' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '第(.*?)期'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    actors = ''
                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    program_url = ''
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                    ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                    ''' % (depth, episode_num, time_length, episode_name,
                           episode_url, episode_download_url, episode_summary,
                           episode_image_url))

                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id,
                        episode_num, time_length, episode_name,
                        download_status, episode_download_url, episode_url,
                        episode_summary, episode_image_url, '')
                page = str(int(page) + 1)
                if not video_list:
                    return False

        # 音乐
        if 'cs=9' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                video_list = tools.get_json_value(json, 'video_list')
                for info in video_list:
                    episode_name = tools.get_json_value(info, 'name')

                    episode_num = tools.get_json_value(info, 'name')
                    regex_episode_num = '(\d*?):'
                    episode_num = tools.get_info(episode_num,
                                                 regex_episode_num)
                    episode_num = ''.join(episode_num)

                    episode_summary = tools.get_json_value(info, 'description')

                    episode_image_url = tools.get_json_value(
                        info, 'images.1080*608')

                    episode_url_num = tools.get_json_value(info, 'vid')
                    episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                    #获取Vid 也就是num
                    episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                        episode_url_num, get_tkey(int(time.time())))

                    episode_download_url = getdownload(
                        episode_download_url_json)

                    program_name = tools.get_json_value(info, 'albumName')

                    summary = ''
                    actors = tools.get_json_value(info, 'actor').values()
                    actors = ''.join(actors)
                    #print('**********', actors)

                    directors = ''
                    release_time = ''
                    image_url = ''
                    episode = ''
                    aid = tools.get_json_value(info, 'aid')
                    program_url = ''
                    download_status = 102
                    time_length = ''

                    log.debug(
                        '''
                                depth                       = %s
                                program_name                = %s
                                program_url                 = %s
                                image_url                   = %s
                                episode                     = %s
                                directors                   = %s
                                actors                      = %s
                                summary                     = %s
                                release_time                = %s
                                aid                         = %s
                              ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time, aid))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time,
                        _id=aid)
                    log.debug('''
                               depth                       = %s
                               episode_num                 = %s
                               time_length                 = %s
                               episode_name                = %s
                               episode_url                 = %s
                               download_url                = %s
                               episode_summary             = %s
                               episode_image_url           = %s
                            ''' %
                              (depth, episode_num, time_length, episode_name,
                               episode_url, episode_download_url,
                               episode_summary, episode_image_url))
                    base_parser.add_program_episode_info(
                        'PROGRAM_EPISODE_info', site_id, program_id,
                        episode_num, time_length, episode_name,
                        download_status, episode_download_url, episode_url,
                        episode_summary, episode_image_url, '')

                page = str(int(page) + 1)
                if not video_list:
                    return False
    # #     # 取当前页的文章信息
    # #     # 标题
    # #
    # #
    # #     everyone_html = tools.get_html_by_requests(program_url)
    # #
    # #     regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>'
    # #     directors = tools.get_info(everyone_html, regexs_directors)
    # #     directors = directors and directors[0] or ''
    # #
    # #     # 摘要
    # #     regexs_summary = '<p class="summaryList_long">(.*?)</p>'
    # #     summary = tools.get_info(everyone_html, regexs_summary)
    # #     summary = summary and summary[0] or ''
    # #
    # #     # 更新时间
    # #     regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>'
    # #     release_time = tools.get_info(everyone_html, regexs_release_time)
    # #     release_time = release_time and release_time[0] or ''
    # #
    # #     # 下载地址
    # #     regexs_download_url = 'videoUrl=(.*?)"'
    # #     download_url = tools.get_info(everyone_html, regexs_download_url)
    # #     download_url = download_url and download_url[0] or ''
    # #
    # #     download_status = 102
    # #     time_length = ''
    # #
    # #
    # #     if download_url:
    # #         program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url = image_url,
    # #                              episode = episode, directors = directors, actors = '', summary = summary,
    # #                              release_time = release_time)
    # #
    # #         sto_path = '/video/' + program_name + '.mp4'
    # #         is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
    # #         download_status = 101 if is_download else 102
    # #
    # #         base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status,
    # #                          download_url, program_url, summary, image_url, sto_path)
    # #
    # #
    # # # 更新source_url为done
    # # base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)

    # 电影
        if 'cs=1' in source_url:
            while True:
                json = tools.get_json_by_requests(source_url + page)
                json_list = tools.get_json_value(json, 'album_list')
                #print(source_url)
                for info in json_list:
                    image_url = tools.get_json_value(info, 'images.1080*608')
                    program_name = tools.get_json_value(info, 'name')
                    program_url = tools.get_json_value(info, 'aid')
                    program_url = 'http://www.le.com/movie/' + program_url + '.html'
                    episode = ' '

                    directors = tools.get_json_value(info, 'directory')
                    directors = ','.join(tools.get_json(directors).values())

                    actors = tools.get_json_value(info, 'starring')
                    actors = ' '.join(actors.values())

                    summary = tools.get_json_value(info, 'description')

                    release_time = tools.get_json_value(info, 'releaseDate')
                    release_time = int(release_time) / 1000
                    x = time.localtime(release_time)
                    release_time = time.strftime("%Y-%m-%d", x)

                    log.debug(
                        '''
                                    depth                       = %s
                                    program_name                = %s
                                    program_url                 = %s
                                    image_url                   = %s
                                    episode                     = %s
                                    directors                   = %s
                                    actors                      = %s
                                    summary                     = %s
                                    release_time                = %s
                                 ''' %
                        (depth, program_name, program_url, image_url, episode,
                         directors, actors, summary, release_time))

                    program_id = base_parser.add_program_info(
                        'PROGRAM_info',
                        site_id,
                        program_name,
                        program_url,
                        image_url=image_url,
                        episode=episode,
                        directors=directors,
                        actors=actors,
                        summary=summary,
                        release_time=release_time)

                    episode_url = tools.get_json_value(info, 'vids')
                    episode_url = episode_url + ','
                    regex = '(.*?),'
                    episode_urls = tools.get_info(episode_url, regex)

                    for episode_url_num in episode_urls:

                        episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html'

                        episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825&region=cn&source=1000&accessyx=1'.format(
                            episode_url_num, get_tkey(int(time.time())))
                        print(episode_download_url_json)
                        episode_json = tools.get_json_by_requests(
                            episode_download_url_json)

                        episode_image_url = tools.get_json_value(
                            episode_json, 'msgs.playurl.pic')

                        episode_name = tools.get_json_value(
                            episode_json, 'msgs.playurl.title')

                        episode_num_regex = "第(.*?)期"
                        episode_num = tools.get_info(episode_name,
                                                     episode_num_regex)
                        episode_num = episode_num and episode_num[0] or ''

                        episode_download_url = getdownload(
                            episode_download_url_json)

                        time_length = ''

                        episode_summary = ''

                        download_status = ''

                        log.debug(
                            '''
                                                   depth                       = %s
                                                   episode_num                 = %s
                                                   time_length                 = %s
                                                   episode_name                = %s
                                                   episode_url                 = %s
                                                   download_url                = %s
                                                   episode_summary             = %s
                                                   episode_image_url           = %s

                                                ''' %
                            (depth, episode_num, time_length, episode_name,
                             episode_url, episode_download_url,
                             episode_summary, episode_image_url))

                        base_parser.add_program_episode_info(
                            'PROGRAM_EPISODE_info', site_id, program_id,
                            episode_num, time_length, episode_name,
                            download_status, episode_download_url, episode_url,
                            episode_summary, episode_image_url, '')

                page = str(int(page) + 1)

                if not json_list:
                    return False
コード例 #8
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    source_url = url_info['url']
    depth = url_info['depth']
    site_id = url_info['site_id']
    description = url_info['remark']

    html, request = tools.get_html_by_requests(source_url)
    regexs = '<ul node-type="listWrap" class="lastUl">(.*?)<div class="bottom_more">'
    lis = tools.get_info(html, regexs)
    regexs = '<li>(.*?)</li>'
    html_lis = tools.get_info(lis, regexs)
    for html_li in html_lis:
        url_regex = '<a href="(.*?)">'
        url = tools.get_info(html_li, url_regex)
        url = url and url[0] or ''
        if url:
            base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1)

        # 取当前页的文章信息
        # 标题

        regexs_program_name = '<a href=".*?">(.*?)</a>'
        program_name = tools.get_info(html_li, regexs_program_name)
        program_name = program_name and program_name[0] or ''

        program_url = url

        #时间
        regexs_image_url = 'data-src="(.*?)"'
        image_url = tools.get_info(html_li, regexs_image_url)
        image_url = image_url and image_url[0] or ''

        episode = 1

        everyone_html = tools.get_html_by_requests(program_url)

        regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>'
        directors = tools.get_info(everyone_html, regexs_directors)
        directors = directors and directors[0] or ''

        # 摘要
        regexs_summary = '<p class="summaryList_long">(.*?)</p>'
        summary = tools.get_info(everyone_html, regexs_summary)
        summary = summary and summary[0] or ''

        # 更新时间
        regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>'
        release_time = tools.get_info(everyone_html, regexs_release_time)
        release_time = release_time and release_time[0] or ''

        # 下载地址
        regexs_download_url = 'videoUrl=(.*?)"'
        download_url = tools.get_info(everyone_html, regexs_download_url)
        download_url = download_url and download_url[0] or ''

        download_status = 102
        time_length = ''

        log.debug('''
                depth                       = %s
                program_name                = %s
                program_url                 = %s
                image_url                   = %s
                episode                     = %s
                directors                   = %s
                summary                     = %s
                release_time                = %s
             ''' % (depth, program_name, program_url, image_url, episode,
                    directors, summary, release_time))
        if download_url:
            program_id = base_parser.add_program_info(
                'PROGRAM_info',
                site_id,
                program_name,
                program_url,
                image_url=image_url,
                episode=episode,
                directors=directors,
                actors='',
                summary=summary,
                release_time=release_time)

            # sto_path = '/videos/' + program_name + '.mp4'
            # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path)
            # download_status = 101 if is_download else 102
            sto_path = ''

            base_parser.add_program_episode_info('PROGRAM_EPISODE_info',
                                                 site_id, program_id, episode,
                                                 time_length, program_name,
                                                 download_status, download_url,
                                                 program_url, summary,
                                                 image_url, sto_path)

    # 更新source_url为done
    base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
コード例 #9
0
def zhuanji_parser(url, remark):
    image_url = remark
    html, res = tools.get_html_by_requests(url)
    program_name = tools.get_info(html, ['<title>(.+?)-.+?</title>'],
                                  fetch_one=True)
    if not program_name:
        base_parser.update_url('PROGRAM_urls', url, Constance.EXCEPTION)
        return
    ablum_list_id = tools.get_info(html, ['data-bodansubid="(.+?)"'],
                                   fetch_one=True)
    if ablum_list_id:
        video_list_url = 'http://cache.video.qiyi.com/jp/plst/%s/' % ablum_list_id
        list_json, res = tools.get_html_by_requests(video_list_url)
        video_list = tools.get_info(list_json,
                                    ['http://www.iqiyi.com/v_.+?.html'])
    else:
        ablum_list_id = tools.get_info(html, ['sourceId: (.+?),'],
                                       fetch_one=True)
        video_list_url = 'http://cache.video.qiyi.com/jp/sdvlst/6/%s/' % ablum_list_id
        list_json, res = tools.get_html_by_requests(video_list_url)
        # video_list = tools.get_info(list_json, ['http://www.iqiyi.com/v_.+?.html'])
        video_list = tools.get_info(list_json,
                                    ['http://www.iqiyi.com/.+?.html'])
    if ablum_list_id == '0':
        video_list = []

    if not video_list:
        video_list = tools.get_tag(html,
                                   'div', {'class': 'wrapper-piclist'},
                                   find_all=False)
        video_list = tools.get_info(str(video_list),
                                    ['(http://www.iqiyi.com/v_.+?.html)'])
        video_list = list(set(video_list))

    if not video_list:
        video_list = tools.get_tag(html,
                                   'div', {'class': 'piclist-wrapper'},
                                   find_all=False)
        video_list = tools.get_info(str(video_list),
                                    ['(http://www.iqiyi.com/v_.+?.html)'])
        video_list = list(set(video_list))

    if not video_list:
        video_list = tools.get_tag(html,
                                   'ul', {'class': 'juji-list'},
                                   find_all=False)
        video_list = tools.get_info(str(video_list),
                                    ['(http://www.iqiyi.com/v_.+?.html)'])
        video_list = list(set(video_list))

    if not video_list:
        video_list = tools.get_tag(html,
                                   'div', {'class': 'videoList'},
                                   find_all=False)
        video_list = tools.get_info(str(video_list),
                                    ['(http://www.iqiyi.com/v_.+?.html)'])
        video_list = list(set(video_list))

    summary = tools.get_tag(html,
                            'span', {'class': 'showMoreText'},
                            find_all=False)
    if summary:
        summary = summary.get_text().replace('简介:', '')
    if not summary:
        summary = tools.get_tag(html,
                                'div', {'data-moreorless': 'moreinfo'},
                                find_all=False)
        if summary:
            summary = summary.get_text()

    log.debug('''
                          封面地址:          %s
                          专辑地址:          %s
                          简介:              %s
                          节目名称:          %s
                          视频列表:          %s
                          ''' %
              (image_url, url, summary, program_name, video_list))

    program_id = base_parser.add_program_info('PROGRAM_info',
                                              SITE_ID,
                                              image_url=image_url,
                                              program_url=url,
                                              summary=summary,
                                              program_name=program_name)
    for link in video_list:
        base_parser.add_url('PROGRAM_urls',
                            SITE_ID,
                            link,
                            depth=1,
                            remark=program_id)

    base_parser.update_url('PROGRAM_urls', url, Constance.DONE)