def parser_program(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return regex = '<li class="v-item-v5.*?">(.*?)</li>' video_blocks = tools.get_info(html, regex) for video_block in video_blocks: regex = '<a class="u-video" href="(.*?)"' program_url = tools.get_info(video_block, regex, fetch_one = True) program_id = program_url[program_url.find('b/') + 2 : program_url.rfind('/')] program_url = 'http://www.mgtv.com/h/%s.html'%program_id regex = '<img class="u-image" src="(.*?)"' image_url = tools.get_info(video_block, regex, fetch_one = True) regex = 'em class="u-time">(.*?)</em>' episode = tools.get_info(video_block, regex, fetch_one = True) regex = '<a class="u-title".*?>(.*?)</a>' title = tools.get_info(video_block, regex, fetch_one = True) regex = '<span class="u-desc">(.*?)</span>' actors_block = tools.get_info(video_block, regex, fetch_one = True) regex = '<a .*?>(.*?)</a?' actors = tools.get_info(actors_block, regex) actors = '/'.join(actors) if actors else '暂无' detail_html, r = tools.get_html_by_requests(program_url) regex = '<em class="label">简介.*?<span>(.*?)</span>' summary = tools.get_info(detail_html, regex, fetch_one = True) if detail_html else '' log.debug(''' program_url %s image_url %s episode %s title %s actors %s summary %s '''%(program_url, image_url, episode, title, actors, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, title, program_url, image_url, episode, directors = '', actors = actors, summary = summary, release_time = '') # 获取集信息url 没月份参数默认是最近月份的数据 episode_detail_url = 'http://pcweb.api.mgtv.com/variety/showlist?collection_id=' + program_id base_parser.add_url('PROGRAM_urls', SITE_ID, episode_detail_url, depth = 1, remark = {'program_mongo_id' : program_mongo_id, 'program_id' : program_id}) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='GBK') episode_list = 'var url = "(.*?)"' episode_list_json = tools.get_info(html, episode_list) episode_list_json = episode_list_json and episode_list_json[0] or '' episode_list_json_url = episode_list_json + '&cb=jsonp' + str( int(time.time())) episode_list_json_url = episode_list_json_url.replace("\\", "") #print(episode_list_json_url) # base_parser.add_url('PROGRAM_urls', site_id, url, depth+1) # 取类型 # 标题 regexs_program_name = '<meta name="keywords" content="(.*?)" />' program_name = tools.get_info(html, regexs_program_name) program_name = program_name and program_name[0] or '' program_url = source_url episode_list_json_html, r = tools.get_html_by_requests( episode_list_json_url) regexs = 'jsonp\d*?\((.*)\)' episode_list_json = tools.get_info(episode_list_json_html, regexs) episode_list_json = episode_list_json and episode_list_json[0] or '' episode_list_json = tools.dumps_json(episode_list_json) episode_list_json_value_list = tools.get_json_value( episode_list_json, 'data.list') episode = len(episode_list_json_value_list) summary = '' log.debug(''' depth = %s program_name = %s program_url = %s episode = %s summary = %s ''' % (depth, program_name, program_url, episode, summary)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url='', episode=episode, directors='', actors='', summary=summary, release_time='') for episode_info in episode_list_json_value_list: episode_name = tools.get_json_value(episode_info, 'title') episode_image_url = tools.get_json_value(episode_info, 'picurl') episode_url = tools.get_json_value(episode_info, 'podurl') episode_summary = tools.get_json_value(episode_info, 'desc') episode_num = tools.get_json_value(episode_info, 'title') episode_num_regex = '第(\d*?)期' episode_num = tools.get_info(episode_num, episode_num_regex) episode_num = episode_num and episode_num[0] or '' if episode_num: episode_num = '第' + episode_num + '期' download_url_json_str = tools.get_json_value(episode_info, 'vid') download_url_json_url = 'http://v.ku6.com/fetchVideo4Player/' + download_url_json_str + '.html' download_url_json = tools.get_json_by_requests(download_url_json_url) download_url = tools.get_json_value(download_url_json, 'data.f') download_status = 102 time_length = '' if download_url: # sto_path = '/video/' + program_name + '.mp4' # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # download_status = 101 if is_download else 102 log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth + 1, episode_num, time_length, episode_name, episode_url, download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, download_url, episode_url, episode_summary, episode_image_url, '') # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] html, r = tools.get_html_by_requests(source_url) regexs = '<ul class="st-list cfix">(.*?)<div class="ssPages area">' lis = tools.get_info(html, regexs) regexs = '<li>(.*?)</li>' html_lis = tools.get_info(lis, regexs) for html_li in html_lis: url_regex = '<a href="(.*?)"' url = tools.get_info(html_li, url_regex) url = url and url[0] or '' #取到每一部url url = "http:" + url everyone_html, r = tools.get_html_by_requests(url) # 部 信息 regexs_program_name = '<h3 class="lh-tit">.*?<a.*?>(.*?)</a>' program_name = tools.get_info(html_li, regexs_program_name) program_name = program_name and program_name[0] or '' # 部 url program_url = url #部 发布时间 release_time_regex = '发布时间:(.*?)</p>' release_time = tools.get_info(html_li, release_time_regex) release_time = release_time and release_time[0] or '' # 部 摘要 regexs_summary = '<span class="full_intro" style="display: none">(.*?)</span>' summary = tools.get_info(everyone_html, regexs_summary) summary = summary and summary[0] or '' summary = tools.del_html_tag(summary) # 部 图片 img_url_regex = '<img.*?src="(.*?)".*?>' image_url = tools.get_info(html_li, img_url_regex) image_url = image_url and image_url[0] or '' image_url = "http:"+image_url # log.debug(''' # depth = %s # program_name = %s # program_url = %s # image_url = %s # summary = %s # release_time = %s # ''' % (depth, program_name, program_url, image_url, summary, release_time)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode='', directors='', actors='', summary=summary, release_time=release_time) print('-=-=-=-=-=-=-=-=-=-=-') #获取每集信息json url参数playlistId,variety_year everyone_html, r = tools.get_html_by_requests(program_url) playlistId_regex = 'var playlistId="(\d*?)";' playlistId = tools.get_info(everyone_html, playlistId_regex) playlistId = ''.join(playlistId) # 获取每集信息json url参数variety_year variety_years_regex = '<li class="v-year">(.*?)</li>' variety_years_html = tools.get_info(everyone_html, variety_years_regex) variety_years_regex = '<em>(\d*?)</em>' variety_years = tools.get_info(variety_years_html, variety_years_regex) if playlistId and variety_years: for variety_year in variety_years: episode_json_url = 'http://tv.sohu.com/item/VideoServlet?callback=&source=sohu&id=' + \ playlistId + '&year=' + variety_year + '&month=0&page=1' episode_json = tools.get_json_by_requests(episode_json_url) # print(tools.dumps_json(episode_json)) # 获取集数 episode_json_infos = tools.get_json_value(episode_json, 'videos') #episode = len(episode_json_infos) for episode_json_info in episode_json_infos: # 集摘要 episode_summary = tools.get_json_value(episode_json_info, 'videoDesc') # 集名字 episode_name = tools.get_json_value(episode_json_info, 'title') # 集url episode_url = tools.get_json_value(episode_json_info, 'url') # 下载地址 episode_download_url = you_get.get_video_url(episode_url) if episode_download_url: episode_download_url = '^_^'.join(episode_download_url) # 集图片地址 episode_image_url = tools.get_json_value(episode_json_info, 'pic10') # 当前集数 episode_num = tools.get_json_value(episode_json_info, 'showDate') download_status = 102 time_length = '' if episode_download_url: # log.debug(''' # depth = %s # episode_num = %s # time_length = %s # episode_name = %s # episode_url = %s # episode_download_url = %s # episode_summary = %s # episode_image_url = %s # # ''' % ( # depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, # episode_image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, sto_path='') if playlistId and not variety_years: regexs = '<!-- start : juqing title -->(.*?)<!-- end : plot content -->' episode_infos = tools.get_info(everyone_html, regexs) for episode_info in episode_infos: # 集名字 regex = '<h4><.*?>(.*?)<span></span></a></h4>' episode_name = tools.get_info(episode_info, regex) episode_name = episode_name and episode_name[0] or '' # 摘要 regex = '<p class="intro synopsis text">(.*?)</p>' episode_summary = tools.get_info(episode_info, regex) episode_summary = episode_summary and episode_summary[0] or '' episode_summary = tools.del_html_tag(episode_summary) # 图片url regex = '<img src="(.*?)" width=".*?" height=".*?"' episode_image_url = tools.get_info(episode_info, regex) episode_image_url = episode_image_url and episode_image_url[0] or '' episode_image_url = "http:" + episode_image_url # 集数 regex = '<h4><a href=.*?>画心师 第一季(.*?)<span></span></a></h4>' episode_num = tools.get_info(episode_info, regex) episode_num = episode_num and episode_num[0] or '' # url regex = '<h4><a href="(.*?)" title=".*?" target="_blank">' episode_url = tools.get_info(episode_info, regex) episode_url = episode_url and episode_url[0] or '' episode_url = "http:" + episode_url # 下载地址 episode_download_url = you_get.get_video_url(episode_url) if episode_download_url: episode_download_url = '^_^'.join(episode_download_url) download_status = 102 time_length = '' if episode_download_url: log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s episode_download_url = %s episode_summary = %s episode_image_url = %s ''' % ( depth+1, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, sto_path='') # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] def r1(pattern, text): m = re.search(pattern, text) if m: return m.group(1) program_name = '风行星风范' actors = '姜武,秦海璐,黄海波,柳岩' release_time = '2011年07月23日' directors = '' program_url = 'http://www.fun.tv/vplay/g-98097/' summary = '' image_url = 'http://img3.funshion.com/sdw?oid=cc09e4ab792d4008d86efcbbbf4c55dc&w=200&h=280' id = '98097' json_episode_info = tools.get_json_by_requests( 'http://pm.funshion.com/v5/media/episode?id=' + id + '&cl=aphone&uc=5') episode = len(json_episode_info) log.debug(''' depth = %s program_name = %s program_url = %s episode = %s summary = %s image_url = %s ''' % (depth, program_name, program_url, episode, summary, image_url)) program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors='', actors=actors, summary='', release_time=release_time) if re.match(r'http://www.fun.tv/vplay/.*g-(\w+)', source_url): id = r1(r'http://www.fun.tv/vplay/.*g-(\d+)', source_url) json_info = tools.get_json_by_requests( 'http://pm.funshion.com/v5/media/episode?id=' + id + '&cl=aphone&uc=5') json_episodes_info = tools.get_json_value(json_info, 'episodes') for json_episode_info in json_episodes_info: vid = tools.get_json_value(json_episode_info, 'id') episode_name = tools.get_json_value(json_episode_info, 'name') image_episode_info = tools.get_json_value(json_episode_info, 'still') episode_url = tools.get_json_value(json_episode_info, 'num') episode_url = 'http://pm.funshion.com/v5/media/share?id=98097&num=' + episode_url episode_num = tools.get_json_value(json_episode_info, 'num') #总集数 episode = len(json_episode_info) time_length = '' episode_summary = '' download_status = '' download_url = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s image_episode_info = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, download_url, episode_summary, image_episode_info)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, download_url, episode_url, episode_summary, image_episode_info, '') # 取当前页的文章信息 # 标题 # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def parser_program_info(url_info): log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] program_id = remark['program_id'] classify = remark['classify'] # 解析 html, request = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('PROGRAM_urls', root_url, Constance.EXCEPTION) return # 标题 regex = '<h1 class="video_title_cn" >.*?>(.*?)<' program_name = tools.get_info(html, regex, fetch_one=True) # 地址 program_url = root_url # 图片地址 regex = '<img src="(.*?)".*?_stat="info:poster"/>' image_url = tools.get_info(html, regex, fetch_one=True) image_url = tools.get_full_url('http://', image_url) # 集数 regex = ['更新期数:</span>.*?>(.*?)</span>', '总集数:</span>.*?>(.*?)</span>'] episode = tools.get_info(html, regex, fetch_one=True) # 导演 regex = '<span class="director">导演</span>.*?info:actor_name">(.*?)<' directors = tools.get_info(html, regex, split='/') # 演员 regex = '_stat="info:actor_name">(.*?)</span>' # 包含导演 actors = tools.get_info(html, regex, split='/') actors = actors.replace(directors + '/', '') if directors else actors # 发布时间 regex = ['首播时间:</span>.*?>(.*?)<', '出品时间:</span>.*?>(.*?)<'] release_time = tools.get_info(html, regex, fetch_one=True) # 简介 regex = 'desc_txt">(.*?)</span>' summary = tools.get_info(html, regex, fetch_one=True) summary = tools.del_html_tag(summary) log.debug(''' program_name %s program_url %s image_url %s episode %s directors %s actors %s release_time %s summary %s ''' % (program_name, program_url, image_url, episode, directors, actors, release_time, summary)) program_mongo_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url, episode, directors, actors=actors, summary=summary, release_time=release_time) # 解析分集信息的地址 # 综艺(json) if classify == '综艺': # 月份 regex = '_stat="series:tab".*?>(\d*)月' months = tools.get_info(html, regex) # print(months) # 年 regex = '<div class="year_slt_list">.*?data-value="(.*?)"' years = tools.get_info(html, regex) # print(years) for year in years: for month in months: episode_url = 'http://s.video.qq.com/get_playsource?id=%s&plat=2&type=4&data_type=3&video_type=10&year=%s&month=%s&plname=qq&otype=json' % ( program_id, year, month) log.debug('%s分集json地址:%s' % (program_name, episode_url)) base_parser.add_url('PROGRAM_urls', site_id, episode_url, depth=2, remark={ 'program_mongo_id': program_mongo_id, 'classify': '综艺' }) # 电视剧(包含第几集, url) elif classify == '电视剧': regex = 'series:numbtn".*?<a href="(.*?)".*?<span itemprop="episodeNumber">(.*?)</span>' episode_msgs = tools.get_info(html, regex) for episode_msg in episode_msgs: episode_url = episode_msg[0] episode_num = episode_msg[1] log.debug(''' episode_url %s episode_num %s ''' % (episode_url, episode_num)) base_parser.add_url('PROGRAM_urls', site_id, episode_url, depth=2, remark={ 'program_mongo_id': program_mongo_id, 'episode_num': episode_num, 'program_name': program_name, 'classify': '电视剧' }) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info): root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] # root_url = 'http://list.youku.com/show/id_ze7cc3b8ed96711e68ce4.html' # depth = 0 # headers = {'Host': 'cmstool.youku.com', # 'Referer': 'http://v.youku.com/v_show/id_XMjY2NzY3MTE4NA.html', # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', # 'Cookie': '__ysuid=1491380613750xxD; __yscnt=1; juid=01bg7f56tqm9e; __aryft=1495434329; yseid=1495503610725JmZw8d; yseidcount=11; seid=01bgpfc8rb2vm6; ykss=fe922359521ce2d462cbda53; cna=Y5NrEThaR2MCAdOcjEogCug8; __ayvstp=6; __aysvstp=110; l=AmdnSHROpJU3344cDsaqhZhFd5Ex5jvO; isg=AlZW_barEwKJtiefqvOnVZcapwzSXpoxTdXpV8C_SDnUg_YdKIfqQbwzbaiV; __ayft=1495503611023; __aysid=1495416942598jZ1; __arpvid=1495504158930FOANHy-1495504158944; __arycid=; __ayscnt=1; __arcms=; __aypstp=5; __ayspstp=140; ypvid=1495504161820uZFGHk; ysestep=5; yseidtimeout=1495511361821; ycid=0; ystep=237; referhost=; seidtimeout=1495505961826'} if depth == 0: html = tools.get_html_by_urllib(root_url) header_info = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False) program_name = header_info.a['title'] recent_video_url = header_info.a['href'] recent_video_url = 'http:'+recent_video_url recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)='], fetch_one=True) if not recent_video_id: recent_video_id = tools.get_info(recent_video_url, ['id_(.+?)\.h'], fetch_one=True) actors = tools.get_tag(html, 'li', {'class': 'p-row'})[2].get_text() actors = ''.join(tools.re.compile('主持人:(.+)').findall(actors)) summary = tools.get_tag(html, 'span', {'class': 'text'}, find_all=False).get_text() summary = ''.join(tools.re.compile('简介:(.+)').findall(summary)) image_url = tools.get_tag(html, 'div', {'class': 'p-thumb'}, find_all=False) image_url = image_url.img['src'] list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id list_json = tools.get_json_by_requests(list_url) video_list = tools.get_json_value(list_json, 'data.videos.list') # print(video_list) episode = tools.get_json_value(list_json, 'data.show.episode_total') log.debug(''' recent_video_url: %s recent_video_id: %s 集数: %s 主持人: %s 封面地址: %s 专辑地址: %s 简介: %s 节目名称: %s 视频列表: %s list_url: %s ''' % (recent_video_url, recent_video_id, episode, actors, image_url, root_url, summary, program_name, video_list, list_url)) program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, actors=actors, image_url=image_url, program_url=root_url, summary=summary, program_name=program_name, episode=episode) for vl in video_list: vl_id = tools.get_json_value(vl, 'encodevid') vl_url = 'http://v.youku.com/v_show/id_%s.html' % vl_id base_parser.add_url('PROGRAM_urls', SITE_ID, vl_url, depth=1, remark=program_id) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE) elif depth == 1: program_id = remark html, res = tools.get_html_by_requests(root_url) episode_name = tools.get_tag(html, 'h1', find_all=False) episode_name = episode_name.get_text() videoId = tools.get_info(html, ['videoId:"(.+?)"'], fetch_one=True) play_count, res = tools.get_html_by_requests('http://v.youku.com/action/getVideoPlayInfo?vid=%s&callback=tuijsonp5'% videoId) if not play_count: print(1) play_count = tools.get_info(play_count, ['"vv":"(.+?)"'], fetch_one=True) play_count = play_count.replace(',', '') # info_html, info_res = tools.get_html_by_requests('http://cmstool.youku.com/cms/playlog/get?callback=tuijsonp7', headers) # # print(info_html) # image_url = tools.get_info(info_html, ['"thumburl":"(.+?)",'], fetch_one=True) # image_url = image_url.replace('\\', '') # print(image_url) # episode_num = tools.get_info(info_html, ['"watchStage":"(.+?)",'], fetch_one=True) # episode_num = tools.to_chinese(episode_num) # print(episode_num) recent_video_id = tools.get_info(root_url, ['id_(.+?)='], fetch_one=True) if not recent_video_id: recent_video_id = tools.get_info(root_url, ['id_(.+?)\.h'], fetch_one=True) list_url = 'https://ups.youku.com/ups/get.json?vid=%s==&ccode=0401&client_ip=&utid=Y5NrEThaR2MCAdOcjEogCug8&client_ts=' % recent_video_id list_info = tools.get_json_by_requests(list_url) stream = tools.get_json_value(list_info, "data.stream") download_url = stream[layer]['m3u8_url'] time_length = tools.get_json_value(list_info, "data.video.seconds") episode_num = tools.get_json_value(list_info, "data.show.stage") image_url = tools.get_json_value(list_info, "data.video.logo") segs = stream[layer]['segs'] cdn_url = [] for video_url in segs: cdn_url.append(video_url['cdn_url']) # print(cdn_url) log.debug(''' 节目id: %s 当前集数: %s 本集时长: %s 播放次数: %s 节目名称: %s 下载地址: %s 节目链接: %s 图片地址: %s ''' % ( program_id, episode_num, time_length, play_count, episode_name, download_url, root_url, image_url)) base_parser.add_program_episode_info('PROGRAM_EPISODE_info', SITE_ID, program_id=program_id, episode_num=episode_num, time_length=time_length, episode_name=episode_name, download_url=download_url, episode_url=root_url, image_url=image_url, play_count=play_count) base_parser.update_url('PROGRAM_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] def get_tkey(t): def ror(val, key): i = 0 while (i < key): val = (0x7fffffff & (val >> 1)) | ((val & 1) << 31) i += 1 return val key = 185025305 val = ror(t, key % 17) val = val ^ key return val def getdownload(episode_download_url_json): episode_json = tools.get_json_by_requests(episode_download_url_json) #print(episode_download_url_json) episode_download_url = tools.get_json_value(episode_json, 'msgs.playurl.domain') episode_download_url = episode_download_url and episode_download_url[ 0] or '' #print('-----',episode_download_url) episode_download_url_definition = tools.get_json_value( episode_json, 'msgs.playurl.dispatch.1080p') episode_download_url_definition = episode_download_url_definition and episode_download_url_definition[ 0] or '' print(episode_download_url_definition, '*********') episode_download_url = episode_download_url + episode_download_url_definition episode_download_url += "&ctv=pc&m3v=1&termid=1&format=1&hwtype=un&ostype=Linux&tag=letv&sign=letv&expect=3&tn={}&pay=0&iscpn=f9051&rateid={}".format( random.random(), '1080p') episode_download_url_json = tools.get_json_by_requests( episode_download_url) episode_download_url = tools.get_json_value(episode_download_url_json, 'location') return episode_download_url if depth == 0: cs_regex = 'cs(.*?)_' o_regex = 'cs.*?_o(.*?)_p' cs = tools.get_info(source_url, cs_regex) cs_value = cs and cs[0] or '' o = tools.get_info(source_url, o_regex) o_value = o and o[0] or '' #print('1'+o_value+'2','***', cs_value) url = 'http://list.le.com/apin/chandata.json?cs=' + cs_value + '&_o=' + o_value + '&_p=' base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1) if depth == 1: page = '1' #电视剧 if 'cs=2' in source_url: while True: json = tools.get_json_by_requests(source_url + page) json_list = tools.get_json_value(json, 'album_list') #print(source_url) for info in json_list: image_url = tools.get_json_value(info, 'images.1080*608') program_name = tools.get_json_value(info, 'name') program_url = tools.get_json_value(info, 'aid') program_url = 'http://www.le.com/tv/' + program_url + '.html' episode = tools.get_json_value(info, 'nowEpisodes') directors = tools.get_json_value(info, 'directory') #print(type(directors)) directors = ','.join(tools.get_json(directors).values()) actors = tools.get_json_value(info, 'starring') actors = ' '.join(actors.values()) summary = tools.get_json_value(info, 'description') release_time = tools.get_json_value(info, 'releaseDate') release_time = int(release_time) / 1000 x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time) episode_url = tools.get_json_value(info, 'vids') episode_url = episode_url + ',' regex = '(\d*?),' episode_urls = tools.get_info(episode_url, regex) for episode_url_num in episode_urls: episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_json = tools.get_json_by_requests( episode_download_url_json) episode_image_url = tools.get_json_value( episode_json, 'msgs.playurl.pic') episode_name = tools.get_json_value( episode_json, 'msgs.playurl.title') episode_num_regex = "(\d*?)" episode_num = tools.get_info(episode_name, episode_num_regex) episode_num = episode_num and episode_num[0] or '' episode_download_url = getdownload( episode_download_url_json) time_length = '' episode_summary = '' download_status = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not json_list: return False #体育 if 'cs=4' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '第(.*?)期' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://sports.le.com/video/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' program_url = '' actors = '' directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, episode_download_url, program_url, summary, image_url, '') page = str(int(page) + 1) if not json_list: return False # 综艺 if 'cs=11' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '第(.*?)期' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' actors = '' directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') program_url = '' download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not video_list: return False # 音乐 if 'cs=9' in source_url: while True: json = tools.get_json_by_requests(source_url + page) video_list = tools.get_json_value(json, 'video_list') for info in video_list: episode_name = tools.get_json_value(info, 'name') episode_num = tools.get_json_value(info, 'name') regex_episode_num = '(\d*?):' episode_num = tools.get_info(episode_num, regex_episode_num) episode_num = ''.join(episode_num) episode_summary = tools.get_json_value(info, 'description') episode_image_url = tools.get_json_value( info, 'images.1080*608') episode_url_num = tools.get_json_value(info, 'vid') episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' #获取Vid 也就是num episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) episode_download_url = getdownload( episode_download_url_json) program_name = tools.get_json_value(info, 'albumName') summary = '' actors = tools.get_json_value(info, 'actor').values() actors = ''.join(actors) #print('**********', actors) directors = '' release_time = '' image_url = '' episode = '' aid = tools.get_json_value(info, 'aid') program_url = '' download_status = 102 time_length = '' log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s aid = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time, aid)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time, _id=aid) log.debug(''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not video_list: return False # # # 取当前页的文章信息 # # # 标题 # # # # # # everyone_html = tools.get_html_by_requests(program_url) # # # # regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>' # # directors = tools.get_info(everyone_html, regexs_directors) # # directors = directors and directors[0] or '' # # # # # 摘要 # # regexs_summary = '<p class="summaryList_long">(.*?)</p>' # # summary = tools.get_info(everyone_html, regexs_summary) # # summary = summary and summary[0] or '' # # # # # 更新时间 # # regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>' # # release_time = tools.get_info(everyone_html, regexs_release_time) # # release_time = release_time and release_time[0] or '' # # # # # 下载地址 # # regexs_download_url = 'videoUrl=(.*?)"' # # download_url = tools.get_info(everyone_html, regexs_download_url) # # download_url = download_url and download_url[0] or '' # # # # download_status = 102 # # time_length = '' # # # # # # if download_url: # # program_id = base_parser.add_program_info('PROGRAM_info', site_id, program_name, program_url, image_url = image_url, # # episode = episode, directors = directors, actors = '', summary = summary, # # release_time = release_time) # # # # sto_path = '/video/' + program_name + '.mp4' # # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # # download_status = 101 if is_download else 102 # # # # base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, # # download_url, program_url, summary, image_url, sto_path) # # # # # # # 更新source_url为done # # base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE) # 电影 if 'cs=1' in source_url: while True: json = tools.get_json_by_requests(source_url + page) json_list = tools.get_json_value(json, 'album_list') #print(source_url) for info in json_list: image_url = tools.get_json_value(info, 'images.1080*608') program_name = tools.get_json_value(info, 'name') program_url = tools.get_json_value(info, 'aid') program_url = 'http://www.le.com/movie/' + program_url + '.html' episode = ' ' directors = tools.get_json_value(info, 'directory') directors = ','.join(tools.get_json(directors).values()) actors = tools.get_json_value(info, 'starring') actors = ' '.join(actors.values()) summary = tools.get_json_value(info, 'description') release_time = tools.get_json_value(info, 'releaseDate') release_time = int(release_time) / 1000 x = time.localtime(release_time) release_time = time.strftime("%Y-%m-%d", x) log.debug( ''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s actors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, actors, summary, release_time)) program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors=actors, summary=summary, release_time=release_time) episode_url = tools.get_json_value(info, 'vids') episode_url = episode_url + ',' regex = '(.*?),' episode_urls = tools.get_info(episode_url, regex) for episode_url_num in episode_urls: episode_url = 'http://www.le.com/ptv/vplay/' + episode_url_num + '.html' episode_download_url_json = 'http://player-pc.le.com/mms/out/video/playJson?id={}&platid=1&splatid=101&format=1&tkey={}&domain=www.le.com&dvtype=1000&devid=49BDB62DC27B044CCD48E49CCF38EAAE3B095825®ion=cn&source=1000&accessyx=1'.format( episode_url_num, get_tkey(int(time.time()))) print(episode_download_url_json) episode_json = tools.get_json_by_requests( episode_download_url_json) episode_image_url = tools.get_json_value( episode_json, 'msgs.playurl.pic') episode_name = tools.get_json_value( episode_json, 'msgs.playurl.title') episode_num_regex = "第(.*?)期" episode_num = tools.get_info(episode_name, episode_num_regex) episode_num = episode_num and episode_num[0] or '' episode_download_url = getdownload( episode_download_url_json) time_length = '' episode_summary = '' download_status = '' log.debug( ''' depth = %s episode_num = %s time_length = %s episode_name = %s episode_url = %s download_url = %s episode_summary = %s episode_image_url = %s ''' % (depth, episode_num, time_length, episode_name, episode_url, episode_download_url, episode_summary, episode_image_url)) base_parser.add_program_episode_info( 'PROGRAM_EPISODE_info', site_id, program_id, episode_num, time_length, episode_name, download_status, episode_download_url, episode_url, episode_summary, episode_image_url, '') page = str(int(page) + 1) if not json_list: return False
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) regexs = '<ul node-type="listWrap" class="lastUl">(.*?)<div class="bottom_more">' lis = tools.get_info(html, regexs) regexs = '<li>(.*?)</li>' html_lis = tools.get_info(lis, regexs) for html_li in html_lis: url_regex = '<a href="(.*?)">' url = tools.get_info(html_li, url_regex) url = url and url[0] or '' if url: base_parser.add_url('PROGRAM_urls', site_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs_program_name = '<a href=".*?">(.*?)</a>' program_name = tools.get_info(html_li, regexs_program_name) program_name = program_name and program_name[0] or '' program_url = url #时间 regexs_image_url = 'data-src="(.*?)"' image_url = tools.get_info(html_li, regexs_image_url) image_url = image_url and image_url[0] or '' episode = 1 everyone_html = tools.get_html_by_requests(program_url) regexs_directors = '<span class="editor" style="color:#333;">(.*?)</span>' directors = tools.get_info(everyone_html, regexs_directors) directors = directors and directors[0] or '' # 摘要 regexs_summary = '<p class="summaryList_long">(.*?)</p>' summary = tools.get_info(everyone_html, regexs_summary) summary = summary and summary[0] or '' # 更新时间 regexs_release_time = ' <dt>发布时间:</dt>.*?<dd>(.*?)</dd>' release_time = tools.get_info(everyone_html, regexs_release_time) release_time = release_time and release_time[0] or '' # 下载地址 regexs_download_url = 'videoUrl=(.*?)"' download_url = tools.get_info(everyone_html, regexs_download_url) download_url = download_url and download_url[0] or '' download_status = 102 time_length = '' log.debug(''' depth = %s program_name = %s program_url = %s image_url = %s episode = %s directors = %s summary = %s release_time = %s ''' % (depth, program_name, program_url, image_url, episode, directors, summary, release_time)) if download_url: program_id = base_parser.add_program_info( 'PROGRAM_info', site_id, program_name, program_url, image_url=image_url, episode=episode, directors=directors, actors='', summary=summary, release_time=release_time) # sto_path = '/videos/' + program_name + '.mp4' # is_download = tools.download_file(download_url, FILE_LOCAL_PATH, sto_path) # download_status = 101 if is_download else 102 sto_path = '' base_parser.add_program_episode_info('PROGRAM_EPISODE_info', site_id, program_id, episode, time_length, program_name, download_status, download_url, program_url, summary, image_url, sto_path) # 更新source_url为done base_parser.update_url('PROGRAM_urls', source_url, Constance.DONE)
def zhuanji_parser(url, remark): image_url = remark html, res = tools.get_html_by_requests(url) program_name = tools.get_info(html, ['<title>(.+?)-.+?</title>'], fetch_one=True) if not program_name: base_parser.update_url('PROGRAM_urls', url, Constance.EXCEPTION) return ablum_list_id = tools.get_info(html, ['data-bodansubid="(.+?)"'], fetch_one=True) if ablum_list_id: video_list_url = 'http://cache.video.qiyi.com/jp/plst/%s/' % ablum_list_id list_json, res = tools.get_html_by_requests(video_list_url) video_list = tools.get_info(list_json, ['http://www.iqiyi.com/v_.+?.html']) else: ablum_list_id = tools.get_info(html, ['sourceId: (.+?),'], fetch_one=True) video_list_url = 'http://cache.video.qiyi.com/jp/sdvlst/6/%s/' % ablum_list_id list_json, res = tools.get_html_by_requests(video_list_url) # video_list = tools.get_info(list_json, ['http://www.iqiyi.com/v_.+?.html']) video_list = tools.get_info(list_json, ['http://www.iqiyi.com/.+?.html']) if ablum_list_id == '0': video_list = [] if not video_list: video_list = tools.get_tag(html, 'div', {'class': 'wrapper-piclist'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) if not video_list: video_list = tools.get_tag(html, 'div', {'class': 'piclist-wrapper'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) if not video_list: video_list = tools.get_tag(html, 'ul', {'class': 'juji-list'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) if not video_list: video_list = tools.get_tag(html, 'div', {'class': 'videoList'}, find_all=False) video_list = tools.get_info(str(video_list), ['(http://www.iqiyi.com/v_.+?.html)']) video_list = list(set(video_list)) summary = tools.get_tag(html, 'span', {'class': 'showMoreText'}, find_all=False) if summary: summary = summary.get_text().replace('简介:', '') if not summary: summary = tools.get_tag(html, 'div', {'data-moreorless': 'moreinfo'}, find_all=False) if summary: summary = summary.get_text() log.debug(''' 封面地址: %s 专辑地址: %s 简介: %s 节目名称: %s 视频列表: %s ''' % (image_url, url, summary, program_name, video_list)) program_id = base_parser.add_program_info('PROGRAM_info', SITE_ID, image_url=image_url, program_url=url, summary=summary, program_name=program_name) for link in video_list: base_parser.add_url('PROGRAM_urls', SITE_ID, link, depth=1, remark=program_id) base_parser.update_url('PROGRAM_urls', url, Constance.DONE)