def dianshiju_info(self, tv_infos, db_tv_names): url = u'http://cache.video.qiyi.com/jp/vi/{}/{}/' for name, tv_info in tv_infos.items(): # print u"抓取《{}》中".format(name) warning_message = u"《iqy {}》tv_info ,结果不准确\r\n". \ format(name) tv_id = tv_info[1]['id'] vids = tv_info[2]['v_id'] page = request(url.format(tv_id, vids)) json_content = tv_info_is_valid(page) if not json_content: utils.log(message=warning_message) continue all_number = json_content['es'] current_number = json_content['upOrder'] description = json_content['info'] label = json_content['tg'] cast_member = json_content['ma'] update_info = json_content['qiyiPlayStrategy'][:32] last_update_time = json_content['up'] detail_urls = tv_info[0]['url'] tv_type = u'电视剧' detail_titles = json_content['vn'] detail_episodes = '' if name in db_tv_names: TvInfo.update( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def start_qq(now): start = int(time.time()) print "qq开始抓取 .." qq_spi = SpiderQq() qq_db = SerializeQq(now) # spider tv_names tv_names = qq_spi.tv_names() tv_names = list(set(tv_names)) # spider tv_info qq_spi.tv_info(tv_names) # db tv_info db_tv_names = [_.name for _ in TvInfo.mget_by_platform(u'qq')] qq_db.tv_info(tv_names, db_tv_names) tv_names = tv_names + db_tv_names tv_names = list(set(tv_names)) # spider play db_tv_infos = TvInfo.mget_by_platform(u'qq') qq_spi.play_info(db_tv_infos) # db play_info qq_db.play_info(db_tv_infos) end = int(time.time()) print 'qq抓取完毕,耗时', utils.format_seconds(end - start)
def dianshiju_info(self, tv_infos, db_tv_names): url = u'http://cache.video.qiyi.com/jp/vi/{}/{}/' for name, tv_info in tv_infos.items(): # print u"抓取《{}》中".format(name) warning_message = u"《iqy {}》tv_info ,结果不准确\r\n". \ format(name) tv_id = tv_info[1]['id'] vids = tv_info[2]['v_id'] page = request(url.format(tv_id, vids)) json_content = tv_info_is_valid(page) if not json_content: utils.log(message=warning_message) continue all_number = json_content['es'] current_number = json_content['upOrder'] description = json_content['info'] label = json_content['tg'] cast_member = json_content['ma'] update_info = json_content['qiyiPlayStrategy'][:32] last_update_time = json_content['up'] detail_urls = tv_info[0]['url'] tv_type = u'电视剧' detail_titles = json_content['vn'] detail_episodes = '' if name in db_tv_names: TvInfo.update(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def zongyi_info(self, tv_infos, db_tv_names): for name, tv_info in tv_infos.items(): # print u"抓取《{}》中".format(name) tv_id = u'' description = tv_info[2] last_update_time = u'' all_number = len(tv_info[0]) current_number = tv_info[1] cast_member = tv_info[3] label = u'' update_info = u'' detail_urls = u'' vids = ",".join(tv_info[0]) tv_type = u'综艺' detail_titles = u'' detail_episodes = u'' if name in db_tv_names: TvInfo.update( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def start_let(now): start = int(time.time()) print "let开始抓取 .." let_spi = SpiderLet() let_db = SerializeLet(now) # dianshiju dianshiju_urls_map = let_spi.dianshiju_urls_map() tv_infos = TvInfo.mget_by_platform(u'let') db_tv_names = [_.name for _ in tv_infos] db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'let', utils.format_time(time.time(), "%Y-%m-%d")) for tv_info in tv_infos: if not dianshiju_urls_map.get(tv_info.name) and tv_info.type == u'电视剧': # noqa dianshiju_urls_map[tv_info.name] = [tv_info.detail_urls, tv_info.tv_id, tv_info.cast_member, tv_info.label] # noqa let_db.dianshiju(dianshiju_urls_map, db_tv_names, db_play_info_map) # zongyi zongyi_urls_map = let_spi.zongyi_urls_map() for tv_info in tv_infos: if not zongyi_urls_map.get(tv_info.name) and tv_info.type == u'综艺': zongyi_urls_map[tv_info.name] = [tv_info.detail_urls, tv_info.label] # noqa let_db.zongyi(zongyi_urls_map, db_tv_names, db_play_info_map) end = int(time.time()) print 'let抓取完毕,耗时', utils.format_seconds(end - start)
def start_let(now): start = int(time.time()) print "let开始抓取 .." let_spi = SpiderLet() let_db = SerializeLet(now) # dianshiju dianshiju_urls_map = let_spi.dianshiju_urls_map() tv_infos = TvInfo.mget_by_platform(u'let') db_tv_names = [_.name for _ in tv_infos] db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'let', utils.format_time(time.time(), "%Y-%m-%d")) for tv_info in tv_infos: if not dianshiju_urls_map.get( tv_info.name) and tv_info.type == u'电视剧': # noqa dianshiju_urls_map[tv_info.name] = [ tv_info.detail_urls, tv_info.tv_id, tv_info.cast_member, tv_info.label ] # noqa let_db.dianshiju(dianshiju_urls_map, db_tv_names, db_play_info_map) # zongyi zongyi_urls_map = let_spi.zongyi_urls_map() for tv_info in tv_infos: if not zongyi_urls_map.get(tv_info.name) and tv_info.type == u'综艺': zongyi_urls_map[tv_info.name] = [ tv_info.detail_urls, tv_info.label ] # noqa let_db.zongyi(zongyi_urls_map, db_tv_names, db_play_info_map) end = int(time.time()) print 'let抓取完毕,耗时', utils.format_seconds(end - start)
def zongyi_info(self, tv_infos, db_tv_names): for name, tv_info in tv_infos.items(): # print u"抓取《{}》中".format(name) tv_id = u'' description = tv_info[2] last_update_time = u'' all_number = len(tv_info[0]) current_number = tv_info[1] cast_member = tv_info[3] label = u'' update_info = u'' detail_urls = u'' vids = ",".join(tv_info[0]) tv_type = u'综艺' detail_titles = u'' detail_episodes = u'' if name in db_tv_names: TvInfo.update(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def start_iqy(now): start = int(time.time()) print "iqy开始抓取 .." iqy_spi = SpiderIqy() iqy_db = SerializeIqy(now) # dianshiju dianshiju_infos = iqy_spi.dianshiju_infos() tv_infos = TvInfo.mget_by_platform_and_type(u'iqy', u'电视剧') db_tv_names = [_.name for _ in tv_infos] for tv_info in tv_infos: if not dianshiju_infos.get(tv_info.name) and tv_info.type == u'电视剧': dianshiju_infos[tv_info.name] = \ [ {'url': [tv_info.detail_urls]}, {'id': tv_info.tv_id}, {'v_id': tv_info.vids}] iqy_db.dianshiju_info(dianshiju_infos, db_tv_names) # zongyi zongyi_infos = iqy_spi.zongyi_infos() tv_infos = TvInfo.mget_by_platform_and_type(u'iqy', u'综艺') db_tv_names = [_.name for _ in tv_infos] for tv_info in tv_infos: if not zongyi_infos.get(tv_info.name) and tv_info.type == u'综艺': zongyi_infos[tv_info.name] = \ [ tv_info.vids.split(","), tv_info.current_number, tv_info.description, tv_info.cast_member] iqy_db.zongyi_info(zongyi_infos, db_tv_names) # play_info db_tv_infos = TvInfo.mget_by_platform(u'iqy') db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'iqy', utils.format_time(time.time(), "%Y-%m-%d")) iqy_db.play_info(db_play_info_map, db_tv_infos) end = int(time.time()) print 'iqy抓取完毕,耗时', utils.format_seconds(end - start)
def start_sh(now): start = int(time.time()) print "sh开始抓取 .." sh_spi = SpiderSh() sh_db = SerializeSh(now) # db pids_map = sh_spi.pids_map() tv_infos = TvInfo.mget_by_platform(u'sh') db_tv_names = [_.name for _ in tv_infos] for tv_info in tv_infos: if not pids_map.get(tv_info.name): pids_map[tv_info.name] = tv_info.tv_id db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'sh', utils.format_time(time.time(), "%Y-%m-%d")) sh_db.info_and_play(pids_map, db_tv_names, db_play_info_map) end = int(time.time()) print 'sh抓取完毕,耗时', utils.format_seconds(end - start)
def start_mg(now): start = int(time.time()) print "mg开始抓取 .." mg_spi = SpiderMg() mg_db = SerializeMg(now) # db pids_map = mg_spi.pids_map() tv_infos = TvInfo.mget_by_platform(u'mg') db_tv_names = [_.name for _ in tv_infos] reverse = {v: k for k, v in TV_TYPE_MAP.iteritems()} for tv_info in tv_infos: if not pids_map.get(tv_info.name): type_n = reverse[tv_info.type] pids_map[tv_info.name] = [tv_info.tv_id, type_n] db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'mg', utils.format_time(time.time(), "%Y-%m-%d")) mg_db.info_and_play(pids_map, db_tv_names, db_play_info_map) end = int(time.time()) print 'mg抓取完毕,耗时', utils.format_seconds(end - start)
def start_yk(now): start = int(time.time()) print "yk开始抓取 .." yk_spi = SpiderYk() yk_db = SerializeYk(now) # spider urls_map tv_urls_map = yk_spi.tv_urls_map() # db info and play tv_infos = TvInfo.mget_by_platform(u'yk') db_tv_names = [_.name for _ in tv_infos] db_play_info_map = PlayInfo.mget_map_by_platform_and_time_after( 'yk', utils.format_time(time.time(), "%Y-%m-%d")) for tv_info in tv_infos: if not tv_urls_map.get(tv_info.name): tv_urls_map[tv_info.name] = tv_info.detail_urls yk_db.info_and_play(tv_urls_map, db_tv_names, db_play_info_map) end = int(time.time()) print 'yk抓取完毕,耗时', utils.format_seconds(end - start)
def tv_info(self, tv_names, db_tv_names): info_dir = TV_INFO_FILE_DIR + SAVE_FILE for name in tv_names: page = utils.read(info_dir, name + TV_INFO_FILE_FIX) json_content = tv_info_is_valid_qq(page) if not json_content: continue play_list = get_playlist(json_content) tv_type = play_list['BC'] tv_type = re.search(u'[\u4e00-\u9fa5]+', tv_type).group() description = play_list['TX'] last_update_time = play_list['AT'] update_info = play_list['SS'] tv_id = play_list['ID'] label = play_list['BE'] cast_member = play_list['BM'] cast_member = re.compile(u'<.+?>').sub(u'', cast_member) def get_current_number(play_list): match = re.search('\d+-\d+-\d+', play_list['TT']) current_number = match and match.group() if not current_number: match = re.search('\d+', play_list['TT']) current_number = match and match.group() or '' return current_number current_number = get_current_number(play_list) src_play_list = play_list['src_list']['vsrcarray'][0]['playlist'] all_list = get_all_list(src_play_list) vids, detail_urls, detail_titles, detail_episodes = [], [], [], [] for _ in all_list: vids.append(_['id']) detail_urls.append(_['url']) detail_titles.append(_['title']) detail_episodes.append(_['episode_number']) all_number = len(vids) vids, detail_urls, detail_titles, detail_episodes = \ ",".join(vids), ",".join(detail_urls), \ ",".join(detail_titles), ",".join(detail_episodes) if name in db_tv_names: TvInfo.update( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = 'http://videocenter-2039197532.cn-north-1.elb.amazonaws.com.cn/dynamicinfo?callback=callback&cid={}' # noqa info_url = 'http://www.mgtv.com/v/{type_n}/{pid}' year_url = 'http://www.mgtv.com/v/1/{}/s/json.year.js' number_url = 'http://www.mgtv.com/v/1/{pid}/s/json.{year}.js' for name, tv_infos in pids_map.items(): pid = tv_infos[0].encode('utf8') tv_id = pid info = request(info_url.format(type_n=tv_infos[1], pid=tv_infos[0])) # noqa info = info_is_valid(info) if not info: warning_message = u"mg《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue last_update_time = '' update_info = '' detail_urls = '' detail_titles = '' detail_episodes = '' current_number = re.search(u'"lastseries" : ".+?"', info).group() current_number = current_number.split(':')[1][2:-1] description = re.search(u'简介</em>(.|\n)+?</span>', info).group() description = re.compile(u'<.+?>|简介|:|\s').sub(u'', description) tv_type = TV_TYPE_MAP[tv_infos[1]] cast_flag = u'主演' if tv_type == u'电视剧' else u'主持人' cast_member = re.search(u'{}</em>(.|\n)+?</p>'.format(cast_flag), info).group() cast_member = re.compile(u'<.+?>|主演|主持人|:').sub(u'', cast_member) label = re.search(u'类型</em>(.|\n)+?</p>', info).group() label = re.compile(u'<.+?>|类型|:|\s').sub(u'', label) if tv_type == u'电视剧': all_number = re.search(u'共<b>\d+?</b>集', info).group() all_number = re.search(u'\d+', all_number).group() else: year_json = request(year_url.format(pid)) year = year_json_is_valid(year_json) if not year: warning_message = u"mg zongi《{}》year_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue number_info = request(number_url.format(pid=pid, year=(int)(year[0]))) number_info = number_info_is_valid(number_info) if not number_info: warning_message = u"mg zongyi《{}》number_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_number = len([_ for _ in number_info]) play_info = request(play_url.format(pid)) play_json = play_is_valid(play_info) if not play_json: warning_message = u"mg《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts_str = play_json['data']['allVVStr'] all_play_counts = (float)(re.compile(u'万|亿').sub(u'', all_play_counts_str)) # noqa if u'万'in all_play_counts_str: all_play_counts *= 10000 elif u'亿'in all_play_counts_str: all_play_counts *= 100000000 pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = 'http://videocenter-2039197532.cn-north-1.elb.amazonaws.com.cn/dynamicinfo?callback=callback&cid={}' # noqa info_url = 'http://www.mgtv.com/v/{type_n}/{pid}' year_url = 'http://www.mgtv.com/v/1/{}/s/json.year.js' number_url = 'http://www.mgtv.com/v/1/{pid}/s/json.{year}.js' for name, tv_infos in pids_map.items(): pid = tv_infos[0].encode('utf8') tv_id = pid info = request(info_url.format(type_n=tv_infos[1], pid=tv_infos[0])) # noqa info = info_is_valid(info) if not info: warning_message = u"mg《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue last_update_time = '' update_info = '' detail_urls = '' detail_titles = '' detail_episodes = '' current_number = re.search(u'"lastseries" : ".+?"', info).group() current_number = current_number.split(':')[1][2:-1] description = re.search(u'简介</em>(.|\n)+?</span>', info).group() description = re.compile(u'<.+?>|简介|:|\s').sub(u'', description) tv_type = TV_TYPE_MAP[tv_infos[1]] cast_flag = u'主演' if tv_type == u'电视剧' else u'主持人' cast_member = re.search(u'{}</em>(.|\n)+?</p>'.format(cast_flag), info).group() cast_member = re.compile(u'<.+?>|主演|主持人|:').sub(u'', cast_member) label = re.search(u'类型</em>(.|\n)+?</p>', info).group() label = re.compile(u'<.+?>|类型|:|\s').sub(u'', label) if tv_type == u'电视剧': all_number = re.search(u'共<b>\d+?</b>集', info).group() all_number = re.search(u'\d+', all_number).group() else: year_json = request(year_url.format(pid)) year = year_json_is_valid(year_json) if not year: warning_message = u"mg zongi《{}》year_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue number_info = request( number_url.format(pid=pid, year=(int)(year[0]))) number_info = number_info_is_valid(number_info) if not number_info: warning_message = u"mg zongyi《{}》number_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_number = len([_ for _ in number_info]) play_info = request(play_url.format(pid)) play_json = play_is_valid(play_info) if not play_json: warning_message = u"mg《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts_str = play_json['data']['allVVStr'] all_play_counts = (float)(re.compile(u'万|亿').sub( u'', all_play_counts_str)) # noqa if u'万' in all_play_counts_str: all_play_counts *= 10000 elif u'亿' in all_play_counts_str: all_play_counts *= 100000000 pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add(tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type)
def info_and_play(self, tv_urls_map, db_tv_names, db_play_info_map): for name, url in tv_urls_map.items(): warning_message = u"yk 《{} 》结果不准确\r\n". \ format(name) page = request(url) content = info_and_play_is_valid(page, name) if not content: time.sleep(30) page = request(url) content = info_and_play_is_valid(page, name) if not content: utils.log(message=warning_message) continue last_update_time = '' label = '' update_info = '' detail_urls = url detail_titles = '' detail_episodes = '' tv_id = re.search(u'id.+?\.html', url).group()[:-5] title_str = re.search(u'<h1 class="title">(.|\n)+?</h1>', page). \ group() tv_type = re.search(u'target="_blank">.+?<', title_str). \ group()[16:-1] cast_member = [] cast_member_flag = u'主持人' if tv_type == u'综艺' else u'主演' cast_member_str = re.search( cast_member_flag + u':</label>(.|\n)+?</span>', page).group() for m in re.finditer(u'<a.+?</a>', cast_member_str): cast_member.append( re.search('">.+?<', m.group()).group()[2:-1]) # noqa cast_member = ",".join(cast_member) description_str = re. \ search(u'<span class="short" id="show_info_short"(.|\n)+?</div>', content).group() # noqa description = re.compile(u'<.*?>|查看详情>>').sub(u'', description_str) all_number = '' current_number = '' if tv_type == u'电视剧': number_str = re.search(u'class="basenotice"(.|\n)+?<', content).group() current_number = re.search(u'更新至\d+', number_str) all_number = re.search(u'共\d+', number_str).group()[1:] current_number = current_number and current_number.group( )[3:] or all_number # noqa if tv_type == u'综艺': all_number = 0 tmp_episode = [] for _ in re.finditer(u'y\.episode\.show\(\'.+?\'\)', content): number_url = 'http://www.youku.com/show_episode/{}.html?dt=json&divid={}' # noqa divid = re.search(u'\'.+?\'', _.group()).group()[1:-1] current_number_str = request( number_url.format(tv_id.encode('utf8'), divid.encode('utf8'))) # noqa if not current_number_str: warning_message = u"yk 《{} 》number结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue tmp_episode = [ _ for _ in re.finditer(u'<ul(.|\n)+?</ul>', current_number_str) ] all_number += len(tmp_episode) if not all_number: tmp_episode = re.search( u'<div id="episode">(.|\n)+?</div>', page).group() # noqa tmp_episode = [ _ for _ in re.finditer(u'<ul(.|\n)+?</ul>', tmp_episode) ] if not tmp_episode: utils.log(message=warning_message) continue try: if not tmp_episode[0]: continue except: continue all_number = len(tmp_episode) current_number = re.search(u'<label>.+?</label>', tmp_episode[0].group()).group() current_number = re.compile(u'<.+?>|期'). \ sub(u'', current_number) all_play_counts = re.search(u'<label>总播放:</label>.+?\n', content) \ .group() all_play_counts = (int)( re.compile(u'<label>总播放:</label>|,|\n').sub( u'', all_play_counts)) pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add(tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type)
def tv_info(self, tv_names, db_tv_names): info_dir = TV_INFO_FILE_DIR + SAVE_FILE for name in tv_names: page = utils.read(info_dir, name + TV_INFO_FILE_FIX) json_content = tv_info_is_valid_qq(page) if not json_content: continue play_list = get_playlist(json_content) tv_type = play_list['BC'] tv_type = re.search(u'[\u4e00-\u9fa5]+', tv_type).group() description = play_list['TX'] last_update_time = play_list['AT'] update_info = play_list['SS'] tv_id = play_list['ID'] label = play_list['BE'] cast_member = play_list['BM'] cast_member = re.compile(u'<.+?>').sub(u'', cast_member) def get_current_number(play_list): match = re.search('\d+-\d+-\d+', play_list['TT']) current_number = match and match.group() if not current_number: match = re.search('\d+', play_list['TT']) current_number = match and match.group() or '' return current_number current_number = get_current_number(play_list) src_play_list = play_list['src_list']['vsrcarray'][0]['playlist'] all_list = get_all_list(src_play_list) vids, detail_urls, detail_titles, detail_episodes = [], [], [], [] for _ in all_list: vids.append(_['id']) detail_urls.append(_['url']) detail_titles.append(_['title']) detail_episodes.append(_['episode_number']) all_number = len(vids) vids, detail_urls, detail_titles, detail_episodes = \ ",".join(vids), ",".join(detail_urls), \ ",".join(detail_titles), ",".join(detail_episodes) if name in db_tv_names: TvInfo.update(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=vids, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def dianshiju(self, urls_map, db_tv_names, db_play_info_map): play_url = 'http://v.stat.letv.com/vplay/queryMmsTotalPCount?callback=callback&pid={}' # noqa for name, tv_info in urls_map.items(): url = tv_info[0] pid = tv_info[1] tv_id = pid cast_member = tv_info[2] last_update_time = u'' update_info = u'' detail_urls = url tv_type = u'电视剧' detail_titles = u'' detail_episodes = u'' label = tv_info[3] page = request(url) content = dianshiju_is_valid(page) if not content: warning_message = u"let《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue description = re.search(u'<p class="p7">(.|\n)+?</p>', content). \ group() description = re.compile(u'<.+?>').sub('', description) all_number = re.search(u'共\d+?集', content).group() all_number = re.search(u'\d+', all_number).group() current_number = re.search(u'至\d+?集', content) current_number = current_number and re \ .search(u'\d+', current_number.group()).group() or all_number page = request(play_url.format(pid)) json_content = play_info_is_valid(page) if not json_content: page = request(play_url.format(pid)) json_content = play_info_is_valid(page) if not json_content: warning_message = u"let《{}》play_info ,结果不准确\r\n". \ format(name) continue all_play_counts = json_content.get('plist_play_count') pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = "http://count.vrs.sohu.com/count/queryext.action?plids={}&callback=callback" # noqa info_url = "http://pl.hd.sohu.com/videolist?playlistid={}&callback=callback" # noqa for name, pid in pids_map.items(): tv_id = pid info = request(info_url.format(pid.encode("utf8"))) json_content = info_is_valid(info) if not json_content: warning_message = u"sh《{}》tv_info ,结果不准确\r\n".format(name) utils.log(message=warning_message) continue description = json_content["albumDesc"] last_update_time = "" current_number = json_content["updateSet"] all_number = json_content["totalSet"] all_number = all_number != u"0" and all_number or current_number tv_type = TV_TYPE_MAP.get(json_content["cid"]) if tv_type == u"综艺": cast_member = json_content["hosts"] else: cast_member = json_content["actors"] cast_member = u",".join(cast_member) label = ",".join(json_content["categories"]) update_info = json_content["updateNotification"] detail_urls = "" detail_titles = "" detail_episodes = "" play = request(play_url.format(pid.encode("utf8"))) play_json = play_is_valid(play, pid) if not play_json: warning_message = u"sh《{}》play_info ,结果不准确\r\n".format(name) utils.log(message=warning_message) continue all_play_counts = play_json[pid]["total"] pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type, )
def zongyi(self, urls_map, db_tv_names, db_play_info_map): pids = [] number_utl = 'http://api.le.com/mms/out/album/videos?id={}&cid=11&platform=pc&callback=callback' # noqa play_url = 'http://v.stat.letv.com/vplay/queryMmsTotalPCount?callback=callback&pid={}' # noqa description_url = 'http://www.le.com/zongyi/{}.html' for name, tv_info in urls_map.items(): url = tv_info[0] label = tv_info[1] pid_page = request(url) pid_page = zongyi_is_valid(pid_page) if not pid_page: continue pid = re.search(u'pid: \d+?,', pid_page).group() pid = re.search(u'\d+', pid).group() if pid in pids: continue pids.append(pid) tv_id = pid d_page = request(description_url.format(pid.encode('utf8'))) d_page = description_is_valid(d_page) if not d_page: warning_message = u"let《{}》description_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue description = re.search(u'<p class="p7">(.|\n)+?</p>', d_page). \ group() description = re.compile(u'<.+?>').sub('', description) last_update_time = u'' update_info = u'' detail_urls = url tv_type = u'综艺' detail_titles = u'' detail_episodes = u'' n_page = request(number_utl.format(pid)) n_json = number_utl_is_valid(n_page) if not n_json: warning_message = u"let zongyi《{}》number_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_number = n_json['total'] current_number = n_json['data'][0]['episode'] cast_member = [] [cast_member.append(_.get('guest')) for _ in n_json['data']] # remove repeat cast_member = " ".join(cast_member) cast_member = cast_member.split(" ") cast_member = list(set(cast_member)) cast_member = " ".join(cast_member) page = request(play_url.format(pid)) json_content = play_info_is_valid(page) if not json_content: warning_message = u"let《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts = json_content.get('plist_play_count') pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = 'http://count.vrs.sohu.com/count/queryext.action?plids={}&callback=callback' # noqa info_url = 'http://pl.hd.sohu.com/videolist?playlistid={}&callback=callback' # noqa for name, pid in pids_map.items(): tv_id = pid info = request(info_url.format(pid.encode('utf8'))) json_content = info_is_valid(info) if not json_content: warning_message = u"sh《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue description = json_content['albumDesc'] last_update_time = '' current_number = json_content['updateSet'] all_number = json_content['totalSet'] all_number = all_number != u'0' and all_number or current_number tv_type = TV_TYPE_MAP.get(json_content['cid']) if tv_type == u'综艺': cast_member = json_content['hosts'] else: cast_member = json_content['actors'] cast_member = u",".join(cast_member) label = ",".join(json_content['categories']) update_info = json_content['updateNotification'] detail_urls = '' detail_titles = '' detail_episodes = '' play = request(play_url.format(pid.encode('utf8'))) play_json = play_is_valid(play, pid) if not play_json: warning_message = u"sh《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts = play_json[pid]['total'] pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add(tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type)
def info_and_play(self, tv_urls_map, db_tv_names, db_play_info_map): for name, url in tv_urls_map.items(): warning_message = u"yk 《{} 》结果不准确\r\n". \ format(name) page = request(url) content = info_and_play_is_valid(page, name) if not content: time.sleep(30) page = request(url) content = info_and_play_is_valid(page, name) if not content: utils.log(message=warning_message) continue last_update_time = '' label = '' update_info = '' detail_urls = url detail_titles = '' detail_episodes = '' tv_id = re.search(u'id.+?\.html', url).group()[:-5] title_str = re.search(u'<h1 class="title">(.|\n)+?</h1>', page). \ group() tv_type = re.search(u'target="_blank">.+?<', title_str). \ group()[16:-1] cast_member = [] cast_member_flag = u'主持人' if tv_type == u'综艺' else u'主演' cast_member_str = re.search( cast_member_flag + u':</label>(.|\n)+?</span>', page).group() for m in re.finditer(u'<a.+?</a>', cast_member_str): cast_member.append(re.search('">.+?<', m.group()).group()[2:-1]) # noqa cast_member = ",".join(cast_member) description_str = re. \ search(u'<span class="short" id="show_info_short"(.|\n)+?</div>', content).group() # noqa description = re.compile(u'<.*?>|查看详情>>').sub(u'', description_str) all_number = '' current_number = '' if tv_type == u'电视剧': number_str = re.search(u'class="basenotice"(.|\n)+?<', content).group() current_number = re.search(u'更新至\d+', number_str) all_number = re.search(u'共\d+', number_str).group()[1:] current_number = current_number and current_number.group()[3:] or all_number # noqa if tv_type == u'综艺': all_number = 0 tmp_episode = [] for _ in re.finditer(u'y\.episode\.show\(\'.+?\'\)', content): number_url = 'http://www.youku.com/show_episode/{}.html?dt=json&divid={}' # noqa divid = re.search(u'\'.+?\'', _.group()).group()[1:-1] current_number_str = request(number_url.format(tv_id.encode('utf8'), divid.encode('utf8'))) # noqa if not current_number_str: warning_message = u"yk 《{} 》number结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue tmp_episode = [_ for _ in re.finditer(u'<ul(.|\n)+?</ul>', current_number_str)] all_number += len(tmp_episode) if not all_number: tmp_episode = re.search(u'<div id="episode">(.|\n)+?</div>', page).group() # noqa tmp_episode = [_ for _ in re.finditer(u'<ul(.|\n)+?</ul>', tmp_episode)] if not tmp_episode: utils.log(message=warning_message) continue try: if not tmp_episode[0]: continue except: continue all_number = len(tmp_episode) current_number = re.search(u'<label>.+?</label>', tmp_episode[0].group()).group() current_number = re.compile(u'<.+?>|期'). \ sub(u'', current_number) all_play_counts = re.search(u'<label>总播放:</label>.+?\n', content) \ .group() all_play_counts = (int)(re.compile(u'<label>总播放:</label>|,|\n') .sub(u'', all_play_counts)) pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )