def request(url, interval=60, cycle_times=3, name='qq'): """ Result for request target url. return unicode type result error return None. """ while(cycle_times): try: result = requests.get(url, headers=headers) if result.status_code == 200: if result.encoding and result.encoding.lower() == 'gbk': text = result.text text = text.encode('utf8') text = text.decode('utf8') return text result.encoding = 'utf8' return result.text if result.status_code == 500: time.sleep(interval) utils.log(message="网络出现错误,{}秒之后会重新抓取\r\n".format(interval)) # noqa except: result = None cycle_times -= 1 return result
def dianshiju_info(self, tv_infos, db_tv_names): url = u'http://cache.video.qiyi.com/jp/vi/{}/{}/' for name, tv_info in tv_infos.items(): # print u"抓取《{}》中".format(name) warning_message = u"《iqy {}》tv_info ,结果不准确\r\n". \ format(name) tv_id = tv_info[1]['id'] vids = tv_info[2]['v_id'] page = request(url.format(tv_id, vids)) json_content = tv_info_is_valid(page) if not json_content: utils.log(message=warning_message) continue all_number = json_content['es'] current_number = json_content['upOrder'] description = json_content['info'] label = json_content['tg'] cast_member = json_content['ma'] update_info = json_content['qiyiPlayStrategy'][:32] last_update_time = json_content['up'] detail_urls = tv_info[0]['url'] tv_type = u'电视剧' detail_titles = json_content['vn'] detail_episodes = '' if name in db_tv_names: TvInfo.update( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def play_info(self, db_tv_infos): url = 'http://data.video.qq.com/fcgi-bin/data?tid=70&&appid=10001007&appkey=e075742beb866145&callback=callback&low_login=1&idlist={}&otype=json' # noqa play_dir = PLAY_INFO_FILE_DIR + SAVE_FILE utils.mkdir(play_dir) for tv_info in db_tv_infos: if tv_info.type in [u'电视剧']: # print u"抓取《{}》播放信息中".format(tv_info.name) warning_message = u"qq Warning《{}》play_info ,结果不准确\r\n". \ format(tv_info.name) page = request(url.format(tv_info.tv_id)) if not play_info_is_valid_qq(page): utils.log(message=warning_message) continue utils.write(play_dir, tv_info.name + PLAY_INFO_FILE_FIX, page) elif tv_info.type in [u'综艺']: vids = tv_info.vids.split(',') episodes = tv_info.detail_episodes.split(',') for vid, episode in zip(vids, episodes): # print u"抓取《{}》第{}期播放信息中。。。".format(tv_info.name, episode) warning_message = u"qq《{}》第{}期play_info ,结果不准确\r\n". \ format(tv_info.name, episode) page = request(url.format(vid)) if not play_info_is_valid_qq(page): utils.log(message=warning_message) continue utils.write(play_dir, tv_info.name + episode + PLAY_INFO_FILE_FIX, page)
def play_info(self, db_play_info_map, db_tv_infos): url = 'http://mixer.video.iqiyi.com/jp/mixin/videos/{}/' for db_tv_info in db_tv_infos: # print u'《{}》play_info 抓取中'.format(db_tv_info.name) tmp_all_play_counts = 0 for vid in db_tv_info.vids.split(','): warning_message = u"iqy《{}》{} play_info ,结果不准确\r\n". \ format(db_tv_info.name, vid) page = request(url.format(vid)) json_content = play_info_is_valid(page) if not json_content: # print u'《{}》play_info 抓取失败'.format(db_tv_info.name) time.sleep(30) page = request(url.format(vid)) json_content = play_info_is_valid(page) if not json_content: utils.log(message=warning_message) continue tmp_all_play_counts += (int)(json_content.get('playCount')) all_play_counts = tmp_all_play_counts pre_all_play_counts = db_play_info_map.get(db_tv_info.name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) \ or 0 PlayInfo.add(tv_id=vid, tv_name=db_tv_info.name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=db_tv_info.type)
def play_info(self, db_play_info_map, db_tv_infos): url = 'http://mixer.video.iqiyi.com/jp/mixin/videos/{}/' for db_tv_info in db_tv_infos: # print u'《{}》play_info 抓取中'.format(db_tv_info.name) tmp_all_play_counts = 0 for vid in db_tv_info.vids.split(','): warning_message = u"iqy《{}》{} play_info ,结果不准确\r\n". \ format(db_tv_info.name, vid) page = request(url.format(vid)) json_content = play_info_is_valid(page) if not json_content: # print u'《{}》play_info 抓取失败'.format(db_tv_info.name) time.sleep(30) page = request(url.format(vid)) json_content = play_info_is_valid(page) if not json_content: utils.log(message=warning_message) continue tmp_all_play_counts += (int)(json_content.get('playCount')) all_play_counts = tmp_all_play_counts pre_all_play_counts = db_play_info_map.get(db_tv_info.name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) \ or 0 PlayInfo.add( tv_id=vid, tv_name=db_tv_info.name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=db_tv_info.type )
def request(url, interval=60, cycle_times=3, name='qq'): """ Result for request target url. return unicode type result error return None. """ while (cycle_times): try: result = requests.get(url, headers=headers) if result.status_code == 200: if result.encoding and result.encoding.lower() == 'gbk': text = result.text text = text.encode('utf8') text = text.decode('utf8') return text result.encoding = 'utf8' return result.text if result.status_code == 500: time.sleep(interval) utils.log( message="网络出现错误,{}秒之后会重新抓取\r\n".format(interval)) # noqa except: result = None cycle_times -= 1 return result
def dianshiju_info(self, tv_infos, db_tv_names): url = u'http://cache.video.qiyi.com/jp/vi/{}/{}/' for name, tv_info in tv_infos.items(): # print u"抓取《{}》中".format(name) warning_message = u"《iqy {}》tv_info ,结果不准确\r\n". \ format(name) tv_id = tv_info[1]['id'] vids = tv_info[2]['v_id'] page = request(url.format(tv_id, vids)) json_content = tv_info_is_valid(page) if not json_content: utils.log(message=warning_message) continue all_number = json_content['es'] current_number = json_content['upOrder'] description = json_content['info'] label = json_content['tg'] cast_member = json_content['ma'] update_info = json_content['qiyiPlayStrategy'][:32] last_update_time = json_content['up'] detail_urls = tv_info[0]['url'] tv_type = u'电视剧' detail_titles = json_content['vn'] detail_episodes = '' if name in db_tv_names: TvInfo.update(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, )
def tv_info(self, tv_names): url = 'http://s.video.qq.com/search?comment=1&plat=2&otype=json&query={}&callback=callback' # noqa info_dir = TV_INFO_FILE_DIR + SAVE_FILE utils.mkdir(info_dir) for name in tv_names: warning_message = u"qq Warning《{}》tv_info ,结果不准确\r\n". \ format(name) page = request(url.format(name.encode('utf8'))) if not tv_info_is_valid_qq(page): utils.log(message=warning_message) continue utils.write(info_dir, name + TV_INFO_FILE_FIX, page.encode('utf8'))
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = 'http://count.vrs.sohu.com/count/queryext.action?plids={}&callback=callback' # noqa info_url = 'http://pl.hd.sohu.com/videolist?playlistid={}&callback=callback' # noqa for name, pid in pids_map.items(): tv_id = pid info = request(info_url.format(pid.encode('utf8'))) json_content = info_is_valid(info) if not json_content: warning_message = u"sh《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue description = json_content['albumDesc'] last_update_time = '' current_number = json_content['updateSet'] all_number = json_content['totalSet'] all_number = all_number != u'0' and all_number or current_number tv_type = TV_TYPE_MAP.get(json_content['cid']) if tv_type == u'综艺': cast_member = json_content['hosts'] else: cast_member = json_content['actors'] cast_member = u",".join(cast_member) label = ",".join(json_content['categories']) update_info = json_content['updateNotification'] detail_urls = '' detail_titles = '' detail_episodes = '' play = request(play_url.format(pid.encode('utf8'))) play_json = play_is_valid(play, pid) if not play_json: warning_message = u"sh《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts = play_json[pid]['total'] pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add(tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type)
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = 'http://videocenter-2039197532.cn-north-1.elb.amazonaws.com.cn/dynamicinfo?callback=callback&cid={}' # noqa info_url = 'http://www.mgtv.com/v/{type_n}/{pid}' year_url = 'http://www.mgtv.com/v/1/{}/s/json.year.js' number_url = 'http://www.mgtv.com/v/1/{pid}/s/json.{year}.js' for name, tv_infos in pids_map.items(): pid = tv_infos[0].encode('utf8') tv_id = pid info = request(info_url.format(type_n=tv_infos[1], pid=tv_infos[0])) # noqa info = info_is_valid(info) if not info: warning_message = u"mg《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue last_update_time = '' update_info = '' detail_urls = '' detail_titles = '' detail_episodes = '' current_number = re.search(u'"lastseries" : ".+?"', info).group() current_number = current_number.split(':')[1][2:-1] description = re.search(u'简介</em>(.|\n)+?</span>', info).group() description = re.compile(u'<.+?>|简介|:|\s').sub(u'', description) tv_type = TV_TYPE_MAP[tv_infos[1]] cast_flag = u'主演' if tv_type == u'电视剧' else u'主持人' cast_member = re.search(u'{}</em>(.|\n)+?</p>'.format(cast_flag), info).group() cast_member = re.compile(u'<.+?>|主演|主持人|:').sub(u'', cast_member) label = re.search(u'类型</em>(.|\n)+?</p>', info).group() label = re.compile(u'<.+?>|类型|:|\s').sub(u'', label) if tv_type == u'电视剧': all_number = re.search(u'共<b>\d+?</b>集', info).group() all_number = re.search(u'\d+', all_number).group() else: year_json = request(year_url.format(pid)) year = year_json_is_valid(year_json) if not year: warning_message = u"mg zongi《{}》year_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue number_info = request(number_url.format(pid=pid, year=(int)(year[0]))) number_info = number_info_is_valid(number_info) if not number_info: warning_message = u"mg zongyi《{}》number_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_number = len([_ for _ in number_info]) play_info = request(play_url.format(pid)) play_json = play_is_valid(play_info) if not play_json: warning_message = u"mg《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts_str = play_json['data']['allVVStr'] all_play_counts = (float)(re.compile(u'万|亿').sub(u'', all_play_counts_str)) # noqa if u'万'in all_play_counts_str: all_play_counts *= 10000 elif u'亿'in all_play_counts_str: all_play_counts *= 100000000 pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = 'http://videocenter-2039197532.cn-north-1.elb.amazonaws.com.cn/dynamicinfo?callback=callback&cid={}' # noqa info_url = 'http://www.mgtv.com/v/{type_n}/{pid}' year_url = 'http://www.mgtv.com/v/1/{}/s/json.year.js' number_url = 'http://www.mgtv.com/v/1/{pid}/s/json.{year}.js' for name, tv_infos in pids_map.items(): pid = tv_infos[0].encode('utf8') tv_id = pid info = request(info_url.format(type_n=tv_infos[1], pid=tv_infos[0])) # noqa info = info_is_valid(info) if not info: warning_message = u"mg《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue last_update_time = '' update_info = '' detail_urls = '' detail_titles = '' detail_episodes = '' current_number = re.search(u'"lastseries" : ".+?"', info).group() current_number = current_number.split(':')[1][2:-1] description = re.search(u'简介</em>(.|\n)+?</span>', info).group() description = re.compile(u'<.+?>|简介|:|\s').sub(u'', description) tv_type = TV_TYPE_MAP[tv_infos[1]] cast_flag = u'主演' if tv_type == u'电视剧' else u'主持人' cast_member = re.search(u'{}</em>(.|\n)+?</p>'.format(cast_flag), info).group() cast_member = re.compile(u'<.+?>|主演|主持人|:').sub(u'', cast_member) label = re.search(u'类型</em>(.|\n)+?</p>', info).group() label = re.compile(u'<.+?>|类型|:|\s').sub(u'', label) if tv_type == u'电视剧': all_number = re.search(u'共<b>\d+?</b>集', info).group() all_number = re.search(u'\d+', all_number).group() else: year_json = request(year_url.format(pid)) year = year_json_is_valid(year_json) if not year: warning_message = u"mg zongi《{}》year_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue number_info = request( number_url.format(pid=pid, year=(int)(year[0]))) number_info = number_info_is_valid(number_info) if not number_info: warning_message = u"mg zongyi《{}》number_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_number = len([_ for _ in number_info]) play_info = request(play_url.format(pid)) play_json = play_is_valid(play_info) if not play_json: warning_message = u"mg《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts_str = play_json['data']['allVVStr'] all_play_counts = (float)(re.compile(u'万|亿').sub( u'', all_play_counts_str)) # noqa if u'万' in all_play_counts_str: all_play_counts *= 10000 elif u'亿' in all_play_counts_str: all_play_counts *= 100000000 pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add(tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type)
def info_and_play(self, tv_urls_map, db_tv_names, db_play_info_map): for name, url in tv_urls_map.items(): warning_message = u"yk 《{} 》结果不准确\r\n". \ format(name) page = request(url) content = info_and_play_is_valid(page, name) if not content: time.sleep(30) page = request(url) content = info_and_play_is_valid(page, name) if not content: utils.log(message=warning_message) continue last_update_time = '' label = '' update_info = '' detail_urls = url detail_titles = '' detail_episodes = '' tv_id = re.search(u'id.+?\.html', url).group()[:-5] title_str = re.search(u'<h1 class="title">(.|\n)+?</h1>', page). \ group() tv_type = re.search(u'target="_blank">.+?<', title_str). \ group()[16:-1] cast_member = [] cast_member_flag = u'主持人' if tv_type == u'综艺' else u'主演' cast_member_str = re.search( cast_member_flag + u':</label>(.|\n)+?</span>', page).group() for m in re.finditer(u'<a.+?</a>', cast_member_str): cast_member.append( re.search('">.+?<', m.group()).group()[2:-1]) # noqa cast_member = ",".join(cast_member) description_str = re. \ search(u'<span class="short" id="show_info_short"(.|\n)+?</div>', content).group() # noqa description = re.compile(u'<.*?>|查看详情>>').sub(u'', description_str) all_number = '' current_number = '' if tv_type == u'电视剧': number_str = re.search(u'class="basenotice"(.|\n)+?<', content).group() current_number = re.search(u'更新至\d+', number_str) all_number = re.search(u'共\d+', number_str).group()[1:] current_number = current_number and current_number.group( )[3:] or all_number # noqa if tv_type == u'综艺': all_number = 0 tmp_episode = [] for _ in re.finditer(u'y\.episode\.show\(\'.+?\'\)', content): number_url = 'http://www.youku.com/show_episode/{}.html?dt=json&divid={}' # noqa divid = re.search(u'\'.+?\'', _.group()).group()[1:-1] current_number_str = request( number_url.format(tv_id.encode('utf8'), divid.encode('utf8'))) # noqa if not current_number_str: warning_message = u"yk 《{} 》number结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue tmp_episode = [ _ for _ in re.finditer(u'<ul(.|\n)+?</ul>', current_number_str) ] all_number += len(tmp_episode) if not all_number: tmp_episode = re.search( u'<div id="episode">(.|\n)+?</div>', page).group() # noqa tmp_episode = [ _ for _ in re.finditer(u'<ul(.|\n)+?</ul>', tmp_episode) ] if not tmp_episode: utils.log(message=warning_message) continue try: if not tmp_episode[0]: continue except: continue all_number = len(tmp_episode) current_number = re.search(u'<label>.+?</label>', tmp_episode[0].group()).group() current_number = re.compile(u'<.+?>|期'). \ sub(u'', current_number) all_play_counts = re.search(u'<label>总播放:</label>.+?\n', content) \ .group() all_play_counts = (int)( re.compile(u'<label>总播放:</label>|,|\n').sub( u'', all_play_counts)) pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add(tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type)
def dianshiju(self, urls_map, db_tv_names, db_play_info_map): play_url = 'http://v.stat.letv.com/vplay/queryMmsTotalPCount?callback=callback&pid={}' # noqa for name, tv_info in urls_map.items(): url = tv_info[0] pid = tv_info[1] tv_id = pid cast_member = tv_info[2] last_update_time = u'' update_info = u'' detail_urls = url tv_type = u'电视剧' detail_titles = u'' detail_episodes = u'' label = tv_info[3] page = request(url) content = dianshiju_is_valid(page) if not content: warning_message = u"let《{}》tv_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue description = re.search(u'<p class="p7">(.|\n)+?</p>', content). \ group() description = re.compile(u'<.+?>').sub('', description) all_number = re.search(u'共\d+?集', content).group() all_number = re.search(u'\d+', all_number).group() current_number = re.search(u'至\d+?集', content) current_number = current_number and re \ .search(u'\d+', current_number.group()).group() or all_number page = request(play_url.format(pid)) json_content = play_info_is_valid(page) if not json_content: page = request(play_url.format(pid)) json_content = play_info_is_valid(page) if not json_content: warning_message = u"let《{}》play_info ,结果不准确\r\n". \ format(name) continue all_play_counts = json_content.get('plist_play_count') pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )
def zongyi(self, urls_map, db_tv_names, db_play_info_map): pids = [] number_utl = 'http://api.le.com/mms/out/album/videos?id={}&cid=11&platform=pc&callback=callback' # noqa play_url = 'http://v.stat.letv.com/vplay/queryMmsTotalPCount?callback=callback&pid={}' # noqa description_url = 'http://www.le.com/zongyi/{}.html' for name, tv_info in urls_map.items(): url = tv_info[0] label = tv_info[1] pid_page = request(url) pid_page = zongyi_is_valid(pid_page) if not pid_page: continue pid = re.search(u'pid: \d+?,', pid_page).group() pid = re.search(u'\d+', pid).group() if pid in pids: continue pids.append(pid) tv_id = pid d_page = request(description_url.format(pid.encode('utf8'))) d_page = description_is_valid(d_page) if not d_page: warning_message = u"let《{}》description_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue description = re.search(u'<p class="p7">(.|\n)+?</p>', d_page). \ group() description = re.compile(u'<.+?>').sub('', description) last_update_time = u'' update_info = u'' detail_urls = url tv_type = u'综艺' detail_titles = u'' detail_episodes = u'' n_page = request(number_utl.format(pid)) n_json = number_utl_is_valid(n_page) if not n_json: warning_message = u"let zongyi《{}》number_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_number = n_json['total'] current_number = n_json['data'][0]['episode'] cast_member = [] [cast_member.append(_.get('guest')) for _ in n_json['data']] # remove repeat cast_member = " ".join(cast_member) cast_member = cast_member.split(" ") cast_member = list(set(cast_member)) cast_member = " ".join(cast_member) page = request(play_url.format(pid)) json_content = play_info_is_valid(page) if not json_content: warning_message = u"let《{}》play_info ,结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue all_play_counts = json_content.get('plist_play_count') pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )
def info_and_play(self, pids_map, db_tv_names, db_play_info_map): play_url = "http://count.vrs.sohu.com/count/queryext.action?plids={}&callback=callback" # noqa info_url = "http://pl.hd.sohu.com/videolist?playlistid={}&callback=callback" # noqa for name, pid in pids_map.items(): tv_id = pid info = request(info_url.format(pid.encode("utf8"))) json_content = info_is_valid(info) if not json_content: warning_message = u"sh《{}》tv_info ,结果不准确\r\n".format(name) utils.log(message=warning_message) continue description = json_content["albumDesc"] last_update_time = "" current_number = json_content["updateSet"] all_number = json_content["totalSet"] all_number = all_number != u"0" and all_number or current_number tv_type = TV_TYPE_MAP.get(json_content["cid"]) if tv_type == u"综艺": cast_member = json_content["hosts"] else: cast_member = json_content["actors"] cast_member = u",".join(cast_member) label = ",".join(json_content["categories"]) update_info = json_content["updateNotification"] detail_urls = "" detail_titles = "" detail_episodes = "" play = request(play_url.format(pid.encode("utf8"))) play_json = play_is_valid(play, pid) if not play_json: warning_message = u"sh《{}》play_info ,结果不准确\r\n".format(name) utils.log(message=warning_message) continue all_play_counts = play_json[pid]["total"] pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update( name=name, tv_id=pid, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add( name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type, )
def info_and_play(self, tv_urls_map, db_tv_names, db_play_info_map): for name, url in tv_urls_map.items(): warning_message = u"yk 《{} 》结果不准确\r\n". \ format(name) page = request(url) content = info_and_play_is_valid(page, name) if not content: time.sleep(30) page = request(url) content = info_and_play_is_valid(page, name) if not content: utils.log(message=warning_message) continue last_update_time = '' label = '' update_info = '' detail_urls = url detail_titles = '' detail_episodes = '' tv_id = re.search(u'id.+?\.html', url).group()[:-5] title_str = re.search(u'<h1 class="title">(.|\n)+?</h1>', page). \ group() tv_type = re.search(u'target="_blank">.+?<', title_str). \ group()[16:-1] cast_member = [] cast_member_flag = u'主持人' if tv_type == u'综艺' else u'主演' cast_member_str = re.search( cast_member_flag + u':</label>(.|\n)+?</span>', page).group() for m in re.finditer(u'<a.+?</a>', cast_member_str): cast_member.append(re.search('">.+?<', m.group()).group()[2:-1]) # noqa cast_member = ",".join(cast_member) description_str = re. \ search(u'<span class="short" id="show_info_short"(.|\n)+?</div>', content).group() # noqa description = re.compile(u'<.*?>|查看详情>>').sub(u'', description_str) all_number = '' current_number = '' if tv_type == u'电视剧': number_str = re.search(u'class="basenotice"(.|\n)+?<', content).group() current_number = re.search(u'更新至\d+', number_str) all_number = re.search(u'共\d+', number_str).group()[1:] current_number = current_number and current_number.group()[3:] or all_number # noqa if tv_type == u'综艺': all_number = 0 tmp_episode = [] for _ in re.finditer(u'y\.episode\.show\(\'.+?\'\)', content): number_url = 'http://www.youku.com/show_episode/{}.html?dt=json&divid={}' # noqa divid = re.search(u'\'.+?\'', _.group()).group()[1:-1] current_number_str = request(number_url.format(tv_id.encode('utf8'), divid.encode('utf8'))) # noqa if not current_number_str: warning_message = u"yk 《{} 》number结果不准确\r\n". \ format(name) utils.log(message=warning_message) continue tmp_episode = [_ for _ in re.finditer(u'<ul(.|\n)+?</ul>', current_number_str)] all_number += len(tmp_episode) if not all_number: tmp_episode = re.search(u'<div id="episode">(.|\n)+?</div>', page).group() # noqa tmp_episode = [_ for _ in re.finditer(u'<ul(.|\n)+?</ul>', tmp_episode)] if not tmp_episode: utils.log(message=warning_message) continue try: if not tmp_episode[0]: continue except: continue all_number = len(tmp_episode) current_number = re.search(u'<label>.+?</label>', tmp_episode[0].group()).group() current_number = re.compile(u'<.+?>|期'). \ sub(u'', current_number) all_play_counts = re.search(u'<label>总播放:</label>.+?\n', content) \ .group() all_play_counts = (int)(re.compile(u'<label>总播放:</label>|,|\n') .sub(u'', all_play_counts)) pre_all_play_counts = db_play_info_map.get(name) day_play_counts = pre_all_play_counts and \ max(all_play_counts - (int)(pre_all_play_counts), 0) or 0 if name in db_tv_names: TvInfo.update(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) else: TvInfo.add(name=name, tv_id=tv_id, description=description, last_update_time=last_update_time, all_number=all_number, current_number=current_number, cast_member=cast_member, platform=PLATFORM, label=label, update_info=update_info, detail_urls=detail_urls, vids=tv_id, type=tv_type, detail_titles=detail_titles, detail_episodes=detail_episodes, ) PlayInfo.add( tv_id=tv_id, tv_name=name, day_play_counts=day_play_counts, all_play_counts=all_play_counts, time_at=self.now, platform=PLATFORM, type=tv_type )