def album_json_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None url = response.request.meta[ 'url'] if 'url' in response.request.meta else None if url != request_url: #被重定向,表明不存在 return items year_express = '(\[.*\])' year_regex = re.compile(year_express) match_results = year_regex.search(response.body) if match_results: videoItems = [] year_content = match_results.groups()[0] years = json.loads(year_content) for year in years: video_url = mediaItem['url'] + '/s/json.%s.js' % year result = Util.get_url_content(video_url) videoItems = videoItems + self.album_tag_json_resolve( text=result, meta={'url': video_url}) if videoItems: Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def api_media_info(self, mediaItem): try: api_url = self.other_album_api % (mediaItem['cont_id'], 1) result = Util.get_url_content(api_url) if not result: return json_result = json.loads(result) desc = json_result['body']['intro']['desc'] mediaItem['title'] = desc['nameCn'] if 'directory' in desc: director_list = desc['directory'].split(",") mediaItem['director'] = Util.join_list_safely(director_list) if 'starring' in desc: actor_list = desc['starring'].split(",") mediaItem['actor'] = Util.join_list_safely(actor_list) if 'subCategory' in desc: type_list = desc['subCategory'].split(",") desc['type'] = Util.join_list_safely(type_list) if 'area' in desc: district_list = desc['area'].split(",") mediaItem['district'] = Util.join_list_safely(district_list) if 'releaseDate' in desc: mediaItem['release_date'] = Util.str2date( str(desc['releaseDate'])) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def api_parse(self, mediaVideoItem): items = [] try: mediaItem = mediaVideoItem['media'] logging.log(logging.INFO, 'api parse pid: %s' % mediaItem['cont_id']) self.api_media_info(mediaItem) if 'title' in mediaItem: videoItems = [] pagenum = 1 while True: videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) if not page_items: break videoItems = videoItems + page_items pagenum = pagenum + 1 if videoItems: if 'url' not in mediaItem: mediaItem['url'] = videoItems[0]['url'] Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def album_media_json_resolve(self, text, mediaItem, request_url): items = [] content = '' try: logging.log(logging.INFO, 'album media json url: %s' % request_url) regex_express = '=(\{.*\})' regex_pattern = re.compile(regex_express) match_results = regex_pattern.search(text) if match_results: content = match_results.groups()[0] json_content = json.loads(content) if json_content['code'] != self.api_success_code: return items mediaItem['vcount'] = json_content['data']['pm'] mediaItem['latest'] = json_content['data']['ic'] datas = json_content['data']['vlist'] for data in datas: #type:正片:1, 预告片:0 type = data['type'] if str(type) != '0': videoItem = VideoItem() videoItem['intro'] = data['vt'] videoItem['vnum'] = data['pd'] videoItem['thumb_url'] = data['vpic'] videoItem['title'] = data['vt'] if not videoItem['title']: videoItem['title'] = data['vn'] videoItem['cont_id'] = data['id'] videoItem['url'] = data['vurl'] self.set_video_info(videoItem) items.append(videoItem) #爬取下一页 current_count = int(json_content['data']['pn']) page_count = int(json_content['data']['pp']) if current_count != 0 and current_count == page_count: cont_id = json_content['data']['aid'] page = int(json_content['data']['pg']) + 1 url = self.album_media_api % (cont_id, page, cont_id, page) result = Util.get_url_content(url) items = items + self.album_media_json_resolve( result, mediaItem, url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album media json url: %s' % request_url) logging.log(logging.INFO, '================json content=================') logging.log(logging.INFO, text)
def album_tag_resolve(self, text, meta): items = [] try: request_url = meta['url'] if 'url' in meta else None logging.log(logging.INFO, 'album tag url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) try: response = Selector(text=text) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'text to be parsed is not xml or html') return items sels = response.xpath( '//div[@class="play-index-con-box"]//ul[@class="clearfix ullist-ele"]/li' ) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) for sel in sels: videoItem = VideoItem() hunantv_extract.video_info_extract(sel, videoItem) url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url #获取视频id match_results = video_url_regex.search(url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) items.append(videoItem) #下一页 results = hunantv_extract.next_page_extract(response) if results: url = results[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) items = items + self.album_tag_resolve(text=result, meta={'url': url})
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else None mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="play"]') letv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//dl[@class="textInfo"]') if sels: #电视剧、综艺、动漫 letv_extract.media_info_extract(sels, mediaItem) else: #电影 sels = response.xpath('//div[@class="detail"]') letv_extract.media_info_extract(sels, mediaItem) #获取正片的url videoItems = [] if u'电影' == mediaItem['channel_id']: pagenum = 1 videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) videoItems = page_items #综艺 elif u'综艺' == mediaItem['channel_id']: sels = response.xpath( '//div[@class="listTab"]//div[@data-statectn="n_click"]') if sels: year_month_sels = sels.xpath('.//a') for year_month_sel in year_month_sels: years = year_month_sel.xpath('./@list-year').extract() months = year_month_sel.xpath( './@list-month').extract() year = None month = None if years: year = years[0] if months: month = months[0] if year and month: videos_url = self.zongyi_album_api % ( year, month, mediaItem['cont_id']) result = Util.get_url_content(videos_url) videoItems = videoItems + self.zongyi_album_resolve( text=result, meta={ 'url': videos_url, 'year': year, 'month': month }) elif mediaItem['channel_id'] in [u'电视剧', u'动漫']: pagenum = 1 while True: videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) if not page_items: break videoItems = videoItems + page_items pagenum = pagenum + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' mediaItem['url'] = request_url pptv_extract.media_info_extract(response, mediaItem) videoItems = [] if u'电影' == mediaItem['channel_id']: if 'cont_id' not in mediaItem or not mediaItem['cont_id']: return items videoItem = VideoItem() videoItem['url'] = mediaItem['url'] videoItem['cont_id'] = mediaItem['cont_id'] Util.copy_media_to_video(mediaItem, videoItem) self.set_video_info(videoItem) videoItems.append(videoItem) else: sel = response.xpath('//script[@type="text/javascript"]') #获取pid&cid用于获取电视剧,综艺,动漫的剧集信息 if sel: pids = sel.re('\"pid\"[ ]?:[ ]?(\d+)') cids = sel.re('\"cat_id\"[ ]?:[ ]?(\d+)') vids = sel.re('\"id\"[ ]?:[ ]?(\d+)') if pids and cids and vids: pid = pids[0] cid = cids[0] vid = vids[0] page = 1 #给media的cont_id赋值 mediaItem['cont_id'] = pid while True: meta = { 'pid': pid, 'cid': cid, 'vid': vid, 'page': page } url = self.album_api % (pid, cid, vid, page) result = Util.get_url_content(url) page_result = self.album_json_resolve( result, mediaItem, meta) if not page_result['items']: #该接口暂时由于获取不到video url,暂不提供 #for auth in self.auths: # url = self.auth_album_api % (pid, auth) # result = Util.get_url_content(url) # page_items = self.auth_album_xml_resolve(result, mediaItem, meta) # if page_items: # videoItems = page_items # break break else: videoItems = videoItems + page_result['items'] page = page + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def album_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'album url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = response.request.meta[ 'url'] if 'url' in response.request.meta else None mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] sels = response.xpath( '//div[@class="page-videolist-tag-main"]//p[@class="pa1-nav"]') if sels: #存在tag页 #http://list.hunantv.com/album/56.html results = hunantv_extract.album_tag_extract(sels) for item in results: url = Util.get_absolute_url(item['url'], prefix_url) result = Util.get_url_content(url) videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': url}) else: #不存在tag页 #http://list.hunantv.com/album/2905.html video_sels = response.xpath( '//div[@class="page-videolist clearfix"]') if video_sels: result = video_sels.extract()[0] videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': request_url}) else: #无正片页地址 #http://www.hunantv.com/v/7/102831/f/1043648.html,有正片集的URL,但该URL是无效的 if video_url: videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) videoItem['url'] = video_url Util.copy_media_to_video(mediaItem, videoItem) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) #获取视频id match_results = video_url_regex.search(video_url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album url: %s' % request_url)
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #获取播放地址 videoItems = [] videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.video_info_extract(sels, videoItem) if 'url' not in videoItem: #如果videoItem['url']为空,则表示只有影片资料,无播放地址,直接扔掉 logging.log(logging.INFO, '该影片找不到播放地址: %s' % request_url) return items url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem) videoItems.append(videoItem) #媒体属性 #设置媒体付费属性 video_prefix_url = Util.prefix_url_parse(url) if video_prefix_url in self.vip_prefix_urls: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' sels = response.xpath('//div[@class="laMovPIC fl pr22"]') dy1905_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="laMovMAIN fl"]') dy1905_extract.media_info_extract(sels, mediaItem) #剧情与演职人员 nav_sels = response.xpath( '//ul[@class="navSMb"]//li[@class="mdbpLeft2"]//div[@class="nowDefLine DefBOttom"]//a' ) if nav_sels: for sel in nav_sels: labels = sel.xpath('./text()').extract() urls = sel.xpath('./@href').extract() if labels and urls: label = labels[0].strip() if label.startswith(u'剧情') or label.startswith('演职人员'): url = urls[0] url = Util.get_absolute_url(url, prefix_url) result = Util.get_url_content(url) dy1905_extract.media_more_info_resolve( result, mediaItem) #设置绝对路径 url = mediaItem['url'] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] #普通媒体页 channel_id_fun = mediaItem['channel_id'] sels = response.xpath('//div[@id="qitancommonarea"]') iqiyi_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//script[@type="text/javascript"]') iqiyi_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="mod_search_topic mb20"]') if not sels: sels = response.xpath('.//div[@id="block-B"]') iqiyi_extract.media_info_extract(sels, mediaItem) #特辑媒体页 iqiyi_extract.media_info_extract(response, mediaItem) cont_id = mediaItem['cont_id'] if 'cont_id' in mediaItem else None title = mediaItem['title'] if 'title' in mediaItem else None if cont_id and title: cont_ids = cont_id.split('|') cont_id = cont_ids[0] cont_type = cont_ids[1] ''' vip_url = self.vip_api % cont_id try: result = Util.get_url_content(vip_url) if result: json_data = json.loads(result) if json_data['code'] == self.api_success_code: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'vip url: %s' % vip_url) logging.log(logging.INFO, '-------json data----------') logging.log(logging.INFO, result) ''' mediaItem['channel_id'] = channel_id_fun channel_id_site = iqiyi_extract.list_channels_id[ channel_id_fun] if cont_type == 'source_id': #年份,都采用统一的api来获取 #years = response.xpath('//div[@data-widget="album-sourcelist"]//div[@data-widget-year="album-yearlist"]//a/@data-year').extract() #快乐大本营,天天向上的等,提供的是接口 url = self.source_year_api % (channel_id_site, cont_id) result = Util.get_url_content(url) years = self.source_year_json_resolve(result, url) for year in years: url = self.source_media_api % ( channel_id_site, cont_id, year, channel_id_site, cont_id, year) result = Util.get_url_content(url) videoItems = videoItems + self.source_media_json_resolve( result, mediaItem, url) elif cont_type == 'album_id': #其他,其他的接口 page = 1 url = self.album_media_api % (cont_id, page, cont_id, page) result = Util.get_url_content(url) videoItems = videoItems + self.album_media_json_resolve( result, mediaItem, url) if not videoItems: #特殊节目暂时不爬取,http://www.iqiyi.com/yule/cjkgbj.html #不作任何处理 videoItems = videoItems if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() channel_id_fun = mediaItem['channel_id'] sels = response.xpath('//script[@type="text/javascript"]') iqiyi_extract.media_info_extract(sels, mediaItem) mediaItem['channel_id'] = channel_id_fun sels = response.xpath( '//div[@itemtype="http://schema.org/ShowEpisode"]') iqiyi_extract.media_info_extract(sels, mediaItem) #播放页 - 用于直接从播放页进入 sels = response.xpath( '//div[@class="crumb_bar" or @class="mod-crumb_bar"]') iqiyi_extract.media_info_extract(sels, mediaItem) url = mediaItem['url'] if 'url' in mediaItem else '' url_type = iqiyi_extract.url_type_resolve(url) if url_type == URL_TYPE_MEDIA: mediaVideoItem['media'] = mediaItem url = mediaItem['url'] items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) else: cont_id = mediaItem[ 'cont_id'] if 'cont_id' in mediaItem else None title = mediaItem['title'] if 'title' in mediaItem else None if cont_id and title: cont_ids = cont_id.split('|') cont_id = cont_ids[0] cont_type = cont_ids[1] ''' vip_url = self.vip_api % cont_id try: result = Util.get_url_content(vip_url) if result: json_data = json.loads(result) if json_data['code'] == self.api_success_code: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'vip url: %s' % vip_url) ''' videoItems = [] if cont_type == 'source_id': #年份,都采用统一的api来获取 #years = response.xpath('//div[@data-widget="album-sourcelist"]//div[@data-widget-year="album-yearlist"]//a/@data-year').extract() #快乐大本营,天天向上的等,提供的是接口 url = self.source_year_api % (channel_id_site, cont_id) result = Util.get_url_content(url) years = self.source_year_json_resolve(result, url) for year in years: url = self.source_media_api % ( channel_id_site, cont_id, year, channel_id_site, cont_id, year) result = Util.get_url_content(url) videoItems = videoItems + self.source_media_json_resolve( result, mediaItem, url) elif cont_type == 'album_id': #其他,其他的接口 page = 1 url = self.album_media_api % (cont_id, page, cont_id, page) result = Util.get_url_content(url) videoItems = videoItems + self.album_media_json_resolve( result, mediaItem, url) if not videoItems: #特殊节目暂时不爬取,http://www.iqiyi.com/yule/cjkgbj.html #不作任何处理 videoItems = videoItems if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems print mediaVideoItem items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)