def load_video_urls(self): items = [] try: if self.json_data: cmd = self.json_data['cmd'] if 'cmd' in self.json_data else None if cmd == 'trig': stat = self.json_data[ 'stat'] if 'stat' in self.json_data else None res = self.mgr.get_untrack_url(self.site_code, stat) for item in res: mediaVideoItem = MediaVideoItem() mediaVideoItem['sid'] = item['sid'] mediaVideoItem['untrack_id'] = item['untrack_id'] mediaItem = MediaItem() mediaItem['channel_id'] = item['name'] mediaVideoItem['media'] = mediaItem url = item['url'] items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) elif cmd == 'assign': tasks = self.json_data[ 'task'] if 'task' in self.json_data else None for task in tasks: mediaVideoItem = MediaVideoItem() mediaVideoItem[ 'sid'] = task['sid'] if 'sid' in task else None mediaVideoItem['untrack_id'] = task[ 'untrack_id'] if 'untrack_id' in task else None mediaItem = MediaItem() mediaItem['channel_id'] = task['name'] mediaVideoItem['media'] = mediaItem url = task['url'] items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) elif cmd == 'test': channel_id = self.json_data[ 'id'] if 'id' in self.json_data else None url = self.json_data[ 'url'] if 'url' in self.json_data else None if url and channel_id: list_channel = self.mgr.get_channel_name(channel_id) if list_channel: list_channel = list_channel['name'] items.append( Request(url=url, callback=self.list_parse, meta={ 'first': False, 'id': list_channel })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None #播放页获取详细信息 sels = response.xpath('//script[@type="text/javascript"]') hunantv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="play-xxmes clearfix"]') hunantv_extract.media_info_extract(sels, mediaItem) #获得媒体页地址 url_express = '(http://www\.hunantv\.com/v/[\d]+/[\d]+)/[a-zA-Z]/[\d]+\.html' url_regex = re.compile(url_express) match_results = url_regex.search(request_url) if match_results: url_content = match_results.groups()[0] mediaItem['url'] = url_content #获取正片地址 url_exist = False sels = response.xpath( '//div[@class="play-index-con-til clearfix"]//*[@class="mppl-til"]' ) for sel in sels: results = hunantv_extract.album_extract(sel) if results: item = results[0] url = item['url'] url = Util.get_absolute_url(url, prefix_url) mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.album_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) url_exist = True break #不存在正在播放的链接,如“芒果捞星闻” if 'url' in mediaItem and not url_exist: year_api = mediaItem['url'] + '/s/json.year.js' mediaVideoItem['media'] = mediaItem items.append( Request(url=year_api, callback=self.album_json_parse, meta={ 'item': mediaVideoItem, 'url': year_api })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def play_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'play url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) route_url_list = response.xpath( '//div[@class="play-content"]//div[@class="v-panel-route"]/a/@href' ).extract() media_url = '' if route_url_list: media_url = route_url_list[-1] if media_url: # 有媒体页url,媒体页抓取媒体信息 items.append( Request(url=media_url, callback=self.media_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) else: # 电影没有媒体页,在播放页抓取媒体信息 mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() title_class = "v-info v-info-film e-follow" div_class = "v-meta v-meta-film" v_title = '//div[@class="%s"]//h1[@class="title"]/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) # 没有媒体页,播放地址作为媒体地址 mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaVideoItem['media'] = mediaItem r = re.compile('.*/(\d+).html') m = r.match(mediaItem['url']) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', mediaItem['url']) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'play url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = None if not sels: #Detail电视剧、综艺、动漫 sels = response.xpath( '//div[@data-statectn="play_info"]//ul[@class="intro_box"]' ) if not sels: #Info:普通影片,动漫 sels = response.xpath( '//div[@data-statectn="newplay_info"]//ul[@class="info_list"]' ) if not sels: #收费影片 sels = response.xpath( '//div[@class="Player"]//span[@class="video_info"]') if sels: results = letv_extract.media_extract(sels) if results: item = results[0] url = Util.get_absolute_url(item['url'], prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) if not items: #视频播放页找不到媒体页地址,尝试直接采用接口爬取 if 'cont_id' in mediaItem: self.api_parse(mediaVideoItem) else: logging.log(logging.INFO, '该视频播放页找不到媒体页地址,也无法直接采用接口: %s' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def list_html_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'list html url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None postfix_url = response.request.meta[ 'postfix_url'] if 'postfix_url' in response.request.meta else None if u'电影' == channel_id: ''' is_hj:是否合集的标志,爬虫目前舍弃合集的链接 is_virtual:本站点是否存在 ''' sels = response.xpath( '//a[@class="ui-list-ct" and @is_hj="0" and @is_virtual="0"]' ) else: sels = response.xpath( '//a[@class="ui-list-ct" and @is_virtual="0"]') if sels: #表明仍有下一页 for sel in sels: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id urls = sel.xpath('./@href').extract() mediaItem['url'] = urls[0] pptv_extract.media_info_extract(sel, mediaItem) mediaVideoItem['media'] = mediaItem items.append( Request(url=mediaItem['url'], callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 url = self.list_prefix_url + '?' + postfix_url + '&page=%s' % page items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': postfix_url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def compose_mvitem(self, response, title_list, pers, dirs, play_url, cat_id, poster_url, text): try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0].strip() ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "wasu") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item mid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = mid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) if not res: return None except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() pps_extract.media_extract(response, mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def album_json_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'json url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else None url = response.request.meta[ 'url'] if 'url' in response.request.meta else None if url != request_url: #被重定向,表明不存在 return items year_express = '(\[.*\])' year_regex = re.compile(year_express) match_results = year_regex.search(response.body) if match_results: videoItems = [] year_content = match_results.groups()[0] years = json.loads(year_content) for year in years: video_url = mediaItem['url'] + '/s/json.%s.js' % year result = Util.get_url_content(video_url) videoItems = videoItems + self.album_tag_json_resolve( text=result, meta={'url': video_url}) if videoItems: Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' #http://vod.kankan.com/v/87/87998.shtml sels = response.xpath('//ul[@class="movieinfo"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//p[@id="movie_info_intro_l"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #普通电影,电视剧,综艺,动漫 sels = response.xpath('//div[@class="header_title"]') if sels: results = kankan_extract.media_extract(sels) else: #http://vip.kankan.com/vod/88365.html sels = response.xpath('//div[@class="movie_info"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) results = kankan_extract.media_extract(sels) else: #http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01 sels = response.xpath( '//div[@class="aside"]//div[@class="intro"]') results = kankan_extract.media_extract(sels) for item in results: mediaItem['url'] = item['url'] mediaVideoItem['media'] = mediaItem items.append( Request(url=item['url'], callback=self.media_parse, meta={'item': mediaVideoItem})) break except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def parse_video_item(self, response, cat_id, url, title, playlistId): #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title)) videoitems = [] ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item["video"] = videoitems try: if int(cat_id) != int(self.movie_id): ul_list = response.xpath( '//div[@class="episodes clearfix "]/a') if not ul_list: ul_list = response.xpath( '//div[@class="episodes clearfix enc-episodes-detail"]/a' ) for li in ul_list: url = li.xpath('./@href').extract() ttitle = li.xpath('./@title').extract() snum = li.xpath('./text()').extract() if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num = self.getvnum(self.url_prefix + url[0]) if not ttitle: ttitle = [play_num] vitem = self.compose_vitem([self.url_prefix + url[0]], title, play_num) if 'url' in vitem: videoitems.append(vitem) elif int(cat_id) == int(self.movie_id): if url: vitem = self.compose_vitem([url], title, 1) if 'url' in vitem: videoitems.append(vitem) if videoitems: item["video"] = videoitems item["media"]["url"] = response.request.url Util.set_ext_id(item["media"], item["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return item
def parse_media(self, response, **kwargs): try: channel = kwargs['channel'] logging.log(logging.INFO, response.request.url) items = [] mv = MediaVideoItem() mv["video"] = [] media_info = self.parser[channel].parse_media_info(response) #print media_info if media_info: media_info['site_id'] = self.site_id media_info['channel_id'] = self.channel_map[channel] media_info['url'] = Util.normalize_url(response.request.url) media_info['ext_id'] = Util.md5hash(media_info['url']) media_info['info_id'] = Util.md5hash( Util.summarize(media_info)) mv["media"] = media_info ext_video = self.parser[channel].parse_video(response) #print ext_video if ext_video: mv["ext_video"] = { 'site_id': self.site_id, 'channel_id': media_info['channel_id'], 'urls': ext_video, 'media_ext_id': media_info['ext_id'] } reviews = self.parser[channel].parse_review(response) if reviews: mv["review"] = { 'urls': reviews, 'media_ext_id': media_info['ext_id'] } items.append(mv) return items except Exception as e: logging.log(logging.ERROR, traceback.format_exc())
def list_html_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'list html url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None results = pps_extract.media_extract(response) for item in results: mediaVideoItem = MediaVideoItem() item['channel_id'] = channel_id url = item['url'] url = Util.get_absolute_url(url, request_url) item['url'] = url mediaVideoItem['media'] = item items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 results = pps_extract.next_page_extract(response) for item in results: url = Util.get_absolute_url(item, request_url) items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': page, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #播放页 - 普通电影 sels = response.xpath( '//div[@class="film-info clearfix"]//span[@class="summary"]/a/@href' ) if not sels: #播放页 - vip电影 sels = response.xpath( '//div[@class="f_song inner_resumeCon intro"]//div[@class="con"]/a/@href' ) if not sels: #播放页 - 预告片电影 sels = response.xpath( '//div[@class="related-film clear"]//a[@class="rel-film-img"]/@href' ) if sels: url = sels.extract()[0] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def media_parse(self, response): items = [] try: media_url = response.request.url logging.log(logging.INFO, 'media url: %s' % media_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() # 媒体页获取媒体信息 title_class = "v-info v-info-album " div_class = "v-meta v-meta-album" v_title = '//div[@class="%s"]//h1[@class="title"]/span/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) mediaItem['url'] = Util.normalize_url(media_url, self.site_code) request_url = response.meta['url'] request_url = Util.normalize_url(request_url, self.site_code) r = re.compile('.*/(\d+).html') m = r.match(request_url) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', request_url) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: pass except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] page_id = self.get_youku_pageid(response.request.url) if not page_id: log.error('miss content id: %s' % response.request.url) return untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] items = [] year_list = [] title = self.parse_title(response,cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'地区:').extract() type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'类型:').extract() play_date = self.parse_play_date(response) total_num = self.parse_total_num(response) year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) #text text = response.xpath('//div[@class="detail"]/span/text()').extract() videoitems = [] ep_item = MediaItem() if title: ep_item["title"] = title[0].strip() if pers: ep_item["actor"] = pers if dirs > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if play_date: ep_item["release_date"] = Util.str2date(play_date) if total_num: ep_item["vcount"] = total_num ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url,"youku") if text: ep_item["intro"] = text[0].strip() ep_item["cont_id"] = page_id ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item)) mvitem = MediaVideoItem(); if mid: mvitem['mid'] = mid mvitem["media"] = ep_item; if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id) mvitem['video'] = video_list Util.set_ext_id(mvitem["media"], mvitem["video"]) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def load_video_urls(self): items = [] try: if self.json_data: cmd = self.json_data['cmd'] if 'cmd' in self.json_data else None if cmd == 'trig': ''' #由于当前360爬取策略将m1905 -> 1905,故舍弃该方法 site_codes = ['1905', 'm1905'] for site_code in site_codes: res = self.mgr.get_untrack_url(site_code=site_code) for item in res: mediaVideoItem = MediaVideoItem() mediaVideoItem['sid'] = item['sid'] mediaVideoItem['untrack_id'] = item['untrack_id'] mediaItem = MediaItem() mediaItem['channel_id'] = item['name'] mediaVideoItem['media'] = mediaItem url = item['url'] items.append(Request(url=url, callback=self.video_parse, meta={'item':mediaVideoItem})) ''' #由于电影网特别的site_code有:1905, m1905,舍弃原因的方式 if self.site_code: stat = self.json_data[ 'stat'] if 'stat' in self.json_data else None res = self.mgr.get_untrack_url(self.site_code, stat) for item in res: mediaVideoItem = MediaVideoItem() mediaVideoItem['sid'] = item['sid'] mediaVideoItem['untrack_id'] = item['untrack_id'] mediaItem = MediaItem() mediaItem['channel_id'] = item['name'] mediaVideoItem['media'] = mediaItem url = item['url'] items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) elif cmd == 'assign': tasks = self.json_data[ 'task'] if 'task' in self.json_data else None for task in tasks: mediaVideoItem = MediaVideoItem() mediaVideoItem[ 'sid'] = task['sid'] if 'sid' in task else None mediaVideoItem['untrack_id'] = task[ 'untrack_id'] if 'untrack_id' in task else None mediaItem = MediaItem() mediaItem['channel_id'] = task['name'] mediaVideoItem['media'] = mediaItem url = task['url'] items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) elif cmd == 'test': channel_id = self.json_data[ 'id'] if 'id' in self.json_data else None url = self.json_data[ 'url'] if 'url' in self.json_data else None if url and channel_id: list_channel = self.mgr.get_channel_name(channel_id) if list_channel: list_channel = list_channel['name'] level = self.max_mark_depth + 1 items.append( Request(url=url, callback=self.list_parse, meta={ 'level': level, 'id': list_channel })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #过滤掉skip_types类型的影片 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: play_type = match_result[0] if play_type in self.skip_types: return items #由于某些URL会有跳转,所以应保存真是的URL #http://movie.kankan.com/movie/88365 -> http://data.movie.kankan.com/movie/88365 mediaItem['url'] = request_url sels = response.xpath('//head') kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="info_list"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//ul[@class="detail_ul"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #获取媒体的剧集信息 videoItems = [] if u'综艺' == mediaItem['channel_id']: #综艺 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) elif u'电影' == mediaItem['channel_id']: #电影,从立即观看中获取 videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="section clearfix s2"]') if sels: urls = sels.xpath( './/a[starts-with(@class, "foc")]/@href').extract() thumb_urls = sels.xpath( './/a[@class="foc"]/img/@src').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) else: #电视剧 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_asc")]]') if not sels: #动漫,电视剧 sels = response.xpath( '//ul[@id[re:test(., "fenji_[\d]+_asc")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) else: logging.log(logging.INFO, '%s: no videos' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_single_episode(self, response): items = [] try: logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url) cat_id = response.request.meta['cat_id'] untrack_id = response.request.meta['untrack_id'] sid = response.request.meta['sid'] mid = response.request.meta[ 'mid'] if 'mid' in response.request.meta else "" playtype_list = response.selector.re( re.compile(r'var pagetype = .*?(\D+)')) #发现新的类型页面,http://tv.sohu.com/20100804/n273985736.shtml #http://my.tv.sohu.com/us/49390690/29200993.shtml 该URL利用现有的逻辑无法爬取到 urls = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/div[@class="crumbs"]/a[last()]' ) attributes = urls.xpath('./@*').extract() size = len(attributes) urls = urls.xpath('./@href').extract() if size == 1 and urls and not playtype_list: for iurl in urls: surl = Util.normalize_url(iurl, "sohu") if surl and "http" in surl: items.append( Request(url=surl, callback=self.parse_episode_info, meta={ 'cat_id': cat_id, 'poster_url': '', 'page': 1, "untrack_id": untrack_id, "sid": sid, "mid": mid })) #付费电影,不能跳转到媒体页 else: mvitem = self.parse_episode_play(response, untrack_id, sid) if mid: mvitem['mid'] = mid if mvitem and "media" in mvitem and "url" in mvitem[ "media"] and "ext_id" in mvitem["media"]: if self.check_url(mvitem): items.append(mvitem) if not items: mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid ep_item = MediaItem() ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id mvitem["media"] = ep_item playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] items += self.api_episode_info(mvItem=mvitem, playlistId=playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] year_list = [] lyears = [] title_list = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title' ).extract() director_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'主演:').extract() type_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'类型:').extract() district_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'地区:').extract() year_info = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/text()' % u'地区:').extract() year = None if len(year_info) >= 2: year = self.get_year(year_info[1]) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) districts = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="juqing briefTab"]/div/text()').extract() #score score = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()' ).extract() play_url = "" tplay_url = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href' ).extract() if tplay_url: play_url = self.url_prefix + tplay_url[0].strip() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] if ep_item["title"].find(u'预:') >= 0: print "预告片,url", response.request.url return items ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = districts if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "baofeng") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item vurl = "" videoid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = videoid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) #if self.check_url(mvitem): if res: items.append(mvitem) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #title title = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title' ).extract() #performer #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract() performer_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()' ).extract() if not performer_list: performer_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'主演:').extract() #director #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract() director_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()' ).extract() if not director_list: director_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'导演:').extract() #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() if not text: response.xpath( '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()' ).extract() type_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()' ).extract() if not type_list: type_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'类型:').extract() year_info = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()' ).extract() if not year_info: year_info = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'年份:').extract() play_date = None if year_info: play_date = self.get_year(year_info[0]) # dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) pers = Util.join_list_safely(performer_list) #sourceid sourceid = "" sourceid_list = response.xpath( '//div[@class="mod_bd sourceCont"]/@sourceid').extract() if sourceid_list: sourceid = sourceid_list[0] videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if play_date: ep_item["release_date"] = Util.str2date(play_date) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "qq") ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems vurl = "" url_pre = "http://s.video.qq.com/loadplaylist?vkey=" url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1" videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) mvitem["media"]["cont_id"] = videoid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) vurl = url_pre + str(sourceid) + url_tail tflag = "jQuery191048201349820010364_1425370006500" tpitem = self.parse_play_list(cat_id, vurl, tflag, response) #没有sourceid,比如专题页面 if not tpitem: tpitem = self.parse_topic_play_list(response) videoids = response.xpath( '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value' ).extract() if videoids: mvitem["media"]["cont_id"] = videoids[0] if tpitem: mvitem["video"] = tpitem Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_play(self, response): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = "" untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #items = [] #title title_list = response.xpath( '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title' ).extract() if not title_list: title_list = response.xpath( '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()' ).extract() #performer performer_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()' ).extract() #director director_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() ep_item = MediaItem() videoitems = [] #not film if int(cat_id) != int(self.movie_id): #video list #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li') video_list = response.xpath( '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li' ) i = 0 for tvideo in video_list: lurl = tvideo.xpath('./a/@href').extract() surl = "" #lnum = tvideo.xpath('./a/@title').extract() lnum = tvideo.xpath('./a/span/text()').extract() vitem = VideoItem() if lnum and lurl: vitem["vnum"] = lnum[0] surl = "http://film.qq.com" + lurl[0] vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,surl) turl = "" if cat_id == self.tv_id: turl = Util.normalize_url(surl, "qq", "tv") if cat_id == self.cartoon_id: turl = Util.normalize_url(surl, "qq", "cartoon") else: turl = Util.normalize_url(surl, "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) else: vitem = VideoItem() if title_list: vitem["title"] = title_list[0] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,response.request.url) turl = Util.normalize_url(response.request.url, "qq") vitem["url"] = turl vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) if len(title_list) > 0: ep_item["title"] = title_list[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if len(text) > 0: ep_item["intro"] = text[0] ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) ep_item["cont_id"] = videoid mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems #mvitem["media"]["url"] = response.request.url mvitem["media"]["url"] = Util.normalize_url( response.request.url, "qq") #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.md5hash(Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) #items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def api_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'api prase url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() mediaItem['url'] = request_url sel = response.xpath('.//script[@type="text/javascript"]') pidl = response.xpath('.//script[@type="text/javascript"]').re( '\"pid\"\D?(\d+)') vidl = response.xpath('.//script[@type="text/javascript"]').re( '\"id\"\D?(\d+)') if pidl and vidl: pid = pidl[0] vid = vidl[0] app_api = self.app_api % (self.get_auth(), pid) ismovie = False isvariaty = False if u'电影' == mediaItem['channel_id']: ismovie = True app_api = self.app_api % (self.get_auth(), vid) mediaItem['cont_id'] = str(vid) elif u'综艺' == mediaItem['channel_id']: isvariaty = True app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) else: app_api = self.app_api % (self.get_auth(), pid) mediaItem['cont_id'] = str(pid) xpara = self.get_xdata(url=app_api) mediaItem = self.resolve_media_info(xpara, mediaItem, ismovie=ismovie) mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] mediaItem['info_id'] = Util.md5hash(Util.summarize(mediaItem)) max_page = self.get_max_page(xpara) video_list = [] if ismovie: videoItem = VideoItem() videoItem['title'] = mediaItem[ 'title'] if 'title' in mediaItem else None videoItem['thumb_url'] = mediaItem[ 'poster_url'] if 'poster_url' in mediaItem else None videoItem['url'] = mediaItem[ 'url'] if 'url' in mediaItem else None videoItem['os_id'] = self.os_id videoItem['site_id'] = self.site_id videoItem['ext_id'] = Util.md5hash( mediaItem['url']) if 'url' in mediaItem else None videoItem['vnum'] = mediaItem[ 'vcount'] if 'vcount' in mediaItem else 1 videoItem['cont_id'] = mediaItem[ 'cont_id'] if 'cont_id' in mediaItem else None video_list.append(videoItem) else: for i in range(1, max_page): web_api = self.web_api % (pid, i) dpara = self.get_ddata(url=web_api) video_list += self.resolve_video_item( dpara, page_num=i, isvariaty=isvariaty) if isvariaty: video_list = self.revise_video_item(video_list, xpara) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = video_list items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] year_list = [] lyears = [] playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] if not playlistId: logging.log( logging.INFO, "parse_episode_info error,not find playlistid,url:%s " % response.request.url) return items title_list = self.parse_title(response, cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = self.parse_district(response) type_list = self.parse_type_list(response) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() year_list = self.parse_year(response) year = None if year_list: year = year_list[0] #pers = "|".join([t.strip() for t in performer_list]) #dirs = "|".join([t.strip() for t in director_list]) pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) district = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()' ).extract() play_url = "" play_url = response.xpath( '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href' ).extract() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district: ep_item["district"] = district if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "sohu") playlistId = str(playlistId) ep_item["cont_id"] = playlistId if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() if mid: mvitem['mid'] = mid if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid mvitem["media"] = ep_item vurl = "" ttvitem = [] if title_list: ttvitem = self.parse_video_item(cat_id, playlistId) if ttvitem: mvitem['video'] = ttvitem mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if self.check_url(mvitem): items.append(mvitem) if not items and playlistId: items += self.api_episode_info(mvitem, playlistId, cat_id=cat_id) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def album_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'album url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) video_url = response.request.meta[ 'url'] if 'url' in response.request.meta else None mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() videoItems = [] sels = response.xpath( '//div[@class="page-videolist-tag-main"]//p[@class="pa1-nav"]') if sels: #存在tag页 #http://list.hunantv.com/album/56.html results = hunantv_extract.album_tag_extract(sels) for item in results: url = Util.get_absolute_url(item['url'], prefix_url) result = Util.get_url_content(url) videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': url}) else: #不存在tag页 #http://list.hunantv.com/album/2905.html video_sels = response.xpath( '//div[@class="page-videolist clearfix"]') if video_sels: result = video_sels.extract()[0] videoItems = videoItems + self.album_tag_resolve( text=result, meta={'url': request_url}) else: #无正片页地址 #http://www.hunantv.com/v/7/102831/f/1043648.html,有正片集的URL,但该URL是无效的 if video_url: videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) videoItem['url'] = video_url Util.copy_media_to_video(mediaItem, videoItem) video_url_express = 'http://www\.hunantv\.com/v/[\d]+/[\d]+/[a-zA-Z]/([\d]+)\.html' video_url_regex = re.compile(video_url_express) #获取视频id match_results = video_url_regex.search(video_url) if match_results: id = match_results.groups()[0] videoItem['cont_id'] = id self.set_video_info(videoItem) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) #进入媒体页,获取相关信息 result = Util.get_url_content(mediaItem['url']) if result: mediaItem = self.media_resolve(text=result, meta={ 'item': mediaItem, 'url': mediaItem['url'] }) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'album url: %s' % request_url)
def parse_episode_play(self, response, untrack_id, sid): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] #vip title_list = response.xpath( '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/h2/@title' ).extract() director_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' % u'主演:').extract() text = response.xpath( '//div[@class="info info-con"]/p[@class="intro"]/text()' ).extract() pers = "|".join([t.strip() for t in performer_list]) dirs = "|".join([t.strip() for t in director_list]) playlistId = "" playlistId_list = response.selector.re( re.compile(r'var playlistId.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'var PLAYLIST_ID.*?(\d+)')) if not playlistId_list: playlistId_list = response.selector.re( re.compile(r'= playlistId.*?(\d+)')) if playlistId_list: playlistId = playlistId_list[0] vid = "" vid_list = response.selector.re(re.compile(r'var vid.*?(\d+)')) if vid_list: vid = vid_list[0] if not playlistId or not vid: return mvitem ep_item = MediaItem() ep_item["cont_id"] = playlistId if title_list: ep_item["title"] = title_list[0] ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "sohu") if text: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid vitem = VideoItem() vitem["title"] = ep_item["title"] if 'title' in ep_item else None vitem["url"] = ep_item["url"] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["ext_id"] = Util.md5hash(ep_item["url"]) vitem["site_id"] = self.site_id vitem["cont_id"] = vid videoitems = [] videoitems.append(vitem) mvitem["video"] = videoitems mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def list_json_parse(self, response): items = [] try: origin_url = response.request.meta['url'] request_url = response.request.url logging.log(logging.INFO, 'json api url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None list_json_postfix_url = response.request.meta[ 'postfix_url'] if 'postfix_url' in response.request.meta else None json_datas = json.loads(response.body) videos = [] if json_datas: videos = json_datas[ 'data_list'] if 'data_list' in json_datas else [] if videos: #表明仍有下一页 video_url = 'http://www.letv.com/ptv/vplay/%s.html' for item in videos: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id if 'rating' in item and item['rating']: mediaItem['score'] = item['rating'] subCategoryName = item['subCategoryName'] mediaItem['type'] = subCategoryName.replace(',', ';') mediaVideoItem['media'] = mediaItem release_date = item['releaseDate'] if release_date: release_date = float(release_date) if release_date > 0: release_date = release_date / 1000 release_date = time.localtime(release_date) release_date = '%s-%s-%s' % (release_date.tm_year, release_date.tm_mon, release_date.tm_mday) mediaItem['release_date'] = Util.str2date( release_date) vid = '' if 'vids' in item: vids = item['vids'] vids = vids.split(',') vid = vids[0] elif 'vid' in item: vid = item['vid'] if vid: url = video_url % vid items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 url = self.list_json_prefix_url + list_json_postfix_url + 'p=%s' % page items.append( Request(url=url, callback=self.list_json_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': list_json_postfix_url, 'url': url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json api url: %s' % request_url) logging.log(logging.INFO, 'origin url: %s' % origin_url)
def parse_play_list(self, cat_id, url, flag, response): item = None videoitems = [] try: ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item['video'] = videoitems info = None try: info = self.httpdownload.get_data(url) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems if not info or len(info) < 2: return videoitems msg = info bodylen = len(msg) - 1 index = msg.find(flag) + len(flag) + 1 info = msg[index:bodylen] jinfo = json.loads(info) if "video_play_list" not in jinfo: return videoitems itemlist = jinfo["video_play_list"]["playlist"] for titem in itemlist: if "episode_number" not in titem: continue info = titem["episode_number"] if info and titem["title"].find(u"预告") < 0 and url.find( "qq.com") >= 0: vitem = VideoItem() vitem["title"] = titem["title"] tvnum = string.replace(info, "-", "") #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html tvnum_list = re.findall(r'[\D]+', tvnum) if not tvnum_list: vitem["vnum"] = string.replace(info, "-", "") else: continue vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = "" if int(cat_id) == int(self.tv_id) or int(cat_id) == int( self.cartoon_id): turl = Util.normalize_url(titem["url"], "qq", "tv") else: turl = Util.normalize_url(titem["url"], "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) #vitem["cont_id"] = self.get_vid(response.body,turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) first = response.request.meta[ 'first'] if 'first' in response.request.meta else False channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if first: sels = response.xpath('//div[@class="tab_box"]//a') for sel in sels: texts = sel.xpath('.//span/text()').extract() if texts: text = texts[0].replace(' ', '') if text == u'最新': urls = sel.xpath('./@href').extract() url = urls[0] items.append( Request(url=url, callback=self.list_parse, meta={'id': channel_id})) break else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items #list列表 sels = response.xpath('//ul[@class="movielist"]/li') for sel in sels: results = kankan_extract.video_extract(sel) for item in results: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id kankan_extract.media_info_extract(sel, mediaItem) mediaVideoItem['media'] = mediaItem items.append( Request(url=item['url'], callback=self.video_parse, meta={'item': mediaVideoItem})) break #下一页 sels = response.xpath('//p[@class="list-pager-v2"]') results = kankan_extract.next_page_extract(sels) page = page + 1 for item in results: url = Util.get_absolute_url(item, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'page': page, 'id': channel_id })) break except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' mediaItem['url'] = request_url pptv_extract.media_info_extract(response, mediaItem) videoItems = [] if u'电影' == mediaItem['channel_id']: if 'cont_id' not in mediaItem or not mediaItem['cont_id']: return items videoItem = VideoItem() videoItem['url'] = mediaItem['url'] videoItem['cont_id'] = mediaItem['cont_id'] Util.copy_media_to_video(mediaItem, videoItem) self.set_video_info(videoItem) videoItems.append(videoItem) else: sel = response.xpath('//script[@type="text/javascript"]') #获取pid&cid用于获取电视剧,综艺,动漫的剧集信息 if sel: pids = sel.re('\"pid\"[ ]?:[ ]?(\d+)') cids = sel.re('\"cat_id\"[ ]?:[ ]?(\d+)') vids = sel.re('\"id\"[ ]?:[ ]?(\d+)') if pids and cids and vids: pid = pids[0] cid = cids[0] vid = vids[0] page = 1 #给media的cont_id赋值 mediaItem['cont_id'] = pid while True: meta = { 'pid': pid, 'cid': cid, 'vid': vid, 'page': page } url = self.album_api % (pid, cid, vid, page) result = Util.get_url_content(url) page_result = self.album_json_resolve( result, mediaItem, meta) if not page_result['items']: #该接口暂时由于获取不到video url,暂不提供 #for auth in self.auths: # url = self.auth_album_api % (pid, auth) # result = Util.get_url_content(url) # page_items = self.auth_album_xml_resolve(result, mediaItem, meta) # if page_items: # videoItems = page_items # break break else: videoItems = videoItems + page_result['items'] page = page + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) level = response.request.meta[ 'level'] if 'level' in response.request.meta else -1 if level == 0: #第一次进入list页面 sels = response.xpath( '//div[@id="hony-searchtag-condition"]/p') for list_channel in hunantv_extract.list_channels: list_postfix_urls = sels.xpath( './/a[normalize-space(text())="%s"]/@href' % list_channel).extract() if list_postfix_urls: list_postfix_url = list_postfix_urls[0] url = Util.get_absolute_url(list_postfix_url, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={'id': list_channel})) else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if page > self.max_update_page: return items #获取播放地址 sels = response.xpath('//div[@class="play-index-con-box"]') results = hunantv_extract.video_extract(sels) for item in results: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id video_sels = sels.xpath('.//a[@href="%s"]/..' % item['url']) hunantv_extract.media_info_extract(video_sels, mediaItem) mediaItem['poster_url'] = Util.get_absolute_url( mediaItem['poster_url'], prefix_url) url = Util.get_absolute_url(item['url'], prefix_url) mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 results = hunantv_extract.next_page_extract(response) if results: result = results[0] result = Util.get_absolute_url(result, prefix_url) page = page + 1 items.append( Request(url=result, callback=self.list_parse, meta={ 'page': page, 'id': channel_id })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)