def media_extract(response): items = [] try: results = response.xpath( './/a/@href[re:test(., "http://movie\.kankan\.com/movie/[\d]+/introduction")]' ).extract() if results: #http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01#7927921 #http://vip.kankan.com/vod/88365.html#7306075 url = results[0] regex_pattern = re.compile( '(http://movie\.kankan\.com/movie/[\d]+)') match_results = regex_pattern.search(url) if match_results: mediaItem = MediaItem() mediaItem['url'] = match_results.groups()[0] items.append(mediaItem) else: #http://vod.kankan.com/v/86/86897.shtml#9895815 results = response.xpath( './/a/@href[re:test(., "http://data\.movie\.kankan\.com/movie/[\d]+")]' ).extract() for item in results: mediaItem = MediaItem() mediaItem['url'] = item items.append(mediaItem) break except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def load_video_urls(self): items = [] try: if self.json_data: cmd = self.json_data['cmd'] if 'cmd' in self.json_data else None if cmd == 'trig': stat = self.json_data[ 'stat'] if 'stat' in self.json_data else None res = self.mgr.get_untrack_url(self.site_code, stat) for item in res: mediaVideoItem = MediaVideoItem() mediaVideoItem['sid'] = item['sid'] mediaVideoItem['untrack_id'] = item['untrack_id'] mediaItem = MediaItem() mediaItem['channel_id'] = item['name'] mediaVideoItem['media'] = mediaItem url = item['url'] items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) elif cmd == 'assign': tasks = self.json_data[ 'task'] if 'task' in self.json_data else None for task in tasks: mediaVideoItem = MediaVideoItem() mediaVideoItem[ 'sid'] = task['sid'] if 'sid' in task else None mediaVideoItem['untrack_id'] = task[ 'untrack_id'] if 'untrack_id' in task else None mediaItem = MediaItem() mediaItem['channel_id'] = task['name'] mediaVideoItem['media'] = mediaItem url = task['url'] items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) elif cmd == 'test': channel_id = self.json_data[ 'id'] if 'id' in self.json_data else None url = self.json_data[ 'url'] if 'url' in self.json_data else None if url and channel_id: list_channel = self.mgr.get_channel_name(channel_id) if list_channel: list_channel = list_channel['name'] items.append( Request(url=url, callback=self.list_parse, meta={ 'first': False, 'id': list_channel })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #媒体页 sels = response.xpath('.//div[@class="laMoCont"]') if sels: name_sels = sels.xpath('.//div[@class="laMovName"]') titles = name_sels.xpath( './/a[@class="laGrayS_f"]/text()').extract() if titles: mediaItem['title'] = titles[0] property_sels = sels.xpath( './/ol[@class="movStaff line_BSld"]//li') ignore = True for sel in property_sels: label_sels = sel.xpath('.//strong') info_sels = sel.xpath('.//a') dy1905_extract.text_infos_resolve(label_sels, info_sels, mediaItem, ignore) scores = response.xpath( './/div[@class="laMoOther"]//div[@class="rating-dt"]//span[@class="score"]/text()' ).extract() if scores: scores = re.findall(r'[\d.]+', scores[0]) if scores: mediaItem['score'] = scores[0] except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def media_more_info_resolve(text, mediaItem): try: try: response = Selector(text=text) except Exception, e: logging.log(logging.INFO, 'text to be parsed is not xml or html') logging.log(logging.ERROR, traceback.format_exc()) if mediaItem == None: mediaItem = MediaItem() #剧情页面 intros = response.xpath( './/div[@class="conTABLE mt10"]//div[@class="w100d line_Slx pt15 dlP"]/text()' ).extract() if intros: mediaItem['intro'] = intros[0].strip() #演职人员页面 label_sels = response.xpath( './/div[@class="conTABLE mt10"]//*[@class="now pr05 fb"]') info_sels = response.xpath( './/div[@class="conTABLE mt10"]//*[@class="laGrayQdd_f pt12 line_Sbotlx pb15"]' ) index = 0 size = len(info_sels) for label_sel in label_sels: if index < size: info_sel = info_sels[index].xpath( './/a[@class="laBlueS_f" or @class="laBlueS_f fl"]') dy1905_extract.text_infos_resolve(label_sel, info_sel, mediaItem) index = index + 1 else: break
def media_extract(response): items = [] try: results = response.xpath('.//a/@href[re:test(., "http://www\.hunantv\.com/v/[\d]+/[\d]+")]').extract() for item in results: mediaItem = MediaItem() mediaItem['url'] = item items.append(mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def play_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'play url: %s' % request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) route_url_list = response.xpath( '//div[@class="play-content"]//div[@class="v-panel-route"]/a/@href' ).extract() media_url = '' if route_url_list: media_url = route_url_list[-1] if media_url: # 有媒体页url,媒体页抓取媒体信息 items.append( Request(url=media_url, callback=self.media_parse, meta={ 'url': request_url, 'item': mediaVideoItem })) else: # 电影没有媒体页,在播放页抓取媒体信息 mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() title_class = "v-info v-info-film e-follow" div_class = "v-meta v-meta-film" v_title = '//div[@class="%s"]//h1[@class="title"]/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) # 没有媒体页,播放地址作为媒体地址 mediaItem['url'] = Util.normalize_url(request_url, self.site_code) mediaVideoItem['media'] = mediaItem r = re.compile('.*/(\d+).html') m = r.match(mediaItem['url']) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', mediaItem['url']) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'play url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = None if not sels: #Detail电视剧、综艺、动漫 sels = response.xpath( '//div[@data-statectn="play_info"]//ul[@class="intro_box"]' ) if not sels: #Info:普通影片,动漫 sels = response.xpath( '//div[@data-statectn="newplay_info"]//ul[@class="info_list"]' ) if not sels: #收费影片 sels = response.xpath( '//div[@class="Player"]//span[@class="video_info"]') if sels: results = letv_extract.media_extract(sels) if results: item = results[0] url = Util.get_absolute_url(item['url'], prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) if not items: #视频播放页找不到媒体页地址,尝试直接采用接口爬取 if 'cont_id' in mediaItem: self.api_parse(mediaVideoItem) else: logging.log(logging.INFO, '该视频播放页找不到媒体页地址,也无法直接采用接口: %s' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def list_html_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'list html url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None postfix_url = response.request.meta[ 'postfix_url'] if 'postfix_url' in response.request.meta else None if u'电影' == channel_id: ''' is_hj:是否合集的标志,爬虫目前舍弃合集的链接 is_virtual:本站点是否存在 ''' sels = response.xpath( '//a[@class="ui-list-ct" and @is_hj="0" and @is_virtual="0"]' ) else: sels = response.xpath( '//a[@class="ui-list-ct" and @is_virtual="0"]') if sels: #表明仍有下一页 for sel in sels: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id urls = sel.xpath('./@href').extract() mediaItem['url'] = urls[0] pptv_extract.media_info_extract(sel, mediaItem) mediaVideoItem['media'] = mediaItem items.append( Request(url=mediaItem['url'], callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 url = self.list_prefix_url + '?' + postfix_url + '&page=%s' % page items.append( Request(url=url, callback=self.list_html_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': postfix_url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def api_media_info(self, mediaVideoItem, vid, prefix_video_url): mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() try: miu = self.media_info_url % vid jdata = self.httpdownload.get_data(miu) if not jdata: pass else: ddata = json.loads(jdata) assert int(ddata.get('code', 202)) == 200, "接口获取媒体信息失败" detail = ddata.get('data').get('detail') assert type(detail) == dict mediaItem['cont_id'] = str(detail.get('collectionId')) mediaItem['title'] = detail.get('collectionName') mediaItem['director'] = Util.join_list_safely( detail.get('director').split('/')) mediaItem['actor'] = Util.join_list_safely( detail.get('player').split('/')) mediaItem['release_date'] = Util.str2date( detail.get('publishTime')) mediaItem['vcount'] = int(detail.get('totalvideocount')) latest = detail.get('lastseries') m = re.compile('\D*(\d+)\D*').match(latest) if m: mediaItem['latest'] = m.group(1) if mediaItem['vcount'] == 1: mediaItem['latest'] = 1 mediaItem['paid'] = detail.get('isvip') mediaItem['intro'] = detail.get('desc') mediaItem['poster_url'] = detail.get('image') mediaItem['site_id'] = self.site_id mediaItem['channel_id'] = self.channels_name_id[ mediaItem['channel_id']] info_id = Util.md5hash(Util.summarize(mediaItem)) mediaItem['info_id'] = info_id vcount = mediaItem['vcount'] if not vcount: vcount = 1 else: vcount = int(vcount) video_list = self.api_video_list(vid, vcount, prefix_video_url, mediaItem['channel_id']) if video_list: Util.set_ext_id(mediaItem, video_list) mediaVideoItem['video'] = video_list mediaVideoItem['media'] = mediaItem except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.ERROR, vid)
def compose_mvitem(self, response, title_list, pers, dirs, play_url, cat_id, poster_url, text): try: cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0].strip() ep_item["actor"] = pers ep_item["director"] = dirs ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "wasu") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item mid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = mid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) if not res: return None except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() pps_extract.media_extract(response, mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() if prefix_url == self.vip_prefix_url: mediaItem['paid'] = '1' else: mediaItem['paid'] = '0' #http://vod.kankan.com/v/87/87998.shtml sels = response.xpath('//ul[@class="movieinfo"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//p[@id="movie_info_intro_l"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #普通电影,电视剧,综艺,动漫 sels = response.xpath('//div[@class="header_title"]') if sels: results = kankan_extract.media_extract(sels) else: #http://vip.kankan.com/vod/88365.html sels = response.xpath('//div[@class="movie_info"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) results = kankan_extract.media_extract(sels) else: #http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01 sels = response.xpath( '//div[@class="aside"]//div[@class="intro"]') results = kankan_extract.media_extract(sels) for item in results: mediaItem['url'] = item['url'] mediaVideoItem['media'] = mediaItem items.append( Request(url=item['url'], callback=self.media_parse, meta={'item': mediaVideoItem})) break except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def media_extract(response): items = [] try: #list播放页 sels = response.xpath('.//div[@class="site-piclist_pic"]//a[@class="site-piclist_pic_link"]') if sels: mediaItem = MediaItem() urls = sels.xpath('./@href').extract() poster_urls = sels.xpath('./img/@src').extract() if urls: mediaItem['url'] = urls[0].strip() if poster_urls: mediaItem['poster_url'] = poster_urls[0].strip() items.append(mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def media_extract(response): items = [] try: #list列表页 sels = response.xpath( './/li[@class="fl line" or normalize-space(@class)="fl"]') for sel in sels: mediaItem = MediaItem() urls = sel.xpath('./a/@href').extract() poster_urls = sel.xpath('./a/img/@src').extract() if urls: mediaItem['url'] = urls[0] mediaItem['poster_url'] = poster_urls[0] items.append(mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def media_extract(response): items = [] try: #播放页的详情部分 #http://www.letv.com/ptv/vplay/20655099.html#vid=20061199 #http://www.letv.com/ptv/vplay/22299495.html results = response.xpath('.//a[contains(text(), "%s")]/@href' % u'更多详情').extract() if not results: #http://www.letv.com/ptv/vplay/1609062.html results = response.xpath('.//a[contains(text(), "%s")]/@href' % u'影片详情').extract() if results: url = results[0] mediaItem = MediaItem() mediaItem['url'] = url items.append(mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_video_item(self, response, cat_id, url, title, playlistId): #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title)) videoitems = [] ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item["video"] = videoitems try: if int(cat_id) != int(self.movie_id): ul_list = response.xpath( '//div[@class="episodes clearfix "]/a') if not ul_list: ul_list = response.xpath( '//div[@class="episodes clearfix enc-episodes-detail"]/a' ) for li in ul_list: url = li.xpath('./@href').extract() ttitle = li.xpath('./@title').extract() snum = li.xpath('./text()').extract() if snum: play_num = self.get_play_num(snum[0]) if int(cat_id) == int(self.variety_id): play_num = self.getvnum(self.url_prefix + url[0]) if not ttitle: ttitle = [play_num] vitem = self.compose_vitem([self.url_prefix + url[0]], title, play_num) if 'url' in vitem: videoitems.append(vitem) elif int(cat_id) == int(self.movie_id): if url: vitem = self.compose_vitem([url], title, 1) if 'url' in vitem: videoitems.append(vitem) if videoitems: item["video"] = videoitems item["media"]["url"] = response.request.url Util.set_ext_id(item["media"], item["video"]) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return item
def media_extract(response): items = [] try: #list列表页 sels = response.xpath( './/div[@class="content"]//ul[@class="p-list-syd"]//li[@class="p-item"]' ) for sel in sels: mediaItem = MediaItem() #实际为播放地址,这里暂放在mediaItem中 urls = sel.xpath('./a/@href').extract() poster_urls = sel.xpath('./a/img/@src').extract() scores = sel.xpath('./div[@class="score"]') if urls: mediaItem['url'] = urls[0] if poster_urls: mediaItem['poster_url'] = poster_urls[0] if scores: mediaItem['score'] = scores[0] items.append(mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def video_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'video url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #播放页 - 普通电影 sels = response.xpath( '//div[@class="film-info clearfix"]//span[@class="summary"]/a/@href' ) if not sels: #播放页 - vip电影 sels = response.xpath( '//div[@class="f_song inner_resumeCon intro"]//div[@class="con"]/a/@href' ) if not sels: #播放页 - 预告片电影 sels = response.xpath( '//div[@class="related-film clear"]//a[@class="rel-film-img"]/@href' ) if sels: url = sels.extract()[0] url = Util.get_absolute_url(url, prefix_url) mediaItem['url'] = url mediaVideoItem['media'] = mediaItem items.append( Request(url=url, callback=self.media_parse, meta={'item': mediaVideoItem})) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'video url: %s' % request_url)
def media_parse(self, response): items = [] try: media_url = response.request.url logging.log(logging.INFO, 'media url: %s' % media_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() # 媒体页获取媒体信息 title_class = "v-info v-info-album " div_class = "v-meta v-meta-album" v_title = '//div[@class="%s"]//h1[@class="title"]/span/text()' title_list = response.xpath(v_title % title_class).extract() title = Util.join_list_safely(title_list) if title: mediaItem['title'] = title mediaItem = self.pack_media_info(response, mediaItem, title_class, div_class) mediaItem['url'] = Util.normalize_url(media_url, self.site_code) request_url = response.meta['url'] request_url = Util.normalize_url(request_url, self.site_code) r = re.compile('.*/(\d+).html') m = r.match(request_url) if m: vid = m.group(1) prefix_video_url = re.sub(vid, '%s', request_url) items.append( self.api_media_info(mediaVideoItem, vid, prefix_video_url)) else: pass except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #title title = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title' ).extract() if not title or not title[0]: title = response.xpath( '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title' ).extract() #performer #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract() performer_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()' ).extract() if not performer_list: performer_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'主演:').extract() #director #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract() director_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()' ).extract() if not director_list: director_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()' % u'导演:').extract() #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() if not text: response.xpath( '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()' ).extract() type_list = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()' ).extract() if not type_list: type_list = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'类型:').extract() year_info = response.xpath( '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()' ).extract() if not year_info: year_info = response.xpath( '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()' % u'年份:').extract() play_date = None if year_info: play_date = self.get_year(year_info[0]) # dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) pers = Util.join_list_safely(performer_list) #sourceid sourceid = "" sourceid_list = response.xpath( '//div[@class="mod_bd sourceCont"]/@sourceid').extract() if sourceid_list: sourceid = sourceid_list[0] videoitems = [] ep_item = MediaItem() if len(title) > 0: ep_item["title"] = title[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if play_date: ep_item["release_date"] = Util.str2date(play_date) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["url"] = Util.normalize_url(response.request.url, "qq") ep_item["poster_url"] = poster_url if len(text) > 0: ep_item["intro"] = text[0] mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems vurl = "" url_pre = "http://s.video.qq.com/loadplaylist?vkey=" url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1" videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) mvitem["media"]["cont_id"] = videoid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) vurl = url_pre + str(sourceid) + url_tail tflag = "jQuery191048201349820010364_1425370006500" tpitem = self.parse_play_list(cat_id, vurl, tflag, response) #没有sourceid,比如专题页面 if not tpitem: tpitem = self.parse_topic_play_list(response) videoids = response.xpath( '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value' ).extract() if videoids: mvitem["media"]["cont_id"] = videoids[0] if tpitem: mvitem["video"] = tpitem Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid if self.check_url(mvitem): items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def parse_episode_info(self, response): items = [] try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] year_list = [] lyears = [] title_list = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title' ).extract() director_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'导演:').extract() performer_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'主演:').extract() type_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'类型:').extract() district_list = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' % u'地区:').extract() year_info = response.xpath( '//div[@class="info clearfix"]/span[text()="%s"]/text()' % u'地区:').extract() year = None if len(year_info) >= 2: year = self.get_year(year_info[1]) #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) districts = Util.join_list_safely(district_list) #text text = response.xpath( '//div[@class="juqing briefTab"]/div/text()').extract() #score score = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()' ).extract() play_url = "" tplay_url = response.xpath( '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href' ).extract() if tplay_url: play_url = self.url_prefix + tplay_url[0].strip() videoitems = [] ep_item = MediaItem() if title_list: ep_item["title"] = title_list[0] if ep_item["title"].find(u'预:') >= 0: print "预告片,url", response.request.url return items ep_item["actor"] = pers ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = districts if year: ep_item["release_date"] = Util.str2date(year) ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url, "baofeng") if len(text) > 0: ep_item["intro"] = text[0].strip() mvitem = MediaVideoItem() mvitem["media"] = ep_item vurl = "" videoid = self.getshowid(response.request.url) mvitem["media"]["cont_id"] = videoid ttvitem = {} if title_list: ttvitem = self.parse_video_item(response, cat_id, play_url, title_list, None) if ttvitem: if 'video' in ttvitem and len(ttvitem['video']) > 0: mvitem['video'] = ttvitem['video'] mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) if untrack_id and sid: mvitem["untrack_id"] = untrack_id mvitem["sid"] = sid res = self.check_url(mvitem) #if self.check_url(mvitem): if res: items.append(mvitem) pass except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #普通媒体页 release_dates = response.xpath('./@data-qitancomment-tvyear').extract() if release_dates: release_dates = re.findall(r'[\d]+', release_dates[0]) if release_dates: release_date = ''.join(release_dates) release_date = Util.str2date(release_date) mediaItem['release_date'] = release_date class_names = response.xpath('./@type').extract() if class_names and 'text/javascript' == class_names[0]: #视频类型 video:正片 trailer:片花 regex_express = "vType[ ]?:[ ]?[']?(\w+)[']" match_result = response.re(regex_express) if match_result: vType = match_result[0] if vType.strip() != 'video': return regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)' #默认采用的是sourceId cont_id = '0' regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)' match_result = response.re(regex_express) if match_result: cont_id = match_result[0] if cont_id == '0': #其他采用的是albumId regex_express = 'albumId[ ]?:[ ]?["]?(\d+)' match_result = response.re(regex_express) if match_result: cont_id = match_result[0] mediaItem['cont_id'] = '%s|album_id' % (cont_id) else: mediaItem['cont_id'] = '%s|source_id' % (cont_id) regex_express = 'cid[ ]?:[ ]?(\d+)' match_result = response.re(regex_express) if match_result: cid = match_result[0] mediaItem['channel_id'] = cid regex_express = 'title[ ]?:[ ]?\"(.*)\"' match_result = response.re(regex_express) if match_result: title = match_result[0] mediaItem['title'] = title #特殊剧集页:http://www.iqiyi.com/dianshiju/18jbj.html#vfrm=2-4-0-1 regex_express = 'albumInfo[ ]?=[ ]?(\{.*\})' match_result = response.re(regex_express) if match_result: json_content = match_result[0] try: json_data = json.loads(json_content) cont_ids = '0' cont_ids = json_data['sourceId'] if cont_ids != '0': cont_ids = '%s|source_id' % (cont_ids) mediaItem['cont_id'] = cont_ids else: cont_ids = json_data['albumId'] cont_ids = '%s|album_id' % (cont_ids) mediaItem['cont_id'] = cont_ids districts = json_data['areas'] types = json_data['types'] directors = json_data['directors'] actors = json_data['mainActors'] writers = json_data['writer'] titles = json_data['tvName'] poster_urls = json_data['tvPictureUrl'] vcounts = json_data['episodeCounts'] latests = json_data['currentMaxEpisode'] release_dates = json_data['issueTime'] intros = json_data['tvDesc'] if districts: districts_json = json.loads(districts) districts = districts_json.values() mediaItem['district'] = Util.join_list_safely(districts) if types: types_json = json.loads(types) types = types_json.values() mediaItem['type'] = Util.join_list_safely(types) mediaItem['director'] = Util.join_list_safely(directors) mediaItem['actor'] = Util.join_list_safely(actors) mediaItem['writer'] = Util.join_list_safely(writers) mediaItem['title'] = titles mediaItem['poster_url'] = poster_urls mediaItem['vcount'] = vcounts mediaItem['latest'] = latests release_dates = str(release_dates) release_date = Util.str2date(release_dates) mediaItem['release_date'] = release_date mediaItem['intro'] = intros except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, '=================json_content=================') logging.log(logging.INFO, json_content) #普通媒体页 - 媒体信息域 # (1) http://www.iqiyi.com/a_19rrgjaiqh.html#vfrm=2-4-0-1 # 集数的情况很复杂,这里不予考虑 sels = response.xpath('.//div[@class="result_pic pr"]') if sels: poster_urls = sels.xpath('.//a/img/@src').extract() if poster_urls: mediaItem['poster_url'] = poster_urls[0] sels = response.xpath('.//div[@class="result_detail"]') if sels: titles = sels.xpath('.//h1[@class="main_title"]//a/text()').extract() scores = sels.xpath('.//div[@class="topic_item topic_item-rt"]//span[@class="score_font"]//span/text()').extract() scores = ''.join(scores) scores = re.findall(r'[\d.]+', scores) if titles: mediaItem['title'] = titles[0] if scores: try: mediaItem['score'] = float(scores[0]) except Exception, e: pass msg_sels = sels.xpath('.//div[@class="topic_item clearfix"]') for msg_sel in msg_sels: msg_more_sels = msg_sel.xpath('./div') for sel in msg_more_sels: labels = sel.xpath('.//em/text()').extract() infos = sel.xpath('.//em/a/text()').extract() iqiyi_extract.text_infos_resolve(labels, infos, mediaItem) intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="moreinfo"]/span/text()').extract() if not intros: intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="lessinfo"]/span/text()').extract() if intros: mediaItem['intro'] = intros[0]
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else None mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() sels = response.xpath('//script[@type="text/javascript"]') letv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="play"]') letv_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//dl[@class="textInfo"]') if sels: #电视剧、综艺、动漫 letv_extract.media_info_extract(sels, mediaItem) else: #电影 sels = response.xpath('//div[@class="detail"]') letv_extract.media_info_extract(sels, mediaItem) #获取正片的url videoItems = [] if u'电影' == mediaItem['channel_id']: pagenum = 1 videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) videoItems = page_items #综艺 elif u'综艺' == mediaItem['channel_id']: sels = response.xpath( '//div[@class="listTab"]//div[@data-statectn="n_click"]') if sels: year_month_sels = sels.xpath('.//a') for year_month_sel in year_month_sels: years = year_month_sel.xpath('./@list-year').extract() months = year_month_sel.xpath( './@list-month').extract() year = None month = None if years: year = years[0] if months: month = months[0] if year and month: videos_url = self.zongyi_album_api % ( year, month, mediaItem['cont_id']) result = Util.get_url_content(videos_url) videoItems = videoItems + self.zongyi_album_resolve( text=result, meta={ 'url': videos_url, 'year': year, 'month': month }) elif mediaItem['channel_id'] in [u'电视剧', u'动漫']: pagenum = 1 while True: videos_url = self.other_album_api % (mediaItem['cont_id'], pagenum) result = Util.get_url_content(videos_url) page_items = self.other_album_resolve(text=result, meta={ 'url': videos_url, 'pagenum': pagenum }) if not page_items: break videoItems = videoItems + page_items pagenum = pagenum + 1 if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def list_json_parse(self, response): items = [] try: origin_url = response.request.meta['url'] request_url = response.request.url logging.log(logging.INFO, 'json api url: %s' % request_url) page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None list_json_postfix_url = response.request.meta[ 'postfix_url'] if 'postfix_url' in response.request.meta else None json_datas = json.loads(response.body) videos = [] if json_datas: videos = json_datas[ 'data_list'] if 'data_list' in json_datas else [] if videos: #表明仍有下一页 video_url = 'http://www.letv.com/ptv/vplay/%s.html' for item in videos: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id if 'rating' in item and item['rating']: mediaItem['score'] = item['rating'] subCategoryName = item['subCategoryName'] mediaItem['type'] = subCategoryName.replace(',', ';') mediaVideoItem['media'] = mediaItem release_date = item['releaseDate'] if release_date: release_date = float(release_date) if release_date > 0: release_date = release_date / 1000 release_date = time.localtime(release_date) release_date = '%s-%s-%s' % (release_date.tm_year, release_date.tm_mon, release_date.tm_mday) mediaItem['release_date'] = Util.str2date( release_date) vid = '' if 'vids' in item: vids = item['vids'] vids = vids.split(',') vid = vids[0] elif 'vid' in item: vid = item['vid'] if vid: url = video_url % vid items.append( Request(url=url, callback=self.video_parse, meta={'item': mediaVideoItem})) #下一页 page = page + 1 url = self.list_json_prefix_url + list_json_postfix_url + 'p=%s' % page items.append( Request(url=url, callback=self.list_json_parse, meta={ 'page': page, 'id': channel_id, 'postfix_url': list_json_postfix_url, 'url': url })) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'json api url: %s' % request_url) logging.log(logging.INFO, 'origin url: %s' % origin_url)
def parse_episode_info(self,response): try: logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = response.request.meta['poster_url'] page_id = self.get_youku_pageid(response.request.url) if not page_id: log.error('miss content id: %s' % response.request.url) return untrack_id = "" sid = "" mid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] if "mid" in response.request.meta: mid = response.request.meta['mid'] items = [] year_list = [] title = self.parse_title(response,cat_id) performer_list = self.parse_actor(response) director_list = self.parse_director(response) district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'地区:').extract() type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()' % u'类型:').extract() play_date = self.parse_play_date(response) total_num = self.parse_total_num(response) year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) types = Util.join_list_safely(type_list) #text text = response.xpath('//div[@class="detail"]/span/text()').extract() videoitems = [] ep_item = MediaItem() if title: ep_item["title"] = title[0].strip() if pers: ep_item["actor"] = pers if dirs > 0: ep_item["director"] = dirs if types: ep_item["type"] = types if district_list: ep_item["district"] = district_list[0].strip() if play_date: ep_item["release_date"] = Util.str2date(play_date) if total_num: ep_item["vcount"] = total_num ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url ep_item["url"] = Util.normalize_url(response.request.url,"youku") if text: ep_item["intro"] = text[0].strip() ep_item["cont_id"] = page_id ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item)) mvitem = MediaVideoItem(); if mid: mvitem['mid'] = mid mvitem["media"] = ep_item; if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id) mvitem['video'] = video_list Util.set_ext_id(mvitem["media"], mvitem["video"]) items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return items
def media_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'media url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) mediaVideoItem = response.request.meta[ 'item'] if 'item' in response.request.meta else MediaVideoItem( ) mediaItem = mediaVideoItem[ 'media'] if 'media' in mediaVideoItem else MediaItem() #过滤掉skip_types类型的影片 sels = response.xpath('//head//script') if sels: regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\'' match_result = sels.re(regex_express) if match_result: play_type = match_result[0] if play_type in self.skip_types: return items #由于某些URL会有跳转,所以应保存真是的URL #http://movie.kankan.com/movie/88365 -> http://data.movie.kankan.com/movie/88365 mediaItem['url'] = request_url sels = response.xpath('//head') kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//div[@class="info_list"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) sels = response.xpath('//ul[@class="detail_ul"]') if sels: kankan_extract.media_info_extract(sels, mediaItem) #获取媒体的剧集信息 videoItems = [] if u'综艺' == mediaItem['channel_id']: #综艺 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) elif u'电影' == mediaItem['channel_id']: #电影,从立即观看中获取 videoItem = VideoItem() Util.copy_media_to_video(mediaItem, videoItem) sels = response.xpath('//div[@class="section clearfix s2"]') if sels: urls = sels.xpath( './/a[starts-with(@class, "foc")]/@href').extract() thumb_urls = sels.xpath( './/a[@class="foc"]/img/@src').extract() if urls: url = urls[0] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url if thumb_urls: videoItem['thumb_url'] = thumb_urls[0] self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) else: #电视剧 sels = response.xpath( '//div[@id[re:test(., "fenji_[\d]+_asc")]]') if not sels: #动漫,电视剧 sels = response.xpath( '//ul[@id[re:test(., "fenji_[\d]+_asc")]]') for sel in sels: video_sels = sel.xpath('.//li') for video_sel in video_sels: videoItem = VideoItem() videoItem['intro'] = mediaItem['channel_id'] kankan_extract.video_info_extract(video_sel, videoItem) if 'url' in videoItem: url = videoItem['url'] url = Util.get_absolute_url(url, prefix_url) videoItem['url'] = url self.set_video_info(videoItem, mediaItem['channel_id']) videoItems.append(videoItem) if videoItems: #设置ext_id Util.set_ext_id(mediaItem, videoItems) self.set_media_info(mediaItem) mediaVideoItem['media'] = mediaItem mediaVideoItem['video'] = videoItems items.append(mediaVideoItem) #self.count = self.count + 1 #logging.log(logging.INFO, 'count: %s' % str(self.count)) else: logging.log(logging.INFO, '%s: no videos' % request_url) except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'media url: %s' % request_url)
def parse_episode_play(self, response): mvitem = None try: logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url) cat_id = response.request.meta['cat_id'] poster_url = "" untrack_id = "" sid = "" if "untrack_id" in response.request.meta: untrack_id = response.request.meta['untrack_id'] if "sid" in response.request.meta: sid = response.request.meta['sid'] #items = [] #title title_list = response.xpath( '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title' ).extract() if not title_list: title_list = response.xpath( '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()' ).extract() #performer performer_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()' ).extract() #director director_list = response.xpath( '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract() pers = Util.join_list_safely(performer_list) dirs = Util.join_list_safely(director_list) #text text = response.xpath( '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()' ).extract() ep_item = MediaItem() videoitems = [] #not film if int(cat_id) != int(self.movie_id): #video list #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li') video_list = response.xpath( '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li' ) i = 0 for tvideo in video_list: lurl = tvideo.xpath('./a/@href').extract() surl = "" #lnum = tvideo.xpath('./a/@title').extract() lnum = tvideo.xpath('./a/span/text()').extract() vitem = VideoItem() if lnum and lurl: vitem["vnum"] = lnum[0] surl = "http://film.qq.com" + lurl[0] vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,surl) turl = "" if cat_id == self.tv_id: turl = Util.normalize_url(surl, "qq", "tv") if cat_id == self.cartoon_id: turl = Util.normalize_url(surl, "qq", "cartoon") else: turl = Util.normalize_url(surl, "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) else: vitem = VideoItem() if title_list: vitem["title"] = title_list[0] vitem["vnum"] = "1" vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id #vitem["cont_id"] = self.get_vid(response.body,response.request.url) turl = Util.normalize_url(response.request.url, "qq") vitem["url"] = turl vitem["ext_id"] = Util.md5hash(turl) vitem["cont_id"] = self.get_qq_showid(vitem["url"]) videoitems.append(vitem) if len(title_list) > 0: ep_item["title"] = title_list[0] if len(pers) > 0: ep_item["actor"] = pers if len(dirs) > 0: ep_item["director"] = dirs if len(text) > 0: ep_item["intro"] = text[0] ep_item["site_id"] = self.site_id ep_item["channel_id"] = cat_id ep_item["poster_url"] = poster_url videoid = self.get_qq_showid(response.request.url) #videoid = self.get_vid(response.body,response.request.url) ep_item["cont_id"] = videoid mvitem = MediaVideoItem() mvitem["media"] = ep_item mvitem["video"] = videoitems #mvitem["media"]["url"] = response.request.url mvitem["media"]["url"] = Util.normalize_url( response.request.url, "qq") #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"]) if untrack_id: mvitem["untrack_id"] = untrack_id if sid: mvitem["sid"] = sid mvitem["media"]["info_id"] = Util.md5hash( Util.summarize(mvitem["media"])) Util.md5hash(Util.summarize(mvitem["media"])) Util.set_ext_id(mvitem["media"], mvitem["video"]) #items.append(mvitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return mvitem
def list_parse(self, response): items = [] try: request_url = response.request.url logging.log(logging.INFO, 'url: %s' % request_url) prefix_url = Util.prefix_url_parse(request_url) first = response.request.meta[ 'first'] if 'first' in response.request.meta else False channel_id = response.request.meta[ 'id'] if 'id' in response.request.meta else None if first: sels = response.xpath('//div[@class="tab_box"]//a') for sel in sels: texts = sel.xpath('.//span/text()').extract() if texts: text = texts[0].replace(' ', '') if text == u'最新': urls = sel.xpath('./@href').extract() url = urls[0] items.append( Request(url=url, callback=self.list_parse, meta={'id': channel_id})) break else: page = response.request.meta[ 'page'] if 'page' in response.request.meta else 1 if page > self.max_update_page: return items #list列表 sels = response.xpath('//ul[@class="movielist"]/li') for sel in sels: results = kankan_extract.video_extract(sel) for item in results: mediaVideoItem = MediaVideoItem() mediaItem = MediaItem() mediaItem['channel_id'] = channel_id kankan_extract.media_info_extract(sel, mediaItem) mediaVideoItem['media'] = mediaItem items.append( Request(url=item['url'], callback=self.video_parse, meta={'item': mediaVideoItem})) break #下一页 sels = response.xpath('//p[@class="list-pager-v2"]') results = kankan_extract.next_page_extract(sels) page = page + 1 for item in results: url = Util.get_absolute_url(item, prefix_url) items.append( Request(url=url, callback=self.list_parse, meta={ 'page': page, 'id': channel_id })) break except Exception, e: logging.log(logging.ERROR, traceback.format_exc()) logging.log(logging.INFO, 'url: %s' % request_url)
def media_info_extract(response, mediaItem): try: if mediaItem == None: mediaItem = MediaItem() #list api页面 results = response.xpath('.//p[@class="ui-pic"]//img/@data-src2').extract() if results: mediaItem['poster_url'] = results[0] results = response.xpath('.//p[@class="ui-txt"]//span[@class="main-tt"]/text()').extract() if results: mediaItem['title'] = results[0] results = response.xpath('.//p[@class="ui-txt"]//em/text()').extract() if results: mediaItem['score'] = results[0] #普通播放页 sel = response.xpath('.//script[@type="text/javascript"]') if sel: cont_ids = sel.re('\"id\"[ ]?:[ ]?(\d+)') if cont_ids: mediaItem['cont_id'] = cont_ids[0] sel = response.xpath('.//div[@id="mainContent"]') if sel: titles = sel.xpath('.//*[@class="tit"]/text()').extract() scores = sel.xpath('.//div[@id="scoremark"]//em[@class="score"]/text()').extract() intros = sel.xpath('.//p[@class="longinfo"]/text()').extract() if titles: title = titles[0].strip() match_result = None #电影 if u'电影' == mediaItem['channel_id']: match_result = None #综艺 #都来爱梦-20121215 时尚健康-20150430-包贝尔分享包氏火锅哲学 elif u'综艺' == mediaItem['channel_id']: regex_express = r'(.+)-[\d]+[-].+' regex_pattern = re.compile(regex_express) match_result = regex_pattern.search(title) if not match_result: regex_express = r'(.+)-[\d]+' regex_pattern = re.compile(regex_express) match_result = regex_pattern.search(title) if not match_result: regex_express = u'(.+)[((]第[\d]+集[))]' regex_pattern = re.compile(regex_express) match_result = regex_pattern.search(title) #电视剧,动漫 else: regex_express = u'(.+)[((]第[\d]+集[))]' regex_pattern = re.compile(regex_express) match_result = regex_pattern.search(title) if match_result: mediaItem['title'] = match_result.groups()[0] else: mediaItem['title'] = title if scores: score = scores[0].strip() mediaItem['score'] = score if intros: intro = intros[0].strip() mediaItem['intro'] = intro msg_sels = sel.xpath('.//div[@class="intro-content intro-short"]//li') for sel in msg_sels: labels = sel.xpath('./span/text()').extract() infos = sel.xpath('./a/text()').extract() if not infos: infos = sel.xpath('./text()').extract() pptv_extract.text_infos_resolve(labels, infos, mediaItem) #vip播放页 sel = response.xpath('.//script[@type="text/javascript"]') if sel: cont_ids = sel.re('vid[ ]?:[ ]?["]?(\d+)') if cont_ids: mediaItem['cont_id'] = cont_ids[0] sel = response.xpath('.//div[@class="ptxt"]') if sel: titles = sel.xpath('./*/@title').extract() intros = sel.xpath('.//span[@class="thenext"]/text()').extract() if titles: mediaItem['title'] = titles[0].strip() if intros: mediaItem['intro'] = intros[0].strip() msg_sels = sel.xpath('./p') for sel in msg_sels: labels = sel.xpath('./em/text()').extract() infos = sel.xpath('.//tt/text()').extract() pptv_extract.text_infos_resolve(labels, infos, mediaItem) except Exception, e: logging.log(logging.ERROR, traceback.format_exc())
def parse_play_list(self, cat_id, url, flag, response): item = None videoitems = [] try: ep_item = MediaItem() item = MediaVideoItem() item["media"] = ep_item item['video'] = videoitems info = None try: info = self.httpdownload.get_data(url) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems if not info or len(info) < 2: return videoitems msg = info bodylen = len(msg) - 1 index = msg.find(flag) + len(flag) + 1 info = msg[index:bodylen] jinfo = json.loads(info) if "video_play_list" not in jinfo: return videoitems itemlist = jinfo["video_play_list"]["playlist"] for titem in itemlist: if "episode_number" not in titem: continue info = titem["episode_number"] if info and titem["title"].find(u"预告") < 0 and url.find( "qq.com") >= 0: vitem = VideoItem() vitem["title"] = titem["title"] tvnum = string.replace(info, "-", "") #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html tvnum_list = re.findall(r'[\D]+', tvnum) if not tvnum_list: vitem["vnum"] = string.replace(info, "-", "") else: continue vitem["os_id"] = self.os_id vitem["site_id"] = self.site_id turl = "" if int(cat_id) == int(self.tv_id) or int(cat_id) == int( self.cartoon_id): turl = Util.normalize_url(titem["url"], "qq", "tv") else: turl = Util.normalize_url(titem["url"], "qq") if turl: vitem["ext_id"] = Util.md5hash(turl) #vitem["cont_id"] = self.get_vid(response.body,turl) vitem["url"] = turl vitem["cont_id"] = self.get_qq_showid(vitem["url"]) else: continue videoitems.append(vitem) except Exception as e: logging.log(logging.ERROR, traceback.format_exc()) return videoitems