Exemple #1
0
 def media_extract(response):
     items = []
     try:
         results = response.xpath(
             './/a/@href[re:test(., "http://movie\.kankan\.com/movie/[\d]+/introduction")]'
         ).extract()
         if results:
             #http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01#7927921
             #http://vip.kankan.com/vod/88365.html#7306075
             url = results[0]
             regex_pattern = re.compile(
                 '(http://movie\.kankan\.com/movie/[\d]+)')
             match_results = regex_pattern.search(url)
             if match_results:
                 mediaItem = MediaItem()
                 mediaItem['url'] = match_results.groups()[0]
                 items.append(mediaItem)
         else:
             #http://vod.kankan.com/v/86/86897.shtml#9895815
             results = response.xpath(
                 './/a/@href[re:test(., "http://data\.movie\.kankan\.com/movie/[\d]+")]'
             ).extract()
             for item in results:
                 mediaItem = MediaItem()
                 mediaItem['url'] = item
                 items.append(mediaItem)
                 break
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #2
0
 def load_video_urls(self):
     items = []
     try:
         if self.json_data:
             cmd = self.json_data['cmd'] if 'cmd' in self.json_data else None
             if cmd == 'trig':
                 stat = self.json_data[
                     'stat'] if 'stat' in self.json_data else None
                 res = self.mgr.get_untrack_url(self.site_code, stat)
                 for item in res:
                     mediaVideoItem = MediaVideoItem()
                     mediaVideoItem['sid'] = item['sid']
                     mediaVideoItem['untrack_id'] = item['untrack_id']
                     mediaItem = MediaItem()
                     mediaItem['channel_id'] = item['name']
                     mediaVideoItem['media'] = mediaItem
                     url = item['url']
                     items.append(
                         Request(url=url,
                                 callback=self.video_parse,
                                 meta={'item': mediaVideoItem}))
             elif cmd == 'assign':
                 tasks = self.json_data[
                     'task'] if 'task' in self.json_data else None
                 for task in tasks:
                     mediaVideoItem = MediaVideoItem()
                     mediaVideoItem[
                         'sid'] = task['sid'] if 'sid' in task else None
                     mediaVideoItem['untrack_id'] = task[
                         'untrack_id'] if 'untrack_id' in task else None
                     mediaItem = MediaItem()
                     mediaItem['channel_id'] = task['name']
                     mediaVideoItem['media'] = mediaItem
                     url = task['url']
                     items.append(
                         Request(url=url,
                                 callback=self.video_parse,
                                 meta={'item': mediaVideoItem}))
             elif cmd == 'test':
                 channel_id = self.json_data[
                     'id'] if 'id' in self.json_data else None
                 url = self.json_data[
                     'url'] if 'url' in self.json_data else None
                 if url and channel_id:
                     list_channel = self.mgr.get_channel_name(channel_id)
                     if list_channel:
                         list_channel = list_channel['name']
                         items.append(
                             Request(url=url,
                                     callback=self.list_parse,
                                     meta={
                                         'first': False,
                                         'id': list_channel
                                     }))
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #3
0
 def media_info_extract(response, mediaItem):
     try:
         if mediaItem == None:
             mediaItem = MediaItem()
         #媒体页
         sels = response.xpath('.//div[@class="laMoCont"]')
         if sels:
             name_sels = sels.xpath('.//div[@class="laMovName"]')
             titles = name_sels.xpath(
                 './/a[@class="laGrayS_f"]/text()').extract()
             if titles:
                 mediaItem['title'] = titles[0]
             property_sels = sels.xpath(
                 './/ol[@class="movStaff line_BSld"]//li')
             ignore = True
             for sel in property_sels:
                 label_sels = sel.xpath('.//strong')
                 info_sels = sel.xpath('.//a')
                 dy1905_extract.text_infos_resolve(label_sels, info_sels,
                                                   mediaItem, ignore)
         scores = response.xpath(
             './/div[@class="laMoOther"]//div[@class="rating-dt"]//span[@class="score"]/text()'
         ).extract()
         if scores:
             scores = re.findall(r'[\d.]+', scores[0])
             if scores:
                 mediaItem['score'] = scores[0]
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #4
0
 def media_more_info_resolve(text, mediaItem):
     try:
         try:
             response = Selector(text=text)
         except Exception, e:
             logging.log(logging.INFO,
                         'text to be parsed is not xml or html')
             logging.log(logging.ERROR, traceback.format_exc())
         if mediaItem == None:
             mediaItem = MediaItem()
         #剧情页面
         intros = response.xpath(
             './/div[@class="conTABLE mt10"]//div[@class="w100d line_Slx pt15 dlP"]/text()'
         ).extract()
         if intros:
             mediaItem['intro'] = intros[0].strip()
         #演职人员页面
         label_sels = response.xpath(
             './/div[@class="conTABLE mt10"]//*[@class="now pr05 fb"]')
         info_sels = response.xpath(
             './/div[@class="conTABLE mt10"]//*[@class="laGrayQdd_f pt12 line_Sbotlx pb15"]'
         )
         index = 0
         size = len(info_sels)
         for label_sel in label_sels:
             if index < size:
                 info_sel = info_sels[index].xpath(
                     './/a[@class="laBlueS_f" or @class="laBlueS_f fl"]')
                 dy1905_extract.text_infos_resolve(label_sel, info_sel,
                                                   mediaItem)
                 index = index + 1
             else:
                 break
 def media_extract(response):
     items = []
     try:
         results = response.xpath('.//a/@href[re:test(., "http://www\.hunantv\.com/v/[\d]+/[\d]+")]').extract()
         for item in results:
             mediaItem = MediaItem()
             mediaItem['url'] = item 
             items.append(mediaItem)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #6
0
    def play_parse(self, response):
        items = []
        try:
            request_url = response.request.url
            logging.log(logging.INFO, 'play url: %s' % request_url)
            mediaVideoItem = response.request.meta[
                'item'] if 'item' in response.request.meta else MediaVideoItem(
                )

            route_url_list = response.xpath(
                '//div[@class="play-content"]//div[@class="v-panel-route"]/a/@href'
            ).extract()
            media_url = ''
            if route_url_list:
                media_url = route_url_list[-1]
            if media_url:
                # 有媒体页url,媒体页抓取媒体信息
                items.append(
                    Request(url=media_url,
                            callback=self.media_parse,
                            meta={
                                'url': request_url,
                                'item': mediaVideoItem
                            }))
            else:
                # 电影没有媒体页,在播放页抓取媒体信息
                mediaItem = mediaVideoItem[
                    'media'] if 'media' in mediaVideoItem else MediaItem()
                title_class = "v-info v-info-film e-follow"
                div_class = "v-meta v-meta-film"
                v_title = '//div[@class="%s"]//h1[@class="title"]/text()'
                title_list = response.xpath(v_title % title_class).extract()
                title = Util.join_list_safely(title_list)
                if title:
                    mediaItem['title'] = title
                mediaItem = self.pack_media_info(response, mediaItem,
                                                 title_class, div_class)
                # 没有媒体页,播放地址作为媒体地址
                mediaItem['url'] = Util.normalize_url(request_url,
                                                      self.site_code)
                mediaVideoItem['media'] = mediaItem
                r = re.compile('.*/(\d+).html')
                m = r.match(mediaItem['url'])
                if m:
                    vid = m.group(1)
                    prefix_video_url = re.sub(vid, '%s', mediaItem['url'])
                    items.append(
                        self.api_media_info(mediaVideoItem, vid,
                                            prefix_video_url))
                else:
                    items.append(mediaVideoItem)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'play url: %s' % request_url)
Exemple #7
0
    def video_parse(self, response):
        items = []
        try:
            request_url = response.request.url
            logging.log(logging.INFO, 'video url: %s' % request_url)
            prefix_url = Util.prefix_url_parse(request_url)
            mediaVideoItem = response.request.meta[
                'item'] if 'item' in response.request.meta else MediaVideoItem(
                )
            mediaItem = mediaVideoItem[
                'media'] if 'media' in mediaVideoItem else MediaItem()
            sels = response.xpath('//script[@type="text/javascript"]')
            letv_extract.media_info_extract(sels, mediaItem)

            sels = None
            if not sels:
                #Detail电视剧、综艺、动漫
                sels = response.xpath(
                    '//div[@data-statectn="play_info"]//ul[@class="intro_box"]'
                )
            if not sels:
                #Info:普通影片,动漫
                sels = response.xpath(
                    '//div[@data-statectn="newplay_info"]//ul[@class="info_list"]'
                )
            if not sels:
                #收费影片
                sels = response.xpath(
                    '//div[@class="Player"]//span[@class="video_info"]')

            if sels:
                results = letv_extract.media_extract(sels)
                if results:
                    item = results[0]
                    url = Util.get_absolute_url(item['url'], prefix_url)
                    mediaItem['url'] = url
                    mediaVideoItem['media'] = mediaItem
                    items.append(
                        Request(url=url,
                                callback=self.media_parse,
                                meta={'item': mediaVideoItem}))

            if not items:
                #视频播放页找不到媒体页地址,尝试直接采用接口爬取
                if 'cont_id' in mediaItem:
                    self.api_parse(mediaVideoItem)
                else:
                    logging.log(logging.INFO,
                                '该视频播放页找不到媒体页地址,也无法直接采用接口: %s' % request_url)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'video url: %s' % request_url)
Exemple #8
0
 def list_html_parse(self, response):
     items = []
     try:
         request_url = response.request.url
         logging.log(logging.INFO, 'list html url: %s' % request_url)
         page = response.request.meta[
             'page'] if 'page' in response.request.meta else 1
         if page > self.max_update_page:
             return items
         channel_id = response.request.meta[
             'id'] if 'id' in response.request.meta else None
         postfix_url = response.request.meta[
             'postfix_url'] if 'postfix_url' in response.request.meta else None
         if u'电影' == channel_id:
             '''
                 is_hj:是否合集的标志,爬虫目前舍弃合集的链接
                 is_virtual:本站点是否存在
             '''
             sels = response.xpath(
                 '//a[@class="ui-list-ct" and @is_hj="0" and @is_virtual="0"]'
             )
         else:
             sels = response.xpath(
                 '//a[@class="ui-list-ct" and @is_virtual="0"]')
         if sels:
             #表明仍有下一页
             for sel in sels:
                 mediaVideoItem = MediaVideoItem()
                 mediaItem = MediaItem()
                 mediaItem['channel_id'] = channel_id
                 urls = sel.xpath('./@href').extract()
                 mediaItem['url'] = urls[0]
                 pptv_extract.media_info_extract(sel, mediaItem)
                 mediaVideoItem['media'] = mediaItem
                 items.append(
                     Request(url=mediaItem['url'],
                             callback=self.video_parse,
                             meta={'item': mediaVideoItem}))
             #下一页
             page = page + 1
             url = self.list_prefix_url + '?' + postfix_url + '&page=%s' % page
             items.append(
                 Request(url=url,
                         callback=self.list_html_parse,
                         meta={
                             'page': page,
                             'id': channel_id,
                             'postfix_url': postfix_url
                         }))
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #9
0
    def api_media_info(self, mediaVideoItem, vid, prefix_video_url):
        mediaItem = mediaVideoItem[
            'media'] if 'media' in mediaVideoItem else MediaItem()
        try:
            miu = self.media_info_url % vid
            jdata = self.httpdownload.get_data(miu)
            if not jdata:
                pass
            else:
                ddata = json.loads(jdata)
                assert int(ddata.get('code', 202)) == 200, "接口获取媒体信息失败"
                detail = ddata.get('data').get('detail')
                assert type(detail) == dict
                mediaItem['cont_id'] = str(detail.get('collectionId'))
                mediaItem['title'] = detail.get('collectionName')
                mediaItem['director'] = Util.join_list_safely(
                    detail.get('director').split('/'))
                mediaItem['actor'] = Util.join_list_safely(
                    detail.get('player').split('/'))
                mediaItem['release_date'] = Util.str2date(
                    detail.get('publishTime'))
                mediaItem['vcount'] = int(detail.get('totalvideocount'))
                latest = detail.get('lastseries')
                m = re.compile('\D*(\d+)\D*').match(latest)
                if m:
                    mediaItem['latest'] = m.group(1)
                if mediaItem['vcount'] == 1:
                    mediaItem['latest'] = 1
                mediaItem['paid'] = detail.get('isvip')
                mediaItem['intro'] = detail.get('desc')
                mediaItem['poster_url'] = detail.get('image')
                mediaItem['site_id'] = self.site_id
                mediaItem['channel_id'] = self.channels_name_id[
                    mediaItem['channel_id']]
                info_id = Util.md5hash(Util.summarize(mediaItem))
                mediaItem['info_id'] = info_id

                vcount = mediaItem['vcount']
                if not vcount:
                    vcount = 1
                else:
                    vcount = int(vcount)
                video_list = self.api_video_list(vid, vcount, prefix_video_url,
                                                 mediaItem['channel_id'])
                if video_list:
                    Util.set_ext_id(mediaItem, video_list)
                mediaVideoItem['video'] = video_list
                mediaVideoItem['media'] = mediaItem
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.ERROR, vid)
Exemple #10
0
    def compose_mvitem(self, response, title_list, pers, dirs, play_url,
                       cat_id, poster_url, text):
        try:
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            videoitems = []
            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0].strip()
            ep_item["actor"] = pers
            ep_item["director"] = dirs

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url, "wasu")

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item

            mid = self.getshowid(response.request.url)
            mvitem["media"]["cont_id"] = mid
            ttvitem = {}
            if title_list:
                ttvitem = self.parse_video_item(response, cat_id, play_url,
                                                title_list, None)
            if ttvitem:
                if 'video' in ttvitem and len(ttvitem['video']) > 0:
                    mvitem['video'] = ttvitem['video']
                    mvitem["media"]["info_id"] = Util.md5hash(
                        Util.summarize(mvitem["media"]))
                    Util.set_ext_id(mvitem["media"], mvitem["video"])
                    if untrack_id and sid:
                        mvitem["untrack_id"] = untrack_id
                        mvitem["sid"] = sid
                    res = self.check_url(mvitem)
                    if not res:
                        return None
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return mvitem
Exemple #11
0
 def video_parse(self, response):
     items = []
     try:
         request_url = response.request.url
         logging.log(logging.INFO, 'video url: %s' % request_url)
         prefix_url = Util.prefix_url_parse(request_url)
         mediaVideoItem = response.request.meta[
             'item'] if 'item' in response.request.meta else MediaVideoItem(
             )
         mediaItem = mediaVideoItem[
             'media'] if 'media' in mediaVideoItem else MediaItem()
         pps_extract.media_extract(response, mediaItem)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
         logging.log(logging.INFO, 'video url: %s' % request_url)
Exemple #12
0
 def video_parse(self, response):
     items = []
     try:
         request_url = response.request.url
         logging.log(logging.INFO, 'video url: %s' % request_url)
         prefix_url = Util.prefix_url_parse(request_url)
         mediaVideoItem = response.request.meta[
             'item'] if 'item' in response.request.meta else MediaVideoItem(
             )
         mediaItem = mediaVideoItem[
             'media'] if 'media' in mediaVideoItem else MediaItem()
         if prefix_url == self.vip_prefix_url:
             mediaItem['paid'] = '1'
         else:
             mediaItem['paid'] = '0'
         #http://vod.kankan.com/v/87/87998.shtml
         sels = response.xpath('//ul[@class="movieinfo"]')
         if sels:
             kankan_extract.media_info_extract(sels, mediaItem)
         sels = response.xpath('//p[@id="movie_info_intro_l"]')
         if sels:
             kankan_extract.media_info_extract(sels, mediaItem)
         #普通电影,电视剧,综艺,动漫
         sels = response.xpath('//div[@class="header_title"]')
         if sels:
             results = kankan_extract.media_extract(sels)
         else:
             #http://vip.kankan.com/vod/88365.html
             sels = response.xpath('//div[@class="movie_info"]')
             if sels:
                 kankan_extract.media_info_extract(sels, mediaItem)
                 results = kankan_extract.media_extract(sels)
             else:
                 #http://vip.kankan.com/vod/88169.html?fref=kk_search_sort_01
                 sels = response.xpath(
                     '//div[@class="aside"]//div[@class="intro"]')
                 results = kankan_extract.media_extract(sels)
         for item in results:
             mediaItem['url'] = item['url']
             mediaVideoItem['media'] = mediaItem
             items.append(
                 Request(url=item['url'],
                         callback=self.media_parse,
                         meta={'item': mediaVideoItem}))
             break
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
         logging.log(logging.INFO, 'video url: %s' % request_url)
Exemple #13
0
 def media_extract(response):
     items = []
     try:
         #list播放页
         sels = response.xpath('.//div[@class="site-piclist_pic"]//a[@class="site-piclist_pic_link"]')
         if sels:
             mediaItem = MediaItem()
             urls = sels.xpath('./@href').extract()
             poster_urls = sels.xpath('./img/@src').extract()
             if urls:
                 mediaItem['url'] = urls[0].strip()
             if poster_urls:
                 mediaItem['poster_url'] = poster_urls[0].strip()
             items.append(mediaItem)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #14
0
 def media_extract(response):
     items = []
     try:
         #list列表页
         sels = response.xpath(
             './/li[@class="fl line" or normalize-space(@class)="fl"]')
         for sel in sels:
             mediaItem = MediaItem()
             urls = sel.xpath('./a/@href').extract()
             poster_urls = sel.xpath('./a/img/@src').extract()
             if urls:
                 mediaItem['url'] = urls[0]
                 mediaItem['poster_url'] = poster_urls[0]
             items.append(mediaItem)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #15
0
 def media_extract(response):
     items = []
     try:
         #播放页的详情部分
         #http://www.letv.com/ptv/vplay/20655099.html#vid=20061199
         #http://www.letv.com/ptv/vplay/22299495.html
         results = response.xpath('.//a[contains(text(), "%s")]/@href' %
                                  u'更多详情').extract()
         if not results:
             #http://www.letv.com/ptv/vplay/1609062.html
             results = response.xpath('.//a[contains(text(), "%s")]/@href' %
                                      u'影片详情').extract()
         if results:
             url = results[0]
             mediaItem = MediaItem()
             mediaItem['url'] = url
             items.append(mediaItem)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #16
0
    def parse_video_item(self, response, cat_id, url, title, playlistId):
        #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title))
        videoitems = []
        ep_item = MediaItem()
        item = MediaVideoItem()
        item["media"] = ep_item
        item["video"] = videoitems
        try:
            if int(cat_id) != int(self.movie_id):
                ul_list = response.xpath(
                    '//div[@class="episodes clearfix "]/a')
                if not ul_list:
                    ul_list = response.xpath(
                        '//div[@class="episodes clearfix enc-episodes-detail"]/a'
                    )
                for li in ul_list:
                    url = li.xpath('./@href').extract()
                    ttitle = li.xpath('./@title').extract()
                    snum = li.xpath('./text()').extract()
                    if snum:
                        play_num = self.get_play_num(snum[0])
                    if int(cat_id) == int(self.variety_id):
                        play_num = self.getvnum(self.url_prefix + url[0])
                    if not ttitle:
                        ttitle = [play_num]
                    vitem = self.compose_vitem([self.url_prefix + url[0]],
                                               title, play_num)
                    if 'url' in vitem:
                        videoitems.append(vitem)
            elif int(cat_id) == int(self.movie_id):
                if url:
                    vitem = self.compose_vitem([url], title, 1)
                    if 'url' in vitem:
                        videoitems.append(vitem)
            if videoitems:
                item["video"] = videoitems
                item["media"]["url"] = response.request.url
                Util.set_ext_id(item["media"], item["video"])
        except Exception as e:

            logging.log(logging.ERROR, traceback.format_exc())
        return item
Exemple #17
0
 def media_extract(response):
     items = []
     try:
         #list列表页
         sels = response.xpath(
             './/div[@class="content"]//ul[@class="p-list-syd"]//li[@class="p-item"]'
         )
         for sel in sels:
             mediaItem = MediaItem()
             #实际为播放地址,这里暂放在mediaItem中
             urls = sel.xpath('./a/@href').extract()
             poster_urls = sel.xpath('./a/img/@src').extract()
             scores = sel.xpath('./div[@class="score"]')
             if urls:
                 mediaItem['url'] = urls[0]
             if poster_urls:
                 mediaItem['poster_url'] = poster_urls[0]
             if scores:
                 mediaItem['score'] = scores[0]
             items.append(mediaItem)
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
Exemple #18
0
 def video_parse(self, response):
     items = []
     try:
         request_url = response.request.url
         logging.log(logging.INFO, 'video url: %s' % request_url)
         prefix_url = Util.prefix_url_parse(request_url)
         mediaVideoItem = response.request.meta[
             'item'] if 'item' in response.request.meta else MediaVideoItem(
             )
         mediaItem = mediaVideoItem[
             'media'] if 'media' in mediaVideoItem else MediaItem()
         #播放页 - 普通电影
         sels = response.xpath(
             '//div[@class="film-info clearfix"]//span[@class="summary"]/a/@href'
         )
         if not sels:
             #播放页 - vip电影
             sels = response.xpath(
                 '//div[@class="f_song inner_resumeCon intro"]//div[@class="con"]/a/@href'
             )
         if not sels:
             #播放页 - 预告片电影
             sels = response.xpath(
                 '//div[@class="related-film clear"]//a[@class="rel-film-img"]/@href'
             )
         if sels:
             url = sels.extract()[0]
             url = Util.get_absolute_url(url, prefix_url)
             mediaItem['url'] = url
             mediaVideoItem['media'] = mediaItem
             items.append(
                 Request(url=url,
                         callback=self.media_parse,
                         meta={'item': mediaVideoItem}))
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
         logging.log(logging.INFO, 'video url: %s' % request_url)
Exemple #19
0
    def media_parse(self, response):
        items = []
        try:
            media_url = response.request.url
            logging.log(logging.INFO, 'media url: %s' % media_url)

            mediaVideoItem = response.request.meta[
                'item'] if 'item' in response.request.meta else MediaVideoItem(
                )
            mediaItem = mediaVideoItem[
                'media'] if 'media' in mediaVideoItem else MediaItem()
            # 媒体页获取媒体信息
            title_class = "v-info v-info-album "
            div_class = "v-meta v-meta-album"
            v_title = '//div[@class="%s"]//h1[@class="title"]/span/text()'
            title_list = response.xpath(v_title % title_class).extract()
            title = Util.join_list_safely(title_list)
            if title:
                mediaItem['title'] = title
            mediaItem = self.pack_media_info(response, mediaItem, title_class,
                                             div_class)
            mediaItem['url'] = Util.normalize_url(media_url, self.site_code)
            request_url = response.meta['url']
            request_url = Util.normalize_url(request_url, self.site_code)
            r = re.compile('.*/(\d+).html')
            m = r.match(request_url)
            if m:
                vid = m.group(1)
                prefix_video_url = re.sub(vid, '%s', request_url)
                items.append(
                    self.api_media_info(mediaVideoItem, vid, prefix_video_url))
            else:
                pass
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'media url: %s' % request_url)
Exemple #20
0
    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            #title
            title = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()'
            ).extract()
            if not title or not title[0]:
                title = response.xpath(
                    '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title'
                ).extract()
                if not title or not title[0]:
                    title = response.xpath(
                        '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title'
                    ).extract()
                    if not title or not title[0]:
                        title = response.xpath(
                            '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title'
                        ).extract()
            #performer
            #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract()
            performer_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()'
            ).extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()'
                    % u'主演:').extract()
            #director
            #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract()
            director_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()'
            ).extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()'
                    % u'导演:').extract()
            #text
            text = response.xpath(
                '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()'
            ).extract()
            if not text:
                response.xpath(
                    '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()'
                ).extract()
            type_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()'
            ).extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()'
                    % u'类型:').extract()
            year_info = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()'
            ).extract()
            if not year_info:
                year_info = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()'
                    % u'年份:').extract()
            play_date = None
            if year_info:
                play_date = self.get_year(year_info[0])

            #
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            pers = Util.join_list_safely(performer_list)

            #sourceid
            sourceid = ""
            sourceid_list = response.xpath(
                '//div[@class="mod_bd sourceCont"]/@sourceid').extract()
            if sourceid_list:
                sourceid = sourceid_list[0]

            videoitems = []

            ep_item = MediaItem()

            if len(title) > 0:
                ep_item["title"] = title[0]
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if play_date:
                ep_item["release_date"] = Util.str2date(play_date)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["url"] = Util.normalize_url(response.request.url, "qq")
            ep_item["poster_url"] = poster_url

            if len(text) > 0:
                ep_item["intro"] = text[0]

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item
            mvitem["video"] = videoitems

            vurl = ""
            url_pre = "http://s.video.qq.com/loadplaylist?vkey="
            url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1"

            videoid = self.get_qq_showid(response.request.url)
            #videoid = self.get_vid(response.body,response.request.url)
            mvitem["media"]["cont_id"] = videoid
            mvitem["media"]["info_id"] = Util.md5hash(
                Util.summarize(mvitem["media"]))
            vurl = url_pre + str(sourceid) + url_tail

            tflag = "jQuery191048201349820010364_1425370006500"
            tpitem = self.parse_play_list(cat_id, vurl, tflag, response)
            #没有sourceid,比如专题页面
            if not tpitem:
                tpitem = self.parse_topic_play_list(response)
                videoids = response.xpath(
                    '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value'
                ).extract()
                if videoids:
                    mvitem["media"]["cont_id"] = videoids[0]
            if tpitem:
                mvitem["video"] = tpitem
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if untrack_id:
                    mvitem["untrack_id"] = untrack_id
                if sid:
                    mvitem["sid"] = sid
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemple #21
0
    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            year_list = []
            lyears = []

            title_list = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title'
            ).extract()
            director_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'导演:').extract()
            performer_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'主演:').extract()
            type_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'类型:').extract()
            district_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'地区:').extract()
            year_info = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/text()' %
                u'地区:').extract()
            year = None
            if len(year_info) >= 2:
                year = self.get_year(year_info[1])

            #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            districts = Util.join_list_safely(district_list)

            #text
            text = response.xpath(
                '//div[@class="juqing briefTab"]/div/text()').extract()
            #score
            score = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()'
            ).extract()

            play_url = ""
            tplay_url = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href'
            ).extract()
            if tplay_url:
                play_url = self.url_prefix + tplay_url[0].strip()
            videoitems = []

            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0]
                if ep_item["title"].find(u'预:') >= 0:
                    print "预告片,url", response.request.url
                    return items
            ep_item["actor"] = pers
            ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = districts
            if year:
                ep_item["release_date"] = Util.str2date(year)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url,
                                                "baofeng")

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item

            vurl = ""

            videoid = self.getshowid(response.request.url)
            mvitem["media"]["cont_id"] = videoid
            ttvitem = {}
            if title_list:
                ttvitem = self.parse_video_item(response, cat_id, play_url,
                                                title_list, None)
            if ttvitem:
                if 'video' in ttvitem and len(ttvitem['video']) > 0:
                    mvitem['video'] = ttvitem['video']
                    mvitem["media"]["info_id"] = Util.md5hash(
                        Util.summarize(mvitem["media"]))
                    Util.set_ext_id(mvitem["media"], mvitem["video"])
                    if untrack_id and sid:
                        mvitem["untrack_id"] = untrack_id
                        mvitem["sid"] = sid
                    res = self.check_url(mvitem)
                    #if self.check_url(mvitem):
                    if res:
                        items.append(mvitem)
                        pass
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemple #22
0
 def media_info_extract(response, mediaItem):
     try:
         if mediaItem == None:
             mediaItem = MediaItem()                
         #普通媒体页
         release_dates = response.xpath('./@data-qitancomment-tvyear').extract()
         if release_dates:
             release_dates = re.findall(r'[\d]+', release_dates[0]) 
             if release_dates:
                 release_date = ''.join(release_dates)
                 release_date = Util.str2date(release_date)
                 mediaItem['release_date'] = release_date 
         class_names = response.xpath('./@type').extract()
         if class_names and 'text/javascript' == class_names[0]:
             #视频类型 video:正片 trailer:片花
             regex_express = "vType[ ]?:[ ]?[']?(\w+)[']"
             match_result = response.re(regex_express)
             if match_result:
                vType  = match_result[0]
                if vType.strip() != 'video':
                     return
             regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)'
             #默认采用的是sourceId
             cont_id = '0'
             regex_express = 'sourceId[ ]?:[ ]?["]?(\d+)'
             match_result = response.re(regex_express)
             if match_result:
                 cont_id = match_result[0]
             if cont_id == '0':
                 #其他采用的是albumId
                 regex_express = 'albumId[ ]?:[ ]?["]?(\d+)'
                 match_result = response.re(regex_express)
                 if match_result:
                     cont_id = match_result[0]
                     mediaItem['cont_id'] = '%s|album_id' % (cont_id)                    
             else:
                 mediaItem['cont_id'] = '%s|source_id' % (cont_id)                    
             regex_express = 'cid[ ]?:[ ]?(\d+)'
             match_result = response.re(regex_express)
             if match_result:
                 cid = match_result[0]
                 mediaItem['channel_id'] = cid                    
             regex_express = 'title[ ]?:[ ]?\"(.*)\"'
             match_result = response.re(regex_express)
             if match_result:
                 title = match_result[0]
                 mediaItem['title'] = title
             #特殊剧集页:http://www.iqiyi.com/dianshiju/18jbj.html#vfrm=2-4-0-1
             regex_express = 'albumInfo[ ]?=[ ]?(\{.*\})'
             match_result = response.re(regex_express)
             if match_result:
                 json_content = match_result[0]
                 try:
                     json_data = json.loads(json_content)
                     cont_ids = '0'
                     cont_ids = json_data['sourceId']
                     if cont_ids != '0':
                         cont_ids = '%s|source_id' % (cont_ids) 
                         mediaItem['cont_id'] = cont_ids
                     else:
                         cont_ids = json_data['albumId']
                         cont_ids = '%s|album_id' % (cont_ids) 
                         mediaItem['cont_id'] = cont_ids
                     districts = json_data['areas']
                     types = json_data['types']
                     directors = json_data['directors']
                     actors = json_data['mainActors']
                     writers = json_data['writer']
                     titles = json_data['tvName']
                     poster_urls = json_data['tvPictureUrl']
                     vcounts = json_data['episodeCounts'] 
                     latests = json_data['currentMaxEpisode']
                     release_dates = json_data['issueTime']
                     intros = json_data['tvDesc']
                     if districts:
                         districts_json = json.loads(districts)
                         districts = districts_json.values()
                         mediaItem['district'] = Util.join_list_safely(districts)
                     if types:
                         types_json = json.loads(types)
                         types = types_json.values()
                         mediaItem['type'] = Util.join_list_safely(types)
                     mediaItem['director'] = Util.join_list_safely(directors)
                     mediaItem['actor'] = Util.join_list_safely(actors)
                     mediaItem['writer'] = Util.join_list_safely(writers)
                     mediaItem['title'] = titles
                     mediaItem['poster_url'] = poster_urls
                     mediaItem['vcount'] = vcounts
                     mediaItem['latest'] = latests
                     release_dates = str(release_dates)
                     release_date = Util.str2date(release_dates)
                     mediaItem['release_date'] = release_date
                     mediaItem['intro'] = intros
                 except Exception, e:
                     logging.log(logging.ERROR, traceback.format_exc())
                     logging.log(logging.INFO, '=================json_content=================')
                     logging.log(logging.INFO, json_content)
         #普通媒体页 - 媒体信息域
         # (1) http://www.iqiyi.com/a_19rrgjaiqh.html#vfrm=2-4-0-1
         #   集数的情况很复杂,这里不予考虑
         sels = response.xpath('.//div[@class="result_pic pr"]')
         if sels:
             poster_urls = sels.xpath('.//a/img/@src').extract()
             if poster_urls:
                 mediaItem['poster_url'] = poster_urls[0]
         sels = response.xpath('.//div[@class="result_detail"]')
         if sels:
             titles = sels.xpath('.//h1[@class="main_title"]//a/text()').extract()
             scores = sels.xpath('.//div[@class="topic_item topic_item-rt"]//span[@class="score_font"]//span/text()').extract()
             scores = ''.join(scores)
             scores = re.findall(r'[\d.]+', scores) 
             if titles:
                 mediaItem['title'] = titles[0]
             if scores:
                 try:
                     mediaItem['score'] = float(scores[0])
                 except Exception, e:
                     pass
             msg_sels = sels.xpath('.//div[@class="topic_item clearfix"]')
             for msg_sel in msg_sels:
                 msg_more_sels = msg_sel.xpath('./div')
                 for sel in msg_more_sels:
                     labels = sel.xpath('.//em/text()').extract()
                     infos = sel.xpath('.//em/a/text()').extract()
                     iqiyi_extract.text_infos_resolve(labels, infos, mediaItem)
             intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="moreinfo"]/span/text()').extract()
             if not intros:
                 intros = sels.xpath('.//div[@class="topic_item clearfix"]//span[@data-moreorless="lessinfo"]/span/text()').extract()
             if intros:
                 mediaItem['intro'] = intros[0]
Exemple #23
0
    def media_parse(self, response):
        items = []
        try:
            request_url = response.request.url
            logging.log(logging.INFO, 'media url: %s' % request_url)
            prefix_url = Util.prefix_url_parse(request_url)
            mediaVideoItem = response.request.meta[
                'item'] if 'item' in response.request.meta else None
            mediaItem = mediaVideoItem[
                'media'] if 'media' in mediaVideoItem else MediaItem()
            sels = response.xpath('//script[@type="text/javascript"]')
            letv_extract.media_info_extract(sels, mediaItem)
            sels = response.xpath('//div[@class="play"]')
            letv_extract.media_info_extract(sels, mediaItem)

            sels = response.xpath('//dl[@class="textInfo"]')
            if sels:
                #电视剧、综艺、动漫
                letv_extract.media_info_extract(sels, mediaItem)
            else:
                #电影
                sels = response.xpath('//div[@class="detail"]')
                letv_extract.media_info_extract(sels, mediaItem)

            #获取正片的url
            videoItems = []
            if u'电影' == mediaItem['channel_id']:
                pagenum = 1
                videos_url = self.other_album_api % (mediaItem['cont_id'],
                                                     pagenum)
                result = Util.get_url_content(videos_url)
                page_items = self.other_album_resolve(text=result,
                                                      meta={
                                                          'url': videos_url,
                                                          'pagenum': pagenum
                                                      })
                videoItems = page_items
            #综艺
            elif u'综艺' == mediaItem['channel_id']:
                sels = response.xpath(
                    '//div[@class="listTab"]//div[@data-statectn="n_click"]')
                if sels:
                    year_month_sels = sels.xpath('.//a')
                    for year_month_sel in year_month_sels:
                        years = year_month_sel.xpath('./@list-year').extract()
                        months = year_month_sel.xpath(
                            './@list-month').extract()
                        year = None
                        month = None
                        if years:
                            year = years[0]
                        if months:
                            month = months[0]
                        if year and month:
                            videos_url = self.zongyi_album_api % (
                                year, month, mediaItem['cont_id'])
                            result = Util.get_url_content(videos_url)
                            videoItems = videoItems + self.zongyi_album_resolve(
                                text=result,
                                meta={
                                    'url': videos_url,
                                    'year': year,
                                    'month': month
                                })
            elif mediaItem['channel_id'] in [u'电视剧', u'动漫']:
                pagenum = 1
                while True:
                    videos_url = self.other_album_api % (mediaItem['cont_id'],
                                                         pagenum)
                    result = Util.get_url_content(videos_url)
                    page_items = self.other_album_resolve(text=result,
                                                          meta={
                                                              'url':
                                                              videos_url,
                                                              'pagenum':
                                                              pagenum
                                                          })
                    if not page_items:
                        break
                    videoItems = videoItems + page_items
                    pagenum = pagenum + 1

            if videoItems:
                #设置ext_id
                Util.set_ext_id(mediaItem, videoItems)

                self.set_media_info(mediaItem)

                mediaVideoItem['media'] = mediaItem
                mediaVideoItem['video'] = videoItems
                items.append(mediaVideoItem)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'media url: %s' % request_url)
Exemple #24
0
    def list_json_parse(self, response):
        items = []
        try:
            origin_url = response.request.meta['url']
            request_url = response.request.url
            logging.log(logging.INFO, 'json api url: %s' % request_url)
            page = response.request.meta[
                'page'] if 'page' in response.request.meta else 1
            if page > self.max_update_page:
                return items
            channel_id = response.request.meta[
                'id'] if 'id' in response.request.meta else None
            list_json_postfix_url = response.request.meta[
                'postfix_url'] if 'postfix_url' in response.request.meta else None
            json_datas = json.loads(response.body)
            videos = []
            if json_datas:
                videos = json_datas[
                    'data_list'] if 'data_list' in json_datas else []
            if videos:
                #表明仍有下一页
                video_url = 'http://www.letv.com/ptv/vplay/%s.html'
                for item in videos:
                    mediaVideoItem = MediaVideoItem()
                    mediaItem = MediaItem()
                    mediaItem['channel_id'] = channel_id
                    if 'rating' in item and item['rating']:
                        mediaItem['score'] = item['rating']
                    subCategoryName = item['subCategoryName']
                    mediaItem['type'] = subCategoryName.replace(',', ';')
                    mediaVideoItem['media'] = mediaItem
                    release_date = item['releaseDate']
                    if release_date:
                        release_date = float(release_date)
                        if release_date > 0:
                            release_date = release_date / 1000
                            release_date = time.localtime(release_date)
                            release_date = '%s-%s-%s' % (release_date.tm_year,
                                                         release_date.tm_mon,
                                                         release_date.tm_mday)
                            mediaItem['release_date'] = Util.str2date(
                                release_date)
                    vid = ''
                    if 'vids' in item:
                        vids = item['vids']
                        vids = vids.split(',')
                        vid = vids[0]
                    elif 'vid' in item:
                        vid = item['vid']
                    if vid:
                        url = video_url % vid
                        items.append(
                            Request(url=url,
                                    callback=self.video_parse,
                                    meta={'item': mediaVideoItem}))

                #下一页
                page = page + 1
                url = self.list_json_prefix_url + list_json_postfix_url + 'p=%s' % page
                items.append(
                    Request(url=url,
                            callback=self.list_json_parse,
                            meta={
                                'page': page,
                                'id': channel_id,
                                'postfix_url': list_json_postfix_url,
                                'url': url
                            }))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'json api url: %s' % request_url)
            logging.log(logging.INFO, 'origin url: %s' % origin_url)
Exemple #25
0
    def parse_episode_info(self,response):
        try:
            logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            page_id = self.get_youku_pageid(response.request.url)
            if not page_id:
                log.error('miss content id: %s' % response.request.url)
                return

            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']
            items = []

            year_list = []

            title = self.parse_title(response,cat_id)
            performer_list = self.parse_actor(response)
            director_list = self.parse_director(response)
            district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'地区:').extract()
            type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'类型:').extract()
            play_date = self.parse_play_date(response)
            total_num = self.parse_total_num(response)

            year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)

            #text
            text = response.xpath('//div[@class="detail"]/span/text()').extract()

            videoitems = []

            ep_item = MediaItem()
            if title:
                ep_item["title"] = title[0].strip()
            if pers:
                ep_item["actor"] = pers
            if dirs > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = district_list[0].strip()
            if play_date:
                ep_item["release_date"] = Util.str2date(play_date)
            if total_num:
                ep_item["vcount"] = total_num

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url,"youku")
            if text:
                ep_item["intro"] = text[0].strip()
            ep_item["cont_id"] = page_id
            ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item))

            mvitem = MediaVideoItem();
            if mid:
                mvitem['mid'] = mid
            mvitem["media"] = ep_item;
            if untrack_id:
                mvitem["untrack_id"] = untrack_id
            if sid:
                mvitem["sid"] = sid

            video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id)
            mvitem['video'] = video_list
            Util.set_ext_id(mvitem["media"], mvitem["video"])
            items.append(mvitem)

        except Exception as e: 
            logging.log(logging.ERROR, traceback.format_exc())
        return items
Exemple #26
0
    def media_parse(self, response):
        items = []
        try:
            request_url = response.request.url
            logging.log(logging.INFO, 'media url: %s' % request_url)
            prefix_url = Util.prefix_url_parse(request_url)
            mediaVideoItem = response.request.meta[
                'item'] if 'item' in response.request.meta else MediaVideoItem(
                )
            mediaItem = mediaVideoItem[
                'media'] if 'media' in mediaVideoItem else MediaItem()
            #过滤掉skip_types类型的影片
            sels = response.xpath('//head//script')
            if sels:
                regex_express = 'movieInfo\.play_type[ ]?=[ ]?\'(.*)\''
                match_result = sels.re(regex_express)
                if match_result:
                    play_type = match_result[0]
                    if play_type in self.skip_types:
                        return items
            #由于某些URL会有跳转,所以应保存真是的URL
            #http://movie.kankan.com/movie/88365 ->  http://data.movie.kankan.com/movie/88365
            mediaItem['url'] = request_url
            sels = response.xpath('//head')
            kankan_extract.media_info_extract(sels, mediaItem)
            sels = response.xpath('//div[@class="info_list"]')
            if sels:
                kankan_extract.media_info_extract(sels, mediaItem)
            sels = response.xpath('//ul[@class="detail_ul"]')
            if sels:
                kankan_extract.media_info_extract(sels, mediaItem)

            #获取媒体的剧集信息
            videoItems = []
            if u'综艺' == mediaItem['channel_id']:
                #综艺
                sels = response.xpath(
                    '//div[@id[re:test(., "fenji_[\d]+_[\d]+")]]')
                for sel in sels:
                    video_sels = sel.xpath('.//li')
                    for video_sel in video_sels:
                        videoItem = VideoItem()
                        videoItem['intro'] = mediaItem['channel_id']
                        kankan_extract.video_info_extract(video_sel, videoItem)
                        if 'url' in videoItem:
                            url = videoItem['url']
                            url = Util.get_absolute_url(url, prefix_url)
                            videoItem['url'] = url
                            self.set_video_info(videoItem,
                                                mediaItem['channel_id'])
                            videoItems.append(videoItem)
            elif u'电影' == mediaItem['channel_id']:
                #电影,从立即观看中获取
                videoItem = VideoItem()
                Util.copy_media_to_video(mediaItem, videoItem)
                sels = response.xpath('//div[@class="section clearfix s2"]')
                if sels:
                    urls = sels.xpath(
                        './/a[starts-with(@class, "foc")]/@href').extract()
                    thumb_urls = sels.xpath(
                        './/a[@class="foc"]/img/@src').extract()
                    if urls:
                        url = urls[0]
                        url = Util.get_absolute_url(url, prefix_url)
                        videoItem['url'] = url
                    if thumb_urls:
                        videoItem['thumb_url'] = thumb_urls[0]
                    self.set_video_info(videoItem, mediaItem['channel_id'])
                    videoItems.append(videoItem)
            else:
                #电视剧
                sels = response.xpath(
                    '//div[@id[re:test(., "fenji_[\d]+_asc")]]')
                if not sels:
                    #动漫,电视剧
                    sels = response.xpath(
                        '//ul[@id[re:test(., "fenji_[\d]+_asc")]]')
                for sel in sels:
                    video_sels = sel.xpath('.//li')
                    for video_sel in video_sels:
                        videoItem = VideoItem()
                        videoItem['intro'] = mediaItem['channel_id']
                        kankan_extract.video_info_extract(video_sel, videoItem)
                        if 'url' in videoItem:
                            url = videoItem['url']
                            url = Util.get_absolute_url(url, prefix_url)
                            videoItem['url'] = url
                            self.set_video_info(videoItem,
                                                mediaItem['channel_id'])
                            videoItems.append(videoItem)
            if videoItems:
                #设置ext_id
                Util.set_ext_id(mediaItem, videoItems)

                self.set_media_info(mediaItem)

                mediaVideoItem['media'] = mediaItem
                mediaVideoItem['video'] = videoItems
                items.append(mediaVideoItem)
                #self.count = self.count + 1
                #logging.log(logging.INFO, 'count: %s' % str(self.count))
            else:
                logging.log(logging.INFO, '%s: no videos' % request_url)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            logging.log(logging.INFO, 'media url: %s' % request_url)
Exemple #27
0
    def parse_episode_play(self, response):
        mvitem = None
        try:
            logging.log(logging.INFO,
                        'parse_episode_play: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = ""
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            #items = []

            #title
            title_list = response.xpath(
                '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title'
            ).extract()
            if not title_list:
                title_list = response.xpath(
                    '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()'
                ).extract()
            #performer
            performer_list = response.xpath(
                '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()'
            ).extract()
            #director
            director_list = response.xpath(
                '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()'
                % u'导演:').extract()
            #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract()

            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)

            #text
            text = response.xpath(
                '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()'
            ).extract()

            ep_item = MediaItem()
            videoitems = []

            #not film
            if int(cat_id) != int(self.movie_id):
                #video list
                #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li')
                video_list = response.xpath(
                    '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li'
                )
                i = 0
                for tvideo in video_list:
                    lurl = tvideo.xpath('./a/@href').extract()
                    surl = ""
                    #lnum = tvideo.xpath('./a/@title').extract()
                    lnum = tvideo.xpath('./a/span/text()').extract()

                    vitem = VideoItem()
                    if lnum and lurl:
                        vitem["vnum"] = lnum[0]
                        surl = "http://film.qq.com" + lurl[0]
                        vitem["os_id"] = self.os_id
                        vitem["site_id"] = self.site_id
                        #vitem["cont_id"] = self.get_vid(response.body,surl)
                        turl = ""
                        if cat_id == self.tv_id:
                            turl = Util.normalize_url(surl, "qq", "tv")
                        if cat_id == self.cartoon_id:
                            turl = Util.normalize_url(surl, "qq", "cartoon")
                        else:
                            turl = Util.normalize_url(surl, "qq")
                        if turl:
                            vitem["ext_id"] = Util.md5hash(turl)
                            vitem["url"] = turl
                        vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                    else:
                        continue

                    videoitems.append(vitem)
            else:
                vitem = VideoItem()
                if title_list:
                    vitem["title"] = title_list[0]
                vitem["vnum"] = "1"
                vitem["os_id"] = self.os_id
                vitem["site_id"] = self.site_id
                #vitem["cont_id"] = self.get_vid(response.body,response.request.url)
                turl = Util.normalize_url(response.request.url, "qq")
                vitem["url"] = turl
                vitem["ext_id"] = Util.md5hash(turl)
                vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                videoitems.append(vitem)

            if len(title_list) > 0:
                ep_item["title"] = title_list[0]
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if len(text) > 0:
                ep_item["intro"] = text[0]
            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url

            videoid = self.get_qq_showid(response.request.url)
            #videoid = self.get_vid(response.body,response.request.url)
            ep_item["cont_id"] = videoid

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item
            mvitem["video"] = videoitems
            #mvitem["media"]["url"] = response.request.url
            mvitem["media"]["url"] = Util.normalize_url(
                response.request.url, "qq")
            #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"])

            if untrack_id:
                mvitem["untrack_id"] = untrack_id
            if sid:
                mvitem["sid"] = sid
            mvitem["media"]["info_id"] = Util.md5hash(
                Util.summarize(mvitem["media"]))
            Util.md5hash(Util.summarize(mvitem["media"]))
            Util.set_ext_id(mvitem["media"], mvitem["video"])
            #items.append(mvitem)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return mvitem
Exemple #28
0
 def list_parse(self, response):
     items = []
     try:
         request_url = response.request.url
         logging.log(logging.INFO, 'url: %s' % request_url)
         prefix_url = Util.prefix_url_parse(request_url)
         first = response.request.meta[
             'first'] if 'first' in response.request.meta else False
         channel_id = response.request.meta[
             'id'] if 'id' in response.request.meta else None
         if first:
             sels = response.xpath('//div[@class="tab_box"]//a')
             for sel in sels:
                 texts = sel.xpath('.//span/text()').extract()
                 if texts:
                     text = texts[0].replace(' ', '')
                     if text == u'最新':
                         urls = sel.xpath('./@href').extract()
                         url = urls[0]
                         items.append(
                             Request(url=url,
                                     callback=self.list_parse,
                                     meta={'id': channel_id}))
                         break
         else:
             page = response.request.meta[
                 'page'] if 'page' in response.request.meta else 1
             if page > self.max_update_page:
                 return items
             #list列表
             sels = response.xpath('//ul[@class="movielist"]/li')
             for sel in sels:
                 results = kankan_extract.video_extract(sel)
                 for item in results:
                     mediaVideoItem = MediaVideoItem()
                     mediaItem = MediaItem()
                     mediaItem['channel_id'] = channel_id
                     kankan_extract.media_info_extract(sel, mediaItem)
                     mediaVideoItem['media'] = mediaItem
                     items.append(
                         Request(url=item['url'],
                                 callback=self.video_parse,
                                 meta={'item': mediaVideoItem}))
                     break
             #下一页
             sels = response.xpath('//p[@class="list-pager-v2"]')
             results = kankan_extract.next_page_extract(sels)
             page = page + 1
             for item in results:
                 url = Util.get_absolute_url(item, prefix_url)
                 items.append(
                     Request(url=url,
                             callback=self.list_parse,
                             meta={
                                 'page': page,
                                 'id': channel_id
                             }))
                 break
     except Exception, e:
         logging.log(logging.ERROR, traceback.format_exc())
         logging.log(logging.INFO, 'url: %s' % request_url)
Exemple #29
0
    def media_info_extract(response, mediaItem):
        try:
            if mediaItem == None:
                mediaItem = MediaItem()
            #list api页面
            results = response.xpath('.//p[@class="ui-pic"]//img/@data-src2').extract()
            if results:
                mediaItem['poster_url'] = results[0]
            results = response.xpath('.//p[@class="ui-txt"]//span[@class="main-tt"]/text()').extract()
            if results:
                mediaItem['title'] = results[0]
            results = response.xpath('.//p[@class="ui-txt"]//em/text()').extract()
            if results:
                mediaItem['score'] = results[0]

            #普通播放页
            sel = response.xpath('.//script[@type="text/javascript"]')
            if sel:
                cont_ids = sel.re('\"id\"[ ]?:[ ]?(\d+)')  
                if cont_ids:
                    mediaItem['cont_id'] = cont_ids[0]
            sel = response.xpath('.//div[@id="mainContent"]')
            if sel:
                titles = sel.xpath('.//*[@class="tit"]/text()').extract()
                scores = sel.xpath('.//div[@id="scoremark"]//em[@class="score"]/text()').extract()
                intros = sel.xpath('.//p[@class="longinfo"]/text()').extract()
                if titles:
                    title = titles[0].strip()
                    match_result = None
                    #电影
                    if u'电影' == mediaItem['channel_id']:
                        match_result = None
                    #综艺
                    #都来爱梦-20121215     时尚健康-20150430-包贝尔分享包氏火锅哲学
                    elif u'综艺' == mediaItem['channel_id']:
                        regex_express = r'(.+)-[\d]+[-].+'
                        regex_pattern = re.compile(regex_express)
                        match_result = regex_pattern.search(title)
                        if not match_result:
                            regex_express = r'(.+)-[\d]+' 
                            regex_pattern = re.compile(regex_express)
                            match_result = regex_pattern.search(title)
                        if not match_result:
                            regex_express = u'(.+)[((]第[\d]+集[))]'
                            regex_pattern = re.compile(regex_express)
                            match_result = regex_pattern.search(title)
                    #电视剧,动漫
                    else:
                        regex_express = u'(.+)[((]第[\d]+集[))]'
                        regex_pattern = re.compile(regex_express)
                        match_result = regex_pattern.search(title)
                    if match_result:
                        mediaItem['title'] = match_result.groups()[0] 
                    else:
                        mediaItem['title'] = title
                if scores:
                    score = scores[0].strip()
                    mediaItem['score'] = score
                if intros:
                    intro = intros[0].strip()
                    mediaItem['intro'] = intro
                msg_sels = sel.xpath('.//div[@class="intro-content intro-short"]//li')
                for sel in msg_sels:
                    labels = sel.xpath('./span/text()').extract()
                    infos = sel.xpath('./a/text()').extract()
                    if not infos:
                        infos = sel.xpath('./text()').extract()
                    pptv_extract.text_infos_resolve(labels, infos, mediaItem)

            #vip播放页
            sel = response.xpath('.//script[@type="text/javascript"]')
            if sel:
                cont_ids = sel.re('vid[ ]?:[ ]?["]?(\d+)')
                if cont_ids:
                    mediaItem['cont_id'] = cont_ids[0]
            sel = response.xpath('.//div[@class="ptxt"]')    
            if sel:
                titles = sel.xpath('./*/@title').extract()
                intros = sel.xpath('.//span[@class="thenext"]/text()').extract()
                if titles:
                    mediaItem['title'] = titles[0].strip()
                if intros:
                    mediaItem['intro'] = intros[0].strip() 
                msg_sels = sel.xpath('./p') 
                for sel in msg_sels:
                    labels = sel.xpath('./em/text()').extract()
                    infos = sel.xpath('.//tt/text()').extract()
                    pptv_extract.text_infos_resolve(labels, infos, mediaItem)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemple #30
0
    def parse_play_list(self, cat_id, url, flag, response):
        item = None
        videoitems = []
        try:
            ep_item = MediaItem()
            item = MediaVideoItem()
            item["media"] = ep_item
            item['video'] = videoitems

            info = None
            try:
                info = self.httpdownload.get_data(url)
            except Exception as e:
                logging.log(logging.ERROR, traceback.format_exc())
                return videoitems
            if not info or len(info) < 2:
                return videoitems

            msg = info
            bodylen = len(msg) - 1
            index = msg.find(flag) + len(flag) + 1
            info = msg[index:bodylen]
            jinfo = json.loads(info)
            if "video_play_list" not in jinfo:
                return videoitems
            itemlist = jinfo["video_play_list"]["playlist"]
            for titem in itemlist:
                if "episode_number" not in titem:
                    continue
                info = titem["episode_number"]
                if info and titem["title"].find(u"预告") < 0 and url.find(
                        "qq.com") >= 0:
                    vitem = VideoItem()
                    vitem["title"] = titem["title"]
                    tvnum = string.replace(info, "-", "")
                    #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html
                    tvnum_list = re.findall(r'[\D]+', tvnum)
                    if not tvnum_list:
                        vitem["vnum"] = string.replace(info, "-", "")
                    else:
                        continue
                    vitem["os_id"] = self.os_id
                    vitem["site_id"] = self.site_id
                    turl = ""
                    if int(cat_id) == int(self.tv_id) or int(cat_id) == int(
                            self.cartoon_id):
                        turl = Util.normalize_url(titem["url"], "qq", "tv")
                    else:
                        turl = Util.normalize_url(titem["url"], "qq")
                    if turl:
                        vitem["ext_id"] = Util.md5hash(turl)
                        #vitem["cont_id"] = self.get_vid(response.body,turl)
                        vitem["url"] = turl
                        vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                    else:
                        continue
                    videoitems.append(vitem)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems