Ejemplo n.º 1
0
    def parse_episode(self, response):
        try:
            logging.log(logging.INFO, 'episode:%s' % response.request.url)
            order = response.request.meta['order']
            items = []

            #video info
            #tags = response.xpath('//p[@class="info_tags"]//a/@title').extract()
            #descriptions = response.xpath('//div[@class="info_summary cf"]/span/text()').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_qq_showid(response.request.url)
            #if tags:
            #    ep_item['tag'] = Util.unquote(tags[0]).rstrip('|')
            #if descriptions:
            #    ep_item['description'] = descriptions[0]
            for k, v in order.items():
                if k == 'user':
                    ep_item['category'] = v
                elif k == 'show_id':
                    ep_item['owner_show_id'] = v
                else:
                    ep_item[k] = v

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['format_id'] = self.format_id
            items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Ejemplo n.º 2
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            lens = response.request.meta['lens']
            priority = response.request.meta['priority']
            items = []

            #show_id
            show_id = Util.get_letv_showid(response.request.url)
            albumid = response.selector.re(re.compile(r'pid: ?(\d+)'))
            #video info
            title = response.xpath(
                '//meta[@name="irTitle"]/@content').extract()
            upload_time = response.xpath(
                '//ul[@class="info_list"]//em[@id="video_time"]/text()'
            ).extract()
            tag_sel = response.xpath(
                '//meta[@name="keywords"]/@content').extract()
            ep_item = EpisodeItem()
            if title:
                ep_item['title'] = title[0]
            if show_id:
                ep_item['show_id'] = show_id
            if tag_sel:
                tag_str = tag_sel[0][len(title[0]) + 1:]
                if tag_str:
                    tag_list = []
                    split_space = tag_str.split(' ')
                    for item_space in split_space:
                        split_comma = item_space.split(',')
                        for item_comma in split_comma:
                            tag_list.append(item_comma)

                    ep_item['tag'] = "|".join([t.strip() for t in tag_list])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            ep_item['duration'] = lens
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 3
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            lens = response.request.meta['lens']
            priority = response.request.meta['priority']
            items = []

            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            #albumid = response.selector.re(re.compile(r'pid: ?(\d+)'))
            #show_id
            show_id = Util.get_sohu_showid(response.request.url)
            #tag
            tag = response.xpath('//meta[@name="keywords"]/@content').extract()
            #video info
            title = response.xpath(
                '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()'
            ).extract()
            #played = response.xpath('//em[@id="video_playcount"]').extract()
            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = tag[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            ep_item['duration'] = lens
            #if played:
            #    ep_item['played']=played
            #if albumid:
            #    items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]}))
            #else:
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 4
0
    def parse_media(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            title = response.request.meta['title']
            thumb_url = response.request.meta['img']
            url = response.request.url
            query = urlparse.urlparse(url).query
            query_dict = urlparse.parse_qs(query)
            show_id = query_dict['id'][0]

            #get tags
            sels = response.xpath('//span[@class="c_org1"]/a/text()').extract()
            tag = ''
            if sels:
                tag = "|".join(sels).encode("UTF-8")

            #get release time
            upload_time = ''
            sels = response.xpath(
                '//p[@class="c_gray0 lh3"]/span/text()').extract()
            if sels:
                time_times = sels[0].encode("UTF-8")
                upload_time = time_times[0:16]

            #get play times
            played = 0
            sels = response.xpath(
                '//p[@class="c_gray0 lh3"]/span/a/text()').extract()
            if sels:
                played = sels[0].strip()

            ep_item = EpisodeItem()
            ep_item['title'] = title
            ep_item['show_id'] = show_id
            ep_item['tag'] = tag
            ep_item['upload_time'] = upload_time
            ep_item['category'] = self._category
            ep_item['thumb_url'] = thumb_url
            ep_item['spider_id'] = self._spider_id
            ep_item['site_id'] = self._site_id
            ep_item['url'] = url
            ep_item['played'] = played
            ep_item['cat_id'] = cat_id

            items.append(ep_item)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        finally:
            return items
Ejemplo n.º 5
0
    def parse_second(self,response):
        try:
            #log.msg('lev2: %s' % response.request.url)
            kw_id = response.request.meta['kw_id']
            items = []
            sel = Selector(response)

            #info
            jinfo = json.loads(response.body)
            title = jinfo['data']['t']
            show_id = response.request.meta['show_id']
            tags = jinfo['data']['tag']
            tag = tags.replace(' ','|').replace(',','|').strip('|')

            tuploadtime = jinfo['data']['uploadtime']
            upload_time = Util.timestamp2datetime(tuploadtime)

            description = jinfo['data']['desc']
            thumb_url = jinfo['data']['picpath']

            tduration = str(jinfo['data']['vtime'])
            tduration1 = tduration.split(',')
            duration = tduration1[0]

            ep_item = EpisodeItem()
            if len(title) != 0:
                ep_item["title"] = title
            ep_item['show_id'] = response.request.meta['show_id']

            turl = "http://v.ku6.com/show/" + show_id + ".html"

            if len(tag) != 0:
                ep_item["tag"] = tag
            if len(upload_time) != 0:
                ep_item["upload_time"] = upload_time
            if len(turl) != 0:
                ep_item["url"] = turl
            if len(thumb_url) != 0:
                ep_item['thumb_url'] = thumb_url
            if len(duration) != 0:
                ep_item["duration"] = duration
            ep_item['kw_id'] = kw_id
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id

            items.append(Request(url=turl, callback=self.parse_episode, meta={'item':ep_item}))

            return items
            
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 6
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            lens = response.request.meta['lens']
            items = []
            show_id = response.xpath('//div[@id="block-data-view"]/@data-aid').extract()
            title = response.xpath('//div[@id="block-data-view"]/@data-title').extract()
            tags = response.xpath('//div[@id="block-data-view"]/@data-tags').extract()
            if lens ==0:
                data_from = response.xpath('//div[@id="area-part-view"]/div/a/@data-from').extract()
                data_sid = response.xpath('//div[@id="area-part-view"]/div/a/@data-sid').extract()
                if data_sid:
                    second_request = "http://www.acfun.tv/video/getVideo.aspx?id=" + data_sid[0].strip()
                    items.append(Request(url=second_request, callback=self.parse_duration, meta={'cat_name': cat_name, 'thumb': thumb_url,'audit':audit,'priority':priority,'show_id':show_id,'title':title,'tags':tags,'url':response.request.url}))
                return items
                
            else:
                ep_item = EpisodeItem()
             
                if title:
                    ep_item['title'] = title[0].strip()
                if show_id:
                    ep_item['show_id'] = show_id[0].strip()
                if tags:
                    ep_item['tag'] = tags[0].strip()
                if thumb_url:
                    ep_item['thumb_url'] = thumb_url[0].strip()

                ep_item['spider_id'] = self.spider_id
                ep_item['site_id'] = self.site_id
                ep_item['url'] = response.request.url
                #ep_item['cat_id'] = cat_id
                ep_item['category'] = cat_name
                ep_item['format_id'] = '2'
                ep_item['audit'] = audit
                ep_item['priority'] =priority
                ep_item['duration'] = lens
                items.append(ep_item)
                return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 7
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            lens = response.request.meta['lens']
            items = []

            show_id = Util.get_ifeng_showid(response.request.url)
            title = response.xpath(
                '//head/meta[@property="og:title"]/@content').extract()
            tags = response.xpath('//div[@class="protag"]/a/text()').extract()
            upload_time = response.xpath(
                '//div[@class="vTit_wrap"]/div/p/span[@class="data"]/text()'
            ).extract()
            #video info
            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tags:
                ep_item['tag'] = '|'.join(tags)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()
            if upload_time:
                ep_item['upload_time'] = upload_time[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            ep_item['duration'] = lens
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 8
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            thumb = response.request.meta['thumb']
            items = []

            show_id = Util.get_v1_showid(response.request.url)
            title = response.xpath('//meta[@name="title"]/@content').extract()
            tags = response.xpath(
                '//meta[@name="keywords"]/@content').extract()
            ep_item = EpisodeItem()
            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tags:
                ep_item['tag'] = tags[0].strip()

            ep_item['thumb_url'] = thumb[0].strip()
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            #ep_item['description'] = item.get("description")
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            #ep_item['played'] = item.get('play')
            #ep_item['upload_time'] = item.get('create')
            #duration = item.get('duration')
            #if duration:
            #    a,b=duration.split(':')
            #    duration = int(a)*60+int(b)
            #else:
            #    duration = 0
            #ep_item['duration'] = duration
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 9
0
    def parse_media(self, response, **kwargs):
        items = []
        try:
            title = response.xpath(
                '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()'
            ).extract()
            tag = response.xpath('//meta[@name="keywords"]/@content').extract()
            #show_id = Util.get_sohu_showid(response.request.url)
            thumb = response.xpath('//script').re(',sCover: \'(.*)\'')
            upload = response.xpath('//script').re(',uploadTime: \'(.*)\'')
            description = response.xpath(
                '//p[@class="rel cfix"]/@title').extract()
            played = response.xpath(
                '//span[@class="vbtn vbtn-play"]/em/i/text()').extract()
            print played, upload
            video_id = response.xpath('//script').re('vid = \'(\d+)\'')

            ep_item = EpisodeItem()
            if video_id:
                ep_item['video_id'] = video_id[0]
                ep_item['show_id'] = video_id[0]
                if title:
                    ep_item['title'] = title[0]
                if tag:
                    ep_item['tag'] = tag[0].strip().replace(',', '|')
                if upload:
                    ep_item['upload_time'] = upload[0] + ":00"
                if description:
                    ep_item['description'] = description[0].strip()
                if thumb:
                    ep_item['thumb_url'] = thumb[0]
                if played:
                    ep_item['played'] = Util.normalize_played(played[0])

                ep_item['category'] = u"搞笑"
                ep_item['spider_id'] = self.spider_id
                ep_item['site_id'] = self.site_id
                ep_item['url'] = response.request.url

                items.append(ep_item)
                log.msg("spider success, title:%s" % (ep_item['title']),
                        level=log.INFO)
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
        finally:
            return items
Ejemplo n.º 10
0
    def parse_second(self, response):
        try:
            #log.msg('lev2: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #category
            begin = response.body.find("try{window(")
            begin += len("try{window(")
            end = response.body.find(");}catch(e)")
            msg = response.body[begin:end]
            jmsg = json.loads(msg)
            num = len(jmsg["data"])
            for i in range(num):
                title = jmsg["data"][i]["aName"]
                play_num = "0"
                play_num = str(jmsg["data"][i]["disCnt"])
                upload_time = jmsg["data"][i]["tvYear"]
                turl = jmsg["data"][i]["vUrl"]
                timelength = str(jmsg["data"][i]["timeLength"])

                ep_item = EpisodeItem()
                if len(title) != 0:
                    ep_item["title"] = title
                ep_item["played"] = play_num
                if len(upload_time) != 0:
                    ep_item["upload_time"] = upload_time
                if len(turl) != 0:
                    ep_item["url"] = turl
                if len(timelength) != 0:
                    ep_item["duration"] = timelength
                ep_item['subject_id'] = cat_id

                items.append(
                    Request(url=turl,
                            callback=self.parse_episode,
                            meta={'item': ep_item}))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 11
0
    def parse_duration(self, response):
        try:
            items = []
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            title = response.request.meta['title']
            show_id = response.request.meta['show_id']
            tags = response.request.meta['tags']
            url = response.request.meta['url']
            data = json.loads(response.body)
            success = data.get('success')
            if not success or success == 'false':
                return items
            duration = data.get('time')
            if not duration:
                return items 
            ep_item = EpisodeItem()
             
            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id[0].strip()
            if tags:
                ep_item['tag'] = tags[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] =priority
            ep_item['duration'] = int(duration)
            items.append(ep_item)
            return items 
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 12
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            data = json.loads(response.body)
            list = data.get('list')
            for item in list:
                ep_item = EpisodeItem()
                ep_item['title'] = item.get('title')
                ep_item['show_id'] = item.get('aid')
                #ep_item['tag'] =  item.get()
                ep_item['thumb_url'] = item.get('pic')
                ep_item['spider_id'] = self.spider_id
                ep_item['site_id'] = self.site_id
                ep_item[
                    'url'] = "http://www.bilibili.com/video/av%s/" % item.get(
                        'aid')
                #ep_item['cat_id'] = cat_id
                ep_item['category'] = cat_name
                ep_item['description'] = item.get("description")
                ep_item['format_id'] = '2'
                ep_item['audit'] = audit
                ep_item['priority'] = priority
                ep_item['played'] = item.get('play')
                #ep_item['upload_time'] = item.get('create')
                duration = item.get('duration')
                if duration:
                    a, b = duration.split(':')
                    duration = int(a) * 60 + int(b)
                else:
                    duration = 0
                ep_item['duration'] = duration
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 13
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            #cat_id = response.request.meta['cat_id']
            cat_name = response.request.meta['cat_name']
            thumb_url = response.request.meta['thumb']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']
            items = []
            show_id = Util.get_tucao_showid(response.request.url)
            title = response.xpath(
                '//h1[@class="show_title"]/text()').extract()
            tags = response.xpath(
                '//meta[@name="keywords"]/@content').extract()
            #video info
            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = title[0].strip()
            if show_id:
                ep_item['show_id'] = show_id
            if tags:
                ep_item['tag'] = tags[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority
            #ep_item['duration'] = lens
            items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 14
0
    def parse_played(self, response):
        items = []
        try:
            log.msg(response.request.url, level=log.INFO)
            body = response.xpath('//body/p/text()')
            play_num = body.re('"browse":(\d*)}')[0]
            _item = response.meta

            ep_item = EpisodeItem()
            ep_item['show_id'] = _item['show_id']
            ep_item['title'] = _item['title']
            ep_item['tag'] = _item['tag']
            ep_item['category'] = _item['category']
            ep_item['upload_time'] = _item['upload_time']
            ep_item['spider_id'] = _item['spider_id']
            ep_item['site_id'] = _item['site_id']
            ep_item['url'] = _item['url']
            ep_item['description'] = _item['description']
            ep_item['played'] = int(play_num)
            items.append(ep_item)
        except Exception, err:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 15
0
    def parse_first(self, response):
        try:
            items = []
            user_item = UserItem()
            data = json.loads(response.body)
            print data
            return items
            has_more = data.get("has_more")
            message = data.get("message")
            max_behot_time = data.get("max_behot_time")
            data = data.get("data")
            if data:
                for it in data:
                    ep_item = EpisodeItem()
                    ep_item['title'] = it["title"]
                ep_item['show_id'] = show_id
                ep_item['tag'] = "|".join([t.strip() for t in tag])
                ep_item['upload_time'] = upload_time[0].strip()
            if category:
                ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            #ep_item['cat_id'] = cat_id
            ep_item['category'] = cat_name
            ep_item['format_id'] = '2'
            ep_item['audit'] = audit
            ep_item['priority'] = priority

            print type(data)
            #items.append(Request(url=urls, callback=self.parse_page))
            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 16
0
    def parse_episode_youku(self, response):
        try:
            logging.log(logging.INFO,
                        "episode_youku:%s" % response.request.url)
            pg_id = response.request.meta['pg_id']
            cat_name = response.request.meta['cat_name']
            site_id = response.request.meta['site_id']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []
            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = response.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                #ep_item['title'] = Util.strip_title("".join(title))
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            #if category:
            #    ep_item['category'] = category[0].replace(u'频道', '')
            ep_item['category'] = cat_name
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = site_id
            ep_item['url'] = response.request.url
            ep_item['pg_id'] = pg_id
            ep_item['audit'] = audit
            ep_item['format_id'] = self.format_id
            ep_item['priority'] = priority

            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Ejemplo n.º 17
0
    def parse_episode_iqiyi(self, response):
        try:
            logging.log(logging.INFO,
                        "parse_youku_playlength:%s" % response.request.url)
            pg_id = response.request.meta['pg_id']
            cat_name = response.request.meta['cat_name']
            site_id = response.request.meta['site_id']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath(
                '//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()'
            ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()'
                ).extract()
            if not title:
                title = response.xpath(
                    '//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()'
                ).extract()

            #category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            #if not category:
            #    category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract()

            upload_time = response.xpath(
                '//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/span/text()').extract()

            tag = response.xpath(
                '//span[@id="widget-videotag"]/descendant::*/text()').extract(
                )
            if not tag:
                tag = response.xpath(
                    '//span[@class="mod-tags_item vl-block"]/descendant::*/text()'
                ).extract()
            if not tag:
                tag = response.xpath(
                    '//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()

            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] = "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            #if category:
            #    ep_item['category'] = category[0].strip()
            ep_item['category'] = cat_name
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = site_id
            ep_item['pg_id'] = pg_id
            ep_item['audit'] = audit
            ep_item['url'] = response.request.url
            ep_item['format_id'] = self.format_id
            ep_item['priority'] = priority

            if albumid:
                items.append(
                    Request(url=self.playlength_url + albumid[0],
                            callback=self.parse_playlength,
                            meta={
                                'item': ep_item,
                                'albumid': albumid[0]
                            }))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Ejemplo n.º 18
0
    def parse_episode(self, response):
        try:
            cust_para = response.request.meta['cust_para']
            log.msg('%s: %s' % (response.request.url, cust_para))
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            category = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = Util.strip_title("".join(title))
                if 'need_check' in cust_para:
                    if self.content_is_forbidden(ep_item['title']):
                        log.msg('video [ %s ] is in blacklist!' %
                                ep_item['show_id'])
                        return items
                    else:
                        pass
                else:
                    pass

            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if 'category' in cust_para:
                ep_item['category'] = cust_para['category']
            elif category:
                ep_item['category'] = category[0].replace(u'频道', '')
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            if 'priority' in cust_para:
                ep_item['priority'] = cust_para['priority']

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 19
0
    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            thumb_url = response.request.meta['thumb_url']
            upload_time = response.request.meta['upload_time']
            category = response.request.meta['category']
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else 1
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.url_prefix + owner_url[0] + "/about",
                            callback=self.parse_about))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            #category = response.xpath('//p[@id="eow-category"]/a/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()

            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_youtube_showid(response.request.url)
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if category:
                #ep_item['category'] = category[0].replace('&', '|')
                ep_item['category'] = category
            '''
            if upload:
                ptime = Util.get_youtube_publish(upload[0])
                if ptime:
                    ep_item['upload_time'] = ptime
            '''
            if upload_time:
                t = Util.get_youtube_upload_time(upload_time[0].strip())
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = "\n".join(description)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)
            ep_item['kw_id'] = kw_id

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.parse_other_info,
                        meta={'item': ep_item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 20
0
    def parse_episode(self, response):
        try:
            log.msg('parse_episode %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            thumb_url = response.request.meta['thumb']
            items = []

            #show_id
            show_id = Util.get_iqiyi_showid(response.request.url)

            #space maybe exist: "albumId:326754200" or "albumId: 326754200"
            albumid = response.selector.re(re.compile(r'albumId: ?(\d+)'))

            #video info
            title = response.xpath('//div[@class="play-tit-l"]/h2/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="play-tit-l"]/h1/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="mod-play-t**s"]/h1/descendant-or-self::*/text()').extract()
            if not title:
                title = response.xpath('//div[@class="play-tit play-tit-oneRow play-tit-long"]/h1/descendant-or-self::*/text()').extract()

            category = response.xpath('//div[@class="crumb_bar"]/span[1]/span/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="play-album-crumbs textOverflow"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="crumb_bar"]/span[1]/a[2]/text()').extract()
            if not category:
                category = response.xpath('//div[@class="mod-crumb_bar"]/span[1]/a[2]/text()').extract()

            upload_time = response.xpath('//div[@class="crumb_bar"]/span[3]/span/text()').extract()
            if not upload_time:
                upload_time = response.xpath('//div[@class="crumb_bar"]/span[2]/span/text()').extract()
            
            tag = response.xpath('//span[@id="widget-videotag"]/descendant::*/text()').extract()
            if not tag:
                tag = response.xpath('//span[@class="mod-tags_item vl-block"]/descendant::*/text()').extract()
            if not tag:
                tag = response.xpath('//div[@class="crumb_bar"]/span[2]/a/text()').extract()

            ep_item = EpisodeItem()
            
            if title:
                ep_item['title'] = "".join([t.strip() for t in title])
            if show_id:
                ep_item['show_id'] = show_id
            if tag:
                ep_item['tag'] =  "|".join([t.strip() for t in tag])
            if upload_time:
                ep_item['upload_time'] = upload_time[0].strip()
            if category:
                ep_item['category'] = category[0].strip()
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0].strip()

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id

            if albumid:
                items.append(Request(url=self.playlength_url+albumid[0], callback=self.parse_playlength, meta={'item':ep_item,'albumid':albumid[0]}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 21
0
    def parse_episode(self, response):
        items = []
        try:
            log.msg(response.request.url, level=log.INFO)
            title = response.xpath(
                '//head/meta[@property="og:title"]/@content')
            title = title.extract()[0].strip() if title else ""
            category = response.xpath(
                '//head/meta[@property="og:category"]/@content')
            category = category.extract()[0].strip(
            ) if category else u"\u5a31\u4e50"
            description = response.xpath(
                '//head/meta[@property="og:description"]/@content')
            description = description.extract()[0].strip(
            ) if description else ""

            upload_time = response.xpath('//div[@class="playerinfo"]/p/text()')
            upload_time = upload_time.re(
                u'\u53d1\u5e03:(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})'
            ) if upload_time else ""
            upload_time = upload_time[0] if upload_time else ""
            upload_time = upload_time if upload_time else response.meta['time']

            play_num = response.xpath(
                '//div[@class="playerinfo"]/p/span[@id="numPlay"]/text()')
            play_num = play_num.re(
                u'\u64ad\u653e\u6570:(\d+)') if play_num else ""
            request_played = False if play_num else True
            play_num = play_num[0] if play_num else "0"

            tags = response.xpath('//li[@class="vtags"]/a/text()')
            tags = tags.extract() if tags else []
            tag = ''
            for a in tags:
                tag = tag + '|' + a if tag else a
            tag = tag if tag else u"\u5a31\u4e50"

            video_id = response.request.url.split("/")[-1]
            video_id = video_id.split('.')[0]

            ep_item = EpisodeItem()
            ep_item['show_id'] = video_id.replace("-", "")
            #ep_item['video_id'] = video_id
            ep_item['title'] = title
            ep_item['tag'] = tag
            ep_item['category'] = category
            ep_item['played'] = int(play_num)
            ep_item['upload_time'] = datetime.strptime(upload_time,
                                                       '%Y-%m-%d %H:%M:%S')
            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['description'] = description

            if request_played:
                items.append(
                    Request(url=self.url_num % video_id,
                            callback=self.parse_played,
                            meta=ep_item))
            else:
                items.append(ep_item)
        except Exception, err:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 22
0
    def video_parse(self, response):
        items = []
        try:
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            show_id = Util.get_youtube_showid(response.request.url)
            if not show_id:
                return items

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.youtube_url_prefix + owner_url[0] +
                            "/about",
                            callback=self.video_about_parse))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()
            category = response.xpath(
                '//div[@id="watch-description"]//ul[@class="content watch-info-tag-list"]/li/a/text()'
            ).extract()
            upload = response.xpath(
                '//meta[@itemprop="datePublished"]/@content').extract()
            #该方法获取的缩略图
            thumb_url = response.xpath(
                '//link[@itemprop="thumbnailUrl"]/@href').extract()
            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = show_id
            #这里缩略图采用合成的方式得到['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault']
            #ep_item['thumb_url'] = self.thumb_url_prefix + '/' + show_id + '/default.jpg'
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if description:
                ep_item['description'] = "\n".join(description)
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            if kw_id:
                ep_item['kw_id'] = kw_id
            if pg_id:
                ep_item['pg_id'] = pg_id
            if cat_id:
                ep_item['cat_id'] = cat_id
            if subject_id:
                ep_item['subject_id'] = subject_id

            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if category:
                category = category[0].strip()
                #https://www.youtube.com/watch?v=lwy4qwaByVQ
                ep_item['category'] = category.replace('&', '|')
            if upload:
                upload = upload[0].strip()
                struct_time = None
                struct_time = time.strptime(upload, '%b %d, %Y')
                if not struct_time:
                    struct_time = time.strptime(upload, '%Y年%m月%d日')
                if struct_time:
                    time_str = time.strftime('%Y-%m-%d %H:%M:%S', struct_time)
                    #time_str = "%s-%s-%s %s" % (struct_time.tm_year, struct_time.tm_mon, struct_time.tm_mday, time_str)
                    ep_item['upload_time'] = time_str

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.video_other_info_parse,
                        meta={'item': ep_item}))
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 23
0
    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #owner
            owner = sel.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return
                items.append(Request(url=owner[0], callback=self.parse_owner))

            #video info
            #title = sel.xpath('//div[@class="base_info"]/h1/descendant-or-self::*/text()').extract()
            title = sel.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            category = sel.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            scripts = sel.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = sel.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = sel.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = sel.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                #ep_item['title'] = Util.strip_title("".join(title))
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if category:
                ep_item['category'] = category[0].replace(u'频道', '')
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id

            #if video_id:
            #    items.append(Request(url=self.vpaction_url+video_id[0], callback=self.parse_vpaction, meta={'item':ep_item}))
            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 24
0
    def video_parse(self, response):
        items = []
        try:
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            #check video's category
            category_str = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            category = None
            if category_str:
                category = category_str[0].replace(u'频道', '')
            if category:
                if category in self.category_exclude:
                    log.msg("video category excluded: %s" % category)
                    return

            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return

            #episode info
            show_id = Util.get_showid(response.request.url)

            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')

            episode_item = EpisodeItem()

            if show_id:
                episode_item['show_id'] = show_id
            else:
                return
            if video_id:
                episode_item['video_id'] = video_id[0]
            if owner_show_id:
                episode_item['owner_show_id'] = owner_show_id
            if title:
                episode_item['title'] = Util.strip_title("".join(title))
            if tag:
                episode_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if category:
                episode_item['category'] = category
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    episode_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                episode_item['description'] = description[0]

            episode_item['spider_id'] = self.spider_id
            episode_item['site_id'] = self.site_id
            episode_item['url'] = response.request.url

            episode_item['kw_id'] = kw_id
            episode_item['pg_id'] = pg_id
            episode_item['cat_id'] = cat_id
            episode_item['subject_id'] = subject_id

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.vpaction_parse,
                            meta={'episode_item': episode_item}))
            else:
                items.append(episode_item)
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Ejemplo n.º 25
0
    def parse_episode(self, response):
        try:
            recommend = response.request.meta['recommend']
            log.msg('%s|recommend: %s' % (response.request.url, recommend))
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return

            #check recommended video's category
            category = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            cat = None
            if category:
                cat = category[0].replace(u'频道', '')
            if recommend and cat:
                if cat in self.cat_exclude:
                    log.msg("video category excluded: %s" % cat)
                    return

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = Util.strip_title("".join(title))
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if cat:
                ep_item['category'] = cat
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            #recommendation
            if not recommend:
                items.append(
                    Request(url=self.ykrec_url + video_id[0],
                            callback=self.parse_recommendation))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)