Example #1
0
    def parse_episode_youku(self, response):
        try:
            logging.log(logging.INFO,
                        "episode_youku:%s" % response.request.url)
            pg_id = response.request.meta['pg_id']
            cat_name = response.request.meta['cat_name']
            site_id = response.request.meta['site_id']
            audit = response.request.meta['audit']
            priority = response.request.meta['priority']

            items = []
            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            #category = response.xpath('//div[@class="base_info"]/div[@class="guide"]/div/a/text()').extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = response.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                #ep_item['title'] = Util.strip_title("".join(title))
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            #if category:
            #    ep_item['category'] = category[0].replace(u'频道', '')
            ep_item['category'] = cat_name
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = site_id
            ep_item['url'] = response.request.url
            ep_item['pg_id'] = pg_id
            ep_item['audit'] = audit
            ep_item['format_id'] = self.format_id
            ep_item['priority'] = priority

            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Example #2
0
    def parse_episode(self, response):
        try:
            cust_para = response.request.meta['cust_para']
            log.msg('%s: %s' % (response.request.url, cust_para))
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            category = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = Util.strip_title("".join(title))
                if 'need_check' in cust_para:
                    if self.content_is_forbidden(ep_item['title']):
                        log.msg('video [ %s ] is in blacklist!' %
                                ep_item['show_id'])
                        return items
                    else:
                        pass
                else:
                    pass

            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if 'category' in cust_para:
                ep_item['category'] = cust_para['category']
            elif category:
                ep_item['category'] = category[0].replace(u'频道', '')
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            if 'priority' in cust_para:
                ep_item['priority'] = cust_para['priority']

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Example #3
0
    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            items = []
            sel = Selector(response)

            #owner
            owner = sel.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return
                items.append(Request(url=owner[0], callback=self.parse_owner))

            #video info
            #title = sel.xpath('//div[@class="base_info"]/h1/descendant-or-self::*/text()').extract()
            title = sel.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::text()'
            ).extract()
            category = sel.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            scripts = sel.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = sel.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = sel.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()
            vp_url = sel.xpath(
                '//span[@id="videoTotalPV"]/../../@href').extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                t = "".join(title)
                t = t.strip("\n").strip()
                #ep_item['title'] = Util.strip_title("".join(title))
                ep_item['title'] = Util.strip_title(t)
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if category:
                ep_item['category'] = category[0].replace(u'频道', '')
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url
            ep_item['cat_id'] = cat_id

            #if video_id:
            #    items.append(Request(url=self.vpaction_url+video_id[0], callback=self.parse_vpaction, meta={'item':ep_item}))
            if vp_url:
                items.append(
                    Request(url=vp_url[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Example #4
0
    def video_parse(self, response):
        items = []
        try:
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            #check video's category
            category_str = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            category = None
            if category_str:
                category = category_str[0].replace(u'频道', '')
            if category:
                if category in self.category_exclude:
                    log.msg("video category excluded: %s" % category)
                    return

            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return

            #episode info
            show_id = Util.get_showid(response.request.url)

            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')

            episode_item = EpisodeItem()

            if show_id:
                episode_item['show_id'] = show_id
            else:
                return
            if video_id:
                episode_item['video_id'] = video_id[0]
            if owner_show_id:
                episode_item['owner_show_id'] = owner_show_id
            if title:
                episode_item['title'] = Util.strip_title("".join(title))
            if tag:
                episode_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if category:
                episode_item['category'] = category
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    episode_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                episode_item['description'] = description[0]

            episode_item['spider_id'] = self.spider_id
            episode_item['site_id'] = self.site_id
            episode_item['url'] = response.request.url

            episode_item['kw_id'] = kw_id
            episode_item['pg_id'] = pg_id
            episode_item['cat_id'] = cat_id
            episode_item['subject_id'] = subject_id

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.vpaction_parse,
                            meta={'episode_item': episode_item}))
            else:
                items.append(episode_item)
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
    def parse_episode(self, response):
        try:
            recommend = response.request.meta['recommend']
            log.msg('%s|recommend: %s' % (response.request.url, recommend))
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yk-userinfo"]/div[@class="user-name"]/a/@href'
            ).extract()
            owner_show_id = None
            if owner:
                owner_show_id = Util.get_owner(owner[0])
                if owner_show_id in self.channel_exclude:
                    log.msg("video owner excluded: %s" % owner_show_id)
                    return

            #check recommended video's category
            category = response.xpath(
                '//div[@class="base_info"]/div[@class="guide"]/div/a/text()'
            ).extract()
            cat = None
            if category:
                cat = category[0].replace(u'频道', '')
            if recommend and cat:
                if cat in self.cat_exclude:
                    log.msg("video category excluded: %s" % cat)
                    return

            #video info
            title = response.xpath(
                '//div[@class="base_info"]/h1/descendant-or-self::*/text()'
            ).extract()
            scripts = response.xpath('//script[@type="text/javascript"]')
            video_id = scripts.re('videoId = \'(\d+)\'')
            tag = scripts.re('tags="(.+)"')
            upload = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@class="time"]/text()'
            ).extract()
            description = response.xpath(
                '//div[@class="yk-videoinfo"]/div[@id="text_long"]/text()'
            ).extract()

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_showid(response.request.url)
            if video_id:
                ep_item['video_id'] = video_id[0]
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = Util.strip_title("".join(title))
            if tag:
                ep_item['tag'] = Util.unquote(tag[0]).rstrip('|')
            if cat:
                ep_item['category'] = cat
            if upload:
                t = Util.get_upload_time(upload[0])
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = description[0]

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = response.request.url

            if video_id:
                items.append(
                    Request(url=self.vpaction_url + video_id[0],
                            callback=self.parse_vpaction,
                            meta={'item': ep_item}))
            else:
                items.append(ep_item)

            #recommendation
            if not recommend:
                items.append(
                    Request(url=self.ykrec_url + video_id[0],
                            callback=self.parse_recommendation))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)