def parse_owner(self, response):
        try:
            logging.log(logging.INFO, "owner:%s" % response.request.url)
            items = []
            user_item = UserItem()
            #owner id 
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath('./div[@class="info"]/div[@class="username"]/a[1]/@title').extract()
                played = up.xpath('./div[@class="state"]/ul/li[@class="vnum"]/em/text()').extract()
                fans = up.xpath('./div[@class="state"]/ul/li[@class="snum"]/em/text()').extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    #user_item['played'] = Util.normalize_vp(played[0])
                    user_item['played'] = Util.normalize_played(Util.normalize_vp(played[0]))
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath('./div[@class="userintro"]/div[@class="desc"]/p[2]/text()').extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = None
            if yh:
                video_count = yh.xpath('div[1]/div/div/div/div[@class="title"]/span/a/text()').re(u'\((\d+)\)')
                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url
            
            items.append(user_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Beispiel #2
0
    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            cat = response.request.meta['cat']
            items = []

            qq_v = response.xpath('//div[@class="mod_cont"]/ul/li')
            for v in qq_v:
                urls = v.xpath('./h6/a/@href').extract()
                titles = v.xpath('./h6/a/@text').extract()
                thumb_urls = v.xpath('./a/img/@src').extract()
                durations = v.xpath(
                    './a/div/span[@class="mod_version"]/text()').extract()
                playeds = v.xpath('./p/span/text()').extract()

                title = titles[0] if titles else None
                thumb_url = thumb_urls[0] if thumb_urls else None
                duration = Util.get_qq_duration(
                    durations[0]) if durations else None
                played = Util.normalize_played(Util.normalize_vp(
                    playeds[0])) if playeds else None
                if urls:
                    r = Request(url=urls[0], callback=self.parse_episode)
                    d = {
                        'title': title,
                        'thumb_url': thumb_url,
                        'duration': duration,
                        'played': played
                    }
                    d.update(order)
                    r.meta.update({'order': d})
                    items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
    def parse_vpaction(self, response):
        try:
            logging.log(logging.INFO, 'vpaction:%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()').extract()
            if vp:
                item['played'] = Util.normalize_played(Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url+show_id, callback=self.parse_playlength, meta={'item':item})
            return item
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Beispiel #4
0
 def vpaction_parse(self, response):
     items = []
     try:
         episode_item = response.request.meta['episode_item']
         vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re(
             u'<label>总播放数:</label><span.*>(.+)</span>')
         if vp:
             episode_item['played'] = Util.normalize_vp(vp[0])
         show_id = episode_item['show_id']
         if show_id:
             items.append(
                 Request(url=self.playlength_url + show_id,
                         callback=self.playlength_parse,
                         meta={'episode_item': episode_item}))
     except Exception, e:
         log.msg(traceback.format_exc(), level=log.ERROR)
Beispiel #5
0
    def parse_vpaction(self, response):
        try:
            #log.msg('%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath('//div[@id="videodetailInfo"]/ul/li').re(
                u'<label>总播放数:</label><span.*>(.+)</span>')
            if vp:
                item['played'] = Util.normalize_vp(vp[0])

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})

            return item

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Beispiel #6
0
    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            order = response.request.meta['order']
            items = []

            qq_v = response.xpath(
                '//ul[@id="videolst_cont"]/li[@class="list_item"]')
            for v in qq_v:
                urls = v.xpath('./strong/a/@href').extract()
                titles = v.xpath('./strong/a/text()').extract()
                thumb_urls = v.xpath('./a/img/@src').extract()
                durations = v.xpath('./a/span/em/text()').extract()
                playeds = v.xpath(
                    './div/span[@class="figure_info_play"]/span/text()'
                ).extract()
                upload_times = v.xpath(
                    './div/span[@class="figure_info_time"]/text()').extract()

                title = titles[0] if titles else None
                thumb_url = thumb_urls[0] if thumb_urls else None
                duration = Util.get_qq_duration(
                    durations[0]) if durations else None
                played = Util.normalize_played(Util.normalize_vp(
                    playeds[0])) if playeds else None
                upload_time = Util.get_qq_upload_time(
                    upload_times[0]) if upload_times else None
                if urls:
                    r = Request(url=urls[0], callback=self.parse_episode)
                    d = {
                        'title': title,
                        'thumb_url': thumb_url,
                        'duration': duration,
                        'played': played,
                        'upload_time': upload_time
                    }
                    d.update(order)
                    r.meta.update({'order': d})
                    items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
    def parse_vpaction(self, response):
        try:
            #log.msg('%s' % response.request.url)
            item = response.request.meta['item']
            sel = Selector(response)

            #vp = sel.xpath('//div[@id="videodetailInfo"]/ul/li').re(u'<label>总播放数:</label><span.*>(.+)</span>')
            #vp = sel.xpath('//div[@class="info_num"]/span/text()').extract()
            vp = sel.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()'
                           ).extract()
            if vp:
                item['played'] = Util.normalize_played(
                    Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})

            return item

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Beispiel #8
0
    def parse(self, response):
        try:
            log.msg(response.request.url, level=log.INFO)
            cust_para = response.request.meta['cust_para']
            items = []

            user_item = UserItem()
            #owner id
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath(
                    './div[@class="info"]/div[@class="username"]/a[1]/@title'
                ).extract()
                played = up.xpath(
                    './div[@class="state"]/ul/li[@class="vnum"]/em/text()'
                ).extract()
                fans = up.xpath(
                    './div[@class="state"]/ul/li[@class="snum"]/em/text()'
                ).extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    user_item['played'] = Util.normalize_vp(played[0])
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath(
                    './div[@class="userintro"]/div[@class="desc"]/p[2]/text()'
                ).extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = '0'
            if yh:
                video_count = yh.xpath(
                    'div[1]/div/div/div/div[@class="title"]/span/a/text()').re(
                        u'\((\d+)\)')

                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id

            items.append(user_item)

            #videos
            items.append(
                Request(url=response.request.url + "videos",
                        callback=self.parse_video_page,
                        meta={
                            'page': 1,
                            'cust_para': cust_para
                        }))

            return items

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)