Exemple #1
0
    def video_about_parse(self, response):
        items = []
        try:
            show_id = response.xpath(
                '//meta[@itemprop="channelId"]/@content').extract()
            user_name = response.xpath(
                '//span[@class="qualified-channel-title-text"]/a/text()'
            ).extract()
            fans = response.xpath('//ul[@class="about-stats"]/li').re(
                re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*subscribers.*</li>',
                           re.S))
            played = response.xpath('//ul[@class="about-stats"]/li').re(
                re.compile(r'<li.*>.*<b>([\d|,]*)</b>.*views.*</li>', re.S))
            intro = response.xpath(
                '//div[@class="about-description branded-page-box-padding"]/descendant-or-self::*/text()'
            ).extract()

            if show_id:
                user_item = UserItem()
                user_item['show_id'] = show_id[0]
                if user_name:
                    user_item['user_name'] = user_name[0]
                if fans:
                    user_item['fans'] = Util.normalize_played(fans[0])
                if played:
                    user_item['played'] = Util.normalize_played(played[0])
                if intro:
                    user_item['intro'] = "".join(intro).strip()
                user_item['spider_id'] = self.spider_id
                user_item['site_id'] = self.site_id
                user_item['url'] = response.request.url[:-len('/about')]

                items.append(user_item)
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
    def parse(self, response):
        try:
            logging.log(logging.INFO, "parse:%s" % response.request.url)
            audit =  response.request.meta['audit']
            cat_name = response.request.meta['cat_name']
            kw_id = response.request.meta['kw_id']
            priority = response.request.meta['priority']

            items = []

            #video items
            yk_v = response.xpath('//div[@class="sk-vlist clearfix"]/div[@class="v"]')
            for v in yk_v:
                url = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-title"]/a/@href').extract()
                thumb_urls = v.xpath('./div[@class="v-link"]/a/@href').extract()
                if thumb_urls:
                    thumb_url = thumb_urls[0]
                    if thumb_url == 'http://g1.ykimg.com/':
                        thumb_url = None
                else:
                    thumb_url = None
                pl = v.xpath('./div[@class="v-meta va"]/div[@class="v-meta-entry"]/div/label[text()="%s"]/../span/text()' % u'播放: ').extract()
                if pl:
                    pld = Util.normalize_played(pl[0])
                    played = int(pld)
                else:
                    played = None
                if url:
                    items.append(Request(url=url[0], callback=self.parse_episode, meta={'audit': audit, 'thumb_url': thumb_url, 'played': played, 'cat_name': cat_name, 'kw_id': kw_id, 'priority': priority}))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemple #3
0
    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            cat = response.request.meta['cat']
            items = []

            qq_v = response.xpath('//div[@class="mod_cont"]/ul/li')
            for v in qq_v:
                urls = v.xpath('./h6/a/@href').extract()
                titles = v.xpath('./h6/a/@text').extract()
                thumb_urls = v.xpath('./a/img/@src').extract()
                durations = v.xpath(
                    './a/div/span[@class="mod_version"]/text()').extract()
                playeds = v.xpath('./p/span/text()').extract()

                title = titles[0] if titles else None
                thumb_url = thumb_urls[0] if thumb_urls else None
                duration = Util.get_qq_duration(
                    durations[0]) if durations else None
                played = Util.normalize_played(Util.normalize_vp(
                    playeds[0])) if playeds else None
                if urls:
                    r = Request(url=urls[0], callback=self.parse_episode)
                    d = {
                        'title': title,
                        'thumb_url': thumb_url,
                        'duration': duration,
                        'played': played
                    }
                    d.update(order)
                    r.meta.update({'order': d})
                    items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
    def parse_owner(self, response):
        try:
            logging.log(logging.INFO, "owner:%s" % response.request.url)
            items = []
            user_item = UserItem()
            #owner id 
            script = response.xpath('/html/head/script')
            owner_id = script.re('ownerId = \"(\d+)\"')
            show_id = script.re('ownerEncodeid = \'(.+)\'')
            if owner_id:
                user_item['owner_id'] = owner_id[0]
            if show_id:
                user_item['show_id'] = show_id[0]
            else:
                return

            #user profile
            up = response.xpath('//div[@class="profile"]')
            if up:
                user_name = up.xpath('./div[@class="info"]/div[@class="username"]/a[1]/@title').extract()
                played = up.xpath('./div[@class="state"]/ul/li[@class="vnum"]/em/text()').extract()
                fans = up.xpath('./div[@class="state"]/ul/li[@class="snum"]/em/text()').extract()

                if user_name:
                    user_item['user_name'] = user_name[0]
                if played:
                    #user_item['played'] = Util.normalize_vp(played[0])
                    user_item['played'] = Util.normalize_played(Util.normalize_vp(played[0]))
                if fans:
                    user_item['fans'] = Util.normalize_vp(fans[0])

            #youku profile
            yp = response.xpath('//div[@class="YK-profile"]')
            if yp:
                intro = yp.xpath('./div[@class="userintro"]/div[@class="desc"]/p[2]/text()').extract()

                if intro:
                    user_item['intro'] = ''.join(intro)
            
            #count
            yh = response.xpath('//div[@class="YK-home"]')
            vcount = None
            if yh:
                video_count = yh.xpath('div[1]/div/div/div/div[@class="title"]/span/a/text()').re(u'\((\d+)\)')
                if video_count:
                    vcount = video_count[0]

            user_item['vcount'] = vcount
            user_item['spider_id'] = self.spider_id
            user_item['site_id'] = self.site_id
            user_item['url'] = response.request.url
            
            items.append(user_item)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
    def parse_vpaction(self, response):
        try:
            logging.log(logging.INFO, 'vpaction:%s' % response.request.url)
            item = response.request.meta['item']

            vp = response.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()').extract()
            if vp:
                item['played'] = Util.normalize_played(Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url+show_id, callback=self.parse_playlength, meta={'item':item})
            return item
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
    def parse_media(self, response, **kwargs):
        items = []
        try:
            title = response.xpath(
                '//div[@id="crumbsBar"]/div/div[@class="left"]/h2/text()'
            ).extract()
            tag = response.xpath('//meta[@name="keywords"]/@content').extract()
            #show_id = Util.get_sohu_showid(response.request.url)
            thumb = response.xpath('//script').re(',sCover: \'(.*)\'')
            upload = response.xpath('//script').re(',uploadTime: \'(.*)\'')
            description = response.xpath(
                '//p[@class="rel cfix"]/@title').extract()
            played = response.xpath(
                '//span[@class="vbtn vbtn-play"]/em/i/text()').extract()
            print played, upload
            video_id = response.xpath('//script').re('vid = \'(\d+)\'')

            ep_item = EpisodeItem()
            if video_id:
                ep_item['video_id'] = video_id[0]
                ep_item['show_id'] = video_id[0]
                if title:
                    ep_item['title'] = title[0]
                if tag:
                    ep_item['tag'] = tag[0].strip().replace(',', '|')
                if upload:
                    ep_item['upload_time'] = upload[0] + ":00"
                if description:
                    ep_item['description'] = description[0].strip()
                if thumb:
                    ep_item['thumb_url'] = thumb[0]
                if played:
                    ep_item['played'] = Util.normalize_played(played[0])

                ep_item['category'] = u"搞笑"
                ep_item['spider_id'] = self.spider_id
                ep_item['site_id'] = self.site_id
                ep_item['url'] = response.request.url

                items.append(ep_item)
                log.msg("spider success, title:%s" % (ep_item['title']),
                        level=log.INFO)
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
        finally:
            return items
    def parse_page(self, response):
        try:
            log.msg('%s: %s' %
                    (response.request.url, response.request.meta['page']))
            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']
            if int(page) > int(self.max_search_page):
                return

            items = []
            sel = Selector(response)

            #video items
            yk_v = sel.xpath('//div[@class="yk-col4"]')
            for v in yk_v:
                url = v.xpath('./div/div[@class="v-link"]/a/@href').extract()
                pl = v.xpath(
                    './div/div[@class="v-meta va"]/div[@class="v-meta-entry"]/span/text()'
                ).extract()
                if url and pl:
                    pld = Util.normalize_played(pl[0])
                    if int(pld) >= int(self.hottest_played_threshold):
                        items.append(
                            Request(url=url[0],
                                    callback=self.parse_episode,
                                    meta={'cat_id': cat_id}))
                    #else:
                    #    log.msg('discard: %s' % url[0])

            #pages
            next_page = sel.xpath(
                '//div[@class="yk-pager"]/ul/li[@class="next"]/a/@href'
            ).extract()
            if next_page:
                items.append(
                    Request(url=self.url_prefix + next_page[0],
                            callback=self.parse_page,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id
                            }))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Exemple #8
0
    def parse_page(self, response):
        try:
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            order = response.request.meta['order']
            items = []

            qq_v = response.xpath(
                '//ul[@id="videolst_cont"]/li[@class="list_item"]')
            for v in qq_v:
                urls = v.xpath('./strong/a/@href').extract()
                titles = v.xpath('./strong/a/text()').extract()
                thumb_urls = v.xpath('./a/img/@src').extract()
                durations = v.xpath('./a/span/em/text()').extract()
                playeds = v.xpath(
                    './div/span[@class="figure_info_play"]/span/text()'
                ).extract()
                upload_times = v.xpath(
                    './div/span[@class="figure_info_time"]/text()').extract()

                title = titles[0] if titles else None
                thumb_url = thumb_urls[0] if thumb_urls else None
                duration = Util.get_qq_duration(
                    durations[0]) if durations else None
                played = Util.normalize_played(Util.normalize_vp(
                    playeds[0])) if playeds else None
                upload_time = Util.get_qq_upload_time(
                    upload_times[0]) if upload_times else None
                if urls:
                    r = Request(url=urls[0], callback=self.parse_episode)
                    d = {
                        'title': title,
                        'thumb_url': thumb_url,
                        'duration': duration,
                        'played': played,
                        'upload_time': upload_time
                    }
                    d.update(order)
                    r.meta.update({'order': d})
                    items.append(r)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
    def parse_vpaction(self, response):
        try:
            #log.msg('%s' % response.request.url)
            item = response.request.meta['item']
            sel = Selector(response)

            #vp = sel.xpath('//div[@id="videodetailInfo"]/ul/li').re(u'<label>总播放数:</label><span.*>(.+)</span>')
            #vp = sel.xpath('//div[@class="info_num"]/span/text()').extract()
            vp = sel.xpath('//ul[@class="player_info"]/li[@class="sum"]/text()'
                           ).extract()
            if vp:
                item['played'] = Util.normalize_played(
                    Util.normalize_vp(vp[0].replace('总播放:', '')))

            show_id = item['show_id']
            item = Request(url=self.playlength_url + show_id,
                           callback=self.parse_playlength,
                           meta={'item': item})

            return item

        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
    def parse_episode(self, response):
        try:
            log.msg('%s' % response.request.url)
            thumb_url = response.request.meta['thumb_url']
            upload_time = response.request.meta['upload_time']
            category = response.request.meta['category']
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else 1
            items = []

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.url_prefix + owner_url[0] + "/about",
                            callback=self.parse_about))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            #category = response.xpath('//p[@id="eow-category"]/a/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            #upload = response.xpath('//p[@id="watch-uploader-info"]/strong/text()').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()

            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = Util.get_youtube_showid(response.request.url)
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if category:
                #ep_item['category'] = category[0].replace('&', '|')
                ep_item['category'] = category
            '''
            if upload:
                ptime = Util.get_youtube_publish(upload[0])
                if ptime:
                    ep_item['upload_time'] = ptime
            '''
            if upload_time:
                t = Util.get_youtube_upload_time(upload_time[0].strip())
                if t:
                    ep_item['upload_time'] = Util.get_datetime_delta(
                        datetime.now(), t)
            if description:
                ep_item['description'] = "\n".join(description)
            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)
            ep_item['kw_id'] = kw_id

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.parse_other_info,
                        meta={'item': ep_item}))

            return items
        except Exception as e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Exemple #11
0
    def video_parse(self, response):
        items = []
        try:
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            show_id = Util.get_youtube_showid(response.request.url)
            if not show_id:
                return items

            #owner
            owner = response.xpath(
                '//div[@class="yt-user-info"]/a/@data-ytid').extract()
            owner_url = response.xpath(
                '//div[@class="yt-user-info"]/a/@href').extract()
            owner_show_id = None
            if owner:
                owner_show_id = owner[0]
                items.append(
                    Request(url=self.youtube_url_prefix + owner_url[0] +
                            "/about",
                            callback=self.video_about_parse))

            #video info
            title = response.xpath('//span[@id="eow-title"]/text()').extract()
            tag = response.xpath(
                './head/meta[@name="keywords"]/@content').extract()
            description = response.xpath(
                '//p[@id="eow-description"]/descendant-or-self::*/text()'
            ).extract()
            played = response.xpath(
                '//div[@class="watch-view-count"]/text()').extract()
            category = response.xpath(
                '//div[@id="watch-description"]//ul[@class="content watch-info-tag-list"]/li/a/text()'
            ).extract()
            upload = response.xpath(
                '//meta[@itemprop="datePublished"]/@content').extract()
            #该方法获取的缩略图
            thumb_url = response.xpath(
                '//link[@itemprop="thumbnailUrl"]/@href').extract()
            #other info
            sts = re.search(r'\"sts\": ?(\d+)', response.body)

            ep_item = EpisodeItem()
            ep_item['show_id'] = show_id
            #这里缩略图采用合成的方式得到['default', 'mqdefault', 'hqdefault', 'sddefault', 'maxresdefault']
            #ep_item['thumb_url'] = self.thumb_url_prefix + '/' + show_id + '/default.jpg'
            if owner_show_id:
                ep_item['owner_show_id'] = owner_show_id
            if title:
                ep_item['title'] = title[0].strip()
            if tag:
                ep_item['tag'] = tag[0].replace(', ', '|')
            if description:
                ep_item['description'] = "\n".join(description)
            if played:
                pld = Util.normalize_played(played[0])
                if pld:
                    ep_item['played'] = Util.normalize_played(played[0])
                else:
                    ep_item['played'] = '0'

            if kw_id:
                ep_item['kw_id'] = kw_id
            if pg_id:
                ep_item['pg_id'] = pg_id
            if cat_id:
                ep_item['cat_id'] = cat_id
            if subject_id:
                ep_item['subject_id'] = subject_id

            if thumb_url:
                ep_item['thumb_url'] = thumb_url[0]
            if category:
                category = category[0].strip()
                #https://www.youtube.com/watch?v=lwy4qwaByVQ
                ep_item['category'] = category.replace('&', '|')
            if upload:
                upload = upload[0].strip()
                struct_time = None
                struct_time = time.strptime(upload, '%b %d, %Y')
                if not struct_time:
                    struct_time = time.strptime(upload, '%Y年%m月%d日')
                if struct_time:
                    time_str = time.strftime('%Y-%m-%d %H:%M:%S', struct_time)
                    #time_str = "%s-%s-%s %s" % (struct_time.tm_year, struct_time.tm_mon, struct_time.tm_mday, time_str)
                    ep_item['upload_time'] = time_str

            ep_item['spider_id'] = self.spider_id
            ep_item['site_id'] = self.site_id
            ep_item['url'] = Util.normalize_youtube_url(response.request.url)

            query = Util.encode({'video_id': ep_item['show_id'], \
                                 'eurl': 'https://youtube.googleapis.com/v/' + ep_item['show_id'], \
                                 'sts': sts.groups()[0] if sts else ''})
            items.append(
                Request(url='http://www.youtube.com/get_video_info?' + query,
                        callback=self.video_other_info_parse,
                        meta={'item': ep_item}))
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Exemple #12
0
    def parse_video_page(self, response):
        try:
            # 默认是最新发布
            logging.log(logging.INFO, 'page:%s' % response.request.url)
            audit = response.request.meta['audit']
            cat_name = response.request.meta['cat_name']
            show_id = response.request.meta['show_id']
            priority = response.request.meta['priority']

            page = 1
            items = []
            #get videos
            yk_v = response.xpath('//div[@class="yk-col4"]/div')
            for v in yk_v:
                url = v.xpath('./div[@class="v-link"]/a/@href').extract()
                thumb_urls = v.xpath(
                    './div/div[@class="v-thumb"]/img/@src').extract()
                if thumb_urls:
                    thumb_url = thumb_urls[0]
                    if thumb_url == 'http://g1.ykimg.com/':
                        thumb_url = None
                else:
                    thumb_url = None

                pl = v.xpath(
                    './div[@class="v-meta va"]/div[@class="v-meta-entry"]/span[@class="v-num"]/text()'
                ).extract()
                if pl:
                    pld = Util.normalize_played(pl[0])
                    played = int(pld)
                else:
                    played = None
                if url:
                    items.append(
                        Request(url=url[0],
                                callback=self.parse_episode,
                                meta={
                                    'audit': audit,
                                    'thumb_url': thumb_url,
                                    'played': played,
                                    'cat_name': cat_name,
                                    'show_id': show_id,
                                    'priority': priority
                                }))
            #get last_str and ajax_url
            last_str = response.selector.re(u'\'last_str\':\'([^\']*)\'')
            ajax_url = response.selector.re(u'\'ajax_url\':\'([^\']*)\'')

            #reqest sibling page
            if ajax_url:
                sibling_page = (3 * page - 1, 3 * page)
                for p in sibling_page:
                    s = last_str[0] if last_str else u''
                    para = {
                        "v_page": str(page),
                        "page_num": str(p),
                        "page_order": "1",
                        "last_str": s
                    }
                    items.append(
                        FormRequest(url=self.url_prefix + ajax_url[0] +
                                    "fun_ajaxload/",
                                    formdata=para,
                                    method='GET',
                                    callback=self.parse_video_page,
                                    meta={
                                        'audit': audit,
                                        'cat_name': cat_name,
                                        'show_id': show_id,
                                        'priority': priority
                                    }))

            #request next page
            '''
            next_page = response.xpath('//ul[@class="YK-pages"]/li[@class="next"]/a/@href').extract()
            if next_page:
                items.append(Request(url=self.url_prefix+next_page[0], callback=self.parse_video_page, meta={'page':page+1}))
            '''
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())