Beispiel #1
0
    def video_set_parse_show_subnav(self, response):
        items = []
        try:
            page = response.request.meta[
                'page'] if 'page' in response.request.meta else 1
            page = int(page)
            if page > int(self.max_search_page):
                return items
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            url = response.request.url
            body = response.body
            #用户
            results = youku_url_extract.user_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.user_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))
            #播放
            results = youku_url_extract.video_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.video_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))
            #button click(视频集展开)
            show_episode_url_format = 'http://www.youku.com/show_episode/' + show_id + '.html?dt=json&divid=%s&__rt=1&__ro=%s'
            episode_ids = response.xpath(
                '//div[@class="pgm-tab"]//div[normalize-space(@class)="pgm-list"]/ul/li/@data'
            ).extract()
            for episode_id in episode_ids:
                show_url = show_episode_url_format % (episode_id, episode_id)
                items.append(
                    Request(url=show_url,
                            callback=self.page_parse,
                            meta={
                                'page': 1,
                                'kw_id': kw_id,
                                'pg_id': pg_id,
                                'cat_id': cat_id,
                                'subject_id': subject_id
                            }))
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Beispiel #2
0
    def search_parse(self, response):
        items = []
        try:
            page = response.request.meta[
                'page'] if 'page' in response.request.meta else 1
            page = int(page)
            if page > int(self.max_search_page):
                return items
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            url = response.request.url
            body = response.body
            #剧集
            results = youku_url_extract.video_set_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.video_set_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))
            #用户
            results = youku_url_extract.user_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.user_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))
            #播放
            results = None
            results = youku_url_extract.video_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.video_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))
            #下一页
            next_pages = response.xpath(
                '//div[@class="sk_pager"]/ul/li[@class="next"]/a/@href'
            ).extract()
            for url in next_pages:
                if url.startswith('/'):
                    url = self.soku_url_prefix + url
                items.append(
                    Request(url=url,
                            callback=self.search_parse,
                            meta={
                                'page': page + 1,
                                'kw_id': kw_id,
                                'pg_id': pg_id,
                                'cat_id': cat_id,
                                'subject_id': subject_id
                            }))
                break
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)
Beispiel #3
0
    def video_set_parse(self, response):
        items = []
        try:
            page = response.request.meta[
                'page'] if 'page' in response.request.meta else 1
            page = int(page)
            if page > int(self.max_search_page):
                return items
            kw_id = response.request.meta[
                'kw_id'] if 'kw_id' in response.request.meta else None
            pg_id = response.request.meta[
                'pg_id'] if 'pg_id' in response.request.meta else None
            cat_id = response.request.meta[
                'cat_id'] if 'cat_id' in response.request.meta else None
            subject_id = response.request.meta[
                'subject_id'] if 'subject_id' in response.request.meta else None

            url = response.request.url
            body = response.body

            #用户
            results = youku_url_extract.user_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.user_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))

            #播放
            results = youku_url_extract.video_url_extract(url, body)
            if results:
                for result in results:
                    items.append(
                        Request(url=result,
                                callback=self.video_parse,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))
            #http://www.youku.com/show_page/id_zdfee6f12875611e4b2ad.html
            show_id = None
            regex_express = 'http://www\.youku\.com/show_page/(id_[\w]+)\.html.*'
            regex_pattern = re.compile(regex_express)
            result = regex_pattern.search(url)
            if result:
                results = result.groups()
                show_id = results[0]
            if not show_id:
                return items
            #tab click(标签页点击)
            show_url_format = 'http://www.youku.com/show_%s_' + show_id + '.html?dt=json&__rt=1&__ro=reload_%s'
            subnav_ids = response.xpath(
                '//div[@id="subnav_wrap"]//ul[@class="tb"]/li/@id').extract()
            for subnav_id in subnav_ids:
                regex_express = 'subnav_(.+)'
                regex_pattern = re.compile(regex_express)
                result = regex_pattern.search(subnav_id)
                if result:
                    results = result.groups()
                    subnav_id_str = results[0]
                    show_url = show_url_format % (subnav_id_str, subnav_id_str)
                    items.append(
                        Request(url=show_url,
                                callback=self.video_set_parse_show_subnav,
                                meta={
                                    'page': 1,
                                    'kw_id': kw_id,
                                    'pg_id': pg_id,
                                    'cat_id': cat_id,
                                    'subject_id': subject_id
                                }))

            #button click(视频集展开)
            show_episode_url_format = 'http://www.youku.com/show_episode/' + show_id + '.html?dt=json&divid=%s&__rt=1&__ro=%s'
            episode_ids = response.xpath(
                '//div[@class="pgm-tab"]//div[normalize-space(@class)="pgm-list"]/ul/li/@data'
            ).extract()
            for episode_id in episode_ids:
                show_url = show_episode_url_format % (episode_id, episode_id)
                items.append(
                    Request(url=show_url,
                            callback=self.page_parse,
                            meta={
                                'page': 1,
                                'kw_id': kw_id,
                                'pg_id': pg_id,
                                'cat_id': cat_id,
                                'subject_id': subject_id
                            }))
        except Exception, e:
            log.msg(traceback.format_exc(), level=log.ERROR)