Exemple #1
0
    def __init__(self, *args, **kwargs):
        super(V360Spider, self).__init__(*args, **kwargs)
        self.mgr = DbManager.instance()
        self.parser = {'movie': V360ParserMovie(), 'tv': V360ParserTv(), 'variaty': V360ParserVariaty(), 'cartoon': V360ParserCartoon()}

        self.poster_filter_md5 = self.mgr.get_poster_filter_md5()
        '''
        if 'json_data' in kwargs:
            data = json.loads(kwargs['json_data'])
            task = []
            if data['cmd'] == 'trig':
                stat = data['stat'] if 'stat' in data else None
                task = self.mgr.get_untrack_url('360kan', stat)
            elif data['cmd'] == 'assign':
                task = data['task']
            self.start = [{'channel': t['code'], 'url': t['url'], 'type': URL_TYPE_PLAY} for t in task]
        else:
        '''
        self.start = [{'channel': 'movie', 'url': 'http://www.360kan.com/dianying/list.php', 'type': URL_TYPE_MAIN}, \
                      {'channel': 'tv', 'url': 'http://www.360kan.com/dianshi/list.php', 'type': URL_TYPE_MAIN}, \
                      {'channel': 'variaty', 'url': 'http://www.360kan.com/zongyi/list.php', 'type': URL_TYPE_MAIN}, \
                      {'channel': 'cartoon', 'url': 'http://www.360kan.com/dongman/list.php', 'type': URL_TYPE_MAIN}, \
                      #{'channel': 'variaty', 'url': 'http://www.360kan.com/va/Zsgoa6dv7JM8ED.html', 'type': URL_TYPE_MEDIA}, \
                      #  {'channel': 'movie', 'url': 'http://www.360kan.com/m/f6bkZkUqcHr4TR.html', 'type': URL_TYPE_MEDIA}
                     ]
Exemple #2
0
class pps_spider(Spider):
    '''
        pps爬虫流程:
        (1)list列表页 -> 播放页 -> 媒体页(若存在)
        (2)播放页 -> 媒体页(若存在)
    '''
    site_code = 'pps'
    name = site_code
    mgr = DbManager.instance()
    max_mark_depth = 3
    max_number = 100000

    #通过json传递的参数
    json_data = None

    def __init__(self, json_data=None, *args, **kwargs):
        super(pps_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            logging.log(logging.INFO, '由于pps站点处于十分不稳定状态,所以放弃该站点的爬虫')
            '''
            for list_channel in pps_extract.list_channels_url:
                url = pps_extract.list_channels_url[list_channel]
                items.append(Request(url=url, callback=self.list_parse, meta={'level':1, 'id':list_channel}))
            '''
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemple #3
0
    def __init__(self, *args, **kwargs):
        super(H360Spider, self).__init__(*args, **kwargs)
        self.mgr = DbManager.instance()
        self.parser = {
            'movie': V360ParserMovie(),
            'tv': V360ParserTv(),
            'variaty': V360ParserVariaty(),
            'cartoon': V360ParserCartoon()
        }

        self.site_id = self.mgr.get_site(site_code=self.site_name)['site_id']
        self.os_id = self.mgr.get_os(os_name='web')
        self.channel_map = self.mgr.get_channel_map()
Exemple #4
0
class kankan_spider(Spider):
    '''
        kankan爬虫流程:
            (1)list列表页 -> 播放页 -> 媒体页
            (2)播放页 -> 媒体页
        由于kankan在list表页的全部即代表全部,所以无需每个标签都爬取
    '''
    site_code = 'kankan'
    name = site_code
    mgr = DbManager.instance()
    max_number = '100000'
    vip_prefix_url = 'http://vip.kankan.com'
    #通过json传递的参数
    json_data = None
    #统计数据用
    #count = 0

    #忽略类型:预告片
    skip_types = {'pre': u'预告片'}

    def __init__(self, json_data=None, *args, **kwargs):
        super(kankan_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            self.load_member_variable()
            if self.json_data:
                items = items + self.load_video_urls()
            else:
                list_prefix_url = 'http://movie.kankan.com/type/%s/'
                for list_channel in kankan_extract.list_channels:
                    list_channel_pinyin = kankan_extract.list_channels_pinyin[
                        list_channel]
                    url = list_prefix_url % list_channel_pinyin
                    items.append(
                        Request(url=url,
                                callback=self.list_parse,
                                meta={
                                    'first': True,
                                    'id': list_channel
                                }))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemple #5
0
class pptv_spider(Spider):
    '''
        pptv爬虫流程:
        (1)list列表页 -> 播放页
        (2)播放页

    '''
    site_code = 'pptv'
    name = site_code
    mgr = DbManager.instance()
    max_mark_depth = 6
    max_number = 100000
    list_prefix_url = 'http://list.pptv.com/channel_list.html'
    vip_prefix_url = 'http://ddp.vip.pptv.com'
    #老api,已经被放弃
    #album_api = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=%s&cid=%s&page=%s'
    #该接口时常不稳定
    album_api = 'http://v.pptv.com/show/videoList?&cb=videoList&pid=%s&cat_id=%s&highlight=%s&page=%s'
    #当album_api不稳定时,利用下一个接口:该接口需要auth,一个跟设备绑定的参数
    auths = ["d410fafad87e7bbf6c6dd62434345818"]
    auth_album_api = "http://epg.api.pptv.com/detail.api?vid=%s&auth=%s"
    #通过json传递的参数
    json_data = None
    httpcli = HTTPDownload()
    app_api = "http://epg.api.pptv.com/detail.api?auth=%s&vid=%s"
    web_api = "http://v.pptv.com/show/videoList?&cb=videoList&pid=%s&page=%s"

    def __init__(self, json_data=None, *args, **kwargs):
        super(pptv_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            self.load_member_variable()
            if self.json_data:
                items = items + self.load_video_urls()
            else:
                url = 'http://list.pptv.com'
                items.append(
                    Request(url=url,
                            callback=self.list_parse,
                            meta={'level': 0}))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemple #6
0
class dy1905_spider(Spider):
    '''
        dy1905爬虫流程:
            (1)list列表页 -> 媒体页(无需进入播放页)
            (2)播放页->媒体页
        由于dy1905在list表页的全部即代表全部,所以无需每个标签都爬取
    '''
    site_code = '1905'
    name = site_code
    mgr = DbManager.instance()
    max_number = 100000
    vip_prefix_urls = ['http://vip.1905.com', 'http://vip.m1905.com']
    max_mark_depth = 10
    #通过json传递的参数
    json_data = None

    #统计数据用
    #count = 0

    def __init__(self, json_data=None, *args, **kwargs):
        super(dy1905_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            self.load_member_variable()
            if self.json_data:
                items = items + self.load_video_urls()
            else:
                for list_channel in dy1905_extract.list_channels:
                    if list_channel == u'电影':
                        url = 'http://www.1905.com/mdb/film/list'
                        items.append(
                            Request(url=url,
                                    callback=self.list_parse,
                                    meta={
                                        'level': 0,
                                        'id': list_channel
                                    }))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemple #7
0
class letv_spider(Spider):
    '''
        letv爬虫流程:
        (1)list列表页 -> 播放页 -> 媒体页
        (2)播放页 -> 媒体页
        注意:乐视需要分标签进行爬取
    '''
    site_code = 'letv'
    name = site_code
    mgr = DbManager.instance()
    max_mark_depth = 5
    max_number = 100000
    list_json_prefix_url = 'http://list.letv.com/apin/chandata.json'
    zongyi_album_api = 'http://api.letv.com/mms/out/albumInfo/getVideoListByIdAndDate?&year=%s&month=%s&id=%s'
    # other_album_api = 'http://api.mob.app.letv.com/play/vlist?pid=%s&pagenum=%s'
    other_album_api = 'http://api.mob.app.letv.com/play/cards?pid=%s&version=6.2.2&pagenum=%s'
    #通过json传递的参数
    json_data = None

    def __init__(self, json_data=None, *args, **kwargs):
        super(letv_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            self.load_member_variable()
            if self.json_data:
                items = items + self.load_video_urls()
            else:
                url = 'http://list.letv.com'
                items.append(
                    Request(url=url,
                            callback=self.list_parse,
                            meta={'level': 0}))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemple #8
0
class hunantv_spider(Spider):
    '''
        hunantv爬虫流程:
        (1)list列表页 -> 播放页 -> 正片页[ -> 媒体页]
        (2)播放页 -> 正片页
        由于hunantv在list表页的全部即代表全部,所以无需每个标签都爬取
    '''
    site_code = 'hunantv'
    name = site_code
    mgr = DbManager.instance()
    max_number = 100000
    max_mark_depth = 10
    #通过json传递的参数
    json_data = None
    httpdownload = HTTPDownload()
    media_info_url = "http://m.api.hunantv.com/video/getbyid?videoId=%s"
    video_list_url = "http://m.api.hunantv.com/video/getList?videoId=%s&pageNum=%s"

    def __init__(self, json_data=None, *args, **kwargs):
        super(hunantv_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            self.load_member_variable()
            if self.json_data:
                items = items + self.load_video_urls()
            else:
                url = 'http://list.hunantv.com'
                items.append(
                    Request(url=url,
                            callback=self.list_parse,
                            meta={'level': 0}))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemple #9
0
class baofeng_spider(Spider):
    name = "baofeng"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    site_code = "baofeng"
    site_id = ""  #baofeng
    allowed_domains = ["www.baofeng.com", "g.hd.baofeng.com"]
    url_prefix = 'http://www.baofeng.com'
    site_name = Util.guess_site(url_prefix)

    mgr = DbManager.instance()
    os_id = mgr.get_os('web')["os_id"]
    site_id = str(mgr.get_site(site_code)["site_id"])
    channel_map = {}
    channel_map = mgr.get_channel_map()
    max_update_page = get_project_settings().get('MAX_UPDATE_PAGE')
    global_spider = True

    httpdownload = HTTPDownload()
    channel_info = {}
    test_page_url = None
    test_channel_id = None

    def __init__(self, json_data=None, *args, **kwargs):
        super(baofeng_spider, self).__init__(*args, **kwargs)
        cat_urls = []
        tasks = None
        if json_data:
            data = json.loads(json_data)
            if "type" in data:
                spider_type = data["type"]
                if spider_type != "global":
                    self.global_spider = False
            tasks = []
            ttask = {}
            if "id" in data and "url" in data:
                ttask["id"] = data["id"]
                ttask["url"] = data["url"]
                ttask["sid"] = ""
                ttask["untrack_id"] = ""
                cat_urls.append(ttask)

            cmd = data["cmd"]
            if cmd == "assign":
                tasks = data["task"]
            elif cmd == "trig":
                stat = data['stat'] if 'stat' in data else None
                tasks = self.mgr.get_untrack_url(self.site_code, stat)
            elif cmd == "test" and 'id' in data and 'url' in data:
                self.test_page_url = data["url"]
                self.test_channel_id = data["id"]

            if tasks:
                for task in tasks:
                    ttask = {}
                    ttask["url"] = task["url"]
                    code = task["code"]
                    ttask["id"] = self.channel_map[code]
                    ttask["untrack_id"] = task["untrack_id"]
                    ttask["sid"] = task["sid"]
                    cat_urls.append(ttask)

        self._cat_urls = []
        if cat_urls:
            self._cat_urls = cat_urls

    def start_requests(self):
        try:
            items = []

            self.movie_id = str(self.mgr.get_channel('电影')["channel_id"])
            self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"])
            self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"])
            self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"])

            self.channel_info = {
                self.movie_id: u"电影",
                self.tv_id: u"电视剧",
                self.variety_id: u"综艺",
                self.cartoon_id: u"动漫"
            }

            if self.test_page_url:
                turl = Util.normalize_url(self.test_page_url, "baofeng")
                items.append(
                    Request(url=self.test_page_url,
                            callback=self.parse_page,
                            meta={
                                'cat_id': self.test_channel_id,
                                'page': 1
                            }))
                return items

            if not self._cat_urls:
                if self.global_spider:
                    cat_urls = [{
                        'url':
                        'http://www.baofeng.com/movie/682/list-sid-1-p-1.shtml',
                        'id': self.movie_id
                    }, {
                        'url':
                        'http://www.baofeng.com/tv/914/list-type-2-ishot-1-sid-1-p-1.shtml',
                        'id': self.tv_id
                    }, {
                        'url':
                        'http://www.baofeng.com/enc/444/list-type-4-ishot-1-sid-1-p-1.shtml',
                        'id': self.variety_id
                    }, {
                        'url':
                        'http://www.baofeng.com/comic/924/list-type-3-ishot-1-sid-1-p-1.shtml',
                        'id': self.cartoon_id
                    }]
                    #cat_urls = [{'url':'http://www.baofeng.com/enc/444/list-type-4-ishot-1-sid-1-p-1.shtml','id':self.variety_id}]
                for cat in cat_urls:
                    items.append(
                        Request(url=cat['url'],
                                callback=self.parse_area,
                                meta={
                                    'cat_id': cat['id'],
                                    'page': 1
                                }))
                    #items.append(Request(url=cat['url'], callback=self.parse_page, meta={'cat_id': cat['id'],'page':1}))
            else:
                for cat in self._cat_urls:
                    turl = Util.normalize_url(cat['url'], "baofeng")
                    items.append(
                        Request(url=turl,
                                callback=self.parse_single_episode,
                                meta={
                                    'cat_id': cat["id"],
                                    'page': 1,
                                    "poster_url": "",
                                    "untrack_id": cat["untrack_id"],
                                    "sid": cat["sid"]
                                }))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_area(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_area: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="selecter"]/div[1]/div[@class="clearfix rp"]/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=self.url_prefix + sub,
                            callback=self.parse_type,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_type(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_type: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="selecter"]/div[2]/div[@class="clearfix rp"]/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=self.url_prefix + sub,
                            callback=self.parse_time,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_time(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_time: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="selecter"]/div[3]/div[@class="clearfix rp"]/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=self.url_prefix + sub,
                            callback=self.parse_page,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_page(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']
            logging.log(logging.INFO,
                        'parse_page: %s,%s' % (response.request.url, page))
            #if int(page) > int(self.max_update_page) and not self.global_spider:
            #    return

            items = []

            play_url = ""
            subs = response.xpath(
                '//div[@class="sort-list-r-mod02"]/ul[@class="sort-list-r-poster clearfix"]/li'
            )

            for sub in subs:
                play_url = sub.xpath('./div[1]/p[1]/a/@href').extract()
                pic_urls = sub.xpath('./div[1]/p[1]/a/img/@src').extract()
                #pic_urls = sub.xpath('./div[@class="hot-pic-like js-collect  shadow-cut"]/p[1]/a/img/@src').extract()
                pic_url = ""
                if pic_urls:
                    pic_url = pic_urls[0]
                if play_url:
                    rplay_url = play_url[0].strip()

                    items.append(
                        Request(url=self.url_prefix + rplay_url,
                                callback=self.parse_single_episode,
                                meta={
                                    'cat_id': cat_id,
                                    'poster_url': pic_url,
                                    'untrack_id': '',
                                    'sid': ''
                                }))

            next_page = response.xpath(
                '//div[@class="sort-list-r-mod02"]/div[@class="pages"]/ul[@class="clearfix"]/li/a[text()="%s"]/@href'
                % u'下一页').extract()
            if next_page:
                snext_page = next_page[0].strip()
                if snext_page.find(self.url_prefix) < 0:
                    snext_page = self.url_prefix + snext_page
                items.append(
                    Request(url=snext_page,
                            callback=self.parse_page,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_single_episode(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_single_episode: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            untrack_id = response.request.meta['untrack_id']
            sid = response.request.meta['sid']
            poster_url = response.request.meta['poster_url']
            urls = response.xpath(
                '//div[@class="play-nav-l-new"]/h1/a/@href').extract()
            if urls:
                for iurl in urls:
                    turl = self.url_prefix + iurl
                    surl = Util.normalize_url(turl, "baofeng")
                    if surl and self.site_name == Util.guess_site(surl):
                        #if turl and self.site_name == Util.guess_site(turl):
                        items.append(
                            Request(url=surl,
                                    callback=self.parse_episode_info,
                                    meta={
                                        'cat_id': cat_id,
                                        'poster_url': poster_url,
                                        'page': 1,
                                        "untrack_id": untrack_id,
                                        "sid": sid
                                    }))
            #付费电影,不能跳转到媒体页
            else:
                pass

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            year_list = []
            lyears = []

            title_list = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/h3/a/@title'
            ).extract()
            director_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'导演:').extract()
            performer_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'主演:').extract()
            type_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'类型:').extract()
            district_list = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/a/text()' %
                u'地区:').extract()
            year_info = response.xpath(
                '//div[@class="info clearfix"]/span[text()="%s"]/text()' %
                u'地区:').extract()
            year = None
            if len(year_info) >= 2:
                year = self.get_year(year_info[1])

            #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            districts = Util.join_list_safely(district_list)

            #text
            text = response.xpath(
                '//div[@class="juqing briefTab"]/div/text()').extract()
            #score
            score = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[1]/div[@class="score"]/div[class="score-num"]/strong/text()'
            ).extract()

            play_url = ""
            tplay_url = response.xpath(
                '//div[@class="aboutThis clearfix"]/div[@class="makeup"]/div[@class="sourcePlay"]/a[@id="moviePlayButton"]/@href'
            ).extract()
            if tplay_url:
                play_url = self.url_prefix + tplay_url[0].strip()
            videoitems = []

            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0]
                if ep_item["title"].find(u'预:') >= 0:
                    print "预告片,url", response.request.url
                    return items
            ep_item["actor"] = pers
            ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = districts
            if year:
                ep_item["release_date"] = Util.str2date(year)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url,
                                                "baofeng")

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item

            vurl = ""

            videoid = self.getshowid(response.request.url)
            mvitem["media"]["cont_id"] = videoid
            ttvitem = {}
            if title_list:
                ttvitem = self.parse_video_item(response, cat_id, play_url,
                                                title_list, None)
            if ttvitem:
                if 'video' in ttvitem and len(ttvitem['video']) > 0:
                    mvitem['video'] = ttvitem['video']
                    mvitem["media"]["info_id"] = Util.md5hash(
                        Util.summarize(mvitem["media"]))
                    Util.set_ext_id(mvitem["media"], mvitem["video"])
                    if untrack_id and sid:
                        mvitem["untrack_id"] = untrack_id
                        mvitem["sid"] = sid
                    res = self.check_url(mvitem)
                    #if self.check_url(mvitem):
                    if res:
                        items.append(mvitem)
                        pass
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_video_item(self, response, cat_id, url, title, playlistId):
        #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title))
        videoitems = []
        ep_item = MediaItem()
        item = MediaVideoItem()
        item["media"] = ep_item
        item["video"] = videoitems
        try:
            if int(cat_id) != int(self.movie_id):
                ul_list = response.xpath(
                    '//div[@class="episodes clearfix "]/a')
                if not ul_list:
                    ul_list = response.xpath(
                        '//div[@class="episodes clearfix enc-episodes-detail"]/a'
                    )
                for li in ul_list:
                    url = li.xpath('./@href').extract()
                    ttitle = li.xpath('./@title').extract()
                    snum = li.xpath('./text()').extract()
                    if snum:
                        play_num = self.get_play_num(snum[0])
                    if int(cat_id) == int(self.variety_id):
                        play_num = self.getvnum(self.url_prefix + url[0])
                    if not ttitle:
                        ttitle = [play_num]
                    vitem = self.compose_vitem([self.url_prefix + url[0]],
                                               title, play_num)
                    if 'url' in vitem:
                        videoitems.append(vitem)
            elif int(cat_id) == int(self.movie_id):
                if url:
                    vitem = self.compose_vitem([url], title, 1)
                    if 'url' in vitem:
                        videoitems.append(vitem)
            if videoitems:
                item["video"] = videoitems
                item["media"]["url"] = response.request.url
                Util.set_ext_id(item["media"], item["video"])
        except Exception as e:

            logging.log(logging.ERROR, traceback.format_exc())
        return item

    def compose_vitem(self, url_list, title_list, vnum):
        vitem = VideoItem()
        try:
            if not url_list:
                return vitem
            if title_list:
                vitem["title"] = title_list[0].strip()
            turl = Util.normalize_url(url_list[0], "baofeng")
            vitem["url"] = turl
            vitem["vnum"] = str(vnum)
            vitem["os_id"] = self.os_id
            vitem["ext_id"] = Util.md5hash(turl)
            vitem["site_id"] = self.site_id
            vitem["cont_id"] = self.getshowid(turl)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return vitem

    def get_play_num(self, title):
        num = ""
        try:
            num_list = re.findall('([\d]+)', title)
            if num_list:
                num_size = len(num_list)
                num = num_list[num_size - 1]
        except Exception as e:
            pass
        return num

    def check_url(self, mvitem):
        res = True
        try:
            if 'video' in mvitem:
                for video in mvitem['video']:
                    if 'url' in video:
                        if Util.guess_site(video['url']) != self.site_name:
                            res = False
                            break
        except Exception as e:
            pass
        return res

    def is_same_site(self, url):
        res = True
        try:
            tsite = Util.guess_site(url)
            if tsite != self.site_name:
                res = False
        except Exception as e:
            pass
            res = False
        return res

    def getshowid(self, url):
        id = ""
        try:
            #http://www.baofeng.com/play/497/play-786997.html
            #r = re.compile(r'http://.+/id_([^_]+).*\.html')
            #r = re.compile(r'http://.*[]-([\d]+).html')
            #r = re.compile(r'http://.*[play|detail]-([\d]+).*html')
            r = re.compile(r'http://.*/\w+-(\d+).*')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def getvnum(self, url):
        id = ""
        try:
            #http://www.baofeng.com/play/363/play-786863-drama-10.html
            r = re.compile(r'http://.*-drama-(\d+).*')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def get_year(self, info):
        year = None
        try:
            r = re.compile(ur'.*(\d+).*')
            m = r.search(info)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return year
Exemple #10
0
class youku_spider(Spider):
    name = "youku"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    site_code = "youku"
    allowed_domains = ["youku.com","v.youku.com"]
    url_prefix = 'http://www.youku.com'
    ua='Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'    

    mgr = DbManager.instance()
    os_id = mgr.get_os('web')["os_id"]
    site_id = str(mgr.get_site(site_code)["site_id"])
    channel_map = mgr.get_channel_map() #code -> id
    channel_map_rev = dict([[str(v), k] for k, v in channel_map.items()]) #id -> code
    max_update_page = get_project_settings().get('MAX_UPDATE_PAGE')

    httpdownload = HTTPDownload()
    cat_urls = []

    def __init__(self, json_data=None, *args, **kwargs):
        super(youku_spider, self).__init__(*args, **kwargs)

        if json_data:
            data = json.loads(json_data)
            tasks=[]
            cmd = data["cmd"]

            if cmd == "assign":
                #task from command
                tasks = data["task"]
            elif cmd == "trig":
                #task from untrack
                stat = data['stat'] if 'stat' in data else None
                tasks = self.mgr.get_untrack_url(self.site_code, stat)
            elif cmd == 'carpet':
                tasks = self.mgr.get_video_url(self.site_code)
            elif cmd == "test" and 'id' in data and 'url' in data:
                #assign task by channel_id and url
                self.cat_urls.append({'id': data["id"], 'url': data["url"], 'sid': '', 'untrack_id': ''})

            for task in tasks:
                ttask={}
                ttask["url"] = task["url"]
                code = task["code"]
                ttask["id"] = self.channel_map[code]
                ttask["untrack_id"] = task["untrack_id"] if 'untrack_id' in task else None
                ttask["sid"] = task["sid"] if 'sid' in task else None
                ttask['mid'] = task['mid'] if 'mid' in task else None
                self.cat_urls.append(ttask)

    def start_requests(self):
        try:
            items = []
            if not self.cat_urls:
                cat_urls = [{'url':'http://www.youku.com/v_olist/c_85', 'id': self.channel_map['variaty']}]
                ''' 
                cat_urls = [{'url':'http://www.youku.com/v_olist/c_96', 'id': self.channel_map['movie']},
                        {'url':'http://www.youku.com/v_olist/c_97', 'id': self.channel_map['tv']},
                        {'url':'http://www.youku.com/v_olist/c_85', 'id': self.channel_map['variaty']},
                        {'url':'http://www.youku.com/v_olist/c_100', 'id':self.channel_map['cartoon']}]
                '''
                for cat in cat_urls:
                    items.append(Request(url=cat['url'], callback=self.parse_list, meta={'cat_id': cat['id'],'page':1}))
            else:
                for cat in self.cat_urls:
                    turl = Util.normalize_url(cat['url'],"youku")
                    items.append(Request(url=turl, callback=self.parse_single_episode, meta={'cat_id': cat["id"],'page':1,"untrack_id":cat["untrack_id"],"sid":cat["sid"],"mid":cat["mid"]}))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_single_episode(self,response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']

            urls = response.xpath('//div[@class="base_info"]/h1[@class="title"]/a/@href').extract()
            if urls:
                for iurl in urls:
                    surl = Util.normalize_url(iurl,"youku")
                    if surl:
                        items.append(Request(url=surl, callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':'','page':1,"untrack_id":untrack_id,"sid":sid,"mid":mid}))
            else:
                logging.log(logging.INFO, 'miss media page: %s' % response.request.url)
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_list(self,response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_list: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']

            area_list = response.xpath('//div[@class="yk-filter-panel"]/div/label[text()="%s"]/../ul/li/a/text() ' % u"地区").extract()
            type_list = response.xpath('//div[@class="yk-filter-panel"]/div/label[text()="%s"]/../ul/li/a/text() ' % u"类型").extract()
            year_list = response.xpath('//div[@class="yk-filter-panel"]/div/label[text()="%s"]/../ul/li/a/text() ' % u"时间").extract()
            s_list = ['1','2','4','5','6']
            d_list = ['1','2','4']
 
            for area in area_list:
                for type in type_list:
                    for s_sub in s_list:
                        url_pref = response.request.url + "_a_" + area + "_g_" + type + "_u_1" + "_s_" + s_sub +"_d_1"  + ".html"
                        items.append(Request(url=url_pref, callback=self.parse_page, meta={'cat_id': cat_id,'page':1}))
            
            titem = self.parse_page(response)
            if titem:
                items.extend(titem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return items 

    def parse_page(self,response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_page: %s' % response.request.url)

            page = response.request.meta['page']
            logging.log(logging.INFO, 'parse_page: %s,%s' % (str(page),response.request.url))
            #if int(page) > int(self.max_update_page) and self.global_spider:
            #    logging.log(logging.INFO, 'parse_page: %s,%s' % (str(page),response.request.url))
            #    return

            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']
            items = []

            subs = response.xpath('//div[@class="yk-row yk-v-80"]/div')

            for sub in subs:
                pic_urls = sub.xpath('./div[@class="p p-small"]/div[@class="p-thumb"]/img/@src').extract()
                play_url = sub.xpath('./div[@class="p p-small"]/div[@class="p-link"]/a/@href').extract()
                pic_url = ""
                if pic_urls:
                    pic_url = pic_urls[0]
                if play_url:
                    items.append(Request(url=play_url[0].strip(),callback=self.parse_episode_info,meta={'cat_id': cat_id,'poster_url':pic_url}))

            next_page = response.xpath("//div[@class='yk-pager']/ul[@class='yk-pages']/li[@title='%s']/a/@href" % u'下一页').extract()
            if next_page:
                snext_page = next_page[0].strip()
                if snext_page.find(self.url_prefix) < 0:
                    snext_page = self.url_prefix + snext_page
                items.append(Request(url=snext_page, callback=self.parse_page, meta={'page': page+1, 'cat_id': cat_id}))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_info(self,response):
        try:
            logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            page_id = self.get_youku_pageid(response.request.url)
            if not page_id:
                log.error('miss content id: %s' % response.request.url)
                return

            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']
            items = []

            year_list = []

            title = self.parse_title(response,cat_id)
            performer_list = self.parse_actor(response)
            director_list = self.parse_director(response)
            district_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'地区:').extract()
            type_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'类型:').extract()
            play_date = self.parse_play_date(response)
            total_num = self.parse_total_num(response)

            year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)

            #text
            text = response.xpath('//div[@class="detail"]/span/text()').extract()

            videoitems = []

            ep_item = MediaItem()
            if title:
                ep_item["title"] = title[0].strip()
            if pers:
                ep_item["actor"] = pers
            if dirs > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = district_list[0].strip()
            if play_date:
                ep_item["release_date"] = Util.str2date(play_date)
            if total_num:
                ep_item["vcount"] = total_num

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url,"youku")
            if text:
                ep_item["intro"] = text[0].strip()
            ep_item["cont_id"] = page_id
            ep_item["info_id"] = Util.md5hash(Util.summarize(ep_item))

            mvitem = MediaVideoItem();
            if mid:
                mvitem['mid'] = mid
            mvitem["media"] = ep_item;
            if untrack_id:
                mvitem["untrack_id"] = untrack_id
            if sid:
                mvitem["sid"] = sid

            video_list = self.parse_video_item(response, cat_id, ep_item["title"], page_id)
            mvitem['video'] = video_list
            Util.set_ext_id(mvitem["media"], mvitem["video"])
            items.append(mvitem)

        except Exception as e: 
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_video_item_media(self,code,pn):
        videoitems = []
        try:
            getlist_url = "http://v.youku.com/x_getAjaxData?md=showlistnew&vid=%s&pl=100&pn=%d" % (code,pn)
            urllist_info = self.httpdownload.get_data(getlist_url,ua=self.ua)
            if urllist_info:
                try:
                    json_data = json.loads(urllist_info)
                except Exception as e:
                    return videoitems
                if json_data and "showlistnew" in json_data:
                    if json_data["showlistnew"]:
                        items = json_data["showlistnew"]["items"]
                        vnum_name = ""
                        if type(items)==list:
                            videoseq = set()
                            videostage = set()
                            for item in items:
                                if "preview" in item:
                                    continue
                                videoseq.add(item["show_videoseq"])
                                videostage.add(item["show_videostage"])
                            if len(videoseq)>len(videostage):
                                vnum_name = "show_videoseq"
                            else:
                                vnum_name = "show_videostage"
                            for item in items:
                                if "preview" in item:
                                    continue
                                if "videoid" not in item:
                                    continue
                                vitem = VideoItem()
                                vitem["url"] = "http://v.youku.com/v_show/id_%s.html" % item["videoid"]
                                vitem["vnum"] = item[vnum_name]
                                vitem["title"] = item["title"]
                                vitem["os_id"] = self.os_id
                                vitem["ext_id"] = Util.md5hash(vitem["url"])
                                vitem["site_id"] = self.site_id
                                vitem["cont_id"] = item["videoid"]
                                videoitems.append(vitem)
                        elif type(items)==dict:                    
                            videoseq = set()
                            videostage = set()
                            for k in items:
                                item = items[k] 
                                if "preview" in item:
                                    continue
                                videoseq.add(item["show_videoseq"])
                                videostage.add(item["show_videostage"])
                            if len(videoseq)>len(videostage):
                                vnum_name = "show_videoseq"
                            else:
                                vnum_name = "show_videostage"
                            for k in items:
                                item = items[k]
                                if "preview" in item:
                                    continue
                                if "videoid" not in item:
                                    continue
                                vitem = VideoItem()
                                vitem["url"] = "http://v.youku.com/v_show/id_%s.html" % item["videoid"]
                                vitem["vnum"] = item[vnum_name]
                                vitem["title"] = item["title"]
                                vitem["os_id"] = self.os_id
                                vitem["ext_id"] = Util.md5hash(vitem["url"])
                                vitem["site_id"] = self.site_id
                                vitem["cont_id"] = item["videoid"]
                                videoitems.append(vitem)
                        else:
                            logging.log(logging.ERROR, getlist_url)
                            pass
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
            return videoitems

    def parse_video_item(self, response, cat_id, title, media_page_id):
        videoitems = []
        try:
            play_url = self.parse_play_url(response)
            if play_url:
                url = Util.normalize_url(play_url[0], "youku")
                cont_id = self.get_youku_showid(url)
                i=1
                while True:
                    item = self.parse_video_item_media(cont_id,i)
                    if item:
                        videoitems = videoitems + item
                        i = i+1
                    else:
                        break
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
            return videoitems

    def parse_title(self,response,cat_id):
        title = []
        try:
            #title = response.xpath('//div[@id="title_wrap"]/div[@id="title"]/h1/span[@class="name"]/text()').extract()
            title = response.xpath('//div[@id="title_wrap"]/div[@id="title"]/div[@class="base"]/h1/span[@class="name"]/text()').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return title

    def parse_actor(self,response):
        performer_list = []
        try:
            performer_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'主演:').extract()
            if not performer_list:
                performer_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'主持人:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return performer_list

    def parse_director(self,response):
        director_list = []
        try:
            director_list = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../a/text()'  % u'导演:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return director_list

    def parse_play_url(self,response):
        play_list = []
        try:
            play_list = response.xpath("//div[@class='showInfo poster_w yk-interact']/ul[@class='baseaction']/li[@class='action']/a/em[text()='%s']/../@href" % u"播放正片").extract()
            if not play_list:
                play_list = response.xpath("//div[@class='showInfo poster_w yk-interact']/ul[@class='baseaction']/li[@class='action']/a/em[text()='%s']/../@href" % u"播放").extract()
            if not play_list:
                play_list = response.xpath("//div[@class='showInfo poster_w yk-interact']/ul[@class='baseaction']/li[@class='action']/a/em[text()='%s']/../@href" % u"免费试看").extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return play_list

    def get_youku_pageid(self,url):
        id = ""
        try:
            #http://www.youku.com/show_page/id_zed6b4c7497b811e4b522.html
            r = re.compile(r'http://www.youku.com/show_page/id_([^_]+).*\.html')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def get_youku_showid(self,url):
        #http://v.youku.com/v_show/id_XNzUyMDUwOTAw.html
        id = ""
        try:
            r = re.compile(r'http://v.youku.com/v_show/id_([^/]+).*\.html')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def parse_play_date(self,response):
        res = []
        strdate = None
        try:
            res = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../text()'  % u'优酷上映:').extract()
            if not res:
                res = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../text()'  % u'优酷开播:').extract()
            if not res:
                res = response.xpath('//ul[@class="baseinfo"]/li/span/label[text()="%s"]/../text()'  % u'上映:').extract()
            if res:
                strdate = res[0]
        except Exception as e:
            pass
        return strdate

    def parse_total_num(self,response):
        res = None
        try:
            info_list = response.xpath('//div[@class="basenotice"]/text()').extract()
            for info in info_list:
                r = re.compile(ur'.*%s(\d+)%s.*' % (u'共',u'集'))
                m = r.search(info)
                if m:
                    return m.group(1)
        except Exception as e:
            pass
        return res
Exemple #11
0
class qq_spider(Spider):
    name = "qq"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    site_code = "qq"
    site_id = ""  #qq
    allowed_domains = ["v.qq.com", "film.qq.com", "s.video.qq.com"]
    url_prefix = 'http://v.qq.com'
    #used for guess_site
    site_name = Util.guess_site(url_prefix)

    mgr = DbManager.instance()
    os_id = mgr.get_os('web')["os_id"]
    site_id = str(mgr.get_site(site_code)["site_id"])
    #site_code = str(mgr.get_site(site_name)["site_code"])
    channel_map = {}
    channel_map = mgr.get_channel_map()
    max_update_page = get_project_settings().get('MAX_UPDATE_PAGE')
    global_spider = True
    httpdownload = HTTPDownload()

    channel_info = {}

    movie_id = ""
    tv_id = ""
    variety_id = ""
    cartoon_id = ""

    test_page_url = None
    test_channel_id = None

    def __init__(self, json_data=None, *args, **kwargs):
        super(qq_spider, self).__init__(*args, **kwargs)
        cat_urls = []
        tasks = None
        if json_data:
            data = json.loads(json_data)
            if "type" in data:
                spider_type = data["type"]
                if spider_type != "global":
                    self.global_spider = False
            tasks = []
            if "id" in data and "url" in data:
                ttask = {}
                ttask["id"] = data["id"]
                ttask["url"] = data["url"]
                ttask["sid"] = ""
                ttask["untrack_id"] = ""
                cat_urls.append(ttask)

            cmd = data["cmd"]
            if cmd == "assign":
                tasks = data["task"]
            elif cmd == "trig":
                stat = data['stat'] if 'stat' in data else None
                tasks = self.mgr.get_untrack_url(self.site_code, stat)
            elif cmd == "test" and 'id' in data and 'url' in data:
                self.test_page_url = data["url"]
                self.test_channel_id = data["id"]

            if tasks:
                for task in tasks:
                    ttask = {}
                    ttask["url"] = task["url"]
                    code = task["code"]
                    ttask["id"] = self.channel_map[code]
                    ttask["untrack_id"] = task["untrack_id"]
                    ttask["sid"] = task["sid"]
                    cat_urls.append(ttask)

        self._cat_urls = []
        if cat_urls:
            self._cat_urls = cat_urls

    def start_requests(self):
        items = []
        try:
            cat_urls = []

            self.movie_id = self.mgr.get_channel('电影')["channel_id"]
            self.tv_id = self.mgr.get_channel('电视剧')["channel_id"]
            self.variety_id = self.mgr.get_channel('综艺')["channel_id"]
            self.cartoon_id = self.mgr.get_channel('动漫')["channel_id"]

            self.channel_info = {
                self.movie_id: u"电影",
                self.tv_id: u"电视剧",
                self.variety_id: u"综艺",
                self.cartoon_id: u"动漫"
            }

            if self.test_page_url:
                turl = Util.normalize_url(self.test_page_url, "qq")
                items.append(
                    Request(url=self.test_page_url,
                            callback=self.parse_single_episode,
                            meta={
                                'cat_id': self.test_channel_id,
                                'page': 1
                            }))
                return items

            if not self._cat_urls:
                #cat_urls = [{'url':'http://v.qq.com/list/2_-1_-1_-1_0_1_1_10_-1_-1_0.html','id':self.tv_id}]
                cat_urls = [{
                    'url':
                    'http://v.qq.com/movielist/10001/0/0/1/0/10/1/0.html',
                    'id': self.movie_id
                }, {
                    'url':
                    'http://v.qq.com/list/2_-1_-1_-1_0_1_1_10_-1_-1_0.html',
                    'id': self.tv_id
                }, {
                    'url': 'http://v.qq.com/variety/type/list_-1_0_0.html',
                    'id': self.variety_id
                }, {
                    'url':
                    'http://v.qq.com/cartlist/0/3_-1_-1_-1_-1_1_0_1_10.html',
                    'id': self.cartoon_id
                }]

                for cat in cat_urls:
                    items.append(
                        Request(url=cat['url'],
                                callback=self.parse_type,
                                meta={
                                    'cat_id': cat['id'],
                                    'page': 1
                                }))
            else:
                for cat in self._cat_urls:
                    channel_id = str(cat["id"])
                    items.append(
                        Request(url=cat['url'],
                                callback=self.parse_single_episode,
                                meta={
                                    'cat_id': channel_id,
                                    'page': 1,
                                    "untrack_id": cat["untrack_id"],
                                    "sid": cat["sid"]
                                }))

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_single_episode(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_single_episode: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            urls = response.xpath(
                '//div[@class="breadcrumb"]/a[@class="breadcrumb_item"]/@href'
            ).extract()
            #carton is different
            if not urls:
                turls = response.xpath(
                    '//div[@class="mod_player_head cf"]/div[1]/div[1]/a/@href'
                ).extract()
                if turls:
                    tlen = len(turls)
                    urls = [turls[tlen - 1]]

            if urls:
                turl = self.url_prefix + urls[0]
                #print "turl",turl
                #turl = "http://v.qq.com/p/tv/detail/hqg/index.html"
                items.append(
                    Request(url=turl,
                            callback=self.parse_episode_info,
                            meta={
                                'cat_id': cat_id,
                                'poster_url': '',
                                'page': 1,
                                "untrack_id": untrack_id,
                                "sid": sid
                            }))
            else:
                ttitem = self.parse_episode_play(response)
                if ttitem and self.check_url(ttitem):
                    items.append(ttitem)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_type(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="mod_indexs bor"]/div[@class="mod_cont"]/ul[1]/li/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_area,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))

            titem = self.parse(response)
            if titem:
                items.extend(titem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_area(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_area: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="mod_indexs bor"]/div[@class="mod_cont"]/ul[2]/li/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_year,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_year(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_year: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="mod_indexs bor"]/div[@class="mod_cont"]/ul[3]/li/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_sort,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_sort(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_sort: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="mod_tab_sort"]/ul/li/a/@href').extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    #for each category parse all its sub-categories or types,called by parse_sort
    def parse(self, response):
        items = []
        try:
            page = response.request.meta['page']
            logging.log(logging.INFO,
                        'lev1: %s,%s' % (str(page), response.request.url))
            #if int(page) > int(self.max_update_page) and not self.global_spider:
            #    return

            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']

            play_url = ""
            subs = response.xpath(
                '//div[@class="grid_18"]/div[2]/div[@class="mod_cont"]/div[@class="mod_item"]'
            )
            # 综艺页面不统一
            if not subs:
                subs = response.xpath(
                    '//div[@class="grid_18"]/div[2]/div[@class="mod_cont"]/div[@class="mod_item pic_160"]'
                )

            for sub in subs:
                play_url = sub.xpath(
                    './div[@class="mod_txt"]/div[@class="mod_operate"]/a/@href'
                ).extract()
                if not play_url:
                    play_url = sub.xpath(
                        './div[@class="mod_txt"]/div[@class="mod_item_tit"]/h6/a/@href'
                    ).extract()
                pic_urls = sub.xpath(
                    './div[@class="mod_pic"]/a/img/@src').extract()
                pic_url = ""
                if pic_urls:
                    pic_url = pic_urls[0]
                items.append(
                    Request(url=play_url[0].strip(),
                            callback=self.parse_episode,
                            meta={
                                'cat_id': cat_id,
                                'poster_url': pic_url
                            }))

            next_page = response.xpath(
                "//div[@class='mod_pagenav']/p/a[@title='%s']/@href" %
                u'下一页').extract()
            if next_page:
                snext_page = next_page[0].strip()
                if snext_page.find("v.qq.com") < 0:
                    snext_page = "http://v.qq.com" + snext_page
                items.append(
                    Request(url=snext_page,
                            callback=self.parse,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id
                            }))

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_play(self, response):
        mvitem = None
        try:
            logging.log(logging.INFO,
                        'parse_episode_play: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = ""
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            #items = []

            #title
            title_list = response.xpath(
                '//div[@class="movie_info"]/div[@class="title_wrap"]/h3/a/@title'
            ).extract()
            if not title_list:
                title_list = response.xpath(
                    '//div[@class="intro_lt"]/div[@class="intro_title cf"]/p[@class="title_cn"]/text()'
                ).extract()
            #performer
            performer_list = response.xpath(
                '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="actor"]/a/text()'
            ).extract()
            #director
            director_list = response.xpath(
                '//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()'
                % u'导演:').extract()
            #type_list = response.xpath('//div[@class="movie_info"]/div[@class="movie_detail"]/dl[@class="detail_list"]/dd[@class="type"]/span[text()="%s"]/a/text()' % u'导演:').extract()

            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)

            #text
            text = response.xpath(
                '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()'
            ).extract()

            ep_item = MediaItem()
            videoitems = []

            #not film
            if int(cat_id) != int(self.movie_id):
                #video list
                #video_list = response.xpath('//div[@class="mod_player_side_inner"]/div[2]/div[1]/div[1]/div[1]/div[1]/ul[1]/li')
                video_list = response.xpath(
                    '//div[@class="tabcont_warp tabcont_warp_yespadding"]/div[@class="tabcont_album"]/ul[@class="album_list cf"]/li'
                )
                i = 0
                for tvideo in video_list:
                    lurl = tvideo.xpath('./a/@href').extract()
                    surl = ""
                    #lnum = tvideo.xpath('./a/@title').extract()
                    lnum = tvideo.xpath('./a/span/text()').extract()

                    vitem = VideoItem()
                    if lnum and lurl:
                        vitem["vnum"] = lnum[0]
                        surl = "http://film.qq.com" + lurl[0]
                        vitem["os_id"] = self.os_id
                        vitem["site_id"] = self.site_id
                        #vitem["cont_id"] = self.get_vid(response.body,surl)
                        turl = ""
                        if cat_id == self.tv_id:
                            turl = Util.normalize_url(surl, "qq", "tv")
                        if cat_id == self.cartoon_id:
                            turl = Util.normalize_url(surl, "qq", "cartoon")
                        else:
                            turl = Util.normalize_url(surl, "qq")
                        if turl:
                            vitem["ext_id"] = Util.md5hash(turl)
                            vitem["url"] = turl
                        vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                    else:
                        continue

                    videoitems.append(vitem)
            else:
                vitem = VideoItem()
                if title_list:
                    vitem["title"] = title_list[0]
                vitem["vnum"] = "1"
                vitem["os_id"] = self.os_id
                vitem["site_id"] = self.site_id
                #vitem["cont_id"] = self.get_vid(response.body,response.request.url)
                turl = Util.normalize_url(response.request.url, "qq")
                vitem["url"] = turl
                vitem["ext_id"] = Util.md5hash(turl)
                vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                videoitems.append(vitem)

            if len(title_list) > 0:
                ep_item["title"] = title_list[0]
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if len(text) > 0:
                ep_item["intro"] = text[0]
            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url

            videoid = self.get_qq_showid(response.request.url)
            #videoid = self.get_vid(response.body,response.request.url)
            ep_item["cont_id"] = videoid

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item
            mvitem["video"] = videoitems
            #mvitem["media"]["url"] = response.request.url
            mvitem["media"]["url"] = Util.normalize_url(
                response.request.url, "qq")
            #mvitem["ext_id"] = Util.md5hash(mvitem["media"]["url"])

            if untrack_id:
                mvitem["untrack_id"] = untrack_id
            if sid:
                mvitem["sid"] = sid
            mvitem["media"]["info_id"] = Util.md5hash(
                Util.summarize(mvitem["media"]))
            Util.md5hash(Util.summarize(mvitem["media"]))
            Util.set_ext_id(mvitem["media"], mvitem["video"])
            #items.append(mvitem)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return mvitem

    #先进入播放页,再进入媒体页,判断是否能进入媒体页,如果不能进入,就直接解析播放页信息
    def parse_episode(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'lev2: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']

            urls = response.xpath(
                '//div[@class="breadcrumb"]/a[@class="breadcrumb_item"]/@href'
            ).extract()
            #carton is different
            if not urls:
                turls = response.xpath(
                    '//div[@class="mod_player_head cf"]/div[1]/div[1]/a/@href'
                ).extract()
                if turls:
                    tlen = len(turls)
                    urls = [turls[tlen - 1]]
            if urls:
                turl = self.url_prefix + urls[0]
                items.append(
                    Request(url=turl,
                            callback=self.parse_episode_info,
                            meta={
                                'cat_id': cat_id,
                                'poster_url': poster_url
                            }))
            #不就跳转到媒体页
            else:
                print "2-----------------------not jump to episode ,", response.request.url
                titem = self.parse_episode_play(response)
                if titem and self.check_url(titem):
                    items.append(titem)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            #title
            title = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/strong/a/text()'
            ).extract()
            if not title or not title[0]:
                title = response.xpath(
                    '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h1/strong/@title'
                ).extract()
                if not title or not title[0]:
                    title = response.xpath(
                        '//div[@class="mod_box mod_video_info"]/div[@class="mod_hd mod_hd_border"]/h2/strong/@title'
                    ).extract()
                    if not title or not title[0]:
                        title = response.xpath(
                            '//div[@class="mod_page_banner"]/div[@class="banner_pic"]/a/@title'
                        ).extract()
            #performer
            #performer_list = response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[2]/div[1]/a/span/text()').extract()
            performer_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_cast"]/a/span/text()'
            ).extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()'
                    % u'主演:').extract()
            #director
            #director_list=response.xpath('//div[@class="mod_video_intro mod_video_intro_rich"]/div[2]/div[3]/div[1]/a/span/text()').extract()
            director_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line cf"]/div[@class="info_director"]/a/span/text()'
            ).extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/span/text()'
                    % u'导演:').extract()
            #text
            text = response.xpath(
                '//div[@class="movie_info_wrap"]/div[1]/d1[1]/dd[3]/p[@class="detail_all"]/text()'
            ).extract()
            if not text:
                response.xpath(
                    '//div[@class="mod_video_focus"]/div[@class="info_desc"]/span[@class="desc"]/text()'
                ).extract()
            type_list = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_info cf"]/div[@class="info_line info_line_tags cf"]/div[@class="info_tags"]/a/span/text()'
            ).extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()'
                    % u'类型:').extract()
            year_info = response.xpath(
                '//div[@class="mod_video_intro mod_video_intro_rich"]/div[@class="video_title"]/span[@class="video_current_state"]/span[@class="current_state"]/text()'
            ).extract()
            if not year_info:
                year_info = response.xpath(
                    '//div[@class="video_info cf"]/div[@class="info_line cf"]/p/span[text()="%s"]/../span[@class="content"]/a/text()'
                    % u'年份:').extract()
            play_date = None
            if year_info:
                play_date = self.get_year(year_info[0])

            #
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            pers = Util.join_list_safely(performer_list)

            #sourceid
            sourceid = ""
            sourceid_list = response.xpath(
                '//div[@class="mod_bd sourceCont"]/@sourceid').extract()
            if sourceid_list:
                sourceid = sourceid_list[0]

            videoitems = []

            ep_item = MediaItem()

            if len(title) > 0:
                ep_item["title"] = title[0]
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if play_date:
                ep_item["release_date"] = Util.str2date(play_date)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["url"] = Util.normalize_url(response.request.url, "qq")
            ep_item["poster_url"] = poster_url

            if len(text) > 0:
                ep_item["intro"] = text[0]

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item
            mvitem["video"] = videoitems

            vurl = ""
            url_pre = "http://s.video.qq.com/loadplaylist?vkey="
            url_tail = "&vtype=2&otype=json&video_type=2&callback=jQuery191048201349820010364_1425370006500&low_login=1"

            videoid = self.get_qq_showid(response.request.url)
            #videoid = self.get_vid(response.body,response.request.url)
            mvitem["media"]["cont_id"] = videoid
            mvitem["media"]["info_id"] = Util.md5hash(
                Util.summarize(mvitem["media"]))
            vurl = url_pre + str(sourceid) + url_tail

            tflag = "jQuery191048201349820010364_1425370006500"
            tpitem = self.parse_play_list(cat_id, vurl, tflag, response)
            #没有sourceid,比如专题页面
            if not tpitem:
                tpitem = self.parse_topic_play_list(response)
                videoids = response.xpath(
                    '//div[@class="mod_episodes_info episodes_info"]/input[@name="cid"]/@value'
                ).extract()
                if videoids:
                    mvitem["media"]["cont_id"] = videoids[0]
            if tpitem:
                mvitem["video"] = tpitem
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if untrack_id:
                    mvitem["untrack_id"] = untrack_id
                if sid:
                    mvitem["sid"] = sid
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_play_list(self, cat_id, url, flag, response):
        item = None
        videoitems = []
        try:
            ep_item = MediaItem()
            item = MediaVideoItem()
            item["media"] = ep_item
            item['video'] = videoitems

            info = None
            try:
                info = self.httpdownload.get_data(url)
            except Exception as e:
                logging.log(logging.ERROR, traceback.format_exc())
                return videoitems
            if not info or len(info) < 2:
                return videoitems

            msg = info
            bodylen = len(msg) - 1
            index = msg.find(flag) + len(flag) + 1
            info = msg[index:bodylen]
            jinfo = json.loads(info)
            if "video_play_list" not in jinfo:
                return videoitems
            itemlist = jinfo["video_play_list"]["playlist"]
            for titem in itemlist:
                if "episode_number" not in titem:
                    continue
                info = titem["episode_number"]
                if info and titem["title"].find(u"预告") < 0 and url.find(
                        "qq.com") >= 0:
                    vitem = VideoItem()
                    vitem["title"] = titem["title"]
                    tvnum = string.replace(info, "-", "")
                    #集数不是数字,是字符串,http://v.qq.com/detail/x/xk98t8hntls72f4.html
                    tvnum_list = re.findall(r'[\D]+', tvnum)
                    if not tvnum_list:
                        vitem["vnum"] = string.replace(info, "-", "")
                    else:
                        continue
                    vitem["os_id"] = self.os_id
                    vitem["site_id"] = self.site_id
                    turl = ""
                    if int(cat_id) == int(self.tv_id) or int(cat_id) == int(
                            self.cartoon_id):
                        turl = Util.normalize_url(titem["url"], "qq", "tv")
                    else:
                        turl = Util.normalize_url(titem["url"], "qq")
                    if turl:
                        vitem["ext_id"] = Util.md5hash(turl)
                        #vitem["cont_id"] = self.get_vid(response.body,turl)
                        vitem["url"] = turl
                        vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                    else:
                        continue
                    videoitems.append(vitem)

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def parse_topic_play_list(self, response):
        item = None
        videoitems = []
        try:
            subs = response.xpath(
                '//div[@class="mod_video_fragments"]/div[@class="mod_figures_1"]/ul/li'
            )
            for sub in subs:
                vitem = VideoItem()
                title = sub.xpath('./strong/a/text()').extract()
                vitem["os_id"] = self.os_id
                vitem["site_id"] = self.site_id
                turl = sub.xpath('./strong/a/@href').extract()
                if title and title[0].find(u"预告") < 0:
                    if turl and turl[0].find(".com") < 0 or (
                            turl and turl[0].find("qq.com") >= 0):
                        vitem["title"] = title[0].strip()
                        vitem["vnum"] = self.get_num(vitem["title"])
                        sturl = turl[0]
                        if turl[0].find("qq.com") < 0:
                            sturl = self.url_prefix + turl[0]
                        vitem["url"] = Util.normalize_url(sturl, "qq", "tv")
                        vitem["ext_id"] = Util.md5hash(vitem["url"])
                        vitem["cont_id"] = self.get_qq_showid(vitem["url"])
                        videoitems.append(vitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def get_qq_showid(self, url):
        id = ""
        try:
            #http://v.qq.com/detail/j/jlw8mddv9wkv1a3.html
            #http://film.qq.com/cover/y/yuq5nnt2wwlwfle.html
            #r = re.compile(r'http://.+/id_([^_]+).*\.html')
            #r = re.compile(r'http://.+/.+/[0-9a-zA-Z]/([^_]+).*\.html')
            r = re.compile(r'http://[^/]*.qq.com/cover/.+?/([^/]*).html')
            m = r.match(url)
            if m:
                return m.group(1)
            else:
                r = re.compile(r'http://[^/]*.qq.com/[^/]*/.+?/([^/]*).html')
                m = r.match(url)
                if m:
                    return m.group(1)
        except Exception as e:
            pass
        return id

    def get_vid(self, content, url):
        id = ""
        try:
            #url=http://v.qq.com/cover/k/krl2051za26trxu.html?vid=r0016fx050p"
            if url and url.find("vid") != -1:
                r = re.compile(r'.*[?&]vid=([^&]+)')
                m = r.search(url)
                if m:
                    id = m.group(1)
            if not id and len(content) > 0:
                #vid:"f0016l11uqt"
                #r = re.compile(r'vid:.([^"])"')
                r = re.compile(r'vid:.(.*)".*')
                m = r.search(content)
                if m:
                    id = m.group(1)
                if not id:
                    #r = re.compile(r".*vid.:.(.*)'.*")
                    r = re.compile(r".*vid.:.'(.*)'.*")
                    m = r.search(content)
                    if m:
                        id = m.group(1)

            if not id:
                id = self.get_qq_showid(url)
        except Exception as e:
            pass
        return id

    def convert_url(self, url):
        res = url
        try:
            pass
        except Exception as e:
            pass
        return res

    def check_all(self, mvitem):
        res = True
        try:
            if 'video' not in mvitem:
                res = False
            if 'video' in mvitem:
                if len(mvitem['video']) == 0:
                    res = False

            if res:
                res = self.check_url(mvitem)
        except Exception as e:
            pass
        return res

    def check_url(self, mvitem):
        res = True
        try:
            if 'video' in mvitem:
                for video in mvitem['video']:
                    if 'url' in video:
                        tres = self.is_same_site(video['url'])
                        if not tres:
                            res = False
                            break
        except Exception as e:
            pass
        return res

    def is_same_site(self, url):
        res = True
        try:
            tsite = Util.guess_site(url)
            if tsite != self.site_name:
                res = False
        except Exception as e:
            pass
            res = False
        return res

    def get_year(self, data):
        year = None
        try:
            #r = re.compile(r'.*([\d]+).*')
            #m = r.match(data)
            #m = r.search(data)
            #if m:
            #    print "get year",data,m.group(1)
            #    return m.group(1)
            tyear = re.findall(r'[\d]+', data)
            if tyear:
                return tyear[0]
        except Exception as e:
            pass
        return year

    def get_num(self, data):
        num = None
        try:
            #r = re.compile(r'.*(\d+).*')
            #m = r.search(data)
            #if m:
            #    return m.group(1)
            num = re.findall(r'[\d]+', data)
            if num:
                return num[0]
        except Exception as e:
            pass
        return num
Exemple #12
0
class sohu_spider(Spider):
    name = "sohu"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    site_code = "sohu"  #sohu
    site_id = ""  #sohu
    allowed_domains = ["so.tv.sohu.com", "tv.sohu.com"]
    url_prefix = 'http://so.tv.sohu.com'
    #used for guess_site
    site_name = Util.guess_site(url_prefix)

    mgr = DbManager.instance()
    os_id = mgr.get_os('web')["os_id"]
    site_id = str(mgr.get_site(site_code)["site_id"])
    channel_map = {}
    channel_map = mgr.get_channel_map()
    max_update_page = get_project_settings().get('MAX_UPDATE_PAGE')
    global_spider = True
    httpdownload = HTTPDownload()

    channel_info = {}
    movie_id = None
    tv_id = None
    variety_id = None
    cartoon_id = None

    test_page_url = None
    test_channel_id = None
    cmd_json = {}

    album_api = 'http://pl.hd.sohu.com/videolist?playlistid=%s&pagenum=%s'

    def __init__(self, json_data=None, *args, **kwargs):
        super(sohu_spider, self).__init__(*args, **kwargs)
        self._cat_urls = []
        tcat_urls = []
        if json_data:
            data = json.loads(json_data)
            if "type" in data:
                spider_type = data["type"]
                if spider_type != "global":
                    self.global_spider = False
            tasks = []
            if "id" in data and "url" in data:
                ttask = {}
                ttask["id"] = data["id"]
                ttask["url"] = data["url"]
                ttask["sid"] = ""
                ttask["untrack_id"] = ""
                self._cat_urls.append(ttask)

            cmd = data["cmd"]
            if cmd == "assign":
                tasks = data["task"]
            elif cmd == "trig":
                stat = data['stat'] if 'stat' in data else None
                tasks = self.mgr.get_untrack_url(self.site_code, stat)
            elif cmd == 'carpet':
                tasks = self.mgr.get_video_url(self.site_code)
            elif cmd == "test" and 'id' in data and 'url' in data:
                self.test_page_url = data["url"]
                self.test_channel_id = data["id"]
            elif cmd == "episode" and 'id' in data and 'url' in data:
                self.cmd_json = data
            elif cmd == "debug":
                #tasks = [{"mid":"503669", "url":"http://tv.sohu.com/20151204/n429762764.shtml", "name":"综艺", "code":"variaty"}]
                #tasks = [{"mid":"510798", "url":"http://tv.sohu.com/20090824/n266189779.shtml", "name":"综艺", "code":"variaty"}]
                tasks = [{
                    "mid": "502525",
                    "url": "http://tv.sohu.com/20110617/n310505202.shtml",
                    "name": "综艺",
                    "code": "variaty"
                }]

            for task in tasks:
                ttask = {}
                ttask["url"] = task["url"]
                code = task["code"]
                ttask["id"] = self.channel_map[code]
                ttask["untrack_id"] = task[
                    "untrack_id"] if 'untrack_id' in task else None
                ttask["sid"] = task["sid"] if 'sid' in task else None
                ttask['mid'] = task['mid'] if 'mid' in task else None
                self._cat_urls.append(ttask)

    def start_requests(self):
        try:
            items = []

            self.movie_id = str(self.mgr.get_channel('电影')["channel_id"])
            self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"])
            self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"])
            self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"])

            self.channel_info = {
                self.movie_id: u"电影",
                self.tv_id: u"电视剧",
                self.variety_id: u"综艺",
                self.cartoon_id: u"动漫"
            }
            if self.test_page_url:
                turl = Util.normalize_url(self.test_page_url, "sohu")
                items.append(
                    Request(url=self.test_page_url,
                            callback=self.parse_page,
                            meta={
                                'cat_id': self.test_channel_id,
                                'page': 1
                            }))
                return items

            if self.cmd_json:
                items.append(
                    Request(url=self.cmd_json['url'],
                            callback=self.parse_episode_info,
                            meta={
                                'cat_id': self.cmd_json["id"],
                                'poster_url': ''
                            }))
                return items

            if not self._cat_urls:
                #cat_urls = [{'url':'http://so.tv.sohu.com/list_p1106_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html','id':self.variety_id}]
                cat_urls = [{
                    'url':
                    'http://so.tv.sohu.com/list_p1100_p2_p3_p4_p5_p6_p73_p80_p9_2d1_p10_p11_p12_p13.html',
                    'id': self.movie_id
                }, {
                    'url':
                    'http://so.tv.sohu.com/list_p1101_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html',
                    'id': self.tv_id
                }, {
                    'url':
                    'http://so.tv.sohu.com/list_p1106_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html',
                    'id': self.variety_id
                }, {
                    'url':
                    'http://so.tv.sohu.com/list_p1115_p2_p3_p4_p5_p6_p73_p8_p9_p10_p11_p12_p13.html',
                    'id': self.cartoon_id
                }]
                #cat_urls = [{'url':'http://so.tv.sohu.com/list_p1100_p2_p3_p4_p5_p6_p73_p80_p9_2d1_p10_p11_p12_p13.html','id':self.movie_id}]

                for cat in cat_urls:
                    items.append(
                        Request(url=cat['url'],
                                callback=self.parse_type,
                                meta={
                                    'cat_id': cat['id'],
                                    'page': 1
                                }))
            else:
                for cat in self._cat_urls:
                    items.append(
                        Request(url=cat['url'],
                                callback=self.parse_single_episode,
                                meta={
                                    'cat_id': cat["id"],
                                    'page': 1,
                                    "untrack_id": cat["untrack_id"],
                                    "sid": cat["sid"],
                                    "mid": cat["mid"]
                                }))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_single_episode(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_single_episode: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            untrack_id = response.request.meta['untrack_id']
            sid = response.request.meta['sid']
            mid = response.request.meta[
                'mid'] if 'mid' in response.request.meta else ""
            playtype_list = response.selector.re(
                re.compile(r'var pagetype = .*?(\D+)'))
            #发现新的类型页面,http://tv.sohu.com/20100804/n273985736.shtml
            #http://my.tv.sohu.com/us/49390690/29200993.shtml  该URL利用现有的逻辑无法爬取到
            urls = response.xpath(
                '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/div[@class="crumbs"]/a[last()]'
            )
            attributes = urls.xpath('./@*').extract()
            size = len(attributes)
            urls = urls.xpath('./@href').extract()
            if size == 1 and urls and not playtype_list:
                for iurl in urls:
                    surl = Util.normalize_url(iurl, "sohu")
                    if surl and "http" in surl:
                        items.append(
                            Request(url=surl,
                                    callback=self.parse_episode_info,
                                    meta={
                                        'cat_id': cat_id,
                                        'poster_url': '',
                                        'page': 1,
                                        "untrack_id": untrack_id,
                                        "sid": sid,
                                        "mid": mid
                                    }))
            #付费电影,不能跳转到媒体页
            else:
                mvitem = self.parse_episode_play(response, untrack_id, sid)
                if mid:
                    mvitem['mid'] = mid
                if mvitem and "media" in mvitem and "url" in mvitem[
                        "media"] and "ext_id" in mvitem["media"]:
                    if self.check_url(mvitem):
                        items.append(mvitem)

            if not items:
                mvitem = MediaVideoItem()
                if mid:
                    mvitem['mid'] = mid
                if untrack_id and sid:
                    mvitem["untrack_id"] = untrack_id
                    mvitem["sid"] = sid
                ep_item = MediaItem()
                ep_item["site_id"] = self.site_id
                ep_item["channel_id"] = cat_id
                mvitem["media"] = ep_item

                playlistId = ""
                playlistId_list = response.selector.re(
                    re.compile(r'var playlistId.*?(\d+)'))
                if not playlistId_list:
                    playlistId_list = response.selector.re(
                        re.compile(r'var PLAYLIST_ID.*?(\d+)'))
                if not playlistId_list:
                    playlistId_list = response.selector.re(
                        re.compile(r'= playlistId.*?(\d+)'))
                if playlistId_list:
                    playlistId = playlistId_list[0]
                    items += self.api_episode_info(mvItem=mvitem,
                                                   playlistId=playlistId,
                                                   cat_id=cat_id)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_type(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_typ: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="sort-type"]/dl[1]/dd[@class="sort-tag"]/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=self.url_prefix + sub,
                            callback=self.parse_area,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))

            titem = self.parse_page(response)
            if titem:
                items.extend(titem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_area(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_area: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="sort-type"]/dl[2]/dd[@class="sort-tag"]/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=self.url_prefix + sub,
                            callback=self.parse_sort,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_sort(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_sort: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="sort-column area"]/div[@class="column-hd"]/p[@class="st-link"]/a/@href'
            ).extract()
            for sub in subs:
                items.append(
                    Request(url=self.url_prefix + sub,
                            callback=self.parse_page,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_page(self, response):
        try:
            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']
            #logging.log(logging.INFO, 'parse_page: %s,%s' % (response.request.url,page))

            #if int(page) > int(self.max_update_page) and not self.global_spider:
            #    return

            items = []

            play_url = ""
            subs = response.xpath('//div[@class="column-bd cfix"]/ul[1]/li')

            for sub in subs:
                play_url = sub.xpath(
                    './div[@class="st-pic"]/a/@href').extract()
                pic_urls = sub.xpath(
                    './div[@class="st-pic"]/a/img/@src').extract()
                pic_url = ""
                if pic_urls:
                    pic_url = pic_urls[0]
                if play_url:
                    items.append(
                        Request(url=play_url[0].strip(),
                                callback=self.parse_episode_info,
                                meta={
                                    'cat_id': cat_id,
                                    'poster_url': pic_url
                                }))

            next_page = response.xpath(
                "//div[@class='column-bd cfix']/div[1]/a[@title='%s']/@href" %
                u'下一页').extract()
            if next_page:
                snext_page = next_page[0].strip()
                if snext_page.find(self.url_prefix) < 0:
                    snext_page = self.url_prefix + snext_page
                items.append(
                    Request(url=snext_page,
                            callback=self.parse_page,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id
                            }))

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_info(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']

            year_list = []
            lyears = []

            playlistId = ""
            playlistId_list = response.selector.re(
                re.compile(r'var playlistId.*?(\d+)'))
            if not playlistId_list:
                playlistId_list = response.selector.re(
                    re.compile(r'var PLAYLIST_ID.*?(\d+)'))
            if not playlistId_list:
                playlistId_list = response.selector.re(
                    re.compile(r'= playlistId.*?(\d+)'))

            if playlistId_list:
                playlistId = playlistId_list[0]
            if not playlistId:
                logging.log(
                    logging.INFO,
                    "parse_episode_info error,not find playlistid,url:%s " %
                    response.request.url)
                return items

            title_list = self.parse_title(response, cat_id)
            performer_list = self.parse_actor(response)
            director_list = self.parse_director(response)
            district_list = self.parse_district(response)
            type_list = self.parse_type_list(response)
            #year_list = response.xpath('//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()').extract()
            year_list = self.parse_year(response)
            year = None
            if year_list:
                year = year_list[0]
            #pers = "|".join([t.strip() for t in performer_list])
            #dirs = "|".join([t.strip() for t in director_list])
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            types = Util.join_list_safely(type_list)
            district = Util.join_list_safely(district_list)

            #text
            text = response.xpath(
                '//div[@class="movieCont mod"]/p[1]/span[@class="full_intro"]/text()'
            ).extract()

            play_url = ""
            play_url = response.xpath(
                '//div[@class="cfix movie-info"]/div[2]/div[@class="cfix bot"]/a[@class="btn-playFea"]/@href'
            ).extract()
            videoitems = []

            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0]
            ep_item["actor"] = pers
            ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district:
                ep_item["district"] = district
            if year:
                ep_item["release_date"] = Util.str2date(year)

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url, "sohu")
            playlistId = str(playlistId)
            ep_item["cont_id"] = playlistId

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            if mid:
                mvitem['mid'] = mid
            if untrack_id and sid:
                mvitem["untrack_id"] = untrack_id
                mvitem["sid"] = sid
            mvitem["media"] = ep_item
            vurl = ""
            ttvitem = []
            if title_list:
                ttvitem = self.parse_video_item(cat_id, playlistId)
            if ttvitem:
                mvitem['video'] = ttvitem
                mvitem["media"]["info_id"] = Util.md5hash(
                    Util.summarize(mvitem["media"]))
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if self.check_url(mvitem):
                    items.append(mvitem)
            if not items and playlistId:
                items += self.api_episode_info(mvitem,
                                               playlistId,
                                               cat_id=cat_id)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def api_episode_info(self, mvItem=None, playlistId='', cat_id=''):
        # 应该保证mvItem,playlistId不为空,且包含mid或者sid、untrack_id,包含channel_id、site_id
        items = []
        try:
            mvitem = mvItem
            ep_item = mvitem["media"]

            url = self.album_api % (playlistId, 1)
            logging.log(logging.INFO, 'api_episode_info, info url %s' % url)
            info = self.httpdownload.get_data(url)
            info = info.decode('gbk').encode('utf-8')
            info_json = json.loads(info)

            actor_list = info_json.get("mainActors")
            director_list = info_json.get("directors")
            type_list = info_json.get("categories")
            if "actor" not in ep_item and actor_list:
                ep_item["actor"] = Util.join_list_safely(actor_list)
            if "director" not in ep_item and director_list:
                ep_item["director"] = Util.join_list_safely(director_list)
            if "type" not in ep_item and type_list:
                ep_item["type"] = Util.join_list_safely(type_list)
            if "title" not in ep_item:
                ep_item["title"] = info_json.get("albumName")
            if "district" not in ep_item:
                ep_item["district"] = info_json.get("area")
            if "release_date" not in ep_item and info_json.get("publishYear"):
                ep_item["release_date"] = Util.str2date(
                    str(info_json.get("publishYear")))
            if "intro" not in ep_item:
                ep_item["intro"] = info_json.get("albumDesc")
            if "poster_url" not in ep_item or not str.strip(
                    str(ep_item["poster_url"])):
                ep_item["poster_url"] = info_json.get("pic240_330")
            if "cont_id" not in ep_item:
                ep_item["cont_id"] = playlistId

            ttvitem = []
            if ep_item['title']:
                mvitem['media'] = ep_item
                ttvitem = self.parse_video_item(cat_id, playlistId)
            if ttvitem:
                mvitem['video'] = ttvitem
                if "url" not in mvitem["media"]:
                    mvitem["media"]["url"] = ttvitem[0]['url']
                mvitem["media"]["info_id"] = Util.md5hash(
                    Util.summarize(mvitem["media"]))
                Util.set_ext_id(mvitem["media"], mvitem["video"])
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_play(self, response, untrack_id, sid):
        mvitem = None
        try:
            logging.log(logging.INFO,
                        'parse_episode_play: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            #vip
            title_list = response.xpath(
                '//div[@id="crumbsBar"]/div[@class="area cfix"]/div[@class="left"]/h2/@title'
            ).extract()
            director_list = response.xpath(
                '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' %
                u'导演:').extract()
            performer_list = response.xpath(
                '//div[@class="info info-con"]/ul/li[text()="%s"]/a/text()' %
                u'主演:').extract()
            text = response.xpath(
                '//div[@class="info info-con"]/p[@class="intro"]/text()'
            ).extract()
            pers = "|".join([t.strip() for t in performer_list])
            dirs = "|".join([t.strip() for t in director_list])
            playlistId = ""
            playlistId_list = response.selector.re(
                re.compile(r'var playlistId.*?(\d+)'))
            if not playlistId_list:
                playlistId_list = response.selector.re(
                    re.compile(r'var PLAYLIST_ID.*?(\d+)'))
            if not playlistId_list:
                playlistId_list = response.selector.re(
                    re.compile(r'= playlistId.*?(\d+)'))

            if playlistId_list:
                playlistId = playlistId_list[0]
            vid = ""
            vid_list = response.selector.re(re.compile(r'var vid.*?(\d+)'))
            if vid_list:
                vid = vid_list[0]
            if not playlistId or not vid:
                return mvitem

            ep_item = MediaItem()
            ep_item["cont_id"] = playlistId
            if title_list:
                ep_item["title"] = title_list[0]
            ep_item["actor"] = pers
            ep_item["director"] = dirs
            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["url"] = Util.normalize_url(response.request.url, "sohu")

            if text:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item
            if untrack_id:
                mvitem["untrack_id"] = untrack_id
            if sid:
                mvitem["sid"] = sid
            vitem = VideoItem()
            vitem["title"] = ep_item["title"] if 'title' in ep_item else None
            vitem["url"] = ep_item["url"]
            vitem["vnum"] = "1"
            vitem["os_id"] = self.os_id
            vitem["ext_id"] = Util.md5hash(ep_item["url"])
            vitem["site_id"] = self.site_id
            vitem["cont_id"] = vid
            videoitems = []
            videoitems.append(vitem)
            mvitem["video"] = videoitems
            mvitem["media"]["info_id"] = Util.md5hash(
                Util.summarize(mvitem["media"]))

            Util.set_ext_id(mvitem["media"], mvitem["video"])
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return mvitem

    def parse_title(self, response, cat_id):
        gtitle = []
        title = []
        try:
            title = response.xpath(
                '//div[@class="wrapper"]/div[1]/h2/text()').extract()
            gtitle = self.strip_title(cat_id, title)
            if not gtitle:
                title = response.xpath(
                    '//div[@class="wrapper"]/div[1]/h2/text()').extract()
                gtitle = self.strip_title(cat_id, title)
            if not gtitle:
                title = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/h2/span/text()'
                ).extract()
                gtitle = self.strip_title(cat_id, title)
            if not gtitle:
                title = response.xpath(
                    '//div[@class="wrapper"]/div[1]/h2/span/text()').extract()
                gtitle = self.strip_title(cat_id, title)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return gtitle

    def strip_title(self, cat_id, title):
        gtitle = []
        try:
            if len(title):
                ttitle = title[0].strip()
                index = ttitle.find(self.channel_info[str(cat_id)])
                len1 = 0
                if index >= 0:
                    len1 = len(self.channel_info[str(cat_id)]) + 1
                else:
                    index = 0
                tinfo = ttitle[index + len1:]
                if len(tinfo) > 0:
                    gtitle.append(tinfo)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return gtitle

    def parse_actor(self, response):
        performer_list = []
        try:
            performer_list = response.xpath(
                '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()'
                % u'主演:').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="infoR"]/ul/li/span[text()="%s"]/../a/text()'
                    % u'主持人:').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'配音:').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'声优:').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'主演:').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()'
                    % u'主演:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return performer_list

    def parse_type_list(self, response):
        type_list = []
        try:
            type_list = response.xpath(
                '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()'
                % u'类型:').extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'类型:').extract()
            if not type_list:
                type_list = performer_list = response.xpath(
                    '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()'
                    % u'类型:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return type_list

    def parse_district(self, response):
        type_list = []
        try:
            type_list = response.xpath(
                '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()'
                % u'地区:').extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'地区:').extract()
            if not type_list:
                type_list = performer_list = response.xpath(
                    '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()'
                    % u'地区:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return type_list

    def parse_year(self, response):
        type_list = []
        try:
            type_list = response.xpath(
                '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()'
                % u'上映时间:').extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'上映时间:').extract()
            if not type_list:
                type_list = response.xpath(
                    '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../text()'
                    % u'上映时间:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return type_list

    def parse_director(self, response):
        director_list = []
        try:
            director_list = response.xpath(
                '//div[@class="movie-infoR"]/ul[@class="cfix mB20"]/li/span[text()="%s"]/../a/text()'
                % u'导演:').extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'导演:').extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@id="contentA"]/div[@class="right"]/div[@class="blockRA bord clear"]/div[@class="cont"]/p[text()="%s"]/a/text()'
                    % u'监督:').extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@class="drama-infoR"]/ul[@class="cfix"]/li/span[text()="%s"]/../a/text()'
                    % u'导演:').extract()
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return director_list

    def parse_video_item(self, cat_id, playlistId):
        logging.log(logging.INFO,
                    'parse_video_item , playlistId %s' % playlistId)
        videoitems = []
        try:
            #新接口代码
            page = 1
            while True:
                page_items = self.parse_videos_info(cat_id, playlistId, page)
                if not page_items:
                    break
                videoitems = videoitems + page_items
                page = page + 1

        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def parse_videos_info(self, cat_id, playlistId, page):
        videoitems = []
        try:
            url = self.album_api % (playlistId, page)
            logging.log(logging.INFO, 'parse_videos_info, info url %s' % url)
            info = self.httpdownload.get_data(url)
            info = info.decode('gbk').encode('utf-8')
            info_json = json.loads(info)
            videos = info_json['videos']
            if int(cat_id) == int(self.variety_id):
                for video in videos:
                    tvSType = str(
                        video['tvSType']) if 'tvSType' in video else '-1'
                    if tvSType != '1' and tvSType != '36':
                        continue
                    #综艺采用日期
                    play_num = self.get_play_num(video['showDate'])
                    if not play_num:
                        play_num = self.get_play_num_date(video['publishTime'])
                    vitem = self.compose_vitem([video['pageUrl']],
                                               [video['name']], play_num)
                    vitem['cont_id'] = video['vid']
                    vitem['thumb_url'] = video['smallPicUrl']
                    videoitems.append(vitem)
            else:
                for video in videos:
                    tvSType = str(
                        video['tvSType']) if 'tvSType' in video else '-1'
                    if tvSType != '1' and tvSType != '36':
                        continue
                    #非综艺采用order
                    play_num = self.get_play_num(video['order'])
                    vitem = self.compose_vitem([video['pageUrl']],
                                               [video['name']], play_num)
                    vitem['cont_id'] = video['vid']
                    vitem['thumb_url'] = video['smallPicUrl']
                    videoitems.append(vitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def parse_variety_info(self, playlistId, response):
        logging.log(logging.INFO,
                    'parse_variety_info, info url %s' % response.request.url)
        videoitems = []
        try:
            year_list = response.xpath(
                '//div[@class="mod plot"]/ul[@class="filter"]/li[@class="v-year"]/a/em/text()'
            ).extract()
            if not year_list:
                year_list = ["2015", "2014", "2013", "2012", "2011", "2010"]
            for year in year_list:
                turl1 = "http://tv.sohu.com/item/VideoServlet?source=sohu&id=" + str(
                    playlistId) + "&year=" + year + "&month=0&page=1"
                info = self.httpdownload.get_data(turl1)
                videolist = self.parse_play_list(info)
                if videolist:
                    for titem in videolist:
                        videoitems.append(titem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def parse_play_list(self, info):
        videoitems = []
        try:
            if not info or len(info) < len("{pageTotal: 1,videos:[]"):
                return None
            jinfo = {}
            try:
                jinfo = json.loads(info)
            except Exception as e:
                logging.log(logging.ERROR, traceback.format_exc())
            if "videos" not in jinfo:
                return videoitems

            itemlist = jinfo["videos"]
            for titem in itemlist:
                vitem = self.compose_vitem([titem["url"]], [titem["title"]],
                                           titem["showDate"])
                vitem['cont_id'] = titem['id']
                videoitems.append(vitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def get_carton_list(self, response):
        videoitems = []
        try:
            ul_list = response.xpath(
                '//div[@id="blockA"]/div[@id="allist"]/div[@id="list_asc"]/div[@class="pp similarLists"]/ul'
            )
            for ul in ul_list:
                li_list = ul.xpath('./li')
                for li in li_list:
                    url = li.xpath('./a/@href').extract()
                    ttitle = li.xpath('./span/strong/a/text()').extract()
                    play_num = self.get_play_num(ttitle[0])
                    vitem = self.compose_vitem(url, ttitle, play_num)
                    if 'url' in vitem:
                        videoitems.append(vitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return videoitems

    def compose_vitem(self, url_list, title_list, vnum):
        vitem = VideoItem()
        try:
            if not url_list:
                return vitem
            if title_list:
                vitem["title"] = title_list[0].strip()
            turl = Util.normalize_url(url_list[0], "sohu")
            vitem["url"] = turl
            vitem["vnum"] = str(vnum)
            vitem["os_id"] = self.os_id
            vitem["ext_id"] = Util.md5hash(turl)
            vitem["site_id"] = self.site_id
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return vitem

    def get_sohu_showid(self, url):
        id = ""
        try:
            #http://tv.sohu.com/item/MTE4NTk2MA==.html
            #http://tv.sohu.com/item/MTE0NjQwNg==.html
            #r = re.compile(r'http://tv.sohu.com.+?/[^/]*./([^/]*)\.html')
            r = re.compile(r'http://tv.sohu.com/[^/].*/([^/].*)\.[s]?html')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def get_play_num(self, title):
        num = ""
        try:
            num_list = re.findall('([\d]+)', title)
            if num_list:
                num_size = len(num_list)
                num = num_list[num_size - 1]
        except Exception as e:
            pass
        return num

    def get_play_num_date(self, title):
        num = ""
        try:
            num_list = re.findall('([\d]+)', title)
            if num_list:
                num = "".join(num_list)
        except Exception as e:
            pass
        return num

    def check_url(self, mvitem):
        res = True
        try:
            if 'video' in mvitem:
                for video in mvitem['video']:
                    if 'url' in video:
                        tres = self.is_same_site(video['url'])
                        if not tres:
                            res = False
                            break
        except Exception as e:
            pass
        return res

    def is_same_site(self, url):
        res = True
        try:
            tsite = Util.guess_site(url)
            if tsite != self.site_name:
                res = False
        except Exception as e:
            pass
            res = False
        return res

    def get_year(self, info):
        year = None
        try:
            r = re.compile(ur'.*%s.*(\d+).*' % (u'上映时间'))
            m = r.search(info)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return year
Exemple #13
0
 def __init__(self):
     self.__db_mgr = DbManager.instance()
Exemple #14
0
class tudou_spider(Spider):
    name = "tudou"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    site_code = "tudou"
    site_id = ""   #tudou
    allowed_domains = ["www.tudou.com"]
    pre_url = "http://www.tudou.com/s3portal/service/pianku/data.action?pageSize=90&app=mainsitepc&deviceType=1&tags=&tagType=3&firstTagId="
    tail_url = "&areaCode=110000&initials=&hotSingerId=&sortDesc=pubTime&pageNo="
    #used for guess_site 
    site_name = Util.guess_site("http://www.tudou.com")

    mgr = DbManager.instance()
    os_id = mgr.get_os('web')["os_id"]
    site_id = str(mgr.get_site(site_code)["site_id"])
    channel_map = {}
    channel_map = mgr.get_channel_map()
    max_update_page = get_project_settings().get('MAX_UPDATE_PAGE')
    id_map = {}
    httpdownload = HTTPDownload()
    cmd = None

    def __init__(self, json_data=None, *args, **kwargs):
        super(tudou_spider, self).__init__(*args, **kwargs)
        cat_urls = []
        tasks = []
        if json_data:
            data = json.loads(json_data)
            self.cmd = data["cmd"]
            if self.cmd == "assign":
                tasks = data["task"]
            elif self.cmd == "trig":
                stat = data['stat'] if 'stat' in data else None
                tasks = self.mgr.get_untrack_url(self.site_code, stat)
            ttask={}
            if "id" in data and "url" in data:
                ttask["id"] = data["id"]
                ttask["url"] = data["url"]
                ttask["sid"] = ""
                ttask["untrack_id"] = ""
                cat_urls.append(ttask)
            if tasks:
                for task in tasks:
                    ttask={}
                    ttask["url"] = task["url"]
                    code = task["code"]
                    ttask["id"] = self.channel_map[code]
                    ttask["untrack_id"] = task["untrack_id"]
                    ttask["sid"] = task["sid"]
                    cat_urls.append(ttask)
            #cat_urls = data["cat_urls"]

        self._cat_urls = []
        if cat_urls:
            self._cat_urls = cat_urls

    def start_requests(self):
        try:
            items = []

            cat_urls = []

            movie_id = self.mgr.get_channel('电影')["channel_id"]
            tv_id = self.mgr.get_channel('电视剧')["channel_id"]
            variety_id = self.mgr.get_channel('综艺')["channel_id"]
            cartoon_id = self.mgr.get_channel('动漫')["channel_id"]

            self.id_map = {str(movie_id):"5",str(tv_id):"3",str(variety_id):"6",str(cartoon_id):"4"}
            #不需要url字段,通过土豆网不同频道的id来拼出url
            if not self._cat_urls and not self.cmd:
                #cat_urls = [{'url':'','id':tv_id}]
                cat_urls = [{'url':'','id':movie_id},
                        {'url':'','id':tv_id},
                        {'url':'','id':variety_id},
                        {'url':'','id':cartoon_id}]

                for cat in cat_urls:
                    url = ""
                    type_id = ""
                    if cat['id'] == movie_id:
                        type_id = self.id_map[str(movie_id)]
                    elif cat['id'] == tv_id:
                        type_id = self.id_map[str(tv_id)]
                    elif cat['id'] == variety_id:
                        type_id = self.id_map[str(variety_id)]
                    elif cat['id'] == cartoon_id:
                        type_id = self.id_map[str(cartoon_id)]
                    url = self.pre_url + type_id + self.tail_url
                    page_num = int(self.get_page_num(url+ "10000"))/90 + 1
                    #page_num = 4
                    for i in range(page_num):
                        surl = self.pre_url + type_id + self.tail_url + str(i+1)
                        items.append(Request(url=surl, callback=self.parse, meta={'cat_id': cat['id'],'page':1}))
            else:
                for cat in self._cat_urls:
                    channel_id = str(cat["id"])
                    items.append(Request(url=cat['url'], callback=self.parse_single_episode, meta={'cat_id': channel_id,'page':1,"untrack_id":cat["untrack_id"],"sid":cat["sid"]}))

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def get_page_num(self,url):
        num = None
        try:
            info = self.httpdownload.get_data(url)
            jinfo = json.loads(info)
            num = jinfo["total"]
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return num

    def parse_single_episode(self,response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_single_episode: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            urls = response.xpath('//div[@class="breadcrumb"]/a[@class="breadcrumb_item"]/@href').extract()
            #carton is different
            if not urls:
                urls = response.xpath('//div[@class="mod_player_head cf"]/div[1]/div[1]/a[3]/@href').extract()

            if urls:
                turl = self.url_prefix + urls[0]
                items.append(Request(url=turl, callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':'','page':1,"untrack_id":untrack_id,"sid":sid}))
            else:
                poster_url = ""
                title = ""
                actor = ""
                info_url = response.xpath('//div[@class="summary_main"]/div[@class="fix"]/h1[@class="kw"]/a/@href').extract()
                if info_url:
                    items.append(Request(url=info_url[0], callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,"actor":actor,"untrack_id":untrack_id,"sid":sid}))
                #items.append(Request(url=response.request.url, callback=self.parse_episode_play, meta={'cat_id': cat_id,'poster_url':'','page':1}))
                #response.request.meta['poster_url'] = ''
                #self.parse_episode_play(response)

            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse(self,response):
        try:
            logging.log(logging.INFO, 'parse: %s' % response.request.url)
            
            cat_id = response.request.meta['cat_id']
            #poster_url = response.request.meta['poster_url']
            items = []

            play_url = ""
            jinfo = json.loads(response.body)
            for tmedia in jinfo["items"]:
                title = tmedia["title"]
                actor_list = []
                for tactor in tmedia["actors"]:
                    actor_list.append(tactor["name"])
                actor = Util.join_list_safely(actor_list)
                #actor = "|".join([t.strip() for t in actor_list])
                poster_url = tmedia["picUrl_200x300"]
                play_url = tmedia["playUrl"]
                if "updateInfo" in tmedia and tmedia["updateInfo"].find("预告") >= 0:
                    continue
                else:
                    items.append(Request(url=play_url, callback=self.parse_episode_play, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,'actor':actor}))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

        return items

    def parse_episode_play(self,response):
        try:
            logging.log(logging.INFO, 'parse_episode_play: %s' % response.request.url)

            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            title = response.request.meta['title']
            actor = response.request.meta['actor']
            
            items = []

            info_url = response.xpath('//div[@class="summary_main"]/div[@class="fix"]/h1[@class="kw"]/a/@href').extract()
            if info_url:
                items.append(Request(url=info_url[0], callback=self.parse_episode_info, meta={'cat_id': cat_id,'poster_url':poster_url,'title':title,"actor":actor}))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc()) 

        return items

    def parse_episode_info(self,response):
        try:
            logging.log(logging.INFO, 'parse_episode_info: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            title = response.request.meta['title']
            actor = response.request.meta['actor']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            items = []

            if not poster_url:
                poster_url_list = response.xpath('//div[@class="cover_img"]/div[@class="pack pack_album"]/div[@class="pic"]/img/@src').extract()
                if poster_url_list:
                    poster_url = poster_url_list[0]
            if not title:
                title_list = response.xpath('//div[@class="cover_info"]/h2/strong/@title').extract()
                if title_list:
                    title = title_list[0]
            if not actor:
                #actor_list = response.xpath('//div[@class="cover_keys"]/span/a/text()').extract()
                actor_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u' 主演:').extract()
                if actor_list:
                    actor = Util.join_list_safely(actor_list)
                    #actor = "|".join([t.strip() for t in actor_list])

            #performer
            pers = actor
            type_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'类型:\n').extract()
            district_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'地区:').extract()
            release_date_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'年代:').extract()
            types = None
            if type_list:
                types = Util.join_list_safely(type_list)
            
            #director
            director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'编导:').extract()
            if not director_list:
                director_list = response.xpath('//div[@class="cover_keys"]/span/span[text()="%s"]/../a/text()'  % u'导演:').extract()
            dirs = Util.join_list_safely(director_list)
            #dirs = "|".join([t.strip() for t in director_list])
            #text
            text = response.xpath('//div[@class="cover_info"]/div[@class="desc"]/p/text()').extract()

            #sourceid
            sourceid = self.get_tudou_showid(response.request.url)
            videoitems = []
            ep_item = MediaItem()

            if len(title) > 0:
                ep_item["title"] = title
            if len(pers) > 0:
                ep_item["actor"] = pers
            if len(dirs) > 0:
                ep_item["director"] = dirs
            if types:
                ep_item["type"] = types
            if district_list:
                ep_item["district"] = district_list[0].strip()
            if release_date_list:
                ep_item["release_date"] = Util.str2date(release_date_list[0])

            #ep_item["info_id"] = Util.md5hash(tinfo)
            ep_item["cont_id"] = sourceid
            ep_item["site_id"] = self.site_id
            ep_item["url"] = response.request.url
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            
            if len(text) > 0:
                ep_item["intro"] = text[0]

            mvitem = MediaVideoItem();
            mvitem["media"] = ep_item;
            mvitem["video"] = videoitems

            lurl = "http://www.tudou.com/crp/getAlbumvoInfo.action?charset=utf-8&areaCode=110000&acode=" + str(sourceid)
            info = self.httpdownload.get_data(lurl)
            jinfo = json.loads(info)
            if "items" in jinfo:
                for sitem in jinfo["items"]:
                    vitem = VideoItem()
                    vitem["title"] = sitem["itemTitle"]
                    vitem["vnum"] = sitem["episode"]
                    vitem["os_id"] = self.os_id
                    trailer = sitem['trailer']
                    if not sitem["itemPlayUrl"]:
                        continue
                    #预告片
                    if trailer:
                        continue
                    turl = Util.normalize_url(sitem["itemPlayUrl"],"tudou")
                    vitem["url"] = turl
                    vitem["os_id"] = self.os_id
                    vitem["site_id"] = self.site_id
                    vitem["ext_id"] = Util.md5hash(turl)
                    vitem["cont_id"] = self.get_tudou_showid(turl)
                    #if "ext_id" not in mvitem["media"]:
                    #    mvitem["media"]["ext_id"] = vitem["ext_id"]
                    #vitem["media_ext_id"] = vitem["ext_id"]
                    mvitem["video"].append(vitem)

            if len(mvitem["video"]) > 0:
                Util.set_ext_id(mvitem["media"],mvitem["video"])
                mvitem["media"]["info_id"] = Util.md5hash(Util.summarize(mvitem["media"]))
                if untrack_id:
                    mvitem["untrack_id"] = untrack_id
                if sid:
                    mvitem["sid"] = sid
                if self.check_url(mvitem):
                    items.append(mvitem)
        except Exception as e: 
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def get_tudou_showid(self,url):
        id = ""
        try:
            #http://www.tudou.com/albumcover/ZPUPBy0CC6c.html
            r = re.compile(r'http://.+/.*/([^/].*).html')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def check_url(self,mvitem):
        res = True
        try:
            if 'video' in mvitem:
                for video in mvitem['video']:
                    if 'url' in video:
                        tres = self.is_same_site(video['url'])
                        if not tres:
                            res = False
                            break
        except Exception as e:
            pass
        return res

    def is_same_site(self,url):
        res = True
        try:
            tsite = Util.guess_site(url)
            if tsite != self.site_name:
                res = False
        except Exception as e:
            pass
            res = False
        return res
Exemple #15
0
class wasu_spider(Spider):
    name = "wasu"
    pipelines = ['CategoryPipeline', 'MysqlStorePipeline']
    site_code = "wasu"
    site_id = ""  #wasu
    allowed_domains = ["www.wasu.cn", "all.wasu.cn"]
    url_prefix = 'http://www.wasu.cn'
    site_name = Util.guess_site(url_prefix)

    mgr = DbManager.instance()
    os_id = mgr.get_os('web')["os_id"]
    site_id = str(mgr.get_site(site_code)["site_id"])
    channel_map = {}
    channel_map = mgr.get_channel_map()
    max_update_page = get_project_settings().get('MAX_UPDATE_PAGE')
    global_spider = True

    httpdownload = HTTPDownload()

    channel_info = {}
    test_page_url = None
    test_channel_id = None

    album_api = 'http://www.wasu.cn/Column/ajax_list?uid=%s&y=%s&mon=%s'

    def __init__(self, json_data=None, *args, **kwargs):
        super(wasu_spider, self).__init__(*args, **kwargs)
        cat_urls = []
        tasks = None
        if json_data:
            data = json.loads(json_data)
            if "type" in data:
                spider_type = data["type"]
                if spider_type != "global":
                    self.global_spider = False
            tasks = []
            ttask = {}
            if "id" in data and "url" in data:
                ttask["id"] = data["id"]
                ttask["url"] = data["url"]
                ttask["sid"] = ""
                ttask["untrack_id"] = ""
                cat_urls.append(ttask)

            cmd = data["cmd"]
            if cmd == "assign":
                tasks = data["task"]
            elif cmd == "trig":
                stat = data['stat'] if 'stat' in data else None
                tasks = self.mgr.get_untrack_url(self.site_code, stat)
            elif cmd == 'carpet':
                tasks = self.mgr.get_video_url(self.site_code)
            elif cmd == "test" and 'id' in data and 'url' in data:
                self.test_page_url = data["url"]
                self.test_channel_id = data["id"]
            if tasks:
                for task in tasks:
                    ttask = {}
                    ttask["url"] = task["url"]
                    code = task["code"]
                    ttask["id"] = self.channel_map[code]
                    ttask["untrack_id"] = task[
                        "untrack_id"] if 'untrack_id' in task else None
                    ttask["sid"] = task["sid"] if 'sid' in task else None
                    ttask['mid'] = task['mid'] if 'mid' in task else None
                    cat_urls.append(ttask)

        self._cat_urls = []
        if cat_urls:
            self._cat_urls = cat_urls

    def start_requests(self):
        try:
            items = []

            self.movie_id = str(self.mgr.get_channel('电影')["channel_id"])
            self.tv_id = str(self.mgr.get_channel('电视剧')["channel_id"])
            self.variety_id = str(self.mgr.get_channel('综艺')["channel_id"])
            self.cartoon_id = str(self.mgr.get_channel('动漫')["channel_id"])

            self.channel_info = {
                self.movie_id: u"电影",
                self.tv_id: u"电视剧",
                self.variety_id: u"综艺",
                self.cartoon_id: u"动漫"
            }

            if self.test_page_url:
                turl = Util.normalize_url(self.test_page_url, "wasu")
                items.append(
                    Request(url=self.test_page_url,
                            callback=self.parse_page,
                            meta={
                                'cat_id': self.test_channel_id,
                                'page': 1
                            }))
                return items

            if not self._cat_urls:
                if self.global_spider:
                    cat_urls = [{
                        'url': 'http://all.wasu.cn/index/cid/1',
                        'id': self.movie_id
                    }, {
                        'url': 'http://all.wasu.cn/index/cid/11',
                        'id': self.tv_id
                    }, {
                        'url': 'http://all.wasu.cn/index/cid/37',
                        'id': self.variety_id
                    }, {
                        'url': 'http://all.wasu.cn/index/cid/19',
                        'id': self.cartoon_id
                    }]
                for cat in cat_urls:
                    items.append(
                        Request(url=cat['url'],
                                callback=self.parse_type,
                                meta={
                                    'cat_id': cat['id'],
                                    'page': 1
                                }))
            else:
                for cat in self._cat_urls:
                    turl = Util.normalize_url(cat['url'], "wasu")
                    items.append(
                        Request(url=turl,
                                callback=self.parse_single_episode,
                                meta={
                                    'cat_id': cat["id"],
                                    'page': 1,
                                    "poster_url": "",
                                    "untrack_id": cat["untrack_id"],
                                    "sid": cat["sid"],
                                    "mid": cat["mid"]
                                }))
            return items
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())

    def parse_type(self, response):
        items = []
        try:
            #logging.log(logging.INFO, 'parse_type: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="ws_all_span"]/ul/li[1]/a/@href').extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_tag,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_tag(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_tag: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="ws_all_span"]/ul/li[2]/a/@href').extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_area,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_area(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_area: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="ws_all_span"]/ul/li[3]/a/@href').extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_time,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_time(self, response):
        items = []
        try:
            logging.log(logging.INFO, 'parse_time: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="ws_all_span"]/ul/li[4]/a/@href').extract()
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_sort,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_sort(self, response):
        items = []
        # 默认最近更新
        time_url = response.request.url
        try:
            logging.log(logging.INFO, 'parse_sort: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            subs = response.xpath(
                '//div[@class="pxfs"]/div[@class="l"]/ul/li/a/@href').extract(
                )
            # 优先爬取最近更新
            subs.insert(0, time_url)
            for sub in subs:
                items.append(
                    Request(url=sub,
                            callback=self.parse_page,
                            meta={
                                'cat_id': cat_id,
                                'page': 1
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_page(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            page = response.request.meta['page']
            logging.log(logging.INFO,
                        'parse_page: %s,%s' % (response.request.url, page))
            #if int(page) > int(self.max_update_page) and not self.global_spider:
            #    return

            items = []

            play_url = ""
            subs = response.xpath('//div[@class="ws_row mb25"]/div')
            #if not subs:
            #    subs = response.xpath('./div/div[@class="ws_row mb25"]/div[@class=" col2 mb20"]/div[@class="hezhip]')

            for sub in subs:
                play_urls = sub.xpath(
                    './div/div[@class="v mb5"]/div[@class="v_link"]/a/@href'
                ).extract()
                pic_urls = sub.xpath(
                    './div/div[@class="v mb5"]/div[@class="v_img"]/img/@src'
                ).extract()
                if not play_urls:
                    play_urls = sub.xpath(
                        './div/div[@class="v mb5"]/div[@class="p_link"]/a/@href'
                    ).extract()
                if not pic_urls:
                    pic_urls = sub.xpath(
                        './div/div[@class="v mb5"]/div[@class="p_img"]/img/@src'
                    ).extract()
                pic_url = ""
                if pic_urls:
                    pic_url = pic_urls[0]
                if play_urls:
                    rplay_url = play_urls[0].strip()
                    if '/Play/show' in rplay_url:
                        #if int(cat_id) == int(self.movie_id):
                        items.append(
                            Request(url=rplay_url,
                                    callback=self.parse_single_episode,
                                    meta={
                                        'cat_id': cat_id,
                                        'poster_url': pic_url,
                                        'untrack_id': '',
                                        'sid': ''
                                    }))
                    else:
                        items.append(
                            Request(url=rplay_url,
                                    callback=self.parse_episode_info,
                                    meta={
                                        'cat_id': cat_id,
                                        'poster_url': pic_url,
                                        'untrack_id': '',
                                        'sid': ''
                                    }))

            next_page = response.xpath(
                '//div[@class="item_page"]/a[text()="%s"]/@href' %
                u'下一页').extract()
            page_prefix = "http://all.wasu.cn"
            if next_page:
                snext_page = next_page[0].strip()
                if snext_page.find(page_prefix) < 0:
                    snext_page = page_prefix + snext_page
                items.append(
                    Request(url=snext_page,
                            callback=self.parse_page,
                            meta={
                                'page': page + 1,
                                'cat_id': cat_id
                            }))
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_single_episode(self, response):
        items = []
        try:
            logging.log(logging.INFO,
                        'parse_single_episode: %s' % response.request.url)
            cat_id = response.request.meta['cat_id']
            untrack_id = response.request.meta['untrack_id']
            sid = response.request.meta['sid']
            mid = response.request.meta[
                'mid'] if 'mid' in response.request.meta else ""
            poster_url = response.request.meta['poster_url']
            #解析媒体页信息
            urls = response.xpath(
                '//div[@class="play_site mb10"]/div[1]/h3/a/@href').extract()
            if not urls:
                #通过标题不能进入媒体页,要通过分级目录
                turls = response.xpath(
                    '//div[@class="play_site mb10"]/div[1]/div[@class="play_seat"]/a/@href'
                ).extract()
                for turl in turls:
                    tiurl = self.get_episode_url(turl)
                    if tiurl:
                        urls.append(tiurl)
            if urls:
                for iurl in urls:
                    if not Util.guess_site(iurl):
                        iurl = self.url_prefix + iurl
                    surl = Util.normalize_url(iurl, "wasu")
                    if surl and self.site_name == Util.guess_site(surl):
                        items.append(
                            Request(url=surl,
                                    callback=self.parse_episode_info,
                                    meta={
                                        'cat_id': cat_id,
                                        'poster_url': poster_url,
                                        'page': 1,
                                        "untrack_id": untrack_id,
                                        "sid": sid,
                                        "mid": mid
                                    }))
            else:
                #电影视频,没有媒体页,只有播放页
                #动漫电影,没有媒体页,只有播放页
                titems = self.parse_play_page(response)
                for item in titems:
                    if mid:
                        item['mid'] = mid
                    items.append(item)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_episode_info(self, response):
        items = []
        try:
            request_url = response.request.url
            logging.log(logging.INFO, 'parse_episode_info: %s' % request_url)
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            mid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']
            if "mid" in response.request.meta:
                mid = response.request.meta['mid']

            #此处因考虑不想过多改变原来的程序结构,其实这些属性可以通过接口获得
            #http://clientapi.wasu.cn/Phone/vodinfo/id/6786984
            title_list = response.xpath(
                '//div[@class="cloudotm1"]/p[1]/a/text()').extract()
            if not title_list:
                title_list = response.xpath(
                    '//div[@class="tele_txts"]/h4[1]/a/text()').extract()

            director_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'导演').extract()
            if not director_list:
                director_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'导演').extract()
            performer_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'演员').extract()
            if not performer_list:
                performer_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'演员').extract()
            area_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'地区').extract()
            if not area_list:
                area_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'地区').extract()
            tag_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'标签').extract()
            if not tag_list:
                tag_list = response.xpath(
                    '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()'
                    % u'类型').extract()
            if not tag_list:
                tag_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'标签').extract()
            if not tag_list:
                tag_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'类型').extract()
            year_list = response.xpath(
                '//div[@class="right_fl"]//*[contains(text(),"%s")]/a/text()' %
                u'年份').extract()
            if not year_list:
                year_list = response.xpath(
                    '//div[@class="tele_txts"]//*[contains(text(),"%s")]/a/text()'
                    % u'年份').extract()
            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            areas = Util.join_list_safely(area_list)
            tags = Util.join_list_safely(tag_list)

            #text
            text = response.xpath(
                '//div[@class="right_fl"]/p/span[@id="infoS"]/text()').extract(
                )
            if text:
                text = response.xpath(
                    '//div[@class="tele_b_otm"]/p/span[@id="infoS"]/text()'
                ).extract()

            play_url = ""
            mvitem = self.compose_mvitem(response, title_list, pers, dirs,
                                         response.request.url, cat_id,
                                         poster_url, text)
            if mid:
                mvitem['mid'] = mid

            if mvitem and 'video' in mvitem and 'url' in mvitem['video'][
                    0] and mvitem['video'][0]['url']:
                mvitem['media']['type'] = tags
                mvitem['media']['district'] = areas
                if year_list:
                    mvitem['media']['release_date'] = Util.str2date(
                        year_list[0])
                tlen = len(mvitem['video'])
                logging.log(
                    logging.INFO, "++++url: %s video len: %d " %
                    (response.request.url, tlen))
                items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_play_page(self, response):
        items = []
        try:
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            title_list = response.xpath(
                '//div[@class="play_site mb10"]/div/h3/text()').extract()
            director_list = response.xpath(
                '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/span[text()="%s"]/../a/text()'
                % u'导演:').extract()
            performer_list = response.xpath(
                '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/div/span[text()="%s"]/../../div[@class="r"]/a/text()'
                % u'主演:').extract()
            tag_list = response.xpath(
                '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/span[text()="%s"]/../a/text()'
                % u'类型:').extract()
            area_list = response.xpath(
                '//div[@class="play_information play_intro"]/div[@class="play_information_t"]/div[@class="r"]/div/span[text()="%s"]/../a/text()'
                % u'地区:').extract()

            pers = Util.join_list_safely(performer_list)
            dirs = Util.join_list_safely(director_list)
            areas = Util.join_list_safely(area_list)
            tags = Util.join_list_safely(tag_list)

            text = response.xpath(
                '//div[@class="play_information play_intro"]/div[@class="play_information_b intro_down"]/div[@class="one"]/b/text()'
            ).extract()

            mvitem = self.compose_mvitem(response, title_list, pers, dirs,
                                         response.request.url, cat_id,
                                         poster_url, text)
            if mvitem:
                mvitem['media']['type'] = tags
                mvitem['media']['district'] = areas
                items.append(mvitem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return items

    def parse_video_item(self, response, cat_id, url, title, playlistId):
        #logging.log(logging.INFO, 'parse_video_item , info url %s,paly_url: %s,cat id %s,title %s' % (response.request.url,url,cat_id,title))
        videoitems = []
        ep_item = MediaItem()
        item = MediaVideoItem()
        item["media"] = ep_item
        item["video"] = videoitems
        try:
            if int(cat_id) == int(self.variety_id):
                tvideoitems = self.parse_variety(response)
                if tvideoitems:
                    for titem in tvideoitems:
                        videoitems.append(titem)
            elif '/Play/show' not in url:
                #if int(cat_id) != int(self.movie_id):
                #ul_list = response.xpath('//div[@class="teleplay_gather tab_box"]/div[@class="list_tabs_cont"]/ul/li')
                ul_list = response.xpath(
                    '//div[@class="teleplay_gather tab_box"]/div/ul/li')
                if ul_list:
                    #http://www.wasu.cn/Tele/index/id/6539647
                    for li in ul_list:
                        yugaopian = li.xpath('.//i[@class="yugao"]').extract()
                        if yugaopian:
                            continue
                        url = li.xpath('./a/@href').extract()
                        ttitle = li.xpath('./a/@title').extract()
                        snum = li.xpath('./a/text()').extract()
                        play_num = ""
                        if snum:
                            play_num = self.get_play_num(snum[0])
                        if int(cat_id) == int(self.variety_id):
                            play_num1 = self.getvnum(self.url_prefix + url[0])
                            if play_num1:
                                play_num = play_num1
                        if not ttitle:
                            ttitle = [play_num]
                        vitem = None
                        if self.site_name == Util.guess_site(url[0]):
                            vitem = self.compose_vitem([url[0]],
                                                       [title[0].strip()],
                                                       play_num)
                        else:
                            vitem = self.compose_vitem(
                                [self.url_prefix + url[0]], [title[0].strip()],
                                play_num)
                        if 'url' in vitem:
                            videoitems.append(vitem)
                if not ul_list:
                    #http://www.wasu.cn/Tele/index/id/6786984
                    ul_list = response.xpath(
                        '//div[@class="tab_box"]//div[ends-with(@class, "col2")]'
                    )
                    for li in ul_list:
                        yugaopian = li.xpath('.//i[@class="yugao"]').extract()
                        if yugaopian:
                            continue
                        url = li.xpath(
                            './div[@class="ws_des"]/p[1]/a/@href').extract()
                        ttitle = li.xpath(
                            './div[@class="ws_des"]/p[2]/span/text()').extract(
                            )
                        snum = li.xpath(
                            './div[@class="ws_des"]/p[1]/a/text()').extract()
                        play_num = ""
                        if snum:
                            play_num = self.get_play_num(snum[0])
                        if int(cat_id) == int(self.variety_id):
                            play_num1 = self.getvnum(self.url_prefix + url[0])
                            if play_num1:
                                play_num = play_num1
                        if not ttitle:
                            ttitle = [play_num]
                        vitem = None
                        if self.site_name == Util.guess_site(url[0]):
                            vitem = self.compose_vitem([url[0]],
                                                       [title[0].strip()],
                                                       play_num)
                        else:
                            vitem = self.compose_vitem(
                                [self.url_prefix + url[0]], [title[0].strip()],
                                play_num)
                        if 'url' in vitem:
                            videoitems.append(vitem)
            else:
                #elif int(cat_id) == int(self.movie_id):
                #无媒体页的播放页
                if url:
                    vitem = self.compose_vitem([url], title, 1)
                    if 'url' in vitem:
                        videoitems.append(vitem)
            if videoitems:
                item["video"] = videoitems
                item["media"]["url"] = response.request.url
                Util.set_ext_id(item["media"], item["video"])
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return item

    def compose_mvitem(self, response, title_list, pers, dirs, play_url,
                       cat_id, poster_url, text):
        try:
            cat_id = response.request.meta['cat_id']
            poster_url = response.request.meta['poster_url']
            untrack_id = ""
            sid = ""
            if "untrack_id" in response.request.meta:
                untrack_id = response.request.meta['untrack_id']
            if "sid" in response.request.meta:
                sid = response.request.meta['sid']

            videoitems = []
            ep_item = MediaItem()
            if title_list:
                ep_item["title"] = title_list[0].strip()
            ep_item["actor"] = pers
            ep_item["director"] = dirs

            ep_item["site_id"] = self.site_id
            ep_item["channel_id"] = cat_id
            ep_item["poster_url"] = poster_url
            ep_item["url"] = Util.normalize_url(response.request.url, "wasu")

            if len(text) > 0:
                ep_item["intro"] = text[0].strip()

            mvitem = MediaVideoItem()
            mvitem["media"] = ep_item

            mid = self.getshowid(response.request.url)
            mvitem["media"]["cont_id"] = mid
            ttvitem = {}
            if title_list:
                ttvitem = self.parse_video_item(response, cat_id, play_url,
                                                title_list, None)
            if ttvitem:
                if 'video' in ttvitem and len(ttvitem['video']) > 0:
                    mvitem['video'] = ttvitem['video']
                    mvitem["media"]["info_id"] = Util.md5hash(
                        Util.summarize(mvitem["media"]))
                    Util.set_ext_id(mvitem["media"], mvitem["video"])
                    if untrack_id and sid:
                        mvitem["untrack_id"] = untrack_id
                        mvitem["sid"] = sid
                    res = self.check_url(mvitem)
                    if not res:
                        return None
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return mvitem

    def compose_vitem(self, url_list, title_list, vnum):
        vitem = VideoItem()
        try:
            if not url_list:
                return vitem
            if title_list:
                vitem["title"] = title_list[0].strip()
            turl = Util.normalize_url(url_list[0], "wasu")
            vitem["url"] = turl
            vitem["vnum"] = str(vnum)
            vitem["os_id"] = self.os_id
            vitem["ext_id"] = Util.md5hash(turl)
            vitem["site_id"] = self.site_id
            vitem["cont_id"] = self.getshowid(turl)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return vitem

    #解析娱乐频道的videos
    def parse_variety(self, response):
        videoitems = []
        try:
            #year list
            year_list = response.xpath(
                '//div[@id="play_year"]/div[@id="divselect"]/div[@class="play_sel"]/p/a/text()'
            ).extract()
            uid = self.getuid(response.request.url)
            cid = None
            month_list = [
                "12", "11", "10", "9", "8", "7", "6", "5", "4", "3", "2", "1"
            ]
            cid_url_list = response.xpath(
                '//div[@class="head1 mb10"]/a/@href').extract()
            for cid_url in cid_url_list:
                cid = self.getcid(cid_url)
                if cid:
                    break
            #http://www.wasu.cn/Column/ajax_list?uid=252&y=2015&mon=7&cid=39
            for year in year_list:
                for month in month_list:
                    if uid and year and month:
                        turl = 'http://www.wasu.cn/Column/ajax_list?uid=%s&y=%s&mon=%s&cid=%s' % (
                            uid, year, month, cid)
                        info = self.httpdownload.get_data(turl)
                        if not info:
                            continue
                        jinfo = json.loads(info)
                        if "con" in jinfo and jinfo["con"]:
                            tinfo = jinfo["con"].replace("\/", "/")
                            tsel = Selector(text=tinfo).xpath(
                                '//div[@id="itemContainer"]/div[@class="col2 play_love"]'
                            )
                            for isel in tsel:
                                title = isel.xpath(
                                    './div[@class="v"]/div[@class="v_link"]/a/@title'
                                ).extract()
                                url = isel.xpath(
                                    './div[@class="v"]/div[@class="v_link"]/a/@href'
                                ).extract()
                                vnum = isel.xpath(
                                    './div[@class="v"]/div[@class="v_meta"]/div[@class="meta_tr"]/text()'
                                ).extract()
                                tvnum = vnum[0].strip()
                                svnum = tvnum.replace("-", "")
                                titem = self.compose_vitem(
                                    [self.url_prefix + url[0]], title, svnum)
                                if titem:
                                    videoitems.append(titem)
        except Exception as e:
            logging.log(logging.ERROR, traceback.format_exc())
        return videoitems

    def get_play_num(self, title):
        num = ""
        try:
            num_list = re.findall('([\d]+)', title)
            if num_list:
                num_size = len(num_list)
                num = num_list[num_size - 1]
        except Exception as e:
            pass
        return num

    def check_url(self, mvitem):
        res = True
        try:
            if 'video' in mvitem:
                for video in mvitem['video']:
                    if 'url' in video:
                        if Util.guess_site(video['url']) != self.site_name:
                            res = False
                            break
        except Exception as e:
            pass
        return res

    def is_same_site(self, url):
        res = True
        try:
            tsite = Util.guess_site(url)
            if tsite != self.site_name:
                res = False
        except Exception as e:
            pass
            res = False
        return res

    def getshowid(self, url):
        id = ""
        try:
            #http://www.wasu.cn/Play/show/id/5871821
            #http://www.wasu.cn/Tele/index/id/6786984
            #http://www.wasu.cn/Column/show/column/252
            r = re.compile(r'http://.*/id/(\d+)[\?]?.*')
            m = r.match(url)
            if m:
                return m.group(1)
            else:
                r = re.compile(r'http://.*/show/.*/(\d+)[\?]?.*')
                m = r.match(url)
                if m:
                    return m.group(1)
        except Exception as e:
            pass
        return id

    def getvnum(self, url):
        id = ""
        try:
            r = re.compile(r'http://.*-drama-(\d+).*')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return id

    def getuid(self, url):
        uid = ""
        try:
            #http://www.wasu.cn/Column/show/column/252
            r = re.compile(r'.*/column/([\d]+)')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return uid

    def getcid(self, url):
        cid = ""
        try:
            #http://all.wasu.cn/index/cid/39
            r = re.compile(r'.*/cid/([\d]+)*')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return cid

    def getareaid(self, url):
        cid = ""
        try:
            #http://all.wasu.cn/index/cid/39
            r = re.compile(r'.*/area/([\d]+)*')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return cid

    def getyearid(self, url):
        cid = ""
        try:
            #http://all.wasu.cn/index/cid/39
            r = re.compile(r'.*/year/([\d]+)*')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return cid

    def get_episode_url(self, url):
        rurl = ""
        try:
            #http://www.wasu.cn/Play/show/id/5871821
            #http://www.wasu.cn/Column/show/column/252
            r = re.compile(r'(.*/show/.*/\d+)')
            m = r.match(url)
            if m:
                return m.group(1)
        except Exception as e:
            pass
        return rurl
Exemple #16
0
class iqiyi_spider(Spider):
    '''
        iqiyi浏览流程:
         (1)从list进入
            电视剧, 综艺, :list列表页 -> 媒体页
            电影:list列表页 -> 播放页
            动漫:
                (1)电影版:list列表页 -> 播放页
                (2)普通版:list列表页 -> 媒体页
         (2)从播放页进入
            (1)播放页 -> 媒体页
            (2)播放页
        iqiyi爬虫流程:
          (1)list列表页进入 -> (判断URL类型,确定媒体页还是播放页)获取本页的信息,结束
          (2)播放页进入 -> 获取播放页信息,判断是否存在媒体页 -> 媒体页 
        由于iqiyi在list表页的最多只能浏览到30页,所以采用如下策略爬取
          (1)按一级一级类别细分成各个分支
          (2)当细分的页小于30时,该分支停止细分
          (2)当分支细分结束,页数仍大于30,则再利用不同的排序,再遍历,以尽量减少因无法访问30页之后所带来的内容缺失
          ps.一直采用细分到叶子,感觉整个流程比较深,所以采用截枝的方式
    '''
    site_code = 'iqiyi'
    name = site_code
    mgr = DbManager.instance()
    max_number = 100000
    #因为在类别细分生成树,本爬虫为了提高效率,采用将当前分支的页面数小于max_broswe_page,就截枝(不在细分)的方法
    max_broswe_page = '30'
    list_prefix_url = 'http://list.iqiyi.com'
    #http://cache.video.qiyi.com/jp/sdlst/6/1300000156/
    source_year_api = 'http://cache.video.qiyi.com/jp/sdlst/%s/%s/'
    #http://cache.video.qiyi.com/jp/sdvlst/6/1300001662/2014/?categoryId=6&sourceId=1300001662&tvYear=2014
    source_media_api = 'http://cache.video.qiyi.com/jp/sdvlst/%s/%s/%s/?categoryId=%s&sourceId=%s&tvYear=%s'
    #http://cache.video.qiyi.com/jp/avlist/202321801/1/?albumId=202321801&pageNo=1
    album_media_api = 'http://cache.video.qiyi.com/jp/avlist/%s/%s/?albumId=%s&pageNo=%s'
    vip_api = 'http://serv.vip.iqiyi.com/pay/movieBuy.action?aid=%s'
    api_success_code = u'A00000'
    max_mark_depth = 10
    #通过json传递的参数
    json_data = None

    #统计数据用
    #count = 0

    def __init__(self, json_data=None, *args, **kwargs):
        super(iqiyi_spider, self).__init__(*args, **kwargs)
        if json_data:
            self.json_data = json.loads(json_data)

    def start_requests(self):
        items = []
        try:
            self.load_member_variable()
            if self.json_data:
                items = items + self.load_video_urls()
            else:
                url = self.list_prefix_url
                items.append(
                    Request(url=url,
                            callback=self.list_parse,
                            meta={'level': 0}))
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally: