コード例 #1
0
ファイル: variety.py プロジェクト: ipaste/learning-path
    def detail_page(self, response):
        ##抓取基本信息
        categories = platform = location = orig_id = hosts = None
        catList = []
        hostList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'播出平台:':
                platform = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'国家/地区:':
                location = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'主持人/嘉宾:':
                for cat in pq(each).siblings('a'):
                    hostList.append(pq(cat).text())

        #get original id
        orig_id = getId(response.url, '.*zongyi/zy_(\d+)/$')

        #抓取播放源地址
        baseUrl = 'http://v.2345.com/moviecore/server/variety/index.php?'
        apis = []
        firstApi = response.doc('#playNumTabFirst').attr.apiname
        apis.append(firstApi)
        for each in response.doc('.sourceMoreList a').items():
            if each is not None and len(each) > 0:
                apis.append(each.attr.apiname)

        for apiname in apis:
            save = {"api": apiname, "variety_id": orig_id}
            self.crawl(self.ajaxBaseUrl +
                       makeAjaxParam(api=apiname, id=orig_id),
                       callback=self.jsonYearList,
                       save=save)

        small_image = response.save['small_image']
        if small_image is None or len(small_image) == 0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "platform": platform or '',
            "location": location or '',
            "hosts": hostList,
            "small_image": small_image or '',
            "orig_id": orig_id,
            "rating": response.save['rating'],
            "is_play_source": 0
        }
コード例 #2
0
ファイル: variety.py プロジェクト: silentred/learning-path
    def detail_page(self, response):
        ##抓取基本信息
        categories = platform = location = orig_id = hosts= None
        catList = []
        hostList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'播出平台:':
                platform =  pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'国家/地区:':
                location =  pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'主持人/嘉宾:':
                for cat in pq(each).siblings('a'):
                    hostList.append(pq(cat).text())

        #get original id
        orig_id = getId(response.url, '.*zongyi/zy_(\d+)/$')

        #抓取播放源地址
        baseUrl = 'http://v.2345.com/moviecore/server/variety/index.php?'
        apis = []
        firstApi = response.doc('#playNumTabFirst').attr.apiname
        apis.append(firstApi)
        for each in response.doc('.sourceMoreList a').items():
            if each is not None and len(each)>0:
                apis.append(each.attr.apiname)

        for apiname in apis:
            save = {"api": apiname, "variety_id": orig_id}
            self.crawl(self.ajaxBaseUrl+makeAjaxParam(api=apiname, id=orig_id), callback=self.jsonYearList, save=save)

        small_image = response.save['small_image']
        if small_image is None or len(small_image)==0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "platform": platform or '',
            "location": location or '',
            "hosts" : hostList,
            "small_image": small_image or '',
            "orig_id" : orig_id,
            "rating": response.save['rating'],
            "is_play_source": 0
        }
コード例 #3
0
    def detail_page(self, response):
        self.handleMetaRedirect(response)
        ##抓取基本信息
        categories = year = location = alias = upd_desc = orig_id = None
        catList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'年代:':
                year = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'国家/地区:':
                location = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'别名:':
                alias = pq(each).siblings().eq(0).text() or None

        #抓取播放源地址
        playSources = {}
        for each in response.doc('.sourceList').items():
            if each.attr.id is not None and len(each.attr.id) > 4:
                api = each.attr.id[:-4]
                episodes = {}
                for episode in pq(each).find('.numList>a').items():
                    if episode.attr.href and episode.attr.href[:
                                                               10] != 'javascript':
                        episodes[pq(episode).text()] = delUrlParams(
                            episode.attr.href)
                playSources[api] = episodes
        #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源,
        #因为集数太多,所以分了两个列表,需要合并两个dict
        for key in playSources.keys():
            if playSources.has_key(key + '_con_'):
                playSources[key] = dict(playSources[key].items() +
                                        playSources[key + '_con_'].items())
                del playSources[key + '_con_']

        #判断是否完结,closed:0 | 1
        closed = 0
        wrap = response.doc('.pTxt .sDes')
        sep = wrap.find('i')
        if re.match(u'.*更新.*', wrap.text()) or sep:
            pass
            if sep:
                upd_desc = wrap.contents()[-1][2:] or None
        else:
            closed = 1

        #get original id
        orig_id = getId(response.url, '.*dm/(\d+)\.html$')

        small_image = response.save['small_image']
        if small_image is None or len(small_image) == 0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "rating": response.doc('.sScore em').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "year": year or '',
            "location": location or '',
            "alias": alias or '',
            "play_source": playSources,
            "closed": closed,
            "upd_desc": upd_desc or '',
            "small_image": small_image or '',
            "orig_id": orig_id
        }
コード例 #4
0
ファイル: comic.py プロジェクト: silentred/learning-path
    def detail_page(self, response):
        self.handleMetaRedirect(response)
        ##抓取基本信息
        categories = year = location = alias = upd_desc = orig_id = None
        catList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'年代:':
                year =  pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'国家/地区:':
                location =  pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'别名:':
                alias =  pq(each).siblings().eq(0).text() or None

        #抓取播放源地址
        playSources = {}
        for each in response.doc('.sourceList').items():
            if each.attr.id is not None and len(each.attr.id)>4:
                api = each.attr.id[:-4]
                episodes = {}
                for episode in pq(each).find('.numList>a').items():
                    if episode.attr.href and episode.attr.href[:10] != 'javascript':
                        episodes[pq(episode).text()] = delUrlParams(episode.attr.href)
                playSources[api] = episodes
        #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源,
        #因为集数太多,所以分了两个列表,需要合并两个dict
        for key in playSources.keys():
            if playSources.has_key(key+'_con_'):
                playSources[key] = dict(playSources[key].items() + playSources[key+'_con_'].items())
                del playSources[key+'_con_']


        #判断是否完结,closed:0 | 1 
        closed = 0
        wrap = response.doc('.pTxt .sDes')
        sep = wrap.find('i')
        if re.match(u'.*更新.*', wrap.text()) or sep:
            pass
            if sep:
                upd_desc = wrap.contents()[-1][2:] or None
        else:
            closed = 1

        #get original id
        orig_id = getId(response.url, '.*dm/(\d+)\.html$')

        small_image = response.save['small_image']
        if small_image is None or len(small_image)==0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "rating": response.doc('.sScore em').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "year": year or '',
            "location": location or '',
            "alias" : alias or '',
            "play_source": playSources,
            "closed": closed,
            "upd_desc" : upd_desc or '',
            "small_image": small_image or '',
            "orig_id" : orig_id
        }