Beispiel #1
0
 def source_list_page(self, response):
     save = response.save
     save['source'] = []
     save['is_play_source'] = 1
     if save['api']:
         pass
     for each in pq(response.json['html']).find('ul.ulPic li').items():
         img = pq(each).find('.pic img')
         guests = None
         guests = [pq(x).text() for x in pq(each).find('.sDes').children()]
         episode = {
             "img": img.attr.src or '',
             "desc": img.attr.alt or '',
             "date": pq(each).find('.pic .sExplanation em').text()[1:-1] or '',
             "guests": guests or '',
             "url": delUrlParams(pq(each).find('.aPlayBtn').attr.href)
         }
         save['source'].append(episode)
     return save
Beispiel #2
0
 def source_list_page(self, response):
     save = response.save
     save['source'] = []
     save['is_play_source'] = 1
     if save['api']:
         pass
     for each in pq(response.json['html']).find('ul.ulPic li').items():
         img = pq(each).find('.pic img')
         guests = None
         guests = [pq(x).text() for x in pq(each).find('.sDes').children()]
         episode = {
             "img": img.attr.src or '',
             "desc": img.attr.alt or '',
             "date": pq(each).find('.pic .sExplanation em').text()[1:-1]
             or '',
             "guests": guests or '',
             "url": delUrlParams(pq(each).find('.aPlayBtn').attr.href)
         }
         save['source'].append(episode)
     return save
Beispiel #3
0
    def detail_page(self, response):
        self.handleMetaRedirect(response)
        ##抓取基本信息
        categories = year = location = alias = upd_desc = orig_id = None
        catList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'年代:':
                year = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'国家/地区:':
                location = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'别名:':
                alias = pq(each).siblings().eq(0).text() or None

        #抓取播放源地址
        playSources = {}
        for each in response.doc('.sourceList').items():
            if each.attr.id is not None and len(each.attr.id) > 4:
                api = each.attr.id[:-4]
                episodes = {}
                for episode in pq(each).find('.numList>a').items():
                    if episode.attr.href and episode.attr.href[:
                                                               10] != 'javascript':
                        episodes[pq(episode).text()] = delUrlParams(
                            episode.attr.href)
                playSources[api] = episodes
        #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源,
        #因为集数太多,所以分了两个列表,需要合并两个dict
        for key in playSources.keys():
            if playSources.has_key(key + '_con_'):
                playSources[key] = dict(playSources[key].items() +
                                        playSources[key + '_con_'].items())
                del playSources[key + '_con_']

        #判断是否完结,closed:0 | 1
        closed = 0
        wrap = response.doc('.pTxt .sDes')
        sep = wrap.find('i')
        if re.match(u'.*更新.*', wrap.text()) or sep:
            pass
            if sep:
                upd_desc = wrap.contents()[-1][2:] or None
        else:
            closed = 1

        #get original id
        orig_id = getId(response.url, '.*dm/(\d+)\.html$')

        small_image = response.save['small_image']
        if small_image is None or len(small_image) == 0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "rating": response.doc('.sScore em').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "year": year or '',
            "location": location or '',
            "alias": alias or '',
            "play_source": playSources,
            "closed": closed,
            "upd_desc": upd_desc or '',
            "small_image": small_image or '',
            "orig_id": orig_id
        }
Beispiel #4
0
    def detail_page(self, response):
        self.handleMetaRedirect(response)
        ##抓取基本信息
        categories = year = location = alias = upd_desc = orig_id = None
        catList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'年代:':
                year =  pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'国家/地区:':
                location =  pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'别名:':
                alias =  pq(each).siblings().eq(0).text() or None

        #抓取播放源地址
        playSources = {}
        for each in response.doc('.sourceList').items():
            if each.attr.id is not None and len(each.attr.id)>4:
                api = each.attr.id[:-4]
                episodes = {}
                for episode in pq(each).find('.numList>a').items():
                    if episode.attr.href and episode.attr.href[:10] != 'javascript':
                        episodes[pq(episode).text()] = delUrlParams(episode.attr.href)
                playSources[api] = episodes
        #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源,
        #因为集数太多,所以分了两个列表,需要合并两个dict
        for key in playSources.keys():
            if playSources.has_key(key+'_con_'):
                playSources[key] = dict(playSources[key].items() + playSources[key+'_con_'].items())
                del playSources[key+'_con_']


        #判断是否完结,closed:0 | 1 
        closed = 0
        wrap = response.doc('.pTxt .sDes')
        sep = wrap.find('i')
        if re.match(u'.*更新.*', wrap.text()) or sep:
            pass
            if sep:
                upd_desc = wrap.contents()[-1][2:] or None
        else:
            closed = 1

        #get original id
        orig_id = getId(response.url, '.*dm/(\d+)\.html$')

        small_image = response.save['small_image']
        if small_image is None or len(small_image)==0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "rating": response.doc('.sScore em').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "year": year or '',
            "location": location or '',
            "alias" : alias or '',
            "play_source": playSources,
            "closed": closed,
            "upd_desc" : upd_desc or '',
            "small_image": small_image or '',
            "orig_id" : orig_id
        }
Beispiel #5
0
    def detail_page(self, response):
        ##抓取基本信息
        casting = director = categories = year = location = upd_desc = None
        castingList = []
        directorList = []
        catList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'主演:':
                for cast in pq(each).siblings('a'):
                    if pq(cast).text() != u'全部主演>':
                        castingList.append(pq(cast).text())

            elif pq(each).text() == u'导演:':
                director = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'国家/地区:':
                location = pq(each).siblings().eq(0).text() or None

        if year is None:
            matchObj = re.search(
                u'.*上映于(\d+)年.*',
                response.doc('meta[name=Description]').eq(0).attr.content)
            if year is not None:
                year = matchObj.group(1)
            else:
                year = 1980

        orig_id = 0
        orig_id = re.search('.*/detail/(\d+).html$', response.url).group(1)

        #抓取播放源地址
        playSources = {}
        for each in response.doc('.sourceList').items():
            api = each.attr.id[:-4]
            episodes = {}
            for episode in pq(each).find('.numList>a').items():
                if episode.attr.href and episode.attr.href[:
                                                           10] != 'javascript' and pq(
                                                               episode).text(
                                                               ) != u'分集剧情':
                    episodes[pq(episode).text()] = delUrlParams(
                        episode.attr.href)
            playSources[api] = episodes
        #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源,
        #因为集数太多,所以分了两个列表,需要合并两个dict
        for key in playSources.keys():
            if playSources.has_key(key + '_con_'):
                playSources[key] = dict(playSources[key].items() +
                                        playSources[key + '_con_'].items())
                del playSources[key + '_con_']

        #判断是否完结,closed:0 | 1
        closed = 0
        wrap = response.doc('.pTxt .sDes')
        sep = wrap.find('i')
        if re.match(u'.*更新.*', wrap.text()) or sep:
            pass
            if sep:
                upd_desc = wrap.contents()[-1][2:]
        else:
            closed = 1

        #测试是否能够同时去爬,并返回另一个结果
        self.crawl(response.doc('.pNumTab>a:last-child').attr.href,
                   callback=self.plot_list_page)

        small_image = response.save['small_image']
        if small_image is None or len(small_image) == 0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "rating": response.doc('.sScore em').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "year": year or '',
            "location": location or '',
            "play_source": playSources,
            "small_image": small_image or '',
            "casting": castingList,
            "director": director or '',
            "closed": closed,
            "upd_desc": upd_desc or '',
            "is_plot": 0,
            "orig_id": orig_id
        }
Beispiel #6
0
    def detail_page(self, response):
        ##抓取基本信息
        casting = director = categories = year = location = upd_desc = None
        castingList = []
        directorList = []
        catList = []
        for each in response.doc('dl.dlTxt dd em.emTit'):
            # TODO 这里可以包装为一个方法,目前太乱
            if pq(each).text() == u'主演:':
                for cast in pq(each).siblings('a'):
                    if pq(cast).text() != u'全部主演>':
                        castingList.append(pq(cast).text())

            elif pq(each).text() == u'导演:':
                director = pq(each).siblings().eq(0).text() or None

            elif pq(each).text() == u'类型:':
                for cat in pq(each).siblings('a'):
                    catList.append(pq(cat).text())

            elif pq(each).text() == u'国家/地区:':
                location =  pq(each).siblings().eq(0).text() or None

        if year is None:
            matchObj = re.search(u'.*上映于(\d+)年.*' ,response.doc('meta[name=Description]').eq(0).attr.content)
            if year is not None:
                year = matchObj.group(1)
            else:
                year = 1980

        orig_id = 0
        orig_id = re.search('.*/detail/(\d+).html$', response.url).group(1)

        #抓取播放源地址
        playSources = {}
        for each in response.doc('.sourceList').items():
            api = each.attr.id[:-4]
            episodes = {}
            for episode in pq(each).find('.numList>a').items():
                if episode.attr.href and episode.attr.href[:10] != 'javascript' and pq(episode).text() != u'分集剧情':
                    episodes[pq(episode).text()] = delUrlParams(episode.attr.href)
            playSources[api] = episodes
        #这里必须处理一种情况:sohu_con 和sohu_con_list其实是一种播放源,
        #因为集数太多,所以分了两个列表,需要合并两个dict
        for key in playSources.keys():
            if playSources.has_key(key+'_con_'):
                playSources[key] = dict(playSources[key].items() + playSources[key+'_con_'].items())
                del playSources[key+'_con_']

        #判断是否完结,closed:0 | 1 
        closed = 0
        wrap = response.doc('.pTxt .sDes')
        sep = wrap.find('i')
        if re.match(u'.*更新.*', wrap.text()) or sep:
            pass
            if sep:
                upd_desc = wrap.contents()[-1][2:]
        else:
            closed = 1

        #测试是否能够同时去爬,并返回另一个结果
        self.crawl(response.doc('.pNumTab>a:last-child').attr.href, callback=self.plot_list_page)

        small_image =  response.save['small_image']
        if small_image is None or len(small_image)==0:
            small_image = poster_image

        return {
            "url": response.url,
            "meta_title": response.doc('title').text(),
            "title": response.doc('h1 a').text() or '',
            "rating": response.doc('.sScore em').text() or '',
            "introduction": response.doc('#pIntroId').text() or '',
            "poster_image": response.doc('.posterCon .pic>img').attr.src or '',
            "categories": catList,
            "year": year or '',
            "location": location or '',
            "play_source": playSources,
            "small_image": small_image or '',
            "casting" : castingList,
            "director" : director or '',
            "closed": closed,
            "upd_desc": upd_desc or '',
            "is_plot": 0,
            "orig_id": orig_id
        }