コード例 #1
0
ファイル: kat_search.py プロジェクト: zhjih7988/spider-1
    def parse_resource(self, response):
        logging.log(logging.INFO, "parse_resource:%s" % response.request.url)

        m_item = response.request.meta['m_item']
        vtitle = response.request.meta['vtitle']
        vnum = response.request.meta['vnum']

        vitems = []
        for x in ['Torrent magnet link', 'Download torrent file']:
            vurls = response.xpath('//table[@class="data"]//tr/td[1]//a[@title="%s"]/@href' % x).extract()
            rtitles = response.xpath('//table[@class="data"]//tr/td[1]//a[@title="%s"]/../../div[@class="torrentname"]/div/a/text()' % x).extract()
            if not vurls:
                continue
            vurl = vurls[0]
            rtitle = rtitles[0]
            if vurl.startswith('//'):
                vurl = 'https:' + vurl
            protocol = extract_protocol(vurl)
            if protocol in self.protocol_map and protocol != 'http':
                v_item = VideoItem()
                v_item['url'] = vurl
                v_item['title'] = rtitle
                v_item['vnum'] = vnum
                #v_item['vtitle'] = vtitle
                v_item['cont_id'] = md5(vurl).hexdigest()
                v_item['protocol_id'] = self.protocol_map[protocol]
                if protocol == 'magnet':
                    v_item['tor_ih'] = extract_info_ih(vurl)
                vitems.append(v_item)

            mvitem = MediaVideoItem()
            mvitem["media"] = m_item
            mvitem['video'] = vitems

            yield mvitem
コード例 #2
0
    def parse_all_episode(self, response):
        logging.log(logging.INFO,
                    "parse_all_episode:%s" % response.request.url)

        try:
            mtitle = response.xpath(
                '//table[@class="doublecelltable"]//tr/td[1]/h1/text()'
            ).extract()[0]
            imdb_url = response.xpath(
                '//div[@class="torrentMediaInfo"]/div[@class="dataList"]/ul/li/strong[text()="IMDb link:"]/../a/@href'
            ).extract()[0]
            imdb = extract_imdb(imdb_url)

            # tv
            seasons = response.xpath(
                '//table[@class="doublecelltable"]//tr/td[1]/h3')
            for season in seasons:
                snums = season.xpath('./text()').re('\d+')
                if snums:
                    snum = int(snums[0])
                    resources = season.xpath(
                        './following-sibling::div[1]/div/div[@class="infoList versionsFolded"]/a[@class="infoListCut"]'
                    )
                    for r in resources:

                        vnums = r.xpath(
                            './span[@class="versionsEpNo"]/text()').re('\d+')
                        vtitles = r.xpath(
                            './span[@class="versionsEpName"]/text()').extract(
                            )
                        if not vnums or not vtitles:
                            continue
                        vnum = int(vnums[0])
                        if vnum <= 0:
                            continue
                        vtitle = vtitles[0]
                        episode_nums = r.xpath('./@onclick').re('\d+')
                        if not episode_nums:
                            continue
                        episode_num = episode_nums[0]

                        m_item = MediaItem()
                        m_item['title'] = mtitle + '_Season' + str(snum)
                        m_item['url'] = response.request.url
                        m_item['site_id'] = self.site_id
                        #m_item['imdb'] = imdb + '|' + str(snum)
                        m_item['cont_id'] = imdb + '|' + str(snum)

                        req = Request(url=self.url_getepisode % episode_num,
                                      callback=self.parse_resource,
                                      meta={
                                          'm_item': m_item,
                                          'vtitle': vtitle,
                                          'vnum': vnum
                                      })
                        yield req
            # movie
            vitems = []
            dids = response.xpath(
                '//div[@class="tabs tabSwitcher"]/div[@id]/@id').extract()
            for did in dids:
                for x in ['Torrent magnet link', 'Download torrent file']:
                    vurls = response.xpath(
                        '//div[@id="%s"]/table//tr[@id]/td[1]/div[1]/a[@title="%s"]/@href'
                        % (did, x)).extract()
                    vtitles = response.xpath(
                        '//div[@id="%s"]/table//tr[@id]/td[1]/div[1]/a[@title="%s"]/../../div[@class="torrentname"]/div/a/text()'
                        % (did, x)).extract()
                    if not vurls or not vtitles:
                        continue
                    vurl = vurls[0]
                    vtitle = vtitles[0]
                    if vurl.startswith('//'):
                        vurl = 'https:' + vurl
                    protocol = extract_protocol(vurl)
                    if protocol in self.protocol_map and protocol != 'http':
                        v_item = VideoItem()
                        v_item['url'] = vurl
                        v_item['title'] = vtitle
                        v_item['cont_id'] = md5(vurl).hexdigest()
                        v_item['protocol_id'] = self.protocol_map[protocol]
                        if protocol == 'magnet':
                            v_item['tor_ih'] = extract_info_ih(vurl)
                        vitems.append(v_item)

            if vitems:
                m_item = MediaItem()
                m_item['title'] = mtitle
                m_item['url'] = response.request.url
                m_item['site_id'] = self.site_id
                #m_item['imdb'] = imdb
                m_item['cont_id'] = imdb

                mvitem = MediaVideoItem()
                mvitem["media"] = m_item
                mvitem['video'] = vitems

                yield mvitem

        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())