コード例 #1
0
    def search(self, keyword, stype):
        if keyword.startswith('http'):
            res = self.download_page_request(keyword)
            meta = utils.gen_metadata_struck(stype)
            search_type = re.search(r'http://www\.data18\.com/(.*)/\d+',
                                    keyword).group(1)

            findeds = self.parse_dital(res.text,
                                       meta,
                                       search=True,
                                       types=search_type)

            for finded in findeds:
                if isinstance(finded, OrderedDict):
                    finded['tag']['type'] = stype
                    finded['tag']['dital_url'] = keyword

                    yield finded
        else:
            self.add_urls('http://www.data18.com/search/?k={}'.format(keyword))
            while self.has_url():
                url = self.get_urls()
                if url:
                    res = self.download_page_request(url)
                    if res:
                        for each in self.parse_search_html(res, stype):
                            yield each
                    else:
                        self.add_log('搜索失败')
コード例 #2
0
    def parse_search_html(self, res, stype):
        if not res:
            return
        next_page = re.search(
            r'<li><a href="(http://www.dmm.co.jp/.+?page=\d*)/">次へ</a>',
            res.text, re.IGNORECASE)
        if next_page:
            url = next_page.group(1)
            self.add_urls(url)

        # soup = BeautifulSoup(res.text, 'lxml')
        # total = soup.select_one('div.list-boxcaptside.list-boxpagenation > p')
        # li_nodes = soup.select("#list li")

        doc = pq(res.text)
        li_nodes = doc.find('#list > li').items()
        total = doc.find('div.list-boxcaptside.list-boxpagenation > p')

        for li in li_nodes:
            if self.stoped:
                break
            sell = li.find('p.sublink a')
            url = li.find('p.tmb a').attr('href')
            url_type = self.get_url_type(url)
            if url_type >= 0:
                src_url = 'http:' + li.find('p.tmb a img').attr('src')
                result = utils.gen_metadata_struck(stype)
                if '标题' in result:
                    result['标题'] = li.find('p.tmb a img').attr('alt')
                if '电视节目标题' in result:
                    result['电视节目标题'] = li.find('p.tmb a img').attr('alt')
                if '集标题' in result:
                    result['集标题'] = li.find('p.tmb a img').attr('alt')
                result['级别'] = 'R18+'
                try:
                    result['评级'] = self.format_rate_str(
                        li.find('div.value p.rate').text())
                except Exception:
                    pass

                result['tag']['type'] = stype
                result['tag']['dital_url'] = url
                result['tag']['video_id'] = re.search(r'cid=(.+)/',
                                                      url).group(1)

                result['tag']['backdrop'] = utils.tim_img_bytes(
                    self.download_page_request(
                        self.get_full_src(src_url)).content)
                result['tag']['poster'] = utils.create_poster(
                    result['tag']['backdrop'])
                result['tag']['total'] = int(
                    re.match(r'(\d+).*',
                             total.text()).group(1)) if total else 0
                result['tag']['tip'] = sell.text() if sell else ''

                yield result
コード例 #3
0
    def get_video_info(self, meta, stype=''):
        if not meta:
            return
        if not stype:
            stype = meta.get('tag').get('type')

        if not stype:
            return

        sid = meta.get('tag').get('id')
        slibrary_id = meta.get('tag').get('library_id')

        param = {
            'id': '[{}]'.format(sid),
        }
        if stype == 'tvshow':
            param.update(
                {'additional': '["poster_mtime","summary","backdrop_mtime"]'})

        if stype == 'movie' or stype == 'home_video':
            param.update({
                'additional':
                '["summary","poster_mtime","backdrop_mtime","file","collection","watched_ratio","conversion_produced","actor","director","genre","writer","extra"]'
            })

        if stype == 'tvshow_episode':
            param = {
                'library_id':
                '{}'.format(slibrary_id),
                'tvshow_id':
                '{}'.format(sid),
                'limit':
                '500000',
                'additional':
                '["summary","collection","poster_mtime","watched_ratio","file"]',
            }

        meth = 'getinfo'
        if stype == 'tvshow_episode':
            meth = 'list'

        json_res = self.post_request(
            'entry.cgi',
            'SYNO.VideoStation2.{}'.format(utils.get_library_API(stype)), meth,
            param)
        if json_res:
            results = json_res.get('data').get(utils.get_dsm_json_head(stype))
            for result in results:
                test_meta = utils.gen_metadata_struck(stype)
                if test_meta:
                    result_data = utils.fill_cn_form_en(
                        stype, test_meta, result)
                    yield result_data
コード例 #4
0
    def list_videos(self, meta, keyword='', only_nil=False):
        if not meta:
            return
        stype = meta.get('type')
        sAPI = utils.get_library_API(stype)
        library_id = meta.get('id')
        heads = utils.get_dsm_json_head(stype)

        if library_id is None or not sAPI:
            return

        param = {
            'offset': '0',
            'limit': '5000',
            'sort_by': '"title"',
            'sort_direction': '"desc"',
            'library_id': '{}'.format(library_id),
            'additional': '["poster_mtime","backdrop_mtime","summary"]'
        }

        if keyword:
            param.update({'keyword': '"{}"'.format(keyword)})

        json_res = self.post_request('entry.cgi',
                                     'SYNO.VideoStation2.{}'.format(sAPI),
                                     'list', param)
        if json_res and json_res.get('success'):

            total = json_res.get('data').get('total')
            if total:
                yield total

            datas = json_res.get('data').get(heads)

            for data in datas:

                test_meta = utils.gen_metadata_struck(stype)
                if test_meta:
                    result_data = utils.fill_cn_form_en(stype, test_meta, data)
                    if result_data:
                        poster_mtime = data.get('additional').get(
                            'poster_mtime')
                        poster = self.get_video_poster(stype, data.get('id'),
                                                       poster_mtime)
                        result_data['tag']['poster'] = poster
                        if only_nil:
                            if not result_data.get('tag').get('poster_mtime'):
                                yield result_data
                        else:
                            yield result_data
コード例 #5
0
    def parse_search_html(self, res, stype):
        if not res:
            return
        res.encoding = 'euc-jp'
        html = res.text
        main_url = res.url
        doc = PyQuery(html)

        next_page = doc('a.go-to-next')
        if next_page.text() == '次へ':
            self.add_urls(urljoin(main_url, next_page.attr('href')))

        try:
            total = doc('#main-content > h1 > small').text()
            total = re.search('(\d+)', total).group(1)

            yield int(total)
        except Exception:
            pass

        divs = doc('#main-content > div.list-area > div').items()
        for div in divs:
            result = utils.gen_metadata_struck(stype)
            title = div('span.movie-title > a').text()
            if '标题' in result:
                result['标题'] = title
            if '电视节目标题' in result:
                result['电视节目标题'] = title
            if '集标题' in result:
                result['集标题'] = title
            result['级别'] = 'R18+'
            result['tag']['type'] = stype
            result['tag']['dital_url'] = urljoin(main_url,
                                                 div('a').attr('href'))
            result['tag']['video_id'] = re.search(
                r'/(\d+-\d+)/index', result['tag']['dital_url']).group(1)
            result['tag']['tip'] = div('span.movie-actor > a > span').text()
            poster_url = div('a > img').attr('src')
            if poster_url:
                result['tag']['poster'] = self.download_page_request(
                    poster_url).content
                result['tag']['xy'] = (40, 30)
            yield result
コード例 #6
0
    def parse_search_html(self, res, stype):
        if not res:
            return
        html = res.text
        main_url = res.url
        doc = pq(html)

        next_page = doc('#sub_main > div.listpage > ul > li.next > a')
        if next_page.text() == '次へ »':
            self.add_urls(urljoin(main_url, next_page.attr('href')))

        try:
            total = doc('#sub_main > p').text()
            total = re.search('(\d+)', total).group(1)
            yield int(total)
        except Exception:
            pass

        divs = doc('#sub_main > div.movie_list').items()
        for div in divs:
            result = utils.gen_metadata_struck(stype)
            title = div('div.movielistphoto1 > a > img').attr('alt')
            if '标题' in result:
                result['标题'] = title
            if '电视节目标题' in result:
                result['电视节目标题'] = title
            if '集标题' in result:
                result['集标题'] = title
            result['级别'] = 'R18+'
            result['tag']['type'] = stype
            result['tag']['dital_url'] = urljoin(
                main_url,
                div('div.movielistphoto1 > a').attr('href'))
            result['tag']['video_id'] = re.search(
                r'/(\d+)/index', result['tag']['dital_url']).group(1)
            result['tag']['tip'] = div('div.movielisttext01 > a').text()
            poster_url = div('div.movielistphoto1 > a > img').attr('src')
            if poster_url:
                result['tag']['poster'] = self.download_page_request(
                    poster_url).content
                result['tag']['xy'] = (40, 30)
            yield result
コード例 #7
0
    def parse_search_html(self, res, stype):
        if not res:
            return
        res.encoding = 'utf-8'
        html = res.text
        main_url = res.url
        doc = PyQuery(html)

        next_page = doc('#pagenation li.next a')
        if next_page.text() == '› ›':
            self.add_urls(urljoin(main_url, next_page.attr('href')))

        try:
            total = doc('#contents > div.message').text()
            total = re.search('(\d+)', total).group(1)
            yield int(total)
        except Exception:
            pass

        divs = doc('#contents > form > div.item_box.fixHeight > div').items()
        for div in divs:
            result = utils.gen_metadata_struck(stype)
            title = div('p > a').text()
            if title:
                if '标题' in result:
                    result['标题'] = title
                if '电视节目标题' in result:
                    result['电视节目标题'] = title
                if '集标题' in result:
                    result['集标题'] = title
                result['级别'] = 'R18+'
                result['tag']['type'] = stype
                result['tag']['dital_url'] = urljoin(main_url,
                                                     div('a').attr('href'))
                result['tag']['video_id'] = re.search(
                    r'id=(\d+)', result['tag']['dital_url']).group(1)
                # result['tag']['tip'] = div('span.movie-actor > a > span').text()
                poster_url = urljoin(main_url, div('a > img').attr('src'))
                if poster_url:
                    result['tag']['poster'] = self.download_page_request(
                        poster_url).content
                yield result
コード例 #8
0
    def parse_url_search(self, res, stype='movie'):
        if not res:
            return

        result = utils.gen_metadata_struck(stype)
        try:

            doc = PyQuery(res.text)
            # print(doc)

            title = doc(
                '#main-content > div.main-content-movieinfo > div.video-detail > h1'
            ).text()

            if '标题' in result:
                result['标题'] = title
            if '电视节目标题' in result:
                result['电视节目标题'] = title
            if '集标题' in result:
                result['集标题'] = title

            result['级别'] = 'R18+'

            result['tag']['type'] = stype
            result['tag']['dital_url'] = res.url
            result['tag']['video_id'] = re.search(
                r'/(\d+-\d+)/index', result['tag']['dital_url']).group(1)
            result['tag']['tip'] = doc(
                '#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(1) > dd > a > span'
            ).text()
            result['tag']['xy'] = (40, 30)

            poster_url = 'https://www.caribbeancom.com/moviepages/{}/images/l_l.jpg'.format(
                result['tag']['video_id'])
            if poster_url:
                result['tag']['poster'] = self.download_page_request(
                    poster_url).content

        except Exception:
            pass

        return result
コード例 #9
0
    def parse_url_search(self, res, stype='movie'):
        if not res:
            return
        result = utils.gen_metadata_struck(stype)
        try:
            json_ld = json.loads(
                re.search(
                    r'<script type="application/ld\+json">(.*?)</script>',
                    res.text, re.S).group(1))

            if '标题' in result:
                result['标题'] = json_ld.get('name')
            if '电视节目标题' in result:
                result['电视节目标题'] = json_ld.get('name')
            if '集标题' in result:
                result['集标题'] = json_ld.get('name')

            result['级别'] = 'R18+'
            result['评级'] = self.format_rate_str(
                json_ld.get('aggregateRating').get('ratingValue'))

            result['tag']['type'] = stype
            result['tag']['dital_url'] = res.url
            result['tag']['video_id'] = re.search(r'cid=(.+)/',
                                                  res.url).group(1)

            result['tag']['backdrop'] = utils.tim_img_bytes(
                self.download_page_request(
                    self.get_full_src(json_ld.get('image'))).content)
            result['tag']['poster'] = utils.create_poster(
                result['tag']['backdrop'])
            result['tag']['total'] = 0
            result['tag']['tip'] = ''

        except Exception:
            pass

        result['dital_url'] = res.url

        return result
コード例 #10
0
    def parse_url_search(self, res, stype='movie'):
        if not res:
            return
        res.encoding = 'utf-8'
        result = utils.gen_metadata_struck(stype)
        try:

            doc = PyQuery(res.text)
            title = doc('#contents > form > div.detailed_title').text()
            if title:
                if '标题' in result:
                    result['标题'] = title
                if '电视节目标题' in result:
                    result['电视节目标题'] = title
                if '集标题' in result:
                    result['集标题'] = title

                result['级别'] = 'R18+'

                result['tag']['type'] = stype
                result['tag']['dital_url'] = res.url
                result['tag']['video_id'] = re.search(
                    r'id=(\d+)', result['tag']['dital_url']).group(1)
                result['tag']['tip'] = doc(
                    '#contents > form > div.item_detail > ul > li:nth-child(2) > a'
                ).text()
                poster_url = urljoin(
                    res.url,
                    doc('#contents > form > div.item_detail > div.item600 > img'
                        ).attr('src'))
                if poster_url:
                    result['tag']['backdrop'] = utils.tim_img_bytes(
                        self.download_page_request(poster_url).content)
                    result['tag']['poster'] = utils.create_poster(
                        result['tag']['backdrop'])
        except Exception:
            pass

        return result
コード例 #11
0
    def parse_url_search(self, res, stype='movie'):
        if not res:
            return

        result = utils.gen_metadata_struck(stype)
        try:

            doc = pq(res.text)
            # print(doc)

            title = doc('head > title').text()

            if '标题' in result:
                result['标题'] = title
            if '电视节目标题' in result:
                result['电视节目标题'] = title
            if '集标题' in result:
                result['集标题'] = title

            result['级别'] = 'R18+'

            result['tag']['type'] = stype
            result['tag']['dital_url'] = res.url
            result['tag']['video_id'] = re.search(
                r'/(\d+)/index', result['tag']['dital_url']).group(1)
            result['tag']['tip'] = doc(
                '#detail_box > table >tr:nth-child(1)  a').text()

            poster_url = re.search(r"var imgurl = '(http://.*?jpg)';",
                                   res.text, re.S).group(1)
            if poster_url:
                result['tag']['poster'] = self.download_page_request(
                    poster_url).content

        except Exception:
            pass

        return result
コード例 #12
0
            meta['tag']['backdrop'] = utils.tim_img_bytes(poster_data)
            meta['tag']['poster'] = utils.create_poster(
                meta['tag']['backdrop'])
        except AttributeError:
            pass
        yield meta

        # try:
        #     # 缩略图
        #     sample_url = doc(
        #         '#TabbedPanels1 > div > div.TabbedPanelsContent.TabbedPanelsContentVisible > a > img').attr('src')
        #     yield self.download_page_request(sample_url).content
        #
        # except Exception:
        #     pass

    def dital(self, url, meta):
        res = self.download_page_request(url)
        if res:
            return self.parse_dital(res.text, meta)


if __name__ == '__main__':
    test = Kin8tengokuSpider('kin8')
    # for each in test.search('http://www.kin8tengoku.com/moviepages/0959/index.html','movie'):
    #     print(each)

    for each in test.dital(
            'http://www.kin8tengoku.com/moviepages/0959/index.html',
            utils.gen_metadata_struck('movie')):
        print(each)
コード例 #13
0
    def parse_search_html(self, res, stype):
        if not res:
            return

        relock = re.search(r'<a href="(.*?)">Click here to continue\.\.\.</a>',
                           res.text)
        if relock:
            self.download_page_request(relock.group(1))
            self.add_log('parse_search_html 重置:', relock.group(1))
            self.add_urls(relock.group(1), True)
            return

        metas = []
        pattern = re.compile(
            r'<div style="float: left;.*?(\d{4}-\d{2}-\d{2}).*?'
            r'<a href="(http://.*?)">.*?'
            r'<img src="(http://.*?)".*?style=".*?'
            r'title="(.*?)".*?</div>'
            r'|'
            r'<div class="bscene genmed".*?</b>(.*?\d{2}, \d{4}.*?)</p>'
            r'<p class="line1">.*?<a href="(http://.*?)">.*?'
            r'<img src="(http://.*?)".*?'
            r'title="(.*?)".*?'
            r'.*?</div>', re.S)
        meta_movies = re.findall(pattern, res.text)
        if meta_movies:
            metas.extend(meta_movies)

        pattern2 = re.compile(
            r'<div class="bscene genmed".*?</b>(.*?\d{2}.*?\d{4}.*?)</p>.*?'
            r'<p class="line1">.*?<a href="(http://.*?)">.*?'
            r'<img src="(http://.*?)".*?'
            r'title="(.*?)".*?'
            r'.*?</div>', re.S)

        metas_contens = re.findall(pattern2, res.text)
        if metas_contens:
            metas.extend(metas_contens)

        for meta in metas:
            if self.stoped:
                break
            result = utils.gen_metadata_struck(stype)
            try:
                if '标题' in result:
                    result['标题'] = meta[3].strip()
                if '电视节目标题' in result:
                    result['电视节目标题'] = meta[3].strip()
                if '集标题' in result:
                    result['集标题'] = meta[3].strip()
            except Exception as e:
                self.add_log('parse_search_html 抓取标题错误:', e, level='error')

            result['tag']['type'] = stype
            result['tag']['dital_url'] = meta[1].strip()
            result['tag']['video_id'] = re.search(r'/(\d+)', meta[1]).group(1)

            result['tag']['poster'] = utils.tim_img_bytes(
                self.download_page_request(meta[2]).content)
            result['tag']['total'] = 0
            str = re.match(r'\s*(\w{3}).*?( \d{2}, \d{4})', meta[0],
                           re.IGNORECASE)
            if str:
                result['tag']['tip'] = utils.format_date_str(
                    str.group(1) + str.group(2))
            else:
                result['tag']['tip'] = utils.format_date_str(meta[0].strip())
            yield result