def search(self, keyword, stype): if keyword.startswith('http'): res = self.download_page_request(keyword) meta = utils.gen_metadata_struck(stype) search_type = re.search(r'http://www\.data18\.com/(.*)/\d+', keyword).group(1) findeds = self.parse_dital(res.text, meta, search=True, types=search_type) for finded in findeds: if isinstance(finded, OrderedDict): finded['tag']['type'] = stype finded['tag']['dital_url'] = keyword yield finded else: self.add_urls('http://www.data18.com/search/?k={}'.format(keyword)) while self.has_url(): url = self.get_urls() if url: res = self.download_page_request(url) if res: for each in self.parse_search_html(res, stype): yield each else: self.add_log('搜索失败')
def parse_search_html(self, res, stype): if not res: return next_page = re.search( r'<li><a href="(http://www.dmm.co.jp/.+?page=\d*)/">次へ</a>', res.text, re.IGNORECASE) if next_page: url = next_page.group(1) self.add_urls(url) # soup = BeautifulSoup(res.text, 'lxml') # total = soup.select_one('div.list-boxcaptside.list-boxpagenation > p') # li_nodes = soup.select("#list li") doc = pq(res.text) li_nodes = doc.find('#list > li').items() total = doc.find('div.list-boxcaptside.list-boxpagenation > p') for li in li_nodes: if self.stoped: break sell = li.find('p.sublink a') url = li.find('p.tmb a').attr('href') url_type = self.get_url_type(url) if url_type >= 0: src_url = 'http:' + li.find('p.tmb a img').attr('src') result = utils.gen_metadata_struck(stype) if '标题' in result: result['标题'] = li.find('p.tmb a img').attr('alt') if '电视节目标题' in result: result['电视节目标题'] = li.find('p.tmb a img').attr('alt') if '集标题' in result: result['集标题'] = li.find('p.tmb a img').attr('alt') result['级别'] = 'R18+' try: result['评级'] = self.format_rate_str( li.find('div.value p.rate').text()) except Exception: pass result['tag']['type'] = stype result['tag']['dital_url'] = url result['tag']['video_id'] = re.search(r'cid=(.+)/', url).group(1) result['tag']['backdrop'] = utils.tim_img_bytes( self.download_page_request( self.get_full_src(src_url)).content) result['tag']['poster'] = utils.create_poster( result['tag']['backdrop']) result['tag']['total'] = int( re.match(r'(\d+).*', total.text()).group(1)) if total else 0 result['tag']['tip'] = sell.text() if sell else '' yield result
def get_video_info(self, meta, stype=''): if not meta: return if not stype: stype = meta.get('tag').get('type') if not stype: return sid = meta.get('tag').get('id') slibrary_id = meta.get('tag').get('library_id') param = { 'id': '[{}]'.format(sid), } if stype == 'tvshow': param.update( {'additional': '["poster_mtime","summary","backdrop_mtime"]'}) if stype == 'movie' or stype == 'home_video': param.update({ 'additional': '["summary","poster_mtime","backdrop_mtime","file","collection","watched_ratio","conversion_produced","actor","director","genre","writer","extra"]' }) if stype == 'tvshow_episode': param = { 'library_id': '{}'.format(slibrary_id), 'tvshow_id': '{}'.format(sid), 'limit': '500000', 'additional': '["summary","collection","poster_mtime","watched_ratio","file"]', } meth = 'getinfo' if stype == 'tvshow_episode': meth = 'list' json_res = self.post_request( 'entry.cgi', 'SYNO.VideoStation2.{}'.format(utils.get_library_API(stype)), meth, param) if json_res: results = json_res.get('data').get(utils.get_dsm_json_head(stype)) for result in results: test_meta = utils.gen_metadata_struck(stype) if test_meta: result_data = utils.fill_cn_form_en( stype, test_meta, result) yield result_data
def list_videos(self, meta, keyword='', only_nil=False): if not meta: return stype = meta.get('type') sAPI = utils.get_library_API(stype) library_id = meta.get('id') heads = utils.get_dsm_json_head(stype) if library_id is None or not sAPI: return param = { 'offset': '0', 'limit': '5000', 'sort_by': '"title"', 'sort_direction': '"desc"', 'library_id': '{}'.format(library_id), 'additional': '["poster_mtime","backdrop_mtime","summary"]' } if keyword: param.update({'keyword': '"{}"'.format(keyword)}) json_res = self.post_request('entry.cgi', 'SYNO.VideoStation2.{}'.format(sAPI), 'list', param) if json_res and json_res.get('success'): total = json_res.get('data').get('total') if total: yield total datas = json_res.get('data').get(heads) for data in datas: test_meta = utils.gen_metadata_struck(stype) if test_meta: result_data = utils.fill_cn_form_en(stype, test_meta, data) if result_data: poster_mtime = data.get('additional').get( 'poster_mtime') poster = self.get_video_poster(stype, data.get('id'), poster_mtime) result_data['tag']['poster'] = poster if only_nil: if not result_data.get('tag').get('poster_mtime'): yield result_data else: yield result_data
def parse_search_html(self, res, stype): if not res: return res.encoding = 'euc-jp' html = res.text main_url = res.url doc = PyQuery(html) next_page = doc('a.go-to-next') if next_page.text() == '次へ': self.add_urls(urljoin(main_url, next_page.attr('href'))) try: total = doc('#main-content > h1 > small').text() total = re.search('(\d+)', total).group(1) yield int(total) except Exception: pass divs = doc('#main-content > div.list-area > div').items() for div in divs: result = utils.gen_metadata_struck(stype) title = div('span.movie-title > a').text() if '标题' in result: result['标题'] = title if '电视节目标题' in result: result['电视节目标题'] = title if '集标题' in result: result['集标题'] = title result['级别'] = 'R18+' result['tag']['type'] = stype result['tag']['dital_url'] = urljoin(main_url, div('a').attr('href')) result['tag']['video_id'] = re.search( r'/(\d+-\d+)/index', result['tag']['dital_url']).group(1) result['tag']['tip'] = div('span.movie-actor > a > span').text() poster_url = div('a > img').attr('src') if poster_url: result['tag']['poster'] = self.download_page_request( poster_url).content result['tag']['xy'] = (40, 30) yield result
def parse_search_html(self, res, stype): if not res: return html = res.text main_url = res.url doc = pq(html) next_page = doc('#sub_main > div.listpage > ul > li.next > a') if next_page.text() == '次へ »': self.add_urls(urljoin(main_url, next_page.attr('href'))) try: total = doc('#sub_main > p').text() total = re.search('(\d+)', total).group(1) yield int(total) except Exception: pass divs = doc('#sub_main > div.movie_list').items() for div in divs: result = utils.gen_metadata_struck(stype) title = div('div.movielistphoto1 > a > img').attr('alt') if '标题' in result: result['标题'] = title if '电视节目标题' in result: result['电视节目标题'] = title if '集标题' in result: result['集标题'] = title result['级别'] = 'R18+' result['tag']['type'] = stype result['tag']['dital_url'] = urljoin( main_url, div('div.movielistphoto1 > a').attr('href')) result['tag']['video_id'] = re.search( r'/(\d+)/index', result['tag']['dital_url']).group(1) result['tag']['tip'] = div('div.movielisttext01 > a').text() poster_url = div('div.movielistphoto1 > a > img').attr('src') if poster_url: result['tag']['poster'] = self.download_page_request( poster_url).content result['tag']['xy'] = (40, 30) yield result
def parse_search_html(self, res, stype): if not res: return res.encoding = 'utf-8' html = res.text main_url = res.url doc = PyQuery(html) next_page = doc('#pagenation li.next a') if next_page.text() == '› ›': self.add_urls(urljoin(main_url, next_page.attr('href'))) try: total = doc('#contents > div.message').text() total = re.search('(\d+)', total).group(1) yield int(total) except Exception: pass divs = doc('#contents > form > div.item_box.fixHeight > div').items() for div in divs: result = utils.gen_metadata_struck(stype) title = div('p > a').text() if title: if '标题' in result: result['标题'] = title if '电视节目标题' in result: result['电视节目标题'] = title if '集标题' in result: result['集标题'] = title result['级别'] = 'R18+' result['tag']['type'] = stype result['tag']['dital_url'] = urljoin(main_url, div('a').attr('href')) result['tag']['video_id'] = re.search( r'id=(\d+)', result['tag']['dital_url']).group(1) # result['tag']['tip'] = div('span.movie-actor > a > span').text() poster_url = urljoin(main_url, div('a > img').attr('src')) if poster_url: result['tag']['poster'] = self.download_page_request( poster_url).content yield result
def parse_url_search(self, res, stype='movie'): if not res: return result = utils.gen_metadata_struck(stype) try: doc = PyQuery(res.text) # print(doc) title = doc( '#main-content > div.main-content-movieinfo > div.video-detail > h1' ).text() if '标题' in result: result['标题'] = title if '电视节目标题' in result: result['电视节目标题'] = title if '集标题' in result: result['集标题'] = title result['级别'] = 'R18+' result['tag']['type'] = stype result['tag']['dital_url'] = res.url result['tag']['video_id'] = re.search( r'/(\d+-\d+)/index', result['tag']['dital_url']).group(1) result['tag']['tip'] = doc( '#main-content > div.main-content-movieinfo > div.movie-info > dl:nth-child(1) > dd > a > span' ).text() result['tag']['xy'] = (40, 30) poster_url = 'https://www.caribbeancom.com/moviepages/{}/images/l_l.jpg'.format( result['tag']['video_id']) if poster_url: result['tag']['poster'] = self.download_page_request( poster_url).content except Exception: pass return result
def parse_url_search(self, res, stype='movie'): if not res: return result = utils.gen_metadata_struck(stype) try: json_ld = json.loads( re.search( r'<script type="application/ld\+json">(.*?)</script>', res.text, re.S).group(1)) if '标题' in result: result['标题'] = json_ld.get('name') if '电视节目标题' in result: result['电视节目标题'] = json_ld.get('name') if '集标题' in result: result['集标题'] = json_ld.get('name') result['级别'] = 'R18+' result['评级'] = self.format_rate_str( json_ld.get('aggregateRating').get('ratingValue')) result['tag']['type'] = stype result['tag']['dital_url'] = res.url result['tag']['video_id'] = re.search(r'cid=(.+)/', res.url).group(1) result['tag']['backdrop'] = utils.tim_img_bytes( self.download_page_request( self.get_full_src(json_ld.get('image'))).content) result['tag']['poster'] = utils.create_poster( result['tag']['backdrop']) result['tag']['total'] = 0 result['tag']['tip'] = '' except Exception: pass result['dital_url'] = res.url return result
def parse_url_search(self, res, stype='movie'): if not res: return res.encoding = 'utf-8' result = utils.gen_metadata_struck(stype) try: doc = PyQuery(res.text) title = doc('#contents > form > div.detailed_title').text() if title: if '标题' in result: result['标题'] = title if '电视节目标题' in result: result['电视节目标题'] = title if '集标题' in result: result['集标题'] = title result['级别'] = 'R18+' result['tag']['type'] = stype result['tag']['dital_url'] = res.url result['tag']['video_id'] = re.search( r'id=(\d+)', result['tag']['dital_url']).group(1) result['tag']['tip'] = doc( '#contents > form > div.item_detail > ul > li:nth-child(2) > a' ).text() poster_url = urljoin( res.url, doc('#contents > form > div.item_detail > div.item600 > img' ).attr('src')) if poster_url: result['tag']['backdrop'] = utils.tim_img_bytes( self.download_page_request(poster_url).content) result['tag']['poster'] = utils.create_poster( result['tag']['backdrop']) except Exception: pass return result
def parse_url_search(self, res, stype='movie'): if not res: return result = utils.gen_metadata_struck(stype) try: doc = pq(res.text) # print(doc) title = doc('head > title').text() if '标题' in result: result['标题'] = title if '电视节目标题' in result: result['电视节目标题'] = title if '集标题' in result: result['集标题'] = title result['级别'] = 'R18+' result['tag']['type'] = stype result['tag']['dital_url'] = res.url result['tag']['video_id'] = re.search( r'/(\d+)/index', result['tag']['dital_url']).group(1) result['tag']['tip'] = doc( '#detail_box > table >tr:nth-child(1) a').text() poster_url = re.search(r"var imgurl = '(http://.*?jpg)';", res.text, re.S).group(1) if poster_url: result['tag']['poster'] = self.download_page_request( poster_url).content except Exception: pass return result
meta['tag']['backdrop'] = utils.tim_img_bytes(poster_data) meta['tag']['poster'] = utils.create_poster( meta['tag']['backdrop']) except AttributeError: pass yield meta # try: # # 缩略图 # sample_url = doc( # '#TabbedPanels1 > div > div.TabbedPanelsContent.TabbedPanelsContentVisible > a > img').attr('src') # yield self.download_page_request(sample_url).content # # except Exception: # pass def dital(self, url, meta): res = self.download_page_request(url) if res: return self.parse_dital(res.text, meta) if __name__ == '__main__': test = Kin8tengokuSpider('kin8') # for each in test.search('http://www.kin8tengoku.com/moviepages/0959/index.html','movie'): # print(each) for each in test.dital( 'http://www.kin8tengoku.com/moviepages/0959/index.html', utils.gen_metadata_struck('movie')): print(each)
def parse_search_html(self, res, stype): if not res: return relock = re.search(r'<a href="(.*?)">Click here to continue\.\.\.</a>', res.text) if relock: self.download_page_request(relock.group(1)) self.add_log('parse_search_html 重置:', relock.group(1)) self.add_urls(relock.group(1), True) return metas = [] pattern = re.compile( r'<div style="float: left;.*?(\d{4}-\d{2}-\d{2}).*?' r'<a href="(http://.*?)">.*?' r'<img src="(http://.*?)".*?style=".*?' r'title="(.*?)".*?</div>' r'|' r'<div class="bscene genmed".*?</b>(.*?\d{2}, \d{4}.*?)</p>' r'<p class="line1">.*?<a href="(http://.*?)">.*?' r'<img src="(http://.*?)".*?' r'title="(.*?)".*?' r'.*?</div>', re.S) meta_movies = re.findall(pattern, res.text) if meta_movies: metas.extend(meta_movies) pattern2 = re.compile( r'<div class="bscene genmed".*?</b>(.*?\d{2}.*?\d{4}.*?)</p>.*?' r'<p class="line1">.*?<a href="(http://.*?)">.*?' r'<img src="(http://.*?)".*?' r'title="(.*?)".*?' r'.*?</div>', re.S) metas_contens = re.findall(pattern2, res.text) if metas_contens: metas.extend(metas_contens) for meta in metas: if self.stoped: break result = utils.gen_metadata_struck(stype) try: if '标题' in result: result['标题'] = meta[3].strip() if '电视节目标题' in result: result['电视节目标题'] = meta[3].strip() if '集标题' in result: result['集标题'] = meta[3].strip() except Exception as e: self.add_log('parse_search_html 抓取标题错误:', e, level='error') result['tag']['type'] = stype result['tag']['dital_url'] = meta[1].strip() result['tag']['video_id'] = re.search(r'/(\d+)', meta[1]).group(1) result['tag']['poster'] = utils.tim_img_bytes( self.download_page_request(meta[2]).content) result['tag']['total'] = 0 str = re.match(r'\s*(\w{3}).*?( \d{2}, \d{4})', meta[0], re.IGNORECASE) if str: result['tag']['tip'] = utils.format_date_str( str.group(1) + str.group(2)) else: result['tag']['tip'] = utils.format_date_str(meta[0].strip()) yield result