def search(self, video_type, title, year, season=''): # @UnusedVariable scrape = title.lower().replace(' ','+').replace(':', '') start_url = self.search_link %(self.goog,scrape,year) html = client.request(start_url) results = [] search_url = scraper_utils.urljoin(self.base_url, '/search/ajax_search') html = self._http_get(search_url, params={'q': title}, headers=XHR, cache_limit=1) js_result = scraper_utils.parse_json(html, search_url) match_year = '' for series in js_result.get('series', []): match_url = series.get('seo') match_title = series.get('label') if match_url and match_title and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url('/' + match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def __get_posts(self, html): sources = {} pattern = '\$\.post\("([^"]+)"\s*,\s*\{(.*?)\}' match = re.search(pattern, html) if not match: return sources post_url, post_data = match.groups() data = self.__get_data(post_data) html = self._http_get(post_url, data=data, cache_limit=.5) js_result = scraper_utils.parse_json(html, post_url) for key in js_result: stream_url = js_result[key] host = scraper_utils.get_direct_hostname(self, stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) else: quality = scraper_utils.height_get_quality(key) sources[stream_url] = quality return sources
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, SEARCH_URL) referer = scraper_utils.urljoin(self.base_url, '/search/?q=%s') referer = referer % (urllib.quote_plus(title)) headers = {'Referer': referer} headers.update(XHR) params = {'searchTerm': title, 'type': SEARCH_TYPES[video_type], 'limit': 500} html = self._http_get(search_url, params=params, headers=headers, auth=False, cache_limit=2) js_data = scraper_utils.parse_json(html, search_url) if 'results' in js_data: for result in js_data['results']: match_year = str(result.get('year', '')) match_url = result.get('permalink', '') match_title = result.get('title', '') if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def __get_json_links(self, html, sub): hosters = [] js_data = scraper_utils.parse_json(html) if 'sources' in js_data: for source in js_data.get('sources', []): stream_url = source.get('file') if stream_url is None: continue host = scraper_utils.get_direct_hostname(self, stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) elif 'label' in source: quality = scraper_utils.height_get_quality(source['label']) else: quality = QUALITIES.HIGH hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True} hoster['subs'] = sub hosters.append(hoster) return hosters
def __search(self, video_type, title, year, season=''): results = [] search_url = (SEARCH_URL) % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=1) js_data = scraper_utils.parse_json(html) norm_title = scraper_utils.normalize_title(title) for item in js_data.get('results', []): if '/watch/' not in item['url'].lower(): continue is_season = re.search('Season\s+(\d+)', item['titleNoFormatting'], re.IGNORECASE) if (not is_season and video_type == VIDEO_TYPES.MOVIE) or ( is_season and video_type == VIDEO_TYPES.SEASON): match_title_year = item['titleNoFormatting'] match_title_year = re.sub('^Watch\s+', '', match_title_year) match_url = item['url'] match_year = '' if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year else: if season and int(is_season.group(1)) != int(season): continue match = re.search('(.*?)\s+\(\d{4}\)', match_title_year) if match: match_title = match.group(1) else: match_title = match_title_year if norm_title in scraper_utils.normalize_title( match_title) and (not year or not match_year or year == match_year): result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url) } results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if not source_url or source_url == FORCE_NO_MATCH: return hosters url = scraper_utils.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) js_result = scraper_utils.parse_json(html, url) if 'error' in js_result: logger.log( 'Direct API error: "%s" @ %s' % (js_result['error'], url), log_utils.LOGWARNING) return hosters for result in js_result: if not scraper_utils.release_check( video, result['release'], require_title=False): continue if result['quality'] in self.q_order: for key in result['links']: url = result['links'][key][0] if re.search('\.rar(\.|$)', url): continue hostname = urlparse.urlparse(url).hostname hoster = { 'multi-part': False, 'class': self, 'views': None, 'url': url, 'rating': None, 'host': hostname, 'quality': QUALITY_MAP[result['quality']], 'direct': False } hoster['format'] = result['quality'] if 'x265' in result[ 'release'] and result['quality'] != '1080P-X265': hoster['dd_qual'] += '-x265' hosters.append(hoster) return hosters
def __get_linked(self, html): sources = {} match = re.search('dizi=([^"]+)', html) if not match: return sources html = self._http_get(AJAX_URL, params={'dizi': match.group(1)}, headers=XHR, cache_limit=.5) js_result = scraper_utils.parse_json(html, AJAX_URL) for source in js_result.get('success', []): stream_url = source.get('src') if stream_url is None: continue if scraper_utils.get_direct_hostname(self, stream_url) == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) elif 'label' in source: quality = scraper_utils.height_get_quality(source['label']) else: quality = QUALITIES.HIGH sources[stream_url] = quality return sources
def __get_ht_links(self, html, page_url): sources = {} match = re.search('Htplugins_Make_Player\("([^"]+)', html) if match: data = {'data': match.group(1)} url = scraper_utils.urljoin(self.base_url, LINK_URL2) headers = {'Referer': page_url} html = self._http_get(url, data=data, headers=headers, cache_limit=.25) js_data = scraper_utils.parse_json(html, url) if 'l' in js_data: for link in js_data['l']: if scraper_utils.get_direct_hostname(self, link) == 'gvideo': quality = scraper_utils.gv_get_quality(link) else: quality = QUALITIES.HIGH sources[link] = quality return sources
def __get_source_page(self, video_type, page_url): match = re.search('/movie/(.*?)-(\d+)\.html', page_url) if not match: return '', '', '' slug, movie_id = match.groups() vid_type = 'movie' if video_type == VIDEO_TYPES.MOVIE else 'series' qp_url = QP_URL.format(slug=slug, movie_id=movie_id, vid_type=vid_type) qp_url = scraper_utils.urljoin(self.base_url, qp_url) headers = {'Referer': scraper_utils.urljoin(self.base_url, page_url)} headers.update(XHR) html = self._http_get(qp_url, headers=headers, cache_limit=8) watching_url = dom_parser2.parse_dom( html, 'a', {'title': re.compile('View all episodes')}, req='href') if not watching_url: return '', '', '' watching_url = watching_url[0].attrs['href'] page_html = self._http_get(watching_url, headers={ 'Referer': scraper_utils.urljoin( self.base_url, page_url) }, cache_limit=8) for attrs, _content in dom_parser2.parse_dom(page_html, 'img', {'class': 'hidden'}, req='src'): _img = self._http_get(attrs['src'], headers={'Referer': watching_url}, cache_limit=8) sl_url = SL_URL.format(movie_id=movie_id) sl_url = scraper_utils.urljoin(self.base_url, sl_url) html = self._http_get(sl_url, headers=headers, cache_limit=8) js_data = scraper_utils.parse_json(html, sl_url) try: html = js_data['html'] except: html = '' return movie_id, watching_url, html
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.5) match = re.search('var\s*video_id="([^"]+)', html) if match: video_id = match.group(1) data = {'v': video_id} headers = {'Referer': page_url} headers.update(XHR) html = self._http_get(self.info_url, data=data, headers=headers, cache_limit=0) sources = scraper_utils.parse_json(html, self.info_url) for source in sources: match = re.search('url=(.*)', sources[source]) if match: stream_url = urllib.unquote(match.group(1)) host = self._get_direct_hostname(stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) else: quality = scraper_utils.height_get_quality(source) stream_url += scraper_utils.append_headers( {'User-Agent': scraper_utils.get_ua()}) hoster = { 'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True } hosters.append(hoster) return hosters
def get_sources(self, video): sources = [] source_url = self.get_url(video) if not source_url or source_url == FORCE_NO_MATCH: return sources object_id = self.__extract_id(source_url) if object_id is None: return sources source_url = TITLE_URL.format(id=object_id) page_url = scraper_utils.urljoin(self.base_url, source_url) html = self._authed_http_get(page_url, cache_limit=.5) js_data = scraper_utils.parse_json(html, page_url) if video.video_type == VIDEO_TYPES.MOVIE: links = js_data.get('links', {}) else: links = self.__episode_match(js_data, video) prefix = js_data.get('domain', {}).get('prefix') suffix = js_data.get('domain', {}).get('suffix') for key, path in links.get('links', {}).iteritems(): for mirror in sorted(list(set(links.get('mirrors', [])))): stream_url = TEMPLATE.format(prefix=prefix, mirror=mirror, suffix=suffix, path=path) host = scraper_utils.get_direct_hostname(self, stream_url) quality = Q_MAP.get(key, QUALITIES.HIGH) source = { 'multi-part': False, 'url': stream_url, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'direct': True } source['version'] = '(Mirror %d)' % (mirror) sources.append(source) return sources
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(self.base_url, '/search?query=') search_url += title.replace("'", "") html = self._http_get(search_url, cache_limit=.25) js_result = scraper_utils.parse_json(html, search_url) if 'error' in js_result: logger.log( 'Direct API error: "%s" @ %s' % (js_result['error'], search_url), log_utils.LOGWARNING) return results for match in js_result: # url = search_url + '&quality=%s' % match['quality'] result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match['release']), 'quality': match['quality'], 'year': '' } results.append(result) return results
def search(self, video_type, title, year, season=''): # @UnusedVariable results = [] search_url = scraper_utils.urljoin(SEARCH_BASE, '/1/indexes/al_titles_index/query') params = { 'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.22.1', 'x-algolia-application-id': 'XXDAZCOUL3', 'x-algolia-api-key': 'c5c1279f5ad09819ecf2af9d6b5ee06a' } data = { 'params': urllib.urlencode({ 'query': title, 'facets': '*', 'hitsPerPage': 30 }) } headers = {'Origin': self.base_url} html = self._http_get(search_url, params=params, data=json.dumps(data), headers=headers, cache_limit=8) js_data = scraper_utils.parse_json(html, search_url) media_type = '/movies/' if video_type == VIDEO_TYPES.MOVIE else '/tv/' for item in js_data.get('hits', []): if 'permalink' in item and 'title' in item and media_type in item[ 'permalink']: match_year = str(item.get('yr', '')) if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(item['title']), 'url': scraper_utils.pathify_url(item['permalink']), 'year': match_year } results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) match = re.search('href="([^"]+)"\s*class="player_btn_big"', html) if match: url = match.group(1) html = self._http_get(url, cache_limit=.5) q_str = '' match = re.search('class="status">([^<]+)', html) if match: q_str = match.group(1) page_quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH) views = None match = re.search('Views:</dt>\s*<dd>(\d+)', html, re.DOTALL) if match: views = match.group(1) for src in dom_parser.parse_dom(html, 'iframe', ret='SRC'): html = self._http_get(src, cache_limit=.5) for match in re.finditer('index.php.*?link\s*:\s*"([^"]+)', html): data = {'link': match.group(1)} headers = XHR headers['Referer'] = url gk_url = urlparse.urljoin(src, GK_URL) html = self._http_get(gk_url, data=data, headers=headers, cache_limit=.25) js_result = scraper_utils.parse_json(html, gk_url) if 'link' in js_result and 'func' not in js_result: if isinstance(js_result['link'], list): sources = dict( (link['link'], scraper_utils.height_get_quality( link['label'])) for link in js_result['link']) else: sources = {js_result['link']: page_quality} for source in sources: host = self._get_direct_hostname(source) if Q_ORDER[page_quality] < Q_ORDER[ sources[source]]: quality = page_quality else: quality = sources[source] hoster = { 'multi-part': False, 'url': source, 'class': self, 'quality': quality, 'host': host, 'rating': None, 'views': views, 'direct': True } hosters.append(hoster) return hosters
def get_sources(self, video): source_url = self.get_url(video) sources = [] if not source_url or source_url == FORCE_NO_MATCH: return sources page_url = scraper_utils.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=8) for attrs, _content in dom_parser2.parse_dom( html, 'img', req=['data-id', 'data-name']): film_id, data_name = attrs['data-id'], attrs['data-name'] data = {'id': film_id, 'n': data_name} server_url = scraper_utils.urljoin(self.base_url, SERVER_URL) server_url = server_url % (film_id) headers = {'Referer': page_url} headers.update(XHR) html = self._http_get(server_url, data=data, headers=headers, cache_limit=.5) for attrs, _content in dom_parser2.parse_dom(html, 'a', req='data-id'): data = {'epid': attrs['data-id']} ep_url = scraper_utils.urljoin(self.base_url, EP_URL) ep_url = ep_url % (attrs['data-id']) headers = {'Referer': page_url} headers.update(XHR) html = self._http_get(ep_url, data=data, headers=headers, cache_limit=.5) js_data = scraper_utils.parse_json(html, ep_url) try: links = [ r.attrs['src'] for r in dom_parser2.parse_dom( js_data['link']['embed'], 'iframe', req='src') ] except: try: links = js_data['link']['l'] except: links = [] try: heights = js_data['link']['q'] except: heights = [] for stream_url, height in map(None, links, heights): match = re.search('movie_url=(.*)', stream_url) if match: stream_url = match.group(1) host = scraper_utils.get_direct_hostname(self, stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) stream_url += scraper_utils.append_headers({ 'User-Agent': scraper_utils.get_ua(), 'Referer': page_url }) direct = True else: host = urlparse.urlparse(stream_url).hostname if height: quality = scraper_utils.height_get_quality(height) else: quality = QUALITIES.HD720 direct = False source = { 'multi-part': False, 'url': stream_url, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'direct': direct } sources.append(source) return sources
def __get_links(self, url, video): hosters = [] search_url, params = self.__translate_search(url) html = self._http_get(search_url, params=params, cache_limit=.5) js_result = scraper_utils.parse_json(html, search_url) down_url = js_result.get('downURL') dl_farm = js_result.get('dlFarm') dl_port = js_result.get('dlPort') for item in js_result.get('data', []): post_hash, size, post_title, ext, duration = item['0'], item[ '4'], item['10'], item['11'], item['14'] checks = [False] * 6 if not scraper_utils.release_check(video, post_title): checks[0] = True if 'alangs' in item and item['alangs'] and 'eng' not in item[ 'alangs']: checks[1] = True if re.match('^\d+s', duration) or re.match('^[0-5]m', duration): checks[2] = True if 'passwd' in item and item['passwd']: checks[3] = True if 'virus' in item and item['virus']: checks[4] = True if 'type' in item and item['type'].upper() != 'VIDEO': checks[5] = True if any(checks): logger.log( 'EasyNews Post excluded: %s - |%s|' % (checks, item), log_utils.LOGDEBUG) continue stream_url = down_url + urllib.quote( '/%s/%s/%s%s/%s%s' % (dl_farm, dl_port, post_hash, ext, post_title, ext)) stream_url = stream_url + '|Authorization=%s' % (urllib.quote( self.auth)) host = scraper_utils.get_direct_hostname(self, stream_url) quality = None if 'width' in item: try: width = int(item['width']) except: width = 0 if width: quality = scraper_utils.width_get_quality(width) if quality is None: if video.video_type == VIDEO_TYPES.MOVIE: meta = scraper_utils.parse_movie_link(post_title) else: meta = scraper_utils.parse_episode_link(post_title) quality = scraper_utils.height_get_quality(meta['height']) if self.max_bytes: match = re.search('([\d.]+)\s+(.*)', size) if match: size_bytes = scraper_utils.to_bytes(*match.groups()) if size_bytes > self.max_bytes: logger.log( 'Result skipped, Too big: |%s| - %s (%s) > %s (%s GB)' % (post_title, size_bytes, size, self.max_bytes, self.max_gb)) continue hoster = { 'multi-part': False, 'class': self, 'views': None, 'url': stream_url, 'rating': None, 'host': host, 'quality': quality, 'direct': True } if any(i for i in ['X265', 'HEVC'] if i in post_title.upper()): hoster['format'] = 'x265' if size: hoster['size'] = size if post_title: hoster['extra'] = post_title hosters.append(hoster) return hosters