def __tv_search(self, title, year): results = [] if title: norm_title = scraper_utils.normalize_title(title) url = '/series/letra/%s/' % (title[0]) url = urlparse.urljoin(self.base_url, url) html = self._http_get(url, cache_limit=48) for item in dom_parser.parse_dom(html, 'li', {'class': '[^"]*bpM12[^"]*'}): title_frag = dom_parser.parse_dom(item, 'h2') year_frag = dom_parser.parse_dom(item, 'div', {'class': '[^"]*sectionDetail[^"]*'}) match_url = dom_parser.parse_dom(item, 'a', ret='href') if title_frag and match_url: match_url = match_url[0] match = re.search('(.*?)<br>', title_frag[0]) if match: match_title = match.group(1) else: match_title = title_frag[0] match_year = '' if year_frag: match = re.search('(\d{4})', year_frag[0]) if match: match_year = match.group(1) if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*movie_langs_list[^"]*'}) if fragment: for match in re.finditer('href="([^"]+)', fragment[0]): match = re.search('movie-player/(.*)', match.group(1)) if match: player_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1))) html = self._http_get(player_url, cache_limit=.5) match = re.search('<source\s+src="([^"]+)', html) if match: stream_url = match.group(1) hoster = {'multi-part': False, 'url': stream_url, 'class': self, 'quality': self._gv_get_quality(stream_url), 'host': self._get_direct_hostname(stream_url), 'rating': None, 'views': None, 'direct': True} hosters.append(hoster) fragment2 = dom_parser.parse_dom(html, 'ul', {'class': 'servers'}) if fragment2: for match in re.finditer('href="([^"]+).*?<span>(.*?)</span>', fragment2[0]): other_url, quality = match.groups() match = re.search('movie-player/(.*)', other_url) if match: other_url = urlparse.urljoin(self.base_url, PLAYER_URL % (match.group(1))) if other_url == player_url: continue hoster = {'multi-part': False, 'url': other_url, 'class': self, 'quality': QUALITY_MAP.get(quality, QUALITIES.HD720), 'host': self._get_direct_hostname(other_url), 'rating': None, 'views': None, 'direct': True} hosters.append(hoster) return hosters
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/index.php?menu=search&query=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'} fragment = dom_parser.parse_dom(html, 'div', {'id': sections[video_type]}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'figcaption'): match = re.search('title="([^"]+)[^>]+href="([^"]+)', item) if match: match_title_year, url = match.groups() match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if match_title.startswith('Watch '): match_title = match_title.replace('Watch ', '') if match_title.endswith(' Online'): match_title = match_title.replace(' Online', '') if not year or not match_year or year == match_year: result = {'title': match_title, 'url': scraper_utils.pathify_url(url), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?query=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] info = dom_parser.parse_dom(html, 'div', {'class': 'movie-info'}) for item in info: match_title = dom_parser.parse_dom(item, 'span', {'class': 'movie-title'}) match_year = dom_parser.parse_dom(item, 'span', {'class': 'movie-year'}) if match_title: match_title = self.__strip_link(match_title[0]) if match_year: match_year = self.__strip_link(match_year[0]) else: match_year = '' match = re.search('href="([^"]+)', item) if match: url = match.group(1) else: continue if not year or not match_year or year == match_year: result = {'title': match_title, 'year': match_year, 'url': url.replace(self.base_url, '')} results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) fragment = dom_parser.parse_dom(html, 'tbody') if fragment: links = dom_parser.parse_dom(fragment[0], 'a', ret='href') domains = dom_parser.parse_dom(fragment[0], 'a') for link, host in zip(links, domains): host = re.sub('</?span[^>]*>', '', host) hoster = { 'multi-part': False, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, QUALITIES.HIGH), 'views': None, 'rating': None, 'url': link, 'direct': False } hosters.append(hoster) return hosters
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) q_str = '' match = re.search('class="calishow">([^<]+)', html) if match: q_str = match.group(1) else: match = re.search('<a[^>]*href="#embed\d*"[^>]+>([^<]+)', html) if match: q_str = match.group(1) fragment = dom_parser.parse_dom(html, 'div', {'class': 'tab-content'}) if fragment: for source in dom_parser.parse_dom(fragment[0], 'iframe', ret='src'): host = urlparse.urlparse(source).hostname quality = scraper_utils.blog_get_quality(video, q_str, host) hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': source, 'direct': False} hosters.append(hoster) fragment = dom_parser.parse_dom(html, 'div', {'id': 'olmt'}) if fragment: hosters += self.__get_links(video, fragment[0]) fragment = dom_parser.parse_dom(html, 'div', {'id': 'dlnmt'}) if fragment: hosters += self.__get_links(video, fragment[0]) hosters = dict((stream['url'], stream) for stream in hosters).values() return hosters
def __movie_search(self, title, year): results = [] search_url = urlparse.urljoin(self.base_url, '/search?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'img', ret='alt') match_year = '' if match_url and match_title: match_url = match_url[0] match_title = match_title[0] if match_year: match_year = match_year[0] else: match_year = '' if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def __get_gk_links(self, html, page_url): sources = {} for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*server_line[^"]*'}): film_id = dom_parser.parse_dom(link, 'a', ret='data-film') name_id = dom_parser.parse_dom(link, 'a', ret='data-name') server_id = dom_parser.parse_dom(link, 'a', ret='data-server') if film_id and name_id and server_id: data = {'ipplugins': 1, 'ip_film': film_id[0], 'ip_server': server_id[0], 'ip_name': name_id[0]} headers = XHR headers['Referer'] = page_url url = urlparse.urljoin(self.base_url, LINK_URL) html = self._http_get(url, data=data, headers=headers, cache_limit=.25) js_data = scraper_utils.parse_json(html, url) if 's' in js_data: if isinstance(js_data['s'], basestring): sources[js_data['s']] = QUALITIES.HIGH else: for link in js_data['s']: stream_url = link['file'] if self._get_direct_hostname(stream_url) == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) elif 'label' in link: quality = scraper_utils.height_get_quality(link['label']) else: quality = QUALITIES.HIGH sources[stream_url] = quality return sources
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search-movies/%s.html') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=0) results = [] for thumb in dom_parser.parse_dom(html, 'div', {'class': 'thumb'}): match_title = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='title') url = dom_parser.parse_dom(thumb, 'a', {'class': 'clip-link'}, ret='href') if match_title and url: match_title, url = match_title[0], url[0] is_season = re.search('Season\s+(\d+)$', match_title, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: match_year = dom_parser.parse_dom(thumb, 'div', {'class': '[^"]*status-year[^"]*'}) if match_year: match_year = match_year[0] else: if season and int(is_season.group(1)) != int(season): continue if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/movie/search/') search_url += title html = self._http_get(search_url, cache_limit=1) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}): match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'}) match_url = re.search('href="([^"]+)', item, re.DOTALL) match_year = re.search('class="jt-info">(\d{4})<', item) is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'}) if match_title and match_url and not is_episodes: match_title = match_title[0] match_title = re.sub('</?h2>', '', match_title) match_title = re.sub('\s+\d{4}$', '', match_title) url = urlparse.urljoin(match_url.group(1), 'watching.html') match_year = match_year.group(1) if match_year else '' if not year or not match_year or year == match_year: result = { 'title': match_title, 'year': match_year, 'url': self._pathify_url(url) } results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) html = html.decode('utf-8', 'ignore') fragment = dom_parser.parse_dom(html, 'div', {'class': 'list-wrap'}) if fragment: for stream_url in dom_parser.parse_dom(fragment[0], 'iframe', ret='src'): host = urlparse.urlparse(stream_url).hostname hoster = { 'multi-part': False, 'host': host, 'url': stream_url, 'class': self, 'rating': None, 'views': None, 'quality': QUALITIES.HIGH, 'direct': True } hosters.append(hoster) return hosters
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/arsiv?limit=&tur=&orderby=&ulke=&order=&yil=&dizi_adi=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'tv-series-single'}): try: url = re.search('href="([^"]+)', item).group(1) except: url = '' try: match_year = re.search('<span>\s*(\d{4})\s*</span>', item).group(1) except: match_year = '' try: match_title = dom_parser.parse_dom(item, 'a', {'class': 'title'}) re.search('([^>]+)$', match_title[0]).group(1) except: match_title = '' if url and match_title and (not year or not match_year or year == match_year): result = {'url': url.replace(self.base_url, ''), 'title': match_title, 'year': ''} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.__get_base_url(video_type), '/search/%s.html' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=1) fragment = dom_parser.parse_dom(html, 'ul', {'class': 'cfv'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'li'): is_season = dom_parser.parse_dom(item, 'div', {'class': 'status'}) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'a', ret='title') if match_url and match_title: match_title = match_title[0] match_url = match_url[0] match_year = '' if video_type == VIDEO_TYPES.SEASON: if season and not re.search('Season\s+%s$' % (season), match_title, re.I): continue else: match = re.search('-(\d{4})\.html', match_url) if match: match_year = match.group(1) if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def __get_gk_links(self, html, page_url, video_type, episode): sources = {} phimid = dom_parser.parse_dom(html, 'input', {'name': 'phimid'}, ret='value') if phimid and video_type == VIDEO_TYPES.EPISODE: url = urlparse.urljoin(self.tv_base_url, '/ajax.php') data = {'ipos_server': 1, 'phimid': phimid[0], 'keyurl': episode} headers = XHR headers['Referer'] = page_url html = self._http_get(url, data=data, headers=headers, cache_limit=.5) for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*server_line[^"]*'}): film_id = dom_parser.parse_dom(link, 'a', ret='data-film') name_id = dom_parser.parse_dom(link, 'a', ret='data-name') server_id = dom_parser.parse_dom(link, 'a', ret='data-server') if film_id and name_id and server_id: data = {'ipplugins': 1, 'ip_film': film_id[0], 'ip_server': server_id[0], 'ip_name': name_id[0]} headers = XHR headers['Referer'] = page_url url = urlparse.urljoin(self.__get_base_url(video_type), LINK_URL) html = self._http_get(url, data=data, headers=headers, cache_limit=.25) js_data = scraper_utils.parse_json(html, url) if 's' in js_data: if isinstance(js_data['s'], basestring): sources[js_data['s']] = QUALITIES.HIGH else: for link in js_data['s']: stream_url = link['file'] if self._get_direct_hostname(stream_url) == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) elif 'label' in link: quality = scraper_utils.height_get_quality(link['label']) else: quality = QUALITIES.HIGH sources[stream_url] = quality return sources
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, headers=XHR, cache_limit=1) for film in dom_parser.parse_dom(html, 'li', {'class': 'films-item'}): match_url = dom_parser.parse_dom(film, 'a', ret='href') match_title = dom_parser.parse_dom(film, 'div', {'class': 'films-item-title'}) match_year = dom_parser.parse_dom(film, 'div', {'class': 'films-item-year'}) if match_url and match_title: match_url = match_url[0] match_title = match_title[0] match_title = re.sub('</?span>', '', match_title) if match_year: match = re.search('(\d+)', match_year[0]) if match: match_year = match.group(1) else: match_year = '' else: match_year = '' if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': match_url} results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*screen[^"]*'}) if fragment: js_src = dom_parser.parse_dom(fragment[0], 'script', ret='src') if js_src: js_url = urlparse.urljoin(self.base_url, js_src[0]) html = self._http_get(js_url, cache_limit=.5) else: html = fragment[0] for match in re.finditer('<source[^>]+src="([^"]+)', html): stream_url = match.group(1) host = self._get_direct_hostname(stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) else: _, _, height, _ = scraper_utils.parse_movie_link(stream_url) quality = scraper_utils.height_get_quality(height) stream_url += '|User-Agent=%s' % (scraper_utils.get_ua()) hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True} hosters.append(hoster) return hosters
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?query=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] info = dom_parser.parse_dom(html, 'div', {'class': 'movie-info'}) for item in info: match_title = dom_parser.parse_dom(item, 'span', {'class': 'movie-title'}) match_year = dom_parser.parse_dom(item, 'span', {'class': 'movie-year'}) if match_title: match_title = self.__strip_link(match_title[0]) if match_year: match_year = self.__strip_link(match_year[0]) else: match_year = '' match = re.search('href="([^"]+)', item) if match: url = match.group(1) else: continue if not year or not match_year or year == match_year: result = { 'title': match_title, 'year': match_year, 'url': url.replace(self.base_url, '') } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] if video_type == VIDEO_TYPES.MOVIE: search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus('%s' % (title)) html = self._http_get(search_url, cache_limit=1) links = dom_parser.parse_dom(html, 'a', {'class': 'clip-link'}, 'href') titles = dom_parser.parse_dom(html, 'a', {'class': 'clip-link'}, 'title') matches = zip(links, titles) else: html = self._http_get(self.base_url, cache_limit=8) matches = re.findall('<li\s+class="cat-item[^>]+>\s*<a\s+href="([^"]+)[^>]+>([^<]+)', html) norm_title = scraper_utils.normalize_title(title) for item in matches: url, match_title_year = item match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if norm_title in scraper_utils.normalize_title(match_title) and (not year or not match_year or year == match_year): result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url)} results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) q_str = dom_parser.parse_dom(html, 'span', {'class': 'calidad\d*'}) if q_str: if q_str[0].upper() == 'COMING SOON': return hosters try: quality = self._height_get_quality(q_str[0]) except: quality = QUALITIES.HIGH else: quality = QUALITIES.HIGH fragment = dom_parser.parse_dom(html, 'div', {'id': 'player\d+'}) if fragment: for match in re.finditer('<iframe[^>]+src="([^"]+)', fragment[0], re.I): stream_url = match.group(1) host = urlparse.urlparse(stream_url).hostname hoster = {'multi-part': False, 'url': stream_url, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'direct': False} hosters.append(hoster) return hosters
def get_sources(self, video): source_url = self.get_url(video) sources = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) for item in dom_parser.parse_dom(html, 'li', {'class': 'elemento'}): match = re.search('href="([^"]+)', item) if match: stream_url = match.group(1) q_str = dom_parser.parse_dom(item, 'span', {'class': 'd'}) q_str = q_str[0].upper() if q_str else '' base_quality = QUALITY_MAP.get(q_str, QUALITIES.HIGH) host = urlparse.urlparse(stream_url).hostname quality = scraper_utils.get_quality( video, host, base_quality) source = { 'multi-part': False, 'url': stream_url, 'host': host, 'class': self, 'quality': quality, 'views': None, 'rating': None, 'direct': False } sources.append(source) return sources
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*ldr-item[^"]*'}): stream_url = dom_parser.parse_dom(link, 'a', ret='data-actuallink') views = None watched = dom_parser.parse_dom(link, 'div', {'class': 'click-count'}) if watched: match = re.search(' (\d+) ', watched[0]) if match: views = match.group(1) score = dom_parser.parse_dom(link, 'div', {'class': '\s*point\s*'}) if score: score = int(score[0]) rating = score * 10 if score else None if stream_url: stream_url = stream_url[0].strip() host = urlparse.urlparse(stream_url).hostname quality = scraper_utils.get_quality(video, host, QUALITIES.HIGH) hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': rating, 'url': stream_url, 'direct': False} hosters.append(hoster) return hosters
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=1) fragment = dom_parser.parse_dom(html, 'div', {'class': 'player'}) if fragment: iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src') if iframe_url: html = self._http_get(iframe_url[0], cache_limit=.5) # if captions exist, then they aren't hardcoded if re.search('kind\s*:\s*"captions"', html): subs = False else: subs = True match = re.search('sources\s*:\s*\[(.*?)\]', html) if match: for match in re.finditer('"file"\s*:\s*"([^"]+)', match.group(1)): stream_url = match.group(1) if self._get_direct_hostname(stream_url) == 'gvideo': quality = self._gv_get_quality(stream_url) hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True, 'subs': subs} hosters.append(hoster) return hosters
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/arsiv?limit=&tur=&orderby=&ulke=&order=&yil=&dizi_adi=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'tv-series-single'}): try: url = re.search('href="([^"]+)', item).group(1) except: url = '' try: match_year = re.search('<span>\s*(\d{4})\s*</span>', item).group(1) except: match_year = '' try: match_title = dom_parser.parse_dom(item, 'a', {'class': 'title'}) match_title = re.search('([^>]+)$', match_title[0]).group(1) match_title = match_title.strip() except: match_title = '' if url and match_title and (not year or not match_year or year == match_year): result = {'url': self._pathify_url(url), 'title': match_title, 'year': ''} results.append(result) return results
def search(self, video_type, title, year): results = [] search_url = urlparse.urljoin(self.__get_base_url(video_type), '/?s=%s' % (urllib.quote_plus(title))) html = self._http_get(search_url, cache_limit=1) for movie in dom_parser.parse_dom(html, 'div', {'class': 'movie'}): match = re.search('href="([^"]+)', movie) if match: match_url = match.group(1) if re.search('season-\d+-episode\d+', match_url): continue match_title_year = dom_parser.parse_dom(movie, 'img', ret='alt') if match_title_year: match_title_year = match_title_year[0] match = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = dom_parser.parse_dom(movie, 'div', {'class': 'year'}) try: match_year = match_year[0] except: match_year = '' if not year or not match_year or year == match_year: result = {'url': self._pathify_url(match_url), 'title': match_title, 'year': match_year} results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) match = re.search('''<option[^>]+value\s*=\s*["']([^"']+)[^>]*>(?:Altyaz.{1,3}s.{1,3}z)<''', html) if match: option_url = urlparse.urljoin(self.base_url, match.group(1)) html = self._http_get(option_url, cache_limit=.25) fragment = dom_parser.parse_dom(html, 'span', {'class': 'object-wrapper'}) if fragment: iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src') if iframe_url: html = self._http_get(iframe_url[0], cache_limit=.25) seen_urls = {} for match in re.finditer('"?file"?\s*:\s*"([^"]+)"\s*,\s*"?label"?\s*:\s*"(\d+)p?[^"]*"', html): stream_url, height = match.groups() if stream_url not in seen_urls: seen_urls[stream_url] = True stream_url += '|User-Agent=%s' % (scraper_utils.get_ua()) host = self._get_direct_hostname(stream_url) if host == 'gvideo': quality = scraper_utils.gv_get_quality(stream_url) else: quality = scraper_utils.height_get_quality(height) hoster = {'multi-part': False, 'host': self._get_direct_hostname(stream_url), 'class': self, 'quality': quality, 'views': None, 'rating': None, 'url': stream_url, 'direct': True} hosters.append(hoster) return hosters
def search(self, video_type, title, year): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, headers=XHR, cache_limit=1) for film in dom_parser.parse_dom(html, "li", {"class": "films-item"}): match_url = dom_parser.parse_dom(film, "a", ret="href") match_title = dom_parser.parse_dom(film, "div", {"class": "films-item-title"}) match_year = dom_parser.parse_dom(film, "div", {"class": "films-item-year"}) if match_url and match_title: match_url = match_url[0] match_title = match_title[0] match_title = re.sub("</?span>", "", match_title) if match_year: match = re.search("(\d+)", match_year[0]) if match: match_year = match.group(1) else: match_year = "" else: match_year = "" if not year or not match_year or year == match_year: result = {"title": match_title, "year": match_year, "url": match_url} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match = re.search('href="([^"]+)', item) match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'}) year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if match and match_title: url = match.group(1) match_title = match_title[0] if re.search('\d+\s*x\s*\d+', match_title): continue # exclude episodes match = re.search('(.*?)\s+\((\d{4})\)', match_title) if match: match_title, match_year = match.groups() else: match_title = match_title match_year = '' if year_frag: match_year = year_frag[0] if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(url) } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search.php?q=%s&limit=20×tamp=%s' % (urllib.quote_plus(title), int(time.time()))) html = self._http_get(search_url, cache_limit=.25) results = [] items = dom_parser.parse_dom(html, 'li') if len(items) >= 2: items = items[1:] for item in items: match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title_year = dom_parser.parse_dom(item, 'strong') if match_url and match_title_year: match_url = match_url[0] match_title_year = re.sub('</?strong>', '', match_title_year[0]) is_season = re.search('S(?:eason\s+)?(\d+)$', match_title_year, re.I) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: if video_type == VIDEO_TYPES.MOVIE: match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' else: log_utils.log(is_season.group(1)) if season and int(is_season.group(1)) != int(season): continue match_title = match_title_year match_year = '' result = {'title': match_title, 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/%s.html') search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=1) results = [] fragment = dom_parser.parse_dom(html, 'div', {'class': 'movie'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'li'): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'span', {'class': 'text'}) match_year = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if match_url and match_title: match_url = match_url[0] match_title = re.sub('</?strong>', '', match_title[0]) is_season = re.search('Season\s+(\d+)$', match_title, re.I) if (not is_season and video_type == VIDEO_TYPES.MOVIE) or (is_season and video_type == VIDEO_TYPES.SEASON): if video_type == VIDEO_TYPES.MOVIE: if match_year: match_year = match_year[0] else: match_year = '' else: if season and int(is_season.group(1)) != int(season): continue match_year = '' result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def _get_episode_url(self, show_url, video): sxe = '(\.|_| )S%02dE%02d(\.|_| )' % (int(video.season), int(video.episode)) force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) try: airdate_pattern = video.ep_airdate.strftime('(\.|_| )%Y(\.|_| )%m(\.|_| )%d(\.|_| )') except: airdate_pattern = '' page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for post in posts: if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: match = re.search('<a\s+href="([^"]+)[^>]+>(.*?)</a>', post) if match: url, title = match.groups() if not force_title: if re.search(sxe, title) or (airdate_pattern and re.search(airdate_pattern, title)): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('</strong>(.*?)</p>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(url, cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'class': 'repro'}) if fragment: iframe_url = dom_parser.parse_dom(fragment[0], 'iframe', ret='src') if iframe_url: html = self._http_get(iframe_url[0], cache_limit=.5) fragment = dom_parser.parse_dom(html, 'div', {'id': 'botones'}) if fragment: for media_url in dom_parser.parse_dom(fragment[0], 'a', ret='href'): if self.base_url in media_url or 'pelispedia.biz' in media_url: headers = {'Referer': iframe_url[0]} html = self._http_get(media_url, headers=headers, cache_limit=.5) hosters += self.__get_page_links(html) hosters += self.__get_pk_links(html) hosters += self.__get_gk_links(html, url) else: host = urlparse.urlparse(media_url).hostname hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': QUALITIES.HD720, 'views': None, 'rating': None, 'url': media_url, 'direct': False} hosters.append(hoster) return hosters
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'ml-item'}): match_title = dom_parser.parse_dom(item, 'span', {'class': 'mli-info'}) match_url = re.search('href="([^"]+)', item, re.DOTALL) year_frag = dom_parser.parse_dom(item, 'img', ret='alt') is_episodes = dom_parser.parse_dom(item, 'span', {'class': 'mli-eps'}) if (video_type == VIDEO_TYPES.MOVIE and not is_episodes) or (video_type == VIDEO_TYPES.SEASON and is_episodes): if match_title and match_url: match_url = match_url.group(1) match_title = match_title[0] match_title = re.sub('</?h2>', '', match_title) match_title = re.sub('\s+\d{4}$', '', match_title) if video_type == VIDEO_TYPES.SEASON: if season and not re.search('Season\s+%s$' % (season), match_title): continue if not match_url.endswith('/'): match_url += '/' match_url = urlparse.urljoin(match_url, 'watch/') match_year = '' if video_type == VIDEO_TYPES.MOVIE and year_frag: match = re.search('\s*-\s*(\d{4})$', year_frag[0]) if match: match_year = match.group(1) if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': scraper_utils.pathify_url(match_url)} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url = search_url % (urllib.quote_plus(title)) html = self._http_get(search_url, headers=XHR, cache_limit=1) for film in dom_parser.parse_dom(html, 'li', {'class': 'films-item'}): match_url = dom_parser.parse_dom(film, 'a', ret='href') match_title = dom_parser.parse_dom(film, 'div', {'class': 'films-item-title'}) match_year = dom_parser.parse_dom(film, 'div', {'class': 'films-item-year'}) if match_url and match_title: match_url = match_url[0] match_title = match_title[0] match_title = re.sub('</?span>', '', match_title) if match_year: match = re.search('(\d+)', match_year[0]) if match: match_year = match.group(1) else: match_year = '' else: match_year = '' if not year or not match_year or year == match_year: result = { 'title': scraper_utils.cleanse_title(match_title), 'year': match_year, 'url': match_url } results.append(result) return results
def _get_episode_url(self, show_url, video): sxe = '(\.|_| )S%02dE%02d(\.|_| )' % (int(video.season), int(video.episode)) force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) try: airdate_pattern = video.ep_airdate.strftime('(\.|_| )%Y(\.|_| )%m(\.|_| )%d(\.|_| )') except: airdate_pattern = '' page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, cache_limit=1) headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for heading, post in zip(headings, posts): if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: url, title = heading if not force_title: if re.search(sxe, title) or (airdate_pattern and re.search(airdate_pattern, title)): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('</strong>(.*?)</p>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/index.php?menu=search&query=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=.25) results = [] sections = {VIDEO_TYPES.MOVIE: 'movies', VIDEO_TYPES.TVSHOW: 'series'} fragment = dom_parser.parse_dom(html, 'div', {'id': sections[video_type]}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'figcaption'): match = re.search('title="([^"]+)[^>]+href="([^"]+)', item) if match: match_title_year, url = match.groups() match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if match_title.startswith('Watch '): match_title = match_title.replace('Watch ', '') if match_title.endswith(' Online'): match_title = match_title.replace(' Online', '') if not year or not match_year or year == match_year: result = { 'title': match_title, 'url': self._pathify_url(url), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/search?q=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) norm_title = scraper_utils.normalize_title(title) for item in dom_parser.parse_dom(html, 'div', {'class': 'video_item'}): match_url = dom_parser.parse_dom(item, 'a', ret='href') match_title = dom_parser.parse_dom(item, 'img', ret='alt') match_year = '' if match_url and match_title: match_url = match_url[0] match_title = match_title[0] if VIDEO_TYPES == VIDEO_TYPES.TVSHOW and '/tv-series/' not in match_url: continue if match_year: match_year = match_year[0] else: match_year = '' if norm_title in scraper_utils.normalize_title( match_title) and (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin( self.base_url, '/search.php?q=%s&limit=20×tamp=%s' % (urllib.quote_plus(title), time.time())) html = self._http_get(search_url, cache_limit=.25) results = [] items = dom_parser.parse_dom(html, 'li') if len(items) >= 2: items = items[1:] for item in items: url = dom_parser.parse_dom(item, 'a', ret='href') match_title_year = dom_parser.parse_dom(item, 'strong') if url and match_title_year: url = url[0] match_title_year = match_title_year[0].replace( '<strong>', '').replace('</strong>', '') match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' result = { 'title': match_title, 'year': match_year, 'url': url.replace(self.base_url, '') } results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/search/') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for fragment in dom_parser.parse_dom(html, 'div', {'class': 'inner'}): name = dom_parser.parse_dom(fragment, 'div', {'class': 'name'}) if name: match = re.search('href="([^"]+)[^>]+>(.*?)</a>', name[0]) if match: match_url, match_title_year = match.groups() if 'tv-series' in match_url and video_type == VIDEO_TYPES.MOVIE: continue match_title_year = re.sub('</?[^>]*>', '', match_title_year) match_title_year = re.sub('[Ww]atch\s+[Mm]ovie\s*', '', match_title_year) match_title_year = match_title_year.replace('’', "'") match = re.search('(.*?)\s+\((\d{4})[^)]*\)$', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not match_year: year_span = dom_parser.parse_dom(fragment, 'span', {'class': 'year'}) if year_span: year_text = dom_parser.parse_dom(year_span[0], 'a') if year_text: match_year = year_text[0].strip() if not year or not match_year or year == match_year: result = {'title': scraper_utils.cleanse_title(match_title), 'url': scraper_utils.pathify_url(match_url), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) title_strip = [word.decode('utf-8') for word in TITLE_STRIP] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match_url = re.search('href="([^"]+)', item) match_title = dom_parser.parse_dom(item, 'span', {'class': 'tt'}) if match_url and match_title: item_type = dom_parser.parse_dom(item, 'span', {'class': 'calidad2'}) if item_type and item_type[0] in SEARCH_EXCLUDE: continue match_url = match_url.group(1) match_title = match_title[0] if 'SEZON' in match_title.upper(): continue year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if year_frag: match_year = year_frag[0] else: match_year = '' match_title = ' '.join([word for word in match_title.split() if word.upper() not in title_strip]) if (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) for link in dom_parser.parse_dom(html, 'div', {'class': '[^"]*ldr-item[^"]*'}): stream_url = dom_parser.parse_dom(link, 'a', ret='data-actuallink') views = None watched = dom_parser.parse_dom(link, 'div', {'class': 'click-count'}) if watched: match = re.search(' (\d+) ', watched[0]) if match: views = match.group(1) score = dom_parser.parse_dom(link, 'div', {'class': '\s*point\s*'}) if score: score = int(score[0]) rating = score * 10 if score else None if stream_url: stream_url = stream_url[0] host = urlparse.urlparse(stream_url).hostname quality = scraper_utils.get_quality(video, host, QUALITIES.HIGH) hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': rating, 'url': stream_url, 'direct': False} hosters.append(hoster) return hosters
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: page_url = urlparse.urljoin(self.base_url, source_url) html = self._http_get(page_url, cache_limit=.25) for item in dom_parser.parse_dom(html, 'div', {'class': 'stream-table__row'}): stream_url = dom_parser.parse_dom(item, 'a', ret='href') match = re.search('Views:\s*(?:</span>)?\s*(\d+)', item, re.I) if match: views = match.group(1) else: views = None match = re.search('Size:\s*(?:</span>)?\s*(\d+)', item, re.I) if match: size = int(match.group(1)) * 1024 * 1024 else: size = None if stream_url: stream_url = stream_url[0] match = re.search('/redirect/(.*)', stream_url) if match: stream_url = base64.decodestring(urllib.unquote(match.group(1))) host = urlparse.urlparse(stream_url).hostname if host: quality = scraper_utils.get_quality(video, host, QUALITIES.HIGH) hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': quality, 'views': views, 'rating': None, 'url': stream_url, 'direct': False} if size is not None: hoster['size'] = scraper_utils.format_size(size, 'B') hosters.append(hoster) return hosters
def _get_episode_url(self, show_url, video): url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=2) if html: force_title = scraper_utils.force_title(video) episodes = dom_parser.parse_dom(html, 'div', {'class': '\s*el-item\s*'}) if not force_title: episode_pattern = 'href="([^"]*-[sS]%02d[eE]%02d(?!\d)[^"]*)' % (int(video.season), int(video.episode)) match = re.search(episode_pattern, html) if match: return scraper_utils.pathify_url(match.group(1)) if kodi.get_setting('airdate-fallback') == 'true' and video.ep_airdate: airdate_pattern = '%02d-%02d-%d' % (video.ep_airdate.day, video.ep_airdate.month, video.ep_airdate.year) for episode in episodes: ep_url = dom_parser.parse_dom(episode, 'a', ret='href') ep_airdate = dom_parser.parse_dom(episode, 'div', {'class': 'date'}) if ep_url and ep_airdate: ep_airdate = ep_airdate[0].strip() if airdate_pattern == ep_airdate: return scraper_utils.pathify_url(ep_url[0]) if (force_title or kodi.get_setting('title-fallback') == 'true') and video.ep_title: norm_title = scraper_utils.normalize_title(video.ep_title) for episode in episodes: ep_url = dom_parser.parse_dom(episode, 'a', ret='href') ep_title = dom_parser.parse_dom(episode, 'div', {'class': 'e-name'}) if ep_url and ep_title and norm_title == scraper_utils.normalize_title(ep_title[0]): return scraper_utils.pathify_url(ep_url[0])
def _get_episode_url(self, show_url, video): url = urlparse.urljoin(self.base_url, show_url) html = self._http_get(url, cache_limit=1) data_id = dom_parser.parse_dom(html, 'div', {'id': 'dizidetay'}, ret='data-id') data_dizi = dom_parser.parse_dom(html, 'div', {'id': 'dizidetay'}, ret='data-dizi') if data_id and data_dizi: queries = { 'sekme': 'bolumler', 'id': data_id[0], 'dizi': data_dizi[0] } season_url = SEASON_URL + '?' + urllib.urlencode(queries) episode_pattern = '''href=['"]([^'"]*/%s-sezon-%s-[^'"]*bolum[^'"]*)''' % ( video.season, video.episode) title_pattern = '''href=['"](?P<url>[^'"]+)[^>]*>(?P<title>[^<]+)''' airdate_pattern = '''href=['"]([^"']+)[^>]*>[^<]*</a>\s*</td>\s*<td class="right aligned">{p_day}\.{p_month}\.{year}''' headers = XHR headers['Content-Length'] = 0 headers['Referer'] = url result = self._default_get_episode_url(season_url, video, episode_pattern, title_pattern, airdate_pattern, headers=headers, method='POST') if result and 'javascript:;' not in result: return result
def get_sources(self, video): source_url = self.get_url(video) hosters = [] if source_url and source_url != FORCE_NO_MATCH: url = urlparse.urljoin(self.base_url, source_url) entry = '' while True: html = self._http_get(url, cache_limit=.5) if not html: url = urlparse.urljoin(BASE_URL2, source_url) html = self._http_get(url, cache_limit=.5) entry = dom_parser.parse_dom(html, 'div', {'class': 'entry'}) if entry: entry = entry[0] match = re.search('Watch it here\s*:.*?href="([^"]+)', entry, re.I) if match: url = match.group(1) else: break else: entry = '' break for tab in dom_parser.parse_dom(entry, 'div', {'class': '''[^'"]*postTabs_divs[^'"]*'''}): match = re.search('<iframe[^>]*src="([^"]+)', tab, re.I | re.DOTALL) if match: link = match.group(1) host = urlparse.urlparse(link).hostname hoster = {'multi-part': False, 'host': host, 'class': self, 'quality': scraper_utils.get_quality(video, host, QUALITIES.HIGH), 'views': None, 'rating': None, 'url': link, 'direct': False} hosters.append(hoster) return hosters
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*items[^"]*'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'item'}): match_url = dom_parser.parse_dom(item, 'a', {'class': 'header'}, ret='href') match_title_year = dom_parser.parse_dom( item, 'a', {'class': 'header'}) if match_url and match_title_year: match_url = match_url[0] match_title_year = match_title_year[0] r = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if r: match_title, match_year = r.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(match_url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def _get_episode_url(self, show_url, video): sxe = '.S%02dE%02d.' % (int(video.season), int(video.episode)) force_title = scraper_utils.force_title(video) title_fallback = kodi.get_setting('title-fallback') == 'true' norm_title = scraper_utils.normalize_title(video.ep_title) try: ep_airdate = video.ep_airdate.strftime('.%Y.%m.%d.') except: ep_airdate = '' page_url = [show_url] too_old = False while page_url and not too_old: url = urlparse.urljoin(self.base_url, page_url[0]) html = self._http_get(url, require_debrid=True, cache_limit=1) headings = re.findall('<h2>\s*<a\s+href="([^"]+)[^>]+>(.*?)</a>', html) posts = dom_parser.parse_dom(html, 'div', {'id': 'post-\d+'}) for heading, post in zip(headings, posts): if self.__too_old(post): too_old = True break if CATEGORIES[VIDEO_TYPES.TVSHOW] in post and show_url in post: url, title = heading if not force_title: if (sxe in title) or (ep_airdate and ep_airdate in title): return scraper_utils.pathify_url(url) else: if title_fallback and norm_title: match = re.search('<strong>(.*?)</strong>', post) if match and norm_title == scraper_utils.normalize_title(match.group(1)): return scraper_utils.pathify_url(url) page_url = dom_parser.parse_dom(html, 'a', {'class': 'nextpostslink'}, ret='href')
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, SEARCH_URL) search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=8) fragment = dom_parser.parse_dom(html, 'div', {'class': '[^"]*items[^"]*'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'item'}): match_url = dom_parser.parse_dom(item, 'a', {'class': 'header'}, ret='href') match_title_year = dom_parser.parse_dom(item, 'a', {'class': 'header'}) if match_url and match_title_year: match_url = match_url[0] match_title_year = match_title_year[0] r = re.search('(.*?)\s+\((\d{4})\)', match_title_year) if r: match_title, match_year = r.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': scraper_utils.pathify_url(match_url), 'title': match_title, 'year': match_year} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match = re.search('href="([^"]+)', item) if match: url = match.group(1) match_title_year = dom_parser.parse_dom( item, 'span', {'class': 'tt'}) if match_title_year: match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year[0]) if match: match_title, match_year = match.groups() else: match_title = match_title_year[0] match_year = '' year_frag = dom_parser.parse_dom(item, 'span', {'class': 'year'}) if year_frag: match_year = year_frag[0] if (not year or not match_year or year == match_year): result = { 'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year } results.append(result) return results
def search(self, video_type, title, year, season=''): search_url = urlparse.urljoin(self.base_url, '/search/%s.html' % urllib.quote_plus(title)) html = self._http_get(search_url, cache_limit=.25) results = [] fragment = dom_parser.parse_dom(html, 'div', {'class': 'list-movie'}) if fragment: for item in dom_parser.parse_dom(fragment[0], 'div', {'class': 'movie'}): match = re.search('class="movie-name".*?href="([^"]+)[^>]+>([^<]+)', item) if match: url, match_title = match.groups() is_season = re.search('\s+-\s+[Ss](\d+)$', match_title) if not is_season and video_type == VIDEO_TYPES.MOVIE or is_season and VIDEO_TYPES.SEASON: match_year = '' if video_type == VIDEO_TYPES.MOVIE: for info_frag in dom_parser.parse_dom(item, 'p', {'class': 'info'}): match = re.search('(\d{4})', info_frag) if match: match_year = match.group(1) break if not match_year: match = re.search('(\d{4})$', url) if match: match_year = match.group(1) else: if season and int(is_season.group(1)) != int(season): continue if (not year or not match_year or year == match_year): result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': match_year} results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, "/?query=") search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=0.25) results = [] info = dom_parser.parse_dom(html, "div", {"class": "movie-info"}) for item in info: match_title = dom_parser.parse_dom(item, "span", {"class": "movie-title"}) match_year = dom_parser.parse_dom(item, "span", {"class": "movie-year"}) if match_title: match_title = self.__strip_link(match_title[0]) if match_year: match_year = self.__strip_link(match_year[0]) else: match_year = "" match = re.search('href="([^"]+)', item) if match: url = match.group(1) else: continue if not year or not match_year or year == match_year: result = {"title": match_title, "year": match_year, "url": url.replace(self.base_url, "")} results.append(result) return results
def search(self, video_type, title, year, season=''): results = [] if video_type == VIDEO_TYPES.TVSHOW: url = urlparse.urljoin(self.base_url, '/series/all/') html = self._http_get(url, cache_limit=8) links = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}, 'href') titles = dom_parser.parse_dom(html, 'a', {'class': 'underilne'}) items = zip(links, titles) else: url = urlparse.urljoin(self.base_url, '/search?=%s' % urllib.quote_plus(title)) data = {'q': title, 'go': 'Search'} html = self._http_get(url, data=data, cache_limit=8) match = re.search('you can search again in (\d+) seconds', html, re.I) if match: wait = int(match.group(1)) if wait > self.timeout: wait = self.timeout time.sleep(wait) html = self._http_get(url, data=data, cache_limit=0) pattern = 'class="movie_box.*?href="([^"]+).*?<h1>([^<]+)' items = re.findall(pattern, html, re.DOTALL) norm_title = scraper_utils.normalize_title(title) for item in items: url, match_title = item if norm_title in scraper_utils.normalize_title(match_title): result = {'url': scraper_utils.pathify_url(url), 'title': scraper_utils.cleanse_title(match_title), 'year': ''} results.append(result) return results
def search(self, video_type, title, year): results = [] norm_title = self._normalize_title(title) if video_type == VIDEO_TYPES.TVSHOW: for server_url in TVSHOW_URLS: for row in self.__parse_directory(self._http_get(server_url, cache_limit=48)): match_year = '' if norm_title in self._normalize_title(row['title']) and (not year or not match_year or year == match_year): result = {'url': urlparse.urljoin(server_url, row['link']), 'title': row['title'], 'year': match_year} results.append(result) else: search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus(title) html = self._http_get(search_url, cache_limit=1) for article in dom_parser.parse_dom(html, 'article', {'class': 'entry-body'}): link = dom_parser.parse_dom(article, 'a', {'class': 'more-link'}, 'href') content = dom_parser.parse_dom(article, 'div', {'class': 'post-content'}) match = re.search('</a>\s*([^<]+)', content[0]) if content else '' info = dom_parser.parse_dom(article, 'div', {'class': 'post-info'}) is_movie = re.search('/category/movies/', info[0]) if info else False if match and link and is_movie: match_title_year = match.group(1) match = re.search('(.*?)\s+\(?(\d{4})\)?', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year match_year = '' if not year or not match_year or year == match_year: result = {'url': self._pathify_url(link[0]), 'title': match_title, 'year': match_year} results.append(result) return results
def search(self, video_type, title, year): search_url = urlparse.urljoin(self.base_url, '/?s=') search_url += urllib.quote_plus('%s %s' % (title, year)) html = self._http_get(search_url, cache_limit=.25) results = [] for item in dom_parser.parse_dom(html, 'div', {'class': 'item'}): match = re.search('href="([^"]+).*?alt="([^"]+)', item, re.DOTALL) if match: url, match_title_year = match.groups() match = re.search('(.*?)(?:\s+\(?(\d{4})\)?)', match_title_year) if match: match_title, match_year = match.groups() else: match_title = match_title_year year_fragment = dom_parser.parse_dom( item, 'span', {'class': 'year'}) if year_fragment: match_year = year_fragment[0] else: match_year = '' if not year or not match_year or year == match_year: result = { 'url': scraper_utils.pathify_url(url), 'title': match_title, 'year': match_year } results.append(result) return results