def query(self, series, season, year=None, country=None): # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, { 'year': year, 'country': country }) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() if not r.content: # Provider returns a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows match = series_year_re.match( soup.select('#header font')[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season and episode: params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: params += ' {:4d}'.format(year) logger.debug('Searching subtitles %r', params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) while True: r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitles cells for cell in soup.select('td.latest_name > a:nth-of-type(1)'): # read the item subtitle_id = int(cell['href'].rsplit('/', 2)[1]) page_link = cell['href'] language = Language.fromalpha2( cell.parent.find('img')['src'].split('/')[-1].split('.') [0]) version = cell.text.strip() or None if version is None: version = "" subtitle = self.subtitle_class( language, page_link, version, self.download_url.format(subtitle_id)) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) anchors = soup.select('td a') next_page_available = False for anchor in anchors: if 'Next' in anchor.text and 'search.php' in anchor['href']: search_link = self.server_url + anchor['href'] next_page_available = True break if not next_page_available: break return subtitles
def query(self, series, season, year=None, country=None): # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country}) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() if not r.content: # Provider returns a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows match = series_year_re.match(soup.select('#header font')[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param string series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if any. :rtype: int or None """ # make the search series_clean = self.clean_punctuation(series).lower() logger.info('Searching show id for %r', series_clean) r = self.session.post(self.server_url + 'search.php', data={'q': series_clean}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(self.clean_punctuation(suggestion.text)) if not match: logger.error('Failed to match %s', suggestion.text) continue if self.clean_punctuation(match.group('series')).lower() == series_clean: if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break return show_id
def _search_url_titles(self, title): """Search the URL titles by kind for the given `title`. :param str title: title to search for. :return: the URL titles by kind. :rtype: collections.defaultdict """ # make the search logger.info('Searching title name for %r', title) r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10) r.raise_for_status() # check for redirections if r.history and all([h.status_code == 302 for h in r.history]): logger.debug('Redirected to the subtitles page') links = [r.url] else: # get the suggestions (if needed) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) links = [ link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a') ] logger.debug('Found %d suggestions', len(links)) url_titles = defaultdict(list) for link in links: parts = link.split('/') url_titles[parts[-3]].append(parts[-2]) return url_titles
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=20, cookies=self.cookies) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.vr > h3 > a[href^="/show/"]'): show_ids[sanitize(show.text)] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _search_movie(self, movie_id): subs = [] url = self.server_url + self.movie_info_url + movie_id r = self.session.get(url, timeout=10) r.raise_for_status() if len(r.content) < 10: logger.debug( "Too short content-length in response: [{}]. Treating as No Subtitles Found " .format(str(r.content))) return [] html = ParserBeautifulSoup(r.content, ["html.parser"]) sub_rows = html.select("table#subtitlesList tbody > tr") for row in sub_rows: columns = row.find_all("td") sub = {"id": movie_id} for index, column in enumerate(columns): if index == 0: sub["rls"] = column.get_text().strip().split("\n")[0] if index == 5: sub["sub_id"] = column.find( "a", attrs={"data-subtitle-id": True})["data-subtitle-id"] if 'sub_id' in sub: subs.append(sub) return subs
def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'srch.php', params=params, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get the suggestion suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' ')) == sanitize(series_year): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.version > h3 > a[href^="/show/"]'): show_ids[sanitize(show.text)] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids
def query(self, series, season, episode, year=None): # search the show id show_id = self.search_show_id(series, year) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year}) return [] # get the episode ids episode_ids = self.retry(lambda: self.get_episode_ids(show_id, season)) if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.retry(lambda: self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10)) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('p', title='release').text.strip() or None subtitle = PatchedTVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.info('Found subtitle %s', subtitle) subtitles.append(subtitle) return subtitles
def _get_suggestions(self, title): """Search the show or movie id from the `title` and `year`. :param str title: title of the show. :return: the show suggestions found. :rtype: list of dict """ # make the search logger.info('Searching show ids with %r', title) r = self.session.get(self.server_url + self.search_url.format(title), headers={'Referer': self.server_url}, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['html.parser']) suggestions = [{ 'link': l.attrs['value'], 'title': l.text } for l in soup.select('select[name="Mov_sel"] > option[value]')] logger.debug('Found suggestions: %r', suggestions) return suggestions
def _search_url_titles(self, title): """Search the URL titles by kind for the given `title`. :param str title: title to search for. :return: the URL titles by kind. :rtype: collections.defaultdict """ # make the search logger.info('Searching title name for %r', title) r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10) r.raise_for_status() # check for redirections if r.history and all([h.status_code == 302 for h in r.history]): logger.debug('Redirected to the subtitles page') links = [r.url] else: # get the suggestions (if needed) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) links = [link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a')] logger.debug('Found %d suggestions', len(links)) url_titles = defaultdict(list) for link in links: parts = link.split('/') url_titles[parts[-3]].append(parts[-2]) return url_titles
def query(self, show_id, series, season, episode, year=None): # get the episode ids episode_ids = self.get_episode_ids(show_id, season) # Provider doesn't store multi episode information episode = min(episode) if episode and isinstance(episode, list) else episode if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('h5').text.strip() or None subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.info('Found subtitle %s', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `series.php` page. :return: show id per series, lower case and without quotes. :rtype: dict """ # get the show page logger.info('Getting show ids') r = self.session.get(self.series_url, timeout=10) r.raise_for_status() if r.status_code != 200: logger.error('Error getting show ids') raise ProviderError('Error getting show ids') soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} for show in soup.select('td > a[href^="/show/"]'): show_ids[sanitize(show.get_text())] = int(show['href'][6:]) logger.debug('Found %d show ids', len(show_ids)) return show_ids
def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param string series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if any. :rtype: int or None """ # make the search logger.info('Searching show id for %r', series) r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(suggestion.text) if not match: logger.error('Failed to match %s', suggestion.text) continue if sanitize(match.group('series')).lower() == series.lower(): if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break soup.decompose() soup = None return show_id
def query(self, series, season, episode, year=None): # get the show id show_id = self.get_show_id(series, year) if show_id is None: logger.error('No show id found for %s (%r)', series, year) return [] # get the episode url episode_url = self.get_episode_url(show_id, series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] # get the page of the episode of the show r = self.session.get(episode_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('Subt.+tulos de {}(.+){}x{:02d} - (.+)'.format(series, season, episode).lower()) title = title_pattern.search(soup.select('#cabecera-subtitulo')[0].get_text().strip().lower()).group(2) # loop over subtitle rows subtitles = [] for sub in soup.find_all('div', attrs={'id': re.compile('version([0-9]+)')}): # read the release subtitle release = sanitize_release_group(release_pattern.search(sub.find('p', class_='title-sub') .contents[2]).group(1)) for html_language in sub.select('ul.sslist'): language = Language.fromtusubtitulo(html_language.find_next('b').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # ignore incomplete subtitles status = sanitize(html_language.find_next('li', class_=re.compile('li-estado')).get_text()) if status != 'completado': logger.debug('Ignoring subtitle with status %s', status) continue # get the most updated version of the subtitle and if it doesn't exist get the original version html_status = html_language.select('a[href^="updated/"]') if len(html_status) == 0: html_status = html_language.select('a[href^="original/"]') subtitle_url = self.server_url + html_status[0]['href'] subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.info( f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}" ) return [] r = self.session.get(episode_url, headers={"Referer": self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"]) # get episode title title_pattern = re.compile("{}(.+){}x{:02d}- (.+)".format( series, season, episode).lower()) title = title_pattern.search( soup.select("#episode_title")[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all("div", attrs={"id": "progress_buttons_row"}): # read the language language = Language.fromsubtitulamos( sub.find_previous( "div", class_="subtitle_language").get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language("spa", "MX"): language = Language("spa") hearing_impaired = True # read the release subtitle release = sub.find_next("div", class_="version_name").get_text().strip() # ignore incomplete subtitles status = sub.find_next("div", class_="subtitle_buttons").contents[1] if status.name != "a": logger.debug("Ignoring subtitle in [%s] not finished", language) continue # read the subtitle url subtitle_url = self.server_url + status["href"][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, languages=None, title=None, imdb_id=None, video=None): subtitles = [] params = self.getQueryParams(imdb_id, title) search_response = self.session.get(self.api_url, params=params, timeout=15) search_response.raise_for_status() if not search_response.content: logger.debug('[#### Provider: titrari.ro] No data returned from provider') return [] soup = ParserBeautifulSoup(search_response.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('td[rowspan=\'5\']') for index, row in enumerate(rows): result_anchor_el = row.select_one('a') # Download link href = result_anchor_el.get('href') download_link = self.api_url + href fullTitle = row.parent.find("h1").find("a").text #Get title try: title = fullTitle.split("(")[0] except: logger.error("[#### Provider: titrari.ro] Error parsing title.") # Get downloads count try: downloads = int(row.parent.parent.select("span")[index].text[12:]) except: logger.error("[#### Provider: titrari.ro] Error parsing downloads.") # Get year try: year = int(fullTitle.split("(")[1].split(")")[0]) except: year = None logger.error("[#### Provider: titrari.ro] Error parsing year.") # Get imdbId sub_imdb_id = self.getImdbIdFromSubtitle(row) try: comments = row.parent.parent.find_all("td", class_=re.compile("comment"))[index*2+1].text except: logger.error("Error parsing comments.") subtitle = self.subtitle_class(next(iter(languages)), download_link, index, None, title, sub_imdb_id, year, downloads, comments) logger.debug('[#### Provider: titrari.ro] Found subtitle %r', str(subtitle)) subtitles.append(subtitle) ordered_subs = self.order(subtitles, video) return ordered_subs
def query(self, title): subtitles = [] data = { 'ajax': '1', 'sSearch': title, } r = self.session.post(self.search_url, data=data, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('tbody > tr') for row in rows: # title title_anchor_el = row.select_one('.title > a') title_inner_text = [ element for element in title_anchor_el if isinstance(element, NavigableString) ] title = title_inner_text[0].strip() # year year = row.select_one('.year').text.strip('()') # download link href = title_anchor_el.get('href') download_link = self.server_url + href # imdb id imdb_td = row.select_one('td:nth-of-type(4)') imdb_link = imdb_td.select_one('a').get('href') imdb_id = imdb_link.split('/')[-2] # fps fps = row.select_one('.fps').text.strip() # additional notes notes = row.select_one('.notes').text.strip() # page link = download link (there is no seperate subtitle page link) page_link = download_link # create/add the subitle subtitle = self.subtitle_class(Language.fromalpha2('lv'), page_link, download_link, title, year, imdb_id, fps, notes) logger.debug('nekur: Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the episode url episode_url = self._search_url_titles(series, season, episode, year) if episode_url is None: logger.error('No episode url found for %s, season %d, episode %d', series, season, episode) return [] r = self.session.get(episode_url, headers={'Referer': self.server_url}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get episode title title_pattern = re.compile('{}(.+){}x{:02d}- (.+)'.format( series, season, episode).lower()) title = title_pattern.search( soup.select('#episode_title')[0].get_text().strip().lower()).group( 2) subtitles = [] for sub in soup.find_all('div', attrs={'id': 'progress_buttons_row'}): # read the language language = Language.fromsubtitulamos( sub.find_previous( 'div', class_='subtitle_language').get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language('spa', 'MX'): language = Language('spa') hearing_impaired = True # read the release subtitle release = sub.find_next('div', class_='version_name').get_text().strip() # ignore incomplete subtitles status = sub.find_next('div', class_='subtitle_buttons').contents[1] if status.name != 'a': logger.debug('Ignoring subtitle in [%s] not finished', language) continue # read the subtitle url subtitle_url = self.server_url + status['href'][1:] subtitle = SubtitulamosSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query_movies(self, video, title): subtitles = [] r = self.session.get(self.search_url, params={'q': title}, timeout=30) r.raise_for_status() soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) # loop over movies name movies_url = [] self.is_perfect_match = False movies = soup.select('.film > h3 > a') for item in movies: # title if title.lower() in item.text.lower(): movies_url.append(item.attrs['href']) self.is_perfect_match = True series_subs_archives_url = [] for movies_page in movies_url: page_link = self.server_url + movies_page r = self.session.get(page_link, timeout=30) r.raise_for_status() soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) movies_subs_archives = soup.select('a.subList') for item in movies_subs_archives: download_link = self.server_url + 'films/' + item.attrs['href'] res = self.session.get(download_link, timeout=30) res.raise_for_status() archive = self._get_archive(res.content) # extract the subtitle if archive: subtitles_from_archive = self._get_subtitle_from_archive( archive, video) for subtitle in subtitles_from_archive: subtitle.page_link = page_link subtitle.download_link = download_link subtitles.append(subtitle) return subtitles
def query(self, series, season, episode, year=None): # get the show id show_id = self.get_show_id(series, year) if show_id is None: logger.error("No show id found for %s (%r)", series, year) return [] # get the episode url episode_url = self.get_episode_url(show_id, series, season, episode, year) if episode_url is None: logger.info(f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}") return [] # get the page of the episode of the show r = self.session.get(episode_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"]) # get episode title title_pattern = re.compile("Subt.+tulos de {}(.+){}x{:02d} - (.+)".format(series, season, episode).lower()) title = title_pattern.search(soup.select("#cabecera-subtitulo")[0].get_text().strip().lower()).group(2) # loop over subtitle rows subtitles = [] for sub in soup.find_all("div", attrs={"id": re.compile("version([0-9]+)")}): # read the release subtitle release = sanitize_release_group(release_pattern.search(sub.find("p", class_="title-sub").contents[2]).group(1)) for html_language in sub.select("ul.sslist"): language = Language.fromtusubtitulo(html_language.find_next("b").get_text().strip()) hearing_impaired = False # modify spanish latino subtitle language to only spanish and set hearing_impaired = True # because if exists spanish and spanish latino subtitle for the same episode, the score will be # higher with spanish subtitle. Spanish subtitle takes priority. if language == Language("spa", "MX"): language = Language("spa") hearing_impaired = True # ignore incomplete subtitles status = sanitize(html_language.find_next("li", class_=re.compile("li-estado")).get_text()) if status != "completado": logger.debug("Ignoring subtitle with status %s", status) continue # get the most updated version of the subtitle and if it doesn't exist get the original version html_status = html_language.select('a[href^="updated/"]') if len(html_status) == 0: html_status = html_language.select('a[href^="original/"]') subtitle_url = self.server_url + html_status[0]["href"] subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url) logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) return subtitles
def query(self, title): subtitles = [] r = self.session.get(self.search_url, params={'q': title}, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('.eBlock') for row in rows: result_anchor_el = row.select_one('.eTitle > a') # page link page_link = result_anchor_el.get('href') # fetch/parse additional info r = self.session.get(page_link, timeout=10) soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # title movie_titles_string = soup.select_one('.main-header').text.strip() movie_titles_list = movie_titles_string.split(' / ') title = movie_titles_list[-1] # year year = soup.select_one('#film-page-year').text.strip() # imdb id imdb_link = soup.select_one('#actors-page > a').get('href') imdb_id = imdb_link.split('/')[-2] # download link href = soup.select_one('.hvr').get('href') download_link = self.server_url + href # create/add the subitle subtitle = self.subtitle_class(Language.fromalpha2('lv'), page_link, download_link, title, year, imdb_id) logger.debug('subtitri.id.lv: Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def get_episode_url(self, show_id, series, season, episode, year=None): """Get the url best matching show id for `series`, `season`, `episode` and `year`. :param int show_id: show id of the series :param str series: serie of the episode. :param int season: season of the episode. :param int episode: number of the episode. :param int year: year of the series. :return: the episode url, if found. :rtype: str """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) series_sanitized = sanitize(series) episode_url = None r = self.session.get(self.subtitles_url, params={ 'show': show_id, 'season': season }, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episodes rows for html_episode in soup.select('td > a[href*="/episodes/"]'): title = sanitize(html_episode.get_text()) # attempt series with year if sanitize('{} {} {}x{:02d}'.format(series_sanitized, year, season, episode)) in title: episode_url = 'https://' + html_episode['href'][2:] logger.debug( 'Subtitle found for %s, season: %d, episode: %d. URL: %s', series, season, episode, episode_url) break elif sanitize('{} {}x{:02d}'.format(series_sanitized, season, episode)) in title: episode_url = 'https://' + html_episode['href'][2:] logger.debug( 'Subtitle found for %s, season: %d, episode: %d. URL: %s', series, season, episode, episode_url) break return episode_url
def query(self, movie_id, title, year): # get the season list of the show logger.info('Getting the subtitle list of show id %s', movie_id) if movie_id: page_link = self.server_url + '/' + movie_id else: page_link = self.server_url + self.search_url.format(' '.join( [title, str(year)])) r = self.session.get(page_link, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['html.parser']) year = None year_element = soup.select_one('td#dates_header > table div') matches = False if year_element: matches = year_re.match(str(year_element.contents[2]).strip()) if matches: year = int(matches.group(1)) title_tag = soup.select_one('td#dates_header > table u') show_title = str(title_tag.contents[0]).strip() if title_tag else None subtitles = [] # loop over episode rows for subs_tag in soup.select('.movie-details'): # read common info version = subs_tag.find('span').text download_link = self.server_url + subs_tag.find('a')['href'] uploader = subs_tag.select_one('.movie-info').find('p').find( 'a').text language_code = subs_tag.select_one('.sprite')['class'][1].split( 'gif')[0] language = Language.fromietf(language_code) subtitle = self.subtitle_class(language, page_link, show_title, year, version, download_link, uploader) logger.debug('Found subtitle {!r}'.format(subtitle)) subtitles.append(subtitle) return subtitles
def query(self, movie_id, title, year): # get the season list of the show logger.info('Getting the subtitle list of show id %s', movie_id) if movie_id: page_link = self.server_url + '/' + movie_id else: page_link = self.server_url + text_type(self.search_url).format( ' '.join([title, str(year)])) r = self.session.get(page_link, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['html.parser']) year_num = None year_element = soup.select_one('td#dates_header > table div') matches = False if year_element: matches = year_re.match(str(year_element.contents[2]).strip()) if matches: year_num = int(matches.group(1)) title_element = soup.select_one('td#dates_header > table u') show_title = str( title_element.contents[0]).strip() if title_element else None subtitles = [] # loop over episode rows for subtitle in soup.select( 'table.table_border div[align="center"] > div'): # read common info version = subtitle.find('b').text download_link = self.server_url + subtitle.find('a')['href'] language = Language.fromalpha2( subtitle.find('img')['src'].split('/')[-1].split('.')[0]) subtitle = self.subtitle_class(language, page_link, show_title, year_num, version, download_link) logger.debug('Found subtitle {!r}'.format(subtitle)) subtitles.append(subtitle) return subtitles
def query(self, show_id, series, season, episode, title): # get the season list of the show logger.info('Getting the subtitle list of show id %s', show_id) if all((show_id, season, episode)): page_link = self.server_url + self.episode_link.format( show_id=show_id, season=season, episode=episode) else: return [] r = self.session.get(page_link, timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) year = None matches = year_re.match( str(soup.select_one( '#dates_header_br > table div').contents[2]).strip()) if matches: year = int(matches.group(1)) show_title = str( soup.select_one('#dates_header_br > table div u').string).strip() subtitles = [] # loop over episode rows for subs_tag in soup.select('table .seeDark,.seeMedium'): # read common info version = subs_tag.find_all('b')[0].text download_link = self.server_url + subs_tag.find('a')['href'] uploader = subs_tag.find_all('b')[1].text language = Language.fromalpha2( subs_tag.find('img')['src'].split('/')[-1].split('.')[0]) subtitle = self.subtitle_class(language, page_link, show_title, year, version, download_link, uploader) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def get_episode_ids(self, show_id, season): """Get episode ids from the show id and the season. :param int show_id: show id. :param int season: season of the episode. :return: episode ids per episode number. :rtype: dict """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episode rows episode_ids = {} for row in soup.select('table#table5 tr'): # skip rows that do not have a link to the episode page if not row('a', href=episode_id_re): continue # extract data from the cells cells = row('td') episode = int(cells[0].text.split('x')[1]) episode_id = int(cells[1].a['href'][8:-5]) episode_ids[episode] = episode_id if episode_ids: logger.debug('Found episode ids %r', episode_ids) else: logger.warning('No episode ids found') soup.decompose() soup = None return episode_ids
def query_series(self, video, title): subtitles = [] r = self.session.get(self.search_url, params={'q': title}, timeout=30) r.raise_for_status() soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) # loop over series name self.is_perfect_match = False series_url = [] series = soup.select('.serie > h3 > a') for item in series: # title if title in item.text: series_url.append(item.attrs['href']) self.is_perfect_match = True series_subs_archives_url = [] for series_page in series_url: page_link = self.server_url + series_page r = self.session.get(page_link, timeout=30) r.raise_for_status() soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) series_subs_archives = soup.select('a.subList') for item in series_subs_archives: matching_archive = False subtitles_archive_name = unquote( item.attrs['href'].split('/')[-1:][0][:-4]) guessed_subs = guessit(subtitles_archive_name, {'type': 'episode'}) try: season, episode = item.select_one( '.episodenum').text.split('×') guessed_subs.update({ 'season': int(season), 'episode': int(episode) }) except ValueError: season = item.select_one('.episodenum').text[1:] episode = None guessed_subs.update({'season': int(season)}) if guessed_subs['season'] == video.season: if 'episode' in guessed_subs: if guessed_subs['episode'] == video.episode: matching_archive = True else: matching_archive = True if guessed_subs['season'] == 16: print('test') if matching_archive: download_link = self.server_url + 'series/' + item.attrs[ 'href'] res = self.session.get(download_link, timeout=30) res.raise_for_status() archive = self._get_archive(res.content) # extract the subtitle if archive: subtitles_from_archive = self._get_subtitle_from_archive( archive, video) for subtitle in subtitles_from_archive: subtitle.page_link = page_link subtitle.download_link = download_link subtitles.append(subtitle) return subtitles
def query(self, language=None, title=None, imdb_id=None, video=None): subtitles = [] params = self.getQueryParams(imdb_id, title, language) search_response = self.session.get(self.api_url, params=params, timeout=15) search_response.raise_for_status() if not search_response.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(search_response.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('td[rowspan="5"]') for index, row in enumerate(rows): result_anchor_el = row.select_one('a') # Download link href = result_anchor_el.get('href') download_link = self.api_url + href fullTitle = row.parent.select('h1 a')[0].text # Get title try: title = fullTitle.split("(")[0] except: logger.error("Error parsing title") # Get downloads count downloads = 0 try: downloads = int(row.parent.parent.select('span')[index].text[12:]) except: logger.error("Error parsing downloads") # Get year try: year = int(fullTitle.split("(")[1].split(")")[0]) except: year = None logger.error("Error parsing year") # Get imdbId sub_imdb_id = self.getImdbIdFromSubtitle(row) comments = '' try: comments = row.parent.parent.select('.comment')[1].text except: logger.error("Error parsing comments") # Get page_link try: page_link = self.api_url + row.parent.select('h1 a')[0].get('href') except: logger.error("Error parsing page_link") # Get uploader try: uploader = row.parent.select('td.row1.stanga a')[-1].text except: logger.error("Error parsing uploader") episode_number = video.episode if isinstance(video, Episode) else None subtitle = self.subtitle_class(language, download_link, index, comments, title, sub_imdb_id, page_link, uploader, year, downloads, isinstance(video, Episode), episode_number) logger.debug('Found subtitle %r', str(subtitle)) subtitles.append(subtitle) ordered_subs = self.order(subtitles) return ordered_subs
def query(self, languages, video): subtitle_name = "%s %dx%02d" % (video.series, video.season, video.episode) logger.debug('Searching subtitles "%s"' % subtitle_name) response = self.session.get(self.server_url + '/search/query', params={'q': video.series}, timeout=10) response.raise_for_status() result = response.json() subtitles = [] for serie in result: # skip non-matching series if video.series.lower() != serie['show_name'].lower(): continue # season page response = self.session.get(self.server_url + "/shows/%d" % serie['show_id'], timeout=10) response.raise_for_status() soup = ParserBeautifulSoup(response.text, ['lxml', 'html.parser']) season_found = False for season in soup.select('#season-choices a'): if season.text.strip() == str(video.season): season_found = True if "selected" not in season.attrs['class']: # go to the right season page response = self.session.get(self.server_url + season['href'], timeout=10) response.raise_for_status() soup = ParserBeautifulSoup(response.text, ['lxml', 'html.parser']) break if not season_found: continue # episode page episode_found = False for episode in soup.select('#episode-choices a'): if episode.text.strip() == str(video.episode): episode_found = True if "selected" not in episode.attrs['class']: # go to the right episode page response = self.session.get(self.server_url + episode['href'], timeout=10) response.raise_for_status() soup = ParserBeautifulSoup(response.text, ['lxml', 'html.parser']) break if not episode_found: continue episode_url = response.url # subtitles for lang in soup.select("div.language-container"): lang_name = lang.select("div.language-name")[0].text if "English" in lang_name: language = "en" elif "Español" in lang_name: language = "es" else: continue # not supported yet logger.debug('Found subtitles in "%s" language.', language) for release in lang.select("div.version-container"): if len(release.select('a[href*="/download"]')) != 1: continue # incomplete translation, download link is not available release_name = release.select( 'div.version-container p')[1].text release_url = self.server_url + release.select( 'a[href*="/download"]')[0]['href'] subtitles.append( SubtitulamosTVSubtitle(Language.fromietf(language), episode_url, release_url, release_name)) return subtitles
def query(self, languages=None, title=None, imdb_id=None, video=None): subtitles = [] params = self.getQueryParams(imdb_id, title) search_response = self.session.post(self.api_url, data=params, timeout=15) search_response.raise_for_status() soup = ParserBeautifulSoup( search_response.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # loop over subtitle cells rows = soup.select('div[id="round"]') if len(rows) == 0: logger.debug('No data returned from provider') return [] # release comments are outside of the parent for the sub details itself, so we just map it to another list comment_rows = soup.findAll('div', attrs={ 'class': None, 'id': None, 'align': None }) for index, row in enumerate(rows): result_anchor_el = row.select_one('.buton').select('a') # Download link href = result_anchor_el[0]['href'] download_link = self.server_url + href fullTitle = row.select_one('#content-main a').text # Get title try: title = fullTitle.split("(")[0] except: logger.error("Error parsing title") # Get Uploader try: uploader = row.select('#content-main p')[4].text[10:] except: logger.error("Error parsing uploader") # Get downloads count downloads = 0 try: downloads = int(row.select_one('#content-right p').text[12:]) except: logger.error("Error parsing downloads") # Get year try: year = int(fullTitle.split("(")[1].split(")")[0]) except: year = None logger.error("Error parsing year") # Get imdbId sub_imdb_id = self.getImdbIdFromSubtitle(row) comments = '' try: comments = comment_rows[index].text logger.debug('Comments: {}'.format(comments)) except: logger.error("Error parsing comments") # Get Page Link try: page_link = row.select_one('#content-main a')['href'] except: logger.error("Error parsing page_link") episode_number = video.episode if isinstance(video, Episode) else None subtitle = self.subtitle_class(next(iter(languages)), download_link, index, comments, title, sub_imdb_id, uploader, page_link, year, downloads, isinstance(video, Episode), episode_number) logger.debug('Found subtitle %r', str(subtitle)) subtitles.append(subtitle) ordered_subs = self.order(subtitles) return ordered_subs
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season and episode: params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: params += ' {:4d}'.format(year) logger.debug('Searching subtitles %r', params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) for entity in soup.select('div.item.prel.clearfix a:nth-of-type(2)'): moviename = entity.text entity_url = self.server_url + entity['href'] logger.debug(entity_url) r = self.session.get(entity_url, timeout=30) r.raise_for_status() logger.debug('looking into ' + entity_url) soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find( "div", class_="subs box clearfix") # loop over subtitles cells subs = soup.tbody.find_all("tr") for sub in subs: page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8')) version = sub.a.text.encode('utf-8') or None if version is None: version = "" try: td = sub.find("td", class_="tac lang") r2 = td.find_all("img") langs = [x.get('title').encode('utf-8') for x in r2] except: langs = '未知' name = '%s (%s)' % (version, ",".join(langs)) if ('English' in langs) and not (('简体中文' in langs) or ('繁體中文' in langs)): language = Language('eng') else: language = Language('zho') # read the item subtitle = self.subtitle_class( language, page_link, version, page_link.replace("detail", "dld")) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, show_id, series, season, episode, title): # get the season list of the show logger.info('Getting the subtitle list of show id %s', show_id) is_episode = False if all((show_id, season, episode)): is_episode = True page_link = self.server_url + self.episode_link.format( show_id=show_id, season=season, episode=episode) elif all((show_id, title)): page_link = self.server_url + self.movie_link.format(show_id) else: return [] r = self.session.get(page_link, timeout=10) if r.status_code == 404: return [] r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) year = None if not is_episode: year = int(soup.select_one('span.year').text) subtitles = [] # loop over episode rows for subs_tag in soup.select('div[id="subtitles"] tr[data-id]'): # read common info version = subs_tag.find('td', {'class': 'name'}).text download_link = subs_tag.find( 'a', {'class': 'btn-success'})['href'].strip('\'') # read the episode info if is_episode: episode_numbers = soup.select_one( '#summary-wrapper > div.container.summary span.main-title-sxe' ).text season = None episode = None matches = episode_re.match(episode_numbers.strip()) if matches: season = int(matches.group(1)) episode = int(matches.group(2)) series = soup.select_one( '#summary-wrapper > div.summary h2 > a').string.strip() title = soup.select_one( '#summary-wrapper > div.container.summary span.main-title' ).text subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, series, season, episode, title, year, version, download_link) # read the movie info else: title = str( soup.select_one('#summary-wrapper > div.summary h1'). contents[0]).strip() subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, None, None, None, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles