def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.version > h3 > a[href^="/show/"]'): show_clean = self.clean_punctuation(show.text.lower()) show_id = int(show['href'][6:]) show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') r = self.retry(lambda: self.session.get(self.server_url + 'shows.php', timeout=10)) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # populate the show ids show_ids = {} for show in soup.select('td.version > h3 > a[href^="/show/"]'): show_clean = self.clean_punctuation(show.text.lower()) try: show_id = int(show['href'][6:]) except ValueError: continue show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param string series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if found. :rtype: int or None # patch: add punctuation cleaning """ # build the params series_year = '%s (%d)' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # get the suggestion suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not self.full_clean(suggestion[0].i.text.lower()) == self.full_clean(series_year.lower()): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id
def query(self, show_id, series, season, year=None, country=None): # patch: fix logging # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'ajax_loadShow.php', params={'show': show_id, 'season': season}, timeout=10, headers={ "referer": "%sshow/%s" % (self.server_url, show_id), "X-Requested-With": "XMLHttpRequest" } ) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() if not r.content: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def query(self, series, season, year=None, country=None): # patch: fix logging # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, { 'year': year, 'country': country }) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.retry( lambda: self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = PatchedAddic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) r = self.session.get(self.server_url + 'search.php', params=params, timeout=10) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) suggestion = None # get the suggestion try: suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' '), default_characters=self.sanitize_characters) == \ sanitize(series_year, default_characters=self.sanitize_characters): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id finally: if suggestion: suggestion.decompose() soup.decompose() soup = None
def get_movie_id(self, movie, year=None): """Get the best matching movie id for `movie`, `year`. :param str movie: movie. :param year: year of the movie, if any. :type year: int :return: the movie id, if found. :rtype: int """ movie_id = None # get the movie id logger.info('Getting movie id') r = self.session.get(self.server_url + 'search.php?search=' + quote_plus(movie), timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # populate the movie id movies_table = soup.find('table', {'class': 'tabel'}) movies = movies_table.find_all('tr') for item in movies: link = item.find('a', href=True) if link: if link['href'].startswith('movie/'): splitted_uri = link['href'].split('/') if len(splitted_uri) == 2: media_id = splitted_uri[1] else: continue media_title = link.text match = re.search(r'(.+)\s\((\d{4})\)$', media_title) if match: media_name = match.group(1) media_year = match.group(2) if sanitize(media_name.lower()) == sanitize(movie.lower()) and media_year == str(year): movie_id = media_id soup.decompose() soup = None logger.debug(f'Found this movie id: {movie_id}') if not movie_id: logging.debug(f"Addic7ed: Cannot find this movie with guessed year {year}: {movie}") return movie_id
def query(self, series, season, year=None, country=None): # patch: fix logging # get the show id show_id = self.get_show_id(series, year, country) if show_id is None: logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country}) return [] # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows header = soup.select('#header font') if header: match = series_year_re.match(header[0].text.strip()[:-10]) series = match.group('series') year = int(match.group('year')) if match.group('year') else None subtitles = [] for row in soup.select('tr.epeven'): cells = row('td') # ignore incomplete subtitles status = cells[5].text if status != 'Completed': logger.debug('Ignoring subtitle with status %s', status) continue # read the item language = Language.fromaddic7ed(cells[3].text) hearing_impaired = bool(cells[6].text) page_link = self.server_url + cells[2].a['href'][1:] season = int(cells[0].text) episode = int(cells[1].text) title = cells[2].text version = cells[4].text download_link = cells[9].a['href'][1:] subtitle = PatchedAddic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') region.set(self.last_show_ids_fetch_key, datetime.datetime.now()) r = self.session.get(self.server_url, timeout=10) r.raise_for_status() soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # populate the show ids show_ids = {} shows = soup.find(id='qsShow') for show in shows: if hasattr(show, 'attrs'): try: show_id = int(show.attrs['value']) except ValueError: continue if show_id != 0: show_clean = sanitize(show.text, default_characters=self.sanitize_characters) show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match and match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id soup.decompose() soup = None logger.debug('Found %d show ids', len(show_ids)) if not show_ids: raise Exception("Addic7ed: No show IDs found!") return show_ids
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.content and use 'html.parser' soup = ParserBeautifulSoup(r.content, ['html.parser']) # populate the show ids show_ids = {} for show in soup.select('td > h3 > a[href^="/show/"]'): show_clean = sanitize(show.text, default_characters=self.sanitize_characters) try: show_id = int(show['href'][6:]) except ValueError: continue show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match and match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id soup.decompose() soup = None logger.debug('Found %d show ids', len(show_ids)) return show_ids
def _get_show_ids(self): """Get the ``dict`` of show ids per series by querying the `shows.php` page. :return: show id per series, lower case and without quotes. :rtype: dict # patch: add punctuation cleaning """ # get the show page logger.info('Getting show ids') region.set(self.last_show_ids_fetch_key, datetime.datetime.now()) r = self.session.get(self.server_url + 'shows.php', timeout=10) r.raise_for_status() # LXML parser seems to fail when parsing Addic7ed.com HTML markup. # Last known version to work properly is 3.6.4 (next version, 3.7.0, fails) # Assuming the site's markup is bad, and stripping it down to only contain what's needed. show_cells = re.findall(show_cells_re, r.content) if show_cells: soup = ParserBeautifulSoup( b''.join(show_cells).decode('utf-8', 'ignore'), ['lxml', 'html.parser']) else: # If RegEx fails, fall back to original r.text and use 'html.parser' soup = ParserBeautifulSoup(r.text, ['html.parser']) # populate the show ids show_ids = {} shows = soup.select('td > h3 > a[href^="/show/"]') for show in shows: show_clean = sanitize(show.text, default_characters=self.sanitize_characters) try: show_id = int(show['href'][6:]) except ValueError: continue show_ids[show_clean] = show_id match = series_year_re.match(show_clean) if match and match.group(2) and match.group(1) not in show_ids: # year found, also add it without year show_ids[match.group(1)] = show_id soup.decompose() soup = None logger.debug('Found %d show ids', len(show_ids)) if not show_ids: raise Exception("Addic7ed: No show IDs found!") return show_ids
def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) # currently addic7ed searches via srch.php from the front page, then a re-search is needed which calls # search.php for endpoint in ( "srch.php", "search.php", ): headers = None if endpoint == "search.php": headers = {"referer": self.server_url + "srch.php"} r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers) r.raise_for_status() if r.text and "Sorry, your search" not in r.text: break time.sleep(4) if r.status_code == 304: raise TooManyRequests() soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser']) suggestion = None # get the suggestion try: suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' '), default_characters=self.sanitize_characters) == \ sanitize(series_year, default_characters=self.sanitize_characters): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id finally: soup.decompose() soup = None
def _search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param str series: series of the episode. :param year: year of the series, if any. :type year: int :return: the show id, if found. :rtype: int """ # addic7ed doesn't support search with quotes series = series.replace('\'', ' ') # build the params series_year = '%s %d' % (series, year) if year is not None else series params = {'search': series_year, 'Submit': 'Search'} # make the search logger.info('Searching show ids with %r', params) # currently addic7ed searches via srch.php from the front page, then a re-search is needed which calls # search.php for endpoint in ("srch.php", "search.php",): headers = None if endpoint == "search.php": headers = { "referer": self.server_url + "srch.php" } r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers) r.raise_for_status() if r.content and "Sorry, your search" not in r.content: break time.sleep(4) if r.status_code == 304: raise TooManyRequests() soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) suggestion = None # get the suggestion try: suggestion = soup.select('span.titulo > a[href^="/show/"]') if not suggestion: logger.warning('Show id not found: no suggestion') return None if not sanitize(suggestion[0].i.text.replace('\'', ' '), default_characters=self.sanitize_characters) == \ sanitize(series_year, default_characters=self.sanitize_characters): logger.warning('Show id not found: suggestion does not match') return None show_id = int(suggestion[0]['href'][6:]) logger.debug('Found show id %d', show_id) return show_id finally: soup.decompose() soup = None
def query_movie(self, movie_id, title, year=None): # get the page of the movie logger.info('Getting the page of movie id %s', movie_id) r = self.session.get(self.server_url + 'movie/' + movie_id, timeout=10, headers={ "referer": self.server_url, "X-Requested-With": "XMLHttpRequest" } ) r.raise_for_status() if r.status_code == 304: raise TooManyRequests() if not r.text: # Provider wrongful return a status of 304 Not Modified with an empty content # raise_for_status won't raise exception for that status code logger.error('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitle rows tables = [] subtitles = [] for table in soup.find_all('table', {'align': 'center', 'border': '0', 'class': 'tabel95', 'width': '100%'}): if table.find_all('td', {'class': 'NewsTitle'}): tables.append(table) for table in tables: row1 = table.contents[1] row2 = table.contents[4] row3 = table.contents[6] # other rows are useless # ignore incomplete subtitles status = row2.contents[6].text if "%" in status: logger.debug('Ignoring subtitle with status %s', status) continue # read the item try: language = Language.fromaddic7ed(row2.contents[4].text.strip('\n')) except babelfish.exceptions.LanguageReverseError as error: logger.debug("Language error: %s, Ignoring subtitle", error) continue hearing_impaired = bool(row3.contents[1].contents[1].attrs['src'].endswith('hi.jpg')) page_link = self.server_url + 'movie/' + movie_id # Seems like Addic7ed returns the first word in the language of the user (Version, Versión, etc) # As we can't match a regex, we will just strip the first word try: version = " ".join(str(row1.contents[1].contents[1]).split()[1:]) version_matches = re.search(r"(.+),.+", version) version = version_matches.group(1) if version_matches else None except IndexError: version = None try: download_link = row2.contents[8].contents[3].attrs['href'][1:] except IndexError: download_link = row2.contents[8].contents[2].attrs['href'][1:] uploader = row1.contents[2].contents[8].text.strip() # set subtitle language to hi if it's hearing_impaired if hearing_impaired: language = Language.rebuild(language, hi=True) subtitle = self.subtitle_class(language, hearing_impaired, page_link, None, None, None, title, year, version, download_link, uploader) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles