def search_show_id(self, series, year=None): """Search the show id from the `series` and `year`. :param string series: series of the episode. :param year: year of the series, if any. :type year: int or None :return: the show id, if any. :rtype: int or None """ # make the search logger.info('Searching show id for %r', series) r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10) r.raise_for_status() # get the series out of the suggestions soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) show_id = None for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'): match = link_re.match(suggestion.text) if not match: logger.error('Failed to match %s', suggestion.text) continue if sanitize(match.group('series')).lower() == series.lower(): if year is not None and int(match.group('first_year')) != year: logger.debug('Year does not match') continue show_id = int(suggestion['href'][8:-5]) logger.debug('Found show id %d', show_id) break soup.decompose() soup = None return show_id
def query(self, show_id, series, season, episode, year=None): # get the episode ids episode_ids = self.get_episode_ids(show_id, season) # Provider doesn't store multi episode information episode = min(episode) if episode and isinstance(episode, list) else episode if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('h5').text.strip() or None subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.info('Found subtitle %s', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def query(self, series, season, episode, year=None): # search the show id show_id = self.search_show_id(series, year) if show_id is None: logger.info('No show id found for %r (%r)', series, {'year': year}) return [] # get the episode ids episode_ids = self.get_episode_ids(show_id, season) # Provider doesn't store multi episode information episode = min(episode) if episode and isinstance(episode, list) else episode if episode not in episode_ids: logger.error('Episode %d not found', episode) return [] # get the episode page logger.info('Getting the page for episode %d', episode_ids[episode]) r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over subtitles rows subtitles = [] for row in soup.select('.subtitlen'): # read the item language = Language.fromtvsubtitles(row.h5.img['src'][13:-4]) subtitle_id = int(row.parent['href'][10:-5]) page_link = self.server_url + 'subtitle-%d.html' % subtitle_id rip = row.find('p', title='rip').text.strip() or None release = row.find('h5').text.strip() or None subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip, release) logger.info('Found subtitle %s', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def get_episode_ids(self, show_id, season): """Get episode ids from the show id and the season. :param int show_id: show id. :param int season: season of the episode. :return: episode ids per episode number. :rtype: dict """ # get the page of the season of the show logger.info('Getting the page of show id %d, season %d', show_id, season) r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10) soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) # loop over episode rows episode_ids = {} for row in soup.select('table#table5 tr'): # skip rows that do not have a link to the episode page if not row('a', href=episode_id_re): continue # extract data from the cells cells = row('td') episode = int(cells[0].text.split('x')[1]) episode_id = int(cells[1].a['href'][8:-5]) episode_ids[episode] = episode_id if episode_ids: logger.debug('Found episode ids %r', episode_ids) else: logger.warning('No episode ids found') soup.decompose() soup = None return episode_ids
def query(self, keyword, season=None, episode=None, year=None, video=None): params = keyword if season and episode: params += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: params += '&ARok={:4d}'.format(year) logger.debug('Searching subtitles %r', params) subtitles = [] if season and episode: search_link = self.server_url + text_type( self.search_url_series).format(params) elif year: search_link = self.server_url + text_type( self.search_url_movies).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] # soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # for entity in soup.select('table .main_table > tbody > tr'): # for entity in soup.find_all("table", class_="main_table"): # moviename = entity.text # entity_url = self.server_url + entity['href'] # logger.debug(entity_url) # r = self.session.get(entity_url, timeout=30) # r.raise_for_status() # logger.debug('looking into ' + entity_url) soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']).find( "table", class_="main_table") # loop over subtitles cells if soup: subs = soup.find_all("tr", class_="row1") subs += soup.find_all("tr", class_="row2") for sub in subs: page_link = '%s%s' % (self.server_url, sub.a.get('href').encode('utf-8')) title = sub.find_all('td')[0:1] title = [x.text for x in title] version = sub.find(class_="fixedTip") if version is None: version = "" else: version = version['title'] try: r = sub.find_all('td')[6:7] # r2 = td.find("td", "img") langs = [x.text.encode('utf-8') for x in r] pass except: langs = 'CZ' name = '%s (%s)' % (version, langs) if b'CZ' in langs: language = Language('ces') elif b'SK' in langs: language = Language('slk') # read the item # subtitle = self.subtitle_class(language, page_link, year, version, page_link.replace("detail", "dld")) download_link = sub.find('a', class_='titulkydownloadajax') download_link = self.download_url + download_link.get('href') subtitle = self.subtitle_class( language, page_link, season, episode, version, download_link, year, title, asked_for_release_group=video.release_group, asked_for_episode=episode) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) soup.decompose() soup = None return subtitles
def query(self, language, video_names, type, keyword=None, year=None, season=None, episode=None, imdb_id=None): ## Build the search URL params = {} # Keyword if keyword: params['Fulltext'] = keyword # Video type if type == 'episode': params['Serial'] = 'S' else: params['Serial'] = 'F' # Season / Episode if season: params['Sezona'] = season if episode: params['Epizoda'] = episode # IMDB ID if imdb_id: params['IMDB'] = imdb_id[2:] # Remove the tt from the imdb id # Year if year: params['Rok'] = year # Language if language == Language('ces'): params['Jazyk'] = 'CZ' elif language == Language('slk'): params['Jazyk'] = 'SK' elif language == None: params['Jazyk'] = '' else: return [] # Status if self.approved_only: logger.debug(f"Titulky.com: Searching only for approved subtitles") params['ASchvalene'] = '1' else: params['ASchvalene'] = '' search_url = self.build_search_url(params) ## Search results page parsing html_src = self.fetch_page(search_url) search_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) # If there is a message containing "Žádny odpovídající záznam", it means that there are no results # If that's the case, return an empty list error_message = search_page_soup.select('.panel-body > strong') if len( error_message ) > 0 and 'Žádný odpovídající záznam' in error_message[0].get_text( strip=True): logger.info("Titulky.com: No results found") return [] # Get the table containing the search results table = search_page_soup.find('table', class_='table') if not table: logger.debug("Titulky.com: Could not find table") raise ParseResponseError( "Could not find table. Did the HTML source change?") # Get table body containing rows of subtitles table_body = table.find('tbody') if not table_body: logger.debug("Titulky.com: Could not find table body") raise ParseResponseError( "Could not find table body. Did the HTML source change?") ## Loop over all subtitles on the first page and put them in a list subtitles = [] rows = table_body.find_all('tr') if not self.multithreading: # Process the rows sequentially logger.info("Titulky.com: processing results in sequence") for i, row in enumerate(rows): sub_info = self.process_row(row, video_names, search_url) # If subtitle info was returned, then everything was okay # and we can instationate it and add it to the list if sub_info: logger.debug( f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}" ) # If we found the subtitle by IMDB ID, no need to get it from details page sub_imdb_id = imdb_id or sub_info['imdb_id'] subtitle_instance = self.subtitle_class( sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # No subtitle info was returned, i. e. something unexpected # happend during subtitle details page fetching and processing. logger.debug( f"Titulky.com: No subtitle info retrieved, row: {i}") else: # Process the rows in paralell logger.info( f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time." ) threads = [None] * len(rows) threads_data = [None] * len(rows) # Process rows in parallel, self.max_threads at a time. cycles = math.ceil(len(rows) / self.max_threads) for i in range(cycles): # Batch number i starting_index = i * self.max_threads # Inclusive ending_index = starting_index + self.max_threads # Non-inclusive # Create threads for all rows in this batch for j in range(starting_index, ending_index): # Check if j-th row exists if j < len(rows): # Row number j logger.debug( f"Titulky.com: Creating thread {j} (batch: {i})") # Create a thread for row j and start it threads[j] = Thread( target=self.process_row, args=[rows[j], video_names, search_url], kwargs={ 'thread_id': j, 'threads_data': threads_data }) threads[j].start() # Wait for all created threads to finish before moving to another batch of rows for j in range(starting_index, ending_index): # Check if j-th row exists if j < len(rows): threads[j].join() # Process the resulting data from all threads for i in range(len(threads_data)): thread_data = threads_data[i] # If the thread returned didn't return anything, but expected a dict object if not thread_data: raise ProviderError( f"No data returned from thread ID: {i}") # If an exception was raised in a thread, raise it again here if 'exception' in thread_data and thread_data['exception']: logger.debug( f"Titulky.com: An error occured while processing a row in the thread ID {i}" ) raise thread_data['exception'] # If the thread returned a subtitle info, great, instantiate it and add it to the list if 'sub_info' in thread_data and thread_data['sub_info']: # Instantiate the subtitle object logger.debug( f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}" ) sub_info = thread_data['sub_info'] # If we found the subtitle by IMDB ID, no need to get it from details page sub_imdb_id = imdb_id or sub_info['imdb_id'] subtitle_instance = self.subtitle_class( sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected # happend during subtitle details page fetching and processing. logger.debug( f"Titulky.com: No subtitle info retrieved, thread ID: {i}" ) # Clean up search_page_soup.decompose() search_page_soup = None logger.debug(f"Titulky.com: Found subtitles: {subtitles}") return subtitles
def parse_details(self, details_url, search_url): html_src = self.fetch_page(details_url, ref=search_url) details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) details_container = details_page_soup.find('div', class_='detail') if not details_container: # The subtitles could be removed and got redirected to a different page. Better treat this silently. logger.info( "Titulky.com: Could not find details div container. Skipping.") return False ### IMDB ID imdb_id = None imdb_tag = details_container.find('a', attrs={'target': 'imdb'}) if imdb_tag: imdb_url = imdb_tag.get('href') imdb_id = re.findall(r'tt(\d+)', imdb_url)[0] if not imdb_id: logger.debug("Titulky.com: No IMDB ID supplied on details page.") ### RELEASE release = None release_tag = details_container.find('div', class_='releas') if not release_tag: raise ParseResponseError( "Could not find release tag. Did the HTML source change?") release = release_tag.get_text(strip=True) if not release: logger.debug( "Titulky.com: No release information supplied on details page." ) ### LANGUAGE language = None czech_flag = details_container.select('img[src*=\'flag-CZ\']') slovak_flag = details_container.select('img[src*=\'flag-SK\']') if czech_flag and not slovak_flag: language = Language('ces') elif slovak_flag and not czech_flag: language = Language('slk') if not language: logger.debug( "Titulky.com: No language information supplied on details page." ) ### UPLOADER uploader = None uploader_tag = details_container.find('div', class_='ulozil') if not uploader_tag: raise ParseResponseError( "Could not find uploader tag. Did the HTML source change?") uploader_anchor_tag = uploader_tag.find('a') if not uploader_anchor_tag: raise ParseResponseError( "Could not find uploader anchor tag. Did the HTML source change?" ) uploader = uploader_anchor_tag.string.strip( ) if uploader_anchor_tag else None if not uploader: logger.debug( "Titulky.com: No uploader name supplied on details page.") ### FPS fps = None fps_icon_tag_selection = details_container.select( 'img[src*=\'Movieroll\']') if not fps_icon_tag_selection and not hasattr( fps_icon_tag_selection[0], 'parent'): raise ParseResponseError( "Could not find parent of the fps icon tag. Did the HTML source change?" ) fps_icon_tag = fps_icon_tag_selection[0] parent_text = fps_icon_tag.parent.get_text(strip=True) match = re.findall(r'(\d+,\d+) fps', parent_text) # If the match is found, change the decimal separator to a dot and convert to float fps = float(match[0].replace(',', '.')) if len(match) > 0 else None if not fps: logger.debug("Titulky.com: No fps supplied on details page.") ### YEAR year = None h1_tag = details_container.find('h1', id='titulky') if not h1_tag: raise ParseResponseError( "Could not find h1 tag. Did the HTML source change?") # The h1 tag contains the name of the subtitle and a year h1_texts = [text for text in h1_tag.stripped_strings] year = int(h1_texts[1]) if len(h1_texts) > 1 else None if not year: logger.debug("Titulky.com: No year supplied on details page.") # Clean up details_page_soup.decompose() details_page_soup = None # Return the subtitle details return { 'releases': [release], 'language': language, 'uploader': uploader, 'fps': fps, 'year': year, 'imdb_id': imdb_id }