def _get_download_link(self, subtitle): response = self.session.get(subtitle.page_link, timeout=10) self._check_response(response) try: page_soup = ParserBeautifulSoup( response.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser']) links_soup = page_soup.find_all("a", {'class': 'detalle_link'}) for link_soup in links_soup: if link_soup['href'].startswith('bajar'): return self.server_url + link_soup['href'] except Exception as e: raise ParseResponseError('Error parsing download link: ' + str(e)) raise ParseResponseError('Download link not found')
def _get_archive(self, content): # open the archive archive_stream = io.BytesIO(content) if rarfile.is_rarfile(archive_stream): logger.debug('Identified rar archive') archive = rarfile.RarFile(archive_stream) elif zipfile.is_zipfile(archive_stream): logger.debug('Identified zip archive') archive = zipfile.ZipFile(archive_stream) else: raise ParseResponseError('Unsupported compressed format') return archive
def _get_subtitle_from_archive(self, archive): for name in archive.namelist(): # discard hidden files if os.path.split(name)[-1].startswith('.'): continue # discard non-subtitle files if not name.lower().endswith(SUBTITLE_EXTENSIONS): continue return archive.read(name) raise ParseResponseError( 'Can not find the subtitle in the compressed file')
def _get_subtitle_from_archive(self, archive, subtitle): # some files have a non subtitle with .txt extension _tmp = list(SUBTITLE_EXTENSIONS) _tmp.remove('.txt') _subtitle_extensions = tuple(_tmp) _max_score = 0 _scores = get_scores(subtitle.video) for name in archive.namelist(): # discard hidden files if os.path.split(name)[-1].startswith('.'): continue # discard non-subtitle files if not name.lower().endswith(_subtitle_extensions): continue _guess = guessit(name) if isinstance(subtitle.video, Episode): logger.debug("guessing %s" % name) logger.debug("subtitle S{}E{} video S{}E{}".format( _guess['season'], _guess['episode'], subtitle.video.season, subtitle.video.episode)) if subtitle.video.episode != _guess[ 'episode'] or subtitle.video.season != _guess['season']: logger.debug('subtitle does not match video, skipping') continue matches = set() matches |= guess_matches(subtitle.video, _guess) logger.debug('srt matches: %s' % matches) _score = sum((_scores.get(match, 0) for match in matches)) if _score > _max_score: _max_name = name _max_score = _score logger.debug("new max: {} {}".format(name, _score)) if _max_score > 0: logger.debug("returning from archive: {} scored {}".format( _max_name, _max_score)) return archive.read(_max_name) raise ParseResponseError( 'Can not find the subtitle in the compressed file')
def _get_subtitle_from_archive(self, archive): # some files have a non subtitle with .txt extension _tmp = list(SUBTITLE_EXTENSIONS) _tmp.remove('.txt') _subtitle_extensions = tuple(_tmp) for name in archive.namelist(): # discard hidden files if os.path.split(name)[-1].startswith('.'): continue # discard non-subtitle files if not name.lower().endswith(_subtitle_extensions): continue logger.debug("returning from archive: %s" % name) return archive.read(name) raise ParseResponseError( 'Can not find the subtitle in the compressed file')
def query(self, keyword, season=None, episode=None, year=None): query = keyword if season and episode: query += ' S{season:02d}E{episode:02d}'.format(season=season, episode=episode) elif year: query += ' {:4d}'.format(year) params = { 'buscar': query, # search string 'accion': 5, # action search 'oxdown': 1, # order by downloads descending 'pg': 1 # page 1 } logger.debug('Searching subtitles %r', query) subtitles = [] language = self.language_list[0] search_link = self.server_url + 'index.php' while True: response = self.session.get(search_link, params=params, timeout=10) self._check_response(response) try: page_subtitles = self._parse_subtitles_page(response, language) except Exception as e: raise ParseResponseError('Error parsing subtitles list: ' + str(e)) subtitles += page_subtitles if len(page_subtitles) >= 20: params['pg'] += 1 # search next page time.sleep(self.multi_result_throttle) else: break return subtitles
def query(self, language, video_names, type, keyword=None, year=None, season=None, episode=None, imdb_id=None): ## Build the search URL params = {} # Keyword if keyword: params['Fulltext'] = keyword # Video type if type == 'episode': params['Serial'] = 'S' else: params['Serial'] = 'F' # Season / Episode if season: params['Sezona'] = season if episode: params['Epizoda'] = episode # IMDB ID if imdb_id: params['IMDB'] = imdb_id[2:] # Remove the tt from the imdb id # Year if year: params['Rok'] = year # Language if language == Language('ces'): params['Jazyk'] = 'CZ' elif language == Language('slk'): params['Jazyk'] = 'SK' elif language == None: params['Jazyk'] = '' else: return [] # Status if self.approved_only: logger.debug(f"Titulky.com: Searching only for approved subtitles") params['ASchvalene'] = '1' else: params['ASchvalene'] = '' search_url = self.build_search_url(params) ## Search results page parsing html_src = self.fetch_page(search_url) search_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) # If there is a message containing "Žádny odpovídající záznam", it means that there are no results # If that's the case, return an empty list error_message = search_page_soup.select('.panel-body > strong') if len( error_message ) > 0 and 'Žádný odpovídající záznam' in error_message[0].get_text( strip=True): logger.info("Titulky.com: No results found") return [] # Get the table containing the search results table = search_page_soup.find('table', class_='table') if not table: logger.debug("Titulky.com: Could not find table") raise ParseResponseError( "Could not find table. Did the HTML source change?") # Get table body containing rows of subtitles table_body = table.find('tbody') if not table_body: logger.debug("Titulky.com: Could not find table body") raise ParseResponseError( "Could not find table body. Did the HTML source change?") ## Loop over all subtitles on the first page and put them in a list subtitles = [] rows = table_body.find_all('tr') if not self.multithreading: # Process the rows sequentially logger.info("Titulky.com: processing results in sequence") for i, row in enumerate(rows): sub_info = self.process_row(row, video_names, search_url) # If subtitle info was returned, then everything was okay # and we can instationate it and add it to the list if sub_info: logger.debug( f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}" ) # If we found the subtitle by IMDB ID, no need to get it from details page sub_imdb_id = imdb_id or sub_info['imdb_id'] subtitle_instance = self.subtitle_class( sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # No subtitle info was returned, i. e. something unexpected # happend during subtitle details page fetching and processing. logger.debug( f"Titulky.com: No subtitle info retrieved, row: {i}") else: # Process the rows in paralell logger.info( f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time." ) threads = [None] * len(rows) threads_data = [None] * len(rows) # Process rows in parallel, self.max_threads at a time. cycles = math.ceil(len(rows) / self.max_threads) for i in range(cycles): # Batch number i starting_index = i * self.max_threads # Inclusive ending_index = starting_index + self.max_threads # Non-inclusive # Create threads for all rows in this batch for j in range(starting_index, ending_index): # Check if j-th row exists if j < len(rows): # Row number j logger.debug( f"Titulky.com: Creating thread {j} (batch: {i})") # Create a thread for row j and start it threads[j] = Thread( target=self.process_row, args=[rows[j], video_names, search_url], kwargs={ 'thread_id': j, 'threads_data': threads_data }) threads[j].start() # Wait for all created threads to finish before moving to another batch of rows for j in range(starting_index, ending_index): # Check if j-th row exists if j < len(rows): threads[j].join() # Process the resulting data from all threads for i in range(len(threads_data)): thread_data = threads_data[i] # If the thread returned didn't return anything, but expected a dict object if not thread_data: raise ProviderError( f"No data returned from thread ID: {i}") # If an exception was raised in a thread, raise it again here if 'exception' in thread_data and thread_data['exception']: logger.debug( f"Titulky.com: An error occured while processing a row in the thread ID {i}" ) raise thread_data['exception'] # If the thread returned a subtitle info, great, instantiate it and add it to the list if 'sub_info' in thread_data and thread_data['sub_info']: # Instantiate the subtitle object logger.debug( f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}" ) sub_info = thread_data['sub_info'] # If we found the subtitle by IMDB ID, no need to get it from details page sub_imdb_id = imdb_id or sub_info['imdb_id'] subtitle_instance = self.subtitle_class( sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected # happend during subtitle details page fetching and processing. logger.debug( f"Titulky.com: No subtitle info retrieved, thread ID: {i}" ) # Clean up search_page_soup.decompose() search_page_soup = None logger.debug(f"Titulky.com: Found subtitles: {subtitles}") return subtitles
def parse_details(self, details_url, search_url): html_src = self.fetch_page(details_url, ref=search_url) details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) details_container = details_page_soup.find('div', class_='detail') if not details_container: # The subtitles could be removed and got redirected to a different page. Better treat this silently. logger.info( "Titulky.com: Could not find details div container. Skipping.") return False ### IMDB ID imdb_id = None imdb_tag = details_container.find('a', attrs={'target': 'imdb'}) if imdb_tag: imdb_url = imdb_tag.get('href') imdb_id = re.findall(r'tt(\d+)', imdb_url)[0] if not imdb_id: logger.debug("Titulky.com: No IMDB ID supplied on details page.") ### RELEASE release = None release_tag = details_container.find('div', class_='releas') if not release_tag: raise ParseResponseError( "Could not find release tag. Did the HTML source change?") release = release_tag.get_text(strip=True) if not release: logger.debug( "Titulky.com: No release information supplied on details page." ) ### LANGUAGE language = None czech_flag = details_container.select('img[src*=\'flag-CZ\']') slovak_flag = details_container.select('img[src*=\'flag-SK\']') if czech_flag and not slovak_flag: language = Language('ces') elif slovak_flag and not czech_flag: language = Language('slk') if not language: logger.debug( "Titulky.com: No language information supplied on details page." ) ### UPLOADER uploader = None uploader_tag = details_container.find('div', class_='ulozil') if not uploader_tag: raise ParseResponseError( "Could not find uploader tag. Did the HTML source change?") uploader_anchor_tag = uploader_tag.find('a') if not uploader_anchor_tag: raise ParseResponseError( "Could not find uploader anchor tag. Did the HTML source change?" ) uploader = uploader_anchor_tag.string.strip( ) if uploader_anchor_tag else None if not uploader: logger.debug( "Titulky.com: No uploader name supplied on details page.") ### FPS fps = None fps_icon_tag_selection = details_container.select( 'img[src*=\'Movieroll\']') if not fps_icon_tag_selection and not hasattr( fps_icon_tag_selection[0], 'parent'): raise ParseResponseError( "Could not find parent of the fps icon tag. Did the HTML source change?" ) fps_icon_tag = fps_icon_tag_selection[0] parent_text = fps_icon_tag.parent.get_text(strip=True) match = re.findall(r'(\d+,\d+) fps', parent_text) # If the match is found, change the decimal separator to a dot and convert to float fps = float(match[0].replace(',', '.')) if len(match) > 0 else None if not fps: logger.debug("Titulky.com: No fps supplied on details page.") ### YEAR year = None h1_tag = details_container.find('h1', id='titulky') if not h1_tag: raise ParseResponseError( "Could not find h1 tag. Did the HTML source change?") # The h1 tag contains the name of the subtitle and a year h1_texts = [text for text in h1_tag.stripped_strings] year = int(h1_texts[1]) if len(h1_texts) > 1 else None if not year: logger.debug("Titulky.com: No year supplied on details page.") # Clean up details_page_soup.decompose() details_page_soup = None # Return the subtitle details return { 'releases': [release], 'language': language, 'uploader': uploader, 'fps': fps, 'year': year, 'imdb_id': imdb_id }