def query(self, title, season=None, episode=None): url = '{}/subtitles/release'.format(self.server_url) params = { 'q': '{0} S{1:02}E{2:02}'.format(title, season, episode), 'r': 'true' } # get the list of subtitles logger.debug('Getting the list of subtitles') r = self.session.get(url, params=params, timeout=30) r.raise_for_status() soup = ParserBeautifulSoup(r.content, ['html5lib', 'html.parser']) # loop over results subtitles = {} subtitle_table = soup.find('table') subtitle_rows = subtitle_table('tr') if subtitle_table else [] # Continue only if one subtitle is found if len(subtitle_rows) < 2: return subtitles.values() for row in subtitle_rows[1:]: cells = row('td') language = Language.fromsubscene(cells[0].find_all('span')[0].get_text(strip=True)) hearing_impaired = (False, True)[cells[2].attrs.values()[0] == 41] page_link = cells[0].find('a')['href'] release = cells[0].find_all('span')[1].get_text(strip=True) # guess from name guess = guessit(release, {'type': 'episode'}) if guess.get('season') != season and guess.get('episode') != episode: continue r = self.session.get(self.server_url + page_link, timeout=30) r.raise_for_status() soup2 = ParserBeautifulSoup(r.content, ['html5lib', 'html.parser']) sub_id = re.search(r'\?mac=(.*)', soup2.find('a', id='downloadButton')['href']).group(1) # add the release and increment downloaded count if we already have the subtitle if sub_id in subtitles: logger.debug('Found additional release %r for subtitle %d', release, sub_id) bisect.insort_left(subtitles[sub_id].releases, release) # deterministic order subtitles[sub_id].downloaded += 1 continue # otherwise create it subtitle = SubsceneSubtitle(language, hearing_impaired, title, season, episode, title, sub_id, [release]) logger.debug('Found subtitle %r', subtitle) subtitles[sub_id] = subtitle return subtitles.values()
def _get_archive_dowload_link(session, sub_page_link): r = session.get(sub_page_link) bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["html.parser"]) down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"] down_page_link = urljoin(sub_page_link, down_page_link) r = session.get(down_page_link) bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["html.parser"]) download_link = bs_obj.find("a", {"rel": "nofollow"}) download_link = download_link.attrs["href"] download_link = urljoin(sub_page_link, download_link) return download_link
def _parse_episode_page(self, link, year): r = self.session.get(link) bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["html.parser"]) subs_body = bs_obj.find("div", class_="subs box clearfix").find("tbody") subs = [] for sub in subs_body.find_all("tr"): a = sub.find("a") name = _extract_name(a.text) name = os.path.splitext(name)[ 0] # remove ext because it can be an archive type language = Language("eng") for img in sub.find("td", class_="tac lang").find_all("img"): if ("hongkong" in img.attrs["src"] or "china" in img.attrs["src"] or "jollyroger" in img.attrs["src"]): language = Language("zho") break sub_page_link = urljoin(self.server_url, a.attrs["href"]) backup_session = copy.deepcopy(self.session) backup_session.headers["Referer"] = link subs.append( self.subtitle_class(language, sub_page_link, name, backup_session, year)) return subs
def initialize(self): self.session = Session() self.session.headers[ 'User-Agent'] = 'Subliminal/%s' % __short_version__ # login if self.username and self.password: logger.info('Logging in') data = { '_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password } r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10) raise_for_status(r) soup = ParserBeautifulSoup(r.content, ['html.parser']) if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')): raise AuthenticationError(self.username) logger.debug('Logged in') self.logged_in = True
def _search_tvshow(self, id, season, episode): subs = [] url = ( self.server_url + self.episode_info_url + "moduleName=SubtitlesList&SeriesID={}&Season={}&Episode={}".format( id, season, episode)) r = self.session.get(url, timeout=10) r.raise_for_status() if len(r.content) < 10: logger.debug( "Too short content-length in response: [{}]. Treating as No Subtitles Found " .format(str(r.content))) return [] sub_list = ParserBeautifulSoup(r.content, ["html.parser"]) sub_rows = sub_list("tr") if sub_list.find("tr") and sub_list.find("tr").find( "td") and sub_list.find("tr").find( "td").get_text() == self.no_subtitle_str: logger.debug("No Subtitles Found. URL " + url) return subs for row in sub_rows: columns = row.find_all("td") sub = {"id": id} for index, column in enumerate(columns): if index == 0: sub["rls"] = column.get_text().strip().split("\n")[0] if index == 5: sub["sub_id"] = column.find("input", attrs={"data-sub-id": True})["data-sub-id"] if 'sub_id' in sub: subs.append(sub) return subs
def download_subtitle(self, subtitle): if isinstance(subtitle, ZimukuSubtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=30) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) links = soup.find("div", {"class": "clearfix"}).find_all('a') # TODO: add settings for choice for down_link in links: url = down_link.get('href').encode('utf-8') url = self.server_url + url r = self.session.get( url, headers={'Referer': subtitle.download_link}, timeout=30) r.raise_for_status() if len(r.content) > 1024: break archive_stream = io.BytesIO(r.content) archive = None if rarfile.is_rarfile(archive_stream): logger.debug('Identified rar archive') archive = rarfile.RarFile(archive_stream) subtitle_content = _get_subtitle_from_archive(archive) elif zipfile.is_zipfile(archive_stream): logger.debug('Identified zip archive') archive = zipfile.ZipFile(archive_stream) subtitle_content = _get_subtitle_from_archive(archive) else: subtitle_content = r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) else: logger.debug('Could not extract subtitle from %r', archive)
def download_subtitle(self, subtitle): if isinstance(subtitle, TitulkySubtitle): # download the subtitle logger.info('Downloading subtitle %r', subtitle) r = self.session.get(subtitle.download_link, headers={'Referer': subtitle.page_link}, timeout=30) r.raise_for_status() if not r.content: logger.debug( 'Unable to download subtitle. No data returned from provider' ) return elif 'Limit vyčerpán' in r.text: raise DownloadLimitExceeded soup = ParserBeautifulSoup(r.text.decode('utf-8', 'ignore'), ['lxml', 'html.parser']) # links = soup.find("a", {"id": "downlink"}).find_all('a') link = soup.find(id="downlink") # TODO: add settings for choice url = link.get('href') url = self.dn_url + url time.sleep(0.5) r = self.session.get(url, headers={'Referer': subtitle.download_link}, timeout=30) r.raise_for_status() archive_stream = io.BytesIO(r.content) archive = None if rarfile.is_rarfile(archive_stream): logger.debug('Identified rar archive') archive = rarfile.RarFile(archive_stream) subtitle_content = _get_subtitle_from_archive(archive) elif zipfile.is_zipfile(archive_stream): logger.debug('Identified zip archive') archive = zipfile.ZipFile(archive_stream) subtitle_content = _get_subtitle_from_archive(archive) else: subtitle_content = r.content if subtitle_content: subtitle.content = fix_line_ending(subtitle_content) else: logger.debug('Could not extract subtitle from %r', archive)
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season: params += ".S{season:02d}".format(season=season) elif year: params += " {:4d}".format(year) logger.debug("Searching subtitles %r", params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug("No data returned from provider") return [] soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) # non-shooter result page if soup.find("div", {"class": "item"}): logger.debug("enter a non-shooter page") for item in soup.find_all("div", {"class": "item"}): title_a = item.find("p", class_="tt clearfix").find("a") subs_year = re.findall(r"\d{4}", title_a.text) or None if season: title = title_a.text season_cn1 = re.search("第(.*)季", title) if not season_cn1: season_cn1 = "一" else: season_cn1 = season_cn1.group(1).strip() season_cn2 = num_to_cn(str(season)) if season_cn1 != season_cn2: continue episode_link = self.server_url + title_a.attrs["href"] new_subs = self._parse_episode_page(episode_link, subs_year) subtitles += new_subs # NOTE: shooter result pages are ignored due to the existence of assrt provider return subtitles
def query(self, keyword, season=None, episode=None, year=None): params = keyword if season: params += ".S{season:02d}".format(season=season) elif year: params += " {:4d}".format(year) logger.debug("Searching subtitles %r", params) subtitles = [] search_link = self.server_url + text_type( self.search_url).format(params) r = self.session.get(search_link, timeout=30) r.raise_for_status() if not r.content: logger.debug("No data returned from provider") return [] html = r.content.decode("utf-8", "ignore") # parse window location pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url" parts = re.findall(pattern, html) redirect_url = search_link while parts: parts.reverse() redirect_url = urljoin(self.server_url, "".join(parts)) r = self.session.get(redirect_url, timeout=30) html = r.content.decode("utf-8", "ignore") parts = re.findall(pattern, html) logger.debug("search url located: " + redirect_url) soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]) # non-shooter result page if soup.find("div", {"class": "item"}): logger.debug("enter a non-shooter page") for item in soup.find_all("div", {"class": "item"}): title_a = item.find("p", class_="tt clearfix").find("a") subs_year = year if season: # episode year in zimuku is the season's year not show's year actual_subs_year = re.findall(r"\d{4}", title_a.text) or None if actual_subs_year: subs_year = int(actual_subs_year[0]) - season + 1 title = title_a.text season_cn1 = re.search("第(.*)季", title) if not season_cn1: season_cn1 = "一" else: season_cn1 = season_cn1.group(1).strip() season_cn2 = num_to_cn(str(season)) if season_cn1 != season_cn2: continue episode_link = self.server_url + title_a.attrs["href"] new_subs = self._parse_episode_page(episode_link, subs_year) subtitles += new_subs # NOTE: shooter result pages are ignored due to the existence of assrt provider return subtitles
def query(self, show_id, series, season, year=None, country=None): # get the season list of the show logger.info('Getting the season list of show id %d', show_id) r = self.session.get(self.server_url + self.series_url.format(show_id), timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) series = soup.find('name').text # loop over season rows seasons = soup.findAll('series_group') season_id = None for season_row in seasons: try: parsed_season = int(season_row['ssnnum']) if parsed_season == season: season_id = int(season_row['ssnid']) break except (ValueError, TypeError): continue if season_id is None: logger.debug('Season not found in provider') return [] # get the subtitle list of the season logger.info('Getting the subtitle list of season %d', season) r = self.session.get( self.server_url + self.season_url.format(show_id=show_id, season=season_id), timeout=10) r.raise_for_status() if not r.content: logger.debug('No data returned from provider') return [] soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) subtitles = [] # loop over episode rows for subtitle_group in soup.findAll('subg'): # read the episode info episode_info = subtitle_group.find('etitle') if episode_info is None: continue episodes = [] episode_match = episode_re.match(episode_info['number']) if episode_match: episodes = [ int(e) for e in [episode_match.group(1), episode_match.group(3)] if e ] subtitle_info = subtitle_group.find('sgt') if subtitle_info is None: continue season = int(subtitle_info['ssnnum']) episode_id = int(subtitle_info['epsid']) # filter out unreleased subtitles for subs_tag in subtitle_group.findAll('sr'): if subs_tag['published_on'] == '': continue page_link = self.server_url + self.page_link.format( show_id=show_id, season_id=season_id, season=season, episode=episode_id) title = episode_info['title'] version = subs_tag.fmt.text + ' ' + subs_tag.team.text download_link = self.server_url + self.download_link.format( int(subs_tag['rlsid'])) for episode in episodes: subtitle = self.subtitle_class(Language.fromalpha2('el'), page_link, series, season, episode, year, title, version, download_link) logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) return subtitles
def query(self, language, video_names, type, keyword=None, year=None, season=None, episode=None, imdb_id=None): ## Build the search URL params = {} # Keyword if keyword: params['Fulltext'] = keyword # Video type if type == 'episode': params['Serial'] = 'S' else: params['Serial'] = 'F' # Season / Episode if season: params['Sezona'] = season if episode: params['Epizoda'] = episode # IMDB ID if imdb_id: params['IMDB'] = imdb_id[2:] # Remove the tt from the imdb id # Year if year: params['Rok'] = year # Language if language == Language('ces'): params['Jazyk'] = 'CZ' elif language == Language('slk'): params['Jazyk'] = 'SK' elif language == None: params['Jazyk'] = '' else: return [] # Status if self.approved_only: logger.debug(f"Titulky.com: Searching only for approved subtitles") params['ASchvalene'] = '1' else: params['ASchvalene'] = '' search_url = self.build_search_url(params) ## Search results page parsing html_src = self.fetch_page(search_url) search_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) # If there is a message containing "Žádny odpovídající záznam", it means that there are no results # If that's the case, return an empty list error_message = search_page_soup.select('.panel-body > strong') if len( error_message ) > 0 and 'Žádný odpovídající záznam' in error_message[0].get_text( strip=True): logger.info("Titulky.com: No results found") return [] # Get the table containing the search results table = search_page_soup.find('table', class_='table') if not table: logger.debug("Titulky.com: Could not find table") raise ParseResponseError( "Could not find table. Did the HTML source change?") # Get table body containing rows of subtitles table_body = table.find('tbody') if not table_body: logger.debug("Titulky.com: Could not find table body") raise ParseResponseError( "Could not find table body. Did the HTML source change?") ## Loop over all subtitles on the first page and put them in a list subtitles = [] rows = table_body.find_all('tr') if not self.multithreading: # Process the rows sequentially logger.info("Titulky.com: processing results in sequence") for i, row in enumerate(rows): sub_info = self.process_row(row, video_names, search_url) # If subtitle info was returned, then everything was okay # and we can instationate it and add it to the list if sub_info: logger.debug( f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}" ) # If we found the subtitle by IMDB ID, no need to get it from details page sub_imdb_id = imdb_id or sub_info['imdb_id'] subtitle_instance = self.subtitle_class( sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # No subtitle info was returned, i. e. something unexpected # happend during subtitle details page fetching and processing. logger.debug( f"Titulky.com: No subtitle info retrieved, row: {i}") else: # Process the rows in paralell logger.info( f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time." ) threads = [None] * len(rows) threads_data = [None] * len(rows) # Process rows in parallel, self.max_threads at a time. cycles = math.ceil(len(rows) / self.max_threads) for i in range(cycles): # Batch number i starting_index = i * self.max_threads # Inclusive ending_index = starting_index + self.max_threads # Non-inclusive # Create threads for all rows in this batch for j in range(starting_index, ending_index): # Check if j-th row exists if j < len(rows): # Row number j logger.debug( f"Titulky.com: Creating thread {j} (batch: {i})") # Create a thread for row j and start it threads[j] = Thread( target=self.process_row, args=[rows[j], video_names, search_url], kwargs={ 'thread_id': j, 'threads_data': threads_data }) threads[j].start() # Wait for all created threads to finish before moving to another batch of rows for j in range(starting_index, ending_index): # Check if j-th row exists if j < len(rows): threads[j].join() # Process the resulting data from all threads for i in range(len(threads_data)): thread_data = threads_data[i] # If the thread returned didn't return anything, but expected a dict object if not thread_data: raise ProviderError( f"No data returned from thread ID: {i}") # If an exception was raised in a thread, raise it again here if 'exception' in thread_data and thread_data['exception']: logger.debug( f"Titulky.com: An error occured while processing a row in the thread ID {i}" ) raise thread_data['exception'] # If the thread returned a subtitle info, great, instantiate it and add it to the list if 'sub_info' in thread_data and thread_data['sub_info']: # Instantiate the subtitle object logger.debug( f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}" ) sub_info = thread_data['sub_info'] # If we found the subtitle by IMDB ID, no need to get it from details page sub_imdb_id = imdb_id or sub_info['imdb_id'] subtitle_instance = self.subtitle_class( sub_info['id'], sub_imdb_id, sub_info['language'], sub_info['names'], season, episode, sub_info['year'], sub_info['releases'], sub_info['fps'], sub_info['uploader'], sub_info['approved'], sub_info['details_link'], sub_info['download_link'], skip_wrong_fps=self.skip_wrong_fps, asked_for_episode=(type == 'episode')) subtitles.append(subtitle_instance) else: # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected # happend during subtitle details page fetching and processing. logger.debug( f"Titulky.com: No subtitle info retrieved, thread ID: {i}" ) # Clean up search_page_soup.decompose() search_page_soup = None logger.debug(f"Titulky.com: Found subtitles: {subtitles}") return subtitles
def parse_details(self, details_url, search_url): html_src = self.fetch_page(details_url, ref=search_url) details_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser']) details_container = details_page_soup.find('div', class_='detail') if not details_container: # The subtitles could be removed and got redirected to a different page. Better treat this silently. logger.info( "Titulky.com: Could not find details div container. Skipping.") return False ### IMDB ID imdb_id = None imdb_tag = details_container.find('a', attrs={'target': 'imdb'}) if imdb_tag: imdb_url = imdb_tag.get('href') imdb_id = re.findall(r'tt(\d+)', imdb_url)[0] if not imdb_id: logger.debug("Titulky.com: No IMDB ID supplied on details page.") ### RELEASE release = None release_tag = details_container.find('div', class_='releas') if not release_tag: raise ParseResponseError( "Could not find release tag. Did the HTML source change?") release = release_tag.get_text(strip=True) if not release: logger.debug( "Titulky.com: No release information supplied on details page." ) ### LANGUAGE language = None czech_flag = details_container.select('img[src*=\'flag-CZ\']') slovak_flag = details_container.select('img[src*=\'flag-SK\']') if czech_flag and not slovak_flag: language = Language('ces') elif slovak_flag and not czech_flag: language = Language('slk') if not language: logger.debug( "Titulky.com: No language information supplied on details page." ) ### UPLOADER uploader = None uploader_tag = details_container.find('div', class_='ulozil') if not uploader_tag: raise ParseResponseError( "Could not find uploader tag. Did the HTML source change?") uploader_anchor_tag = uploader_tag.find('a') if not uploader_anchor_tag: raise ParseResponseError( "Could not find uploader anchor tag. Did the HTML source change?" ) uploader = uploader_anchor_tag.string.strip( ) if uploader_anchor_tag else None if not uploader: logger.debug( "Titulky.com: No uploader name supplied on details page.") ### FPS fps = None fps_icon_tag_selection = details_container.select( 'img[src*=\'Movieroll\']') if not fps_icon_tag_selection and not hasattr( fps_icon_tag_selection[0], 'parent'): raise ParseResponseError( "Could not find parent of the fps icon tag. Did the HTML source change?" ) fps_icon_tag = fps_icon_tag_selection[0] parent_text = fps_icon_tag.parent.get_text(strip=True) match = re.findall(r'(\d+,\d+) fps', parent_text) # If the match is found, change the decimal separator to a dot and convert to float fps = float(match[0].replace(',', '.')) if len(match) > 0 else None if not fps: logger.debug("Titulky.com: No fps supplied on details page.") ### YEAR year = None h1_tag = details_container.find('h1', id='titulky') if not h1_tag: raise ParseResponseError( "Could not find h1 tag. Did the HTML source change?") # The h1 tag contains the name of the subtitle and a year h1_texts = [text for text in h1_tag.stripped_strings] year = int(h1_texts[1]) if len(h1_texts) > 1 else None if not year: logger.debug("Titulky.com: No year supplied on details page.") # Clean up details_page_soup.decompose() details_page_soup = None # Return the subtitle details return { 'releases': [release], 'language': language, 'uploader': uploader, 'fps': fps, 'year': year, 'imdb_id': imdb_id }
def query(self, video, languages): _searchurl = self.searchurl subtitles = [] if isinstance(video, Movie): querytext = video.imdb_id if video.imdb_id else video.title if isinstance(video, Episode): querytext = '{} S{:02d}E{:02d}'.format(video.series, video.season, video.episode) querytext = quote(querytext.lower()) # language query filter if not isinstance(languages, (tuple, list, set)): languages = [languages] for language in languages: logger.debug("Legendasdivx.pt :: searching for %s subtitles.", language) language_id = language.opensubtitles if 'por' in language_id: lang_filter = '&form_cat=28' elif 'pob' in language_id: lang_filter = '&form_cat=29' else: lang_filter = '' querytext = querytext + lang_filter if lang_filter else querytext try: # sleep for a 1 second before another request sleep(1) self.headers['Referer'] = self.site + '/index.php' self.session.headers.update(self.headers) res = self.session.get(_searchurl.format(query=querytext), allow_redirects=False) res.raise_for_status() if (res.status_code == 200 and "A legenda não foi encontrada" in res.text): logger.warning( 'Legendasdivx.pt :: query %s return no results!', querytext) # for series, if no results found, try again just with series and season (subtitle packs) if isinstance(video, Episode): logger.debug( "Legendasdivx.pt :: trying again with just series and season on query." ) querytext = re.sub("(e|E)(\d{2})", "", querytext) # sleep for a 1 second before another request sleep(1) res = self.session.get( _searchurl.format(query=querytext), allow_redirects=False) res.raise_for_status() if (res.status_code == 200 and "A legenda não foi encontrada" in res.text): logger.warning( 'Legendasdivx.pt :: query {0} return no results for language {1}(for series and season only).' .format(querytext, language_id)) continue if res.status_code == 302: # got redirected to login page. # seems that our session cookies are no longer valid... clean them from cache region.delete("legendasdivx_cookies2") logger.debug( "Legendasdivx.pt :: Logging in again. Cookies have expired!" ) # login and try again self.login() # sleep for a 1 second before another request sleep(1) res = self.session.get(_searchurl.format(query=querytext)) res.raise_for_status() except HTTPError as e: if "bloqueado" in res.text.lower(): logger.error( "LegendasDivx.pt :: Your IP is blocked on this server." ) raise IPAddressBlocked( "LegendasDivx.pt :: Your IP is blocked on this server." ) logger.error("Legendasdivx.pt :: HTTP Error %s", e) raise TooManyRequests("Legendasdivx.pt :: HTTP Error %s", e) except Exception as e: logger.error("LegendasDivx.pt :: Uncaught error: %r", e) raise ServiceUnavailable( "LegendasDivx.pt :: Uncaught error: %r", e) bsoup = ParserBeautifulSoup(res.content, ['html.parser']) # search for more than 10 results (legendasdivx uses pagination) # don't throttle - maximum results = 6 * 10 MAX_PAGES = 6 # get number of pages bases on results found page_header = bsoup.find("div", {"class": "pager_bar"}) results_found = re.search( r'\((.*?) encontradas\)', page_header.text).group(1) if page_header else 0 logger.debug("Legendasdivx.pt :: Found %s subtitles", str(results_found)) num_pages = (int(results_found) // 10) + 1 num_pages = min(MAX_PAGES, num_pages) # process first page subtitles += self._process_page(video, bsoup) # more pages? if num_pages > 1: for num_page in range(2, num_pages + 1): sleep(1) # another 1 sec before requesting... _search_next = self.searchurl.format( query=querytext) + "&page={0}".format(str(num_page)) logger.debug( "Legendasdivx.pt :: Moving on to next page: %s", _search_next) # sleep for a 1 second before another request sleep(1) res = self.session.get(_search_next) next_page = ParserBeautifulSoup(res.content, ['html.parser']) subs = self._process_page(video, next_page) subtitles.extend(subs) return subtitles
def query(self, video, languages, imdb_id, season=None, episode=None): logger.debug('Searching subtitles for %r', imdb_id) subtitles = [] search_link = self.server_url + 'en/view/' + imdb_id r = self.session.get(search_link, timeout=30) r.raise_for_status() soup_page = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) if isinstance(video, Episode): try: episodes = soup_page.select('div.col-lg-offset-2.col-md-8.text-center.top30.bottom10 > a') for item in episodes: season_episode = re.search(r'Season (\d+) Episode (\d+)', item.text) season_number = int(season_episode.group(1)) episode_number = int(season_episode.group(2)) if season_number == season and episode_number == episode: episode_page = item.attrs['href'] r = self.session.get(episode_page, timeout=30) soup_subs = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) try: secCode = soup_subs.find('input', {'id': 'secCode'}).get('value') except Exception as e: logging.debug(e) else: for subtitles_item in soup_subs.select('#elSub > tbody > tr'): try: subtitle_id = re.search(r'downloadMe\(\'(.*)\'\)', subtitles_item.contents[2].contents[2].contents[0].attrs['onclick']).group(1) page_link = self.server_url + 'dll/' + subtitle_id + '/0/' + secCode language = Language.fromalpha2(subtitles_item.parent.find('img')['alt']) version = subtitles_item.contents[2].contents[4].text.strip() uploader = subtitles_item.contents[2].contents[5].contents[0].contents[1].text.strip() referer = episode_page.encode('utf-8') r = self.session.get(page_link, headers={'Referer': referer}, timeout=30, allow_redirects=False) r.raise_for_status() soup_dll = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) try: langcode = soup_dll.find(attrs={"name": 'langcode'}).get('value') uid = soup_dll.find(attrs={"name": 'uid'}).get('value') output = soup_dll.find(attrs={"name": 'output'}).get('value') dll = soup_dll.find(attrs={"name": 'dll'}).get('value') except Exception as e: logging.debug(e) else: download_req = self.session.post(page_link, data={'langcode': langcode, 'uid': uid, 'output': output, 'dll': dll}, headers={'Referer': page_link}, timeout=10) except Exception as e: logging.debug(e) else: if language in languages: subtitle = self.subtitle_class(language, page_link, version, uploader, referer) if not download_req.content: logger.error('Unable to download subtitle. No data returned from provider') continue subtitle.content = download_req.content logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) else: pass except Exception as e: logging.debug(e) elif isinstance(video, Movie): try: soup_subs = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) try: secCode = soup_subs.find('input', {'id': 'secCode'}).get('value') except Exception as e: logging.debug(e) else: for subtitles_item in soup_subs.select('#elSub > tbody > tr'): try: subtitle_id = re.search(r'downloadMe\(\'(.*)\'\)', subtitles_item.contents[2].contents[2].contents[0].attrs[ 'onclick']).group(1) page_link = self.server_url + 'dll/' + subtitle_id + '/0/' + secCode language = Language.fromalpha2(subtitles_item.parent.find('img')['alt']) version = subtitles_item.contents[2].contents[4].text.strip() uploader = subtitles_item.contents[2].contents[5].contents[0].contents[ 1].text.strip() referer = page_link.encode('utf-8') r = self.session.get(page_link, headers={'Referer': referer}, timeout=30, allow_redirects=False) r.raise_for_status() soup_dll = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) try: langcode = soup_dll.find(attrs={"name": 'langcode'}).get('value') uid = soup_dll.find(attrs={"name": 'uid'}).get('value') output = soup_dll.find(attrs={"name": 'output'}).get('value') dll = soup_dll.find(attrs={"name": 'dll'}).get('value') except Exception as e: logging.debug(e) else: download_req = self.session.post(page_link, data={'langcode': langcode, 'uid': uid, 'output': output, 'dll': dll}, headers={'Referer': page_link}, timeout=10) except Exception as e: logging.debug(e) else: if language in languages: subtitle = self.subtitle_class(language, page_link, version, uploader, referer) if not download_req.content: logger.error('Unable to download subtitle. No data returned from provider') continue subtitle.content = download_req.content logger.debug('Found subtitle %r', subtitle) subtitles.append(subtitle) except Exception as e: logging.debug(e) return subtitles
def get_archives(self, title_id, language_code): """Get the archive list from a given `title_id` and `language_code`. :param int title_id: title id. :param int language_code: language code. :return: the archives. :rtype: list of :class:`LegendasTVArchive` """ logger.info('Getting archives for title %d and language %d', title_id, language_code) archives = [] page = 0 while True: # get the archive page url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format( language=language_code, page=page, title=title_id) r = self.session.get(url) r.raise_for_status() # parse the results soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser']) for archive_soup in soup.select('div.list_element > article > div > div.f_left'): # create archive archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text, 'pack' in archive_soup.parent['class'], 'destaque' in archive_soup.parent['class'], self.server_url + archive_soup.a['href'][1:]) # extract text containing downloads, rating and timestamp data_text = archive_soup.find('p', class_='data').text # match downloads archive.downloads = int(downloads_re.search(data_text).group('downloads')) # match rating match = rating_re.search(data_text) if match: archive.rating = int(match.group('rating')) # match timestamp and validate it time_data = {k: int(v) for k, v in timestamp_re.search(data_text).groupdict().items()} archive.timestamp = pytz.timezone('America/Sao_Paulo').localize(datetime(**time_data)) if archive.timestamp > datetime.utcnow().replace(tzinfo=pytz.utc): raise ProviderError('Archive timestamp is in the future') # add archive logger.info('Found archive for title %d and language %d at page %s: %s', title_id, language_code, page, archive) archives.append(archive) # stop on last page if soup.find('a', attrs={'class': 'load_more'}, string='carregar mais') is None: break # increment page count page += 1 logger.debug('Found %d archives', len(archives)) return archives