Python ParserBeautifulSoup.findの例、subliminal.providers.ParserBeautifulSoup.find Pythonの例

コード例 #1

0

ファイルを表示

    def query(self, title, season=None, episode=None):
        url = '{}/subtitles/release'.format(self.server_url)
        params = {
            'q': '{0} S{1:02}E{2:02}'.format(title, season, episode),
            'r': 'true'
        }

        # get the list of subtitles
        logger.debug('Getting the list of subtitles')

        r = self.session.get(url, params=params, timeout=30)
        r.raise_for_status()

        soup = ParserBeautifulSoup(r.content, ['html5lib', 'html.parser'])

        # loop over results
        subtitles = {}

        subtitle_table = soup.find('table')
        subtitle_rows = subtitle_table('tr') if subtitle_table else []

        # Continue only if one subtitle is found
        if len(subtitle_rows) < 2:
            return subtitles.values()

        for row in subtitle_rows[1:]:
            cells = row('td')

            language = Language.fromsubscene(cells[0].find_all('span')[0].get_text(strip=True))
            hearing_impaired = (False, True)[cells[2].attrs.values()[0] == 41]
            page_link = cells[0].find('a')['href']
            release = cells[0].find_all('span')[1].get_text(strip=True)

            # guess from name
            guess = guessit(release, {'type': 'episode'})
            if guess.get('season') != season and guess.get('episode') != episode:
                continue

            r = self.session.get(self.server_url + page_link, timeout=30)
            r.raise_for_status()
            soup2 = ParserBeautifulSoup(r.content, ['html5lib', 'html.parser'])
            sub_id = re.search(r'\?mac=(.*)', soup2.find('a', id='downloadButton')['href']).group(1)

            # add the release and increment downloaded count if we already have the subtitle
            if sub_id in subtitles:
                logger.debug('Found additional release %r for subtitle %d', release, sub_id)
                bisect.insort_left(subtitles[sub_id].releases, release)  # deterministic order
                subtitles[sub_id].downloaded += 1
                continue

            # otherwise create it
            subtitle = SubsceneSubtitle(language, hearing_impaired, title, season, episode, title, sub_id, [release])

            logger.debug('Found subtitle %r', subtitle)
            subtitles[sub_id] = subtitle

        return subtitles.values()

コード例 #2

0

ファイルを表示

ファイル: zimuku.py プロジェクト: zx900930/bazarr

 def _get_archive_dowload_link(session, sub_page_link):
     r = session.get(sub_page_link)
     bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                  ["html.parser"])
     down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"]
     down_page_link = urljoin(sub_page_link, down_page_link)
     r = session.get(down_page_link)
     bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                  ["html.parser"])
     download_link = bs_obj.find("a", {"rel": "nofollow"})
     download_link = download_link.attrs["href"]
     download_link = urljoin(sub_page_link, download_link)
     return download_link

コード例 #3

0

ファイルを表示

ファイル: zimuku.py プロジェクト: zx900930/bazarr

    def _parse_episode_page(self, link, year):
        r = self.session.get(link)
        bs_obj = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                     ["html.parser"])
        subs_body = bs_obj.find("div",
                                class_="subs box clearfix").find("tbody")
        subs = []
        for sub in subs_body.find_all("tr"):
            a = sub.find("a")
            name = _extract_name(a.text)
            name = os.path.splitext(name)[
                0]  # remove ext because it can be an archive type

            language = Language("eng")
            for img in sub.find("td", class_="tac lang").find_all("img"):
                if ("hongkong" in img.attrs["src"]
                        or "china" in img.attrs["src"]
                        or "jollyroger" in img.attrs["src"]):
                    language = Language("zho")
                    break
            sub_page_link = urljoin(self.server_url, a.attrs["href"])
            backup_session = copy.deepcopy(self.session)
            backup_session.headers["Referer"] = link

            subs.append(
                self.subtitle_class(language, sub_page_link, name,
                                    backup_session, year))

        return subs

コード例 #4

0

ファイルを表示

ファイル: legendastv.py プロジェクト: snamds/Medusa

    def initialize(self):
        self.session = Session()
        self.session.headers[
            'User-Agent'] = 'Subliminal/%s' % __short_version__

        # login
        if self.username and self.password:
            logger.info('Logging in')
            data = {
                '_method': 'POST',
                'data[User][username]': self.username,
                'data[User][password]': self.password
            }
            r = self.session.post(self.server_url + 'login',
                                  data,
                                  allow_redirects=False,
                                  timeout=10)
            raise_for_status(r)

            soup = ParserBeautifulSoup(r.content, ['html.parser'])
            if soup.find('div', {'class': 'alert-error'},
                         string=re.compile(u'Usuário ou senha inválidos')):
                raise AuthenticationError(self.username)

            logger.debug('Logged in')
            self.logged_in = True

コード例 #5

0

ファイルを表示

    def _search_tvshow(self, id, season, episode):
        subs = []

        url = (
            self.server_url + self.episode_info_url +
            "moduleName=SubtitlesList&SeriesID={}&Season={}&Episode={}".format(
                id, season, episode))
        r = self.session.get(url, timeout=10)
        r.raise_for_status()

        if len(r.content) < 10:
            logger.debug(
                "Too short content-length in response: [{}]. Treating as No Subtitles Found "
                .format(str(r.content)))
            return []

        sub_list = ParserBeautifulSoup(r.content, ["html.parser"])
        sub_rows = sub_list("tr")

        if sub_list.find("tr") and sub_list.find("tr").find(
                "td") and sub_list.find("tr").find(
                    "td").get_text() == self.no_subtitle_str:
            logger.debug("No Subtitles Found. URL " + url)
            return subs

        for row in sub_rows:
            columns = row.find_all("td")
            sub = {"id": id}

            for index, column in enumerate(columns):
                if index == 0:
                    sub["rls"] = column.get_text().strip().split("\n")[0]
                if index == 5:
                    sub["sub_id"] = column.find("input",
                                                attrs={"data-sub-id":
                                                       True})["data-sub-id"]

            if 'sub_id' in sub:
                subs.append(sub)
        return subs

コード例 #6

0

ファイルを表示

ファイル: zimuku.py プロジェクト: rickytin/bazarr

    def download_subtitle(self, subtitle):
        if isinstance(subtitle, ZimukuSubtitle):
            # download the subtitle
            logger.info('Downloading subtitle %r', subtitle)
            r = self.session.get(subtitle.download_link,
                                 headers={'Referer': subtitle.page_link},
                                 timeout=30)
            r.raise_for_status()

            if not r.content:
                logger.debug(
                    'Unable to download subtitle. No data returned from provider'
                )
                return

            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'),
                                       ['lxml', 'html.parser'])
            links = soup.find("div", {"class": "clearfix"}).find_all('a')
            # TODO: add settings for choice

            for down_link in links:
                url = down_link.get('href').encode('utf-8')
                url = self.server_url + url
                r = self.session.get(
                    url,
                    headers={'Referer': subtitle.download_link},
                    timeout=30)
                r.raise_for_status()

                if len(r.content) > 1024:
                    break

            archive_stream = io.BytesIO(r.content)
            archive = None
            if rarfile.is_rarfile(archive_stream):
                logger.debug('Identified rar archive')
                archive = rarfile.RarFile(archive_stream)
                subtitle_content = _get_subtitle_from_archive(archive)
            elif zipfile.is_zipfile(archive_stream):
                logger.debug('Identified zip archive')
                archive = zipfile.ZipFile(archive_stream)
                subtitle_content = _get_subtitle_from_archive(archive)
            else:
                subtitle_content = r.content

            if subtitle_content:
                subtitle.content = fix_line_ending(subtitle_content)
            else:
                logger.debug('Could not extract subtitle from %r', archive)

コード例 #7

0

ファイルを表示

ファイル: titulky.py プロジェクト: zandadoum/bazarr

    def download_subtitle(self, subtitle):
        if isinstance(subtitle, TitulkySubtitle):
            # download the subtitle
            logger.info('Downloading subtitle %r', subtitle)
            r = self.session.get(subtitle.download_link,
                                 headers={'Referer': subtitle.page_link},
                                 timeout=30)
            r.raise_for_status()

            if not r.content:
                logger.debug(
                    'Unable to download subtitle. No data returned from provider'
                )
                return
            elif 'Limit vyčerpán' in r.text:
                raise DownloadLimitExceeded

            soup = ParserBeautifulSoup(r.text.decode('utf-8', 'ignore'),
                                       ['lxml', 'html.parser'])
            # links = soup.find("a", {"id": "downlink"}).find_all('a')
            link = soup.find(id="downlink")
            # TODO: add settings for choice

            url = link.get('href')
            url = self.dn_url + url
            time.sleep(0.5)
            r = self.session.get(url,
                                 headers={'Referer': subtitle.download_link},
                                 timeout=30)
            r.raise_for_status()

        archive_stream = io.BytesIO(r.content)
        archive = None
        if rarfile.is_rarfile(archive_stream):
            logger.debug('Identified rar archive')
            archive = rarfile.RarFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        elif zipfile.is_zipfile(archive_stream):
            logger.debug('Identified zip archive')
            archive = zipfile.ZipFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        else:
            subtitle_content = r.content

        if subtitle_content:
            subtitle.content = fix_line_ending(subtitle_content)
        else:
            logger.debug('Could not extract subtitle from %r', archive)

コード例 #8

0

ファイルを表示

    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
        if season:
            params += ".S{season:02d}".format(season=season)
        elif year:
            params += " {:4d}".format(year)

        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = self.server_url + text_type(
            self.search_url).format(params)

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug("No data returned from provider")
            return []

        soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                   ["lxml", "html.parser"])

        # non-shooter result page
        if soup.find("div", {"class": "item"}):
            logger.debug("enter a non-shooter page")
            for item in soup.find_all("div", {"class": "item"}):
                title_a = item.find("p", class_="tt clearfix").find("a")
                subs_year = re.findall(r"\d{4}", title_a.text) or None
                if season:
                    title = title_a.text
                    season_cn1 = re.search("第(.*)季", title)
                    if not season_cn1:
                        season_cn1 = "一"
                    else:
                        season_cn1 = season_cn1.group(1).strip()
                    season_cn2 = num_to_cn(str(season))
                    if season_cn1 != season_cn2:
                        continue
                episode_link = self.server_url + title_a.attrs["href"]
                new_subs = self._parse_episode_page(episode_link, subs_year)
                subtitles += new_subs

        # NOTE: shooter result pages are ignored due to the existence of assrt provider

        return subtitles

コード例 #9

0

ファイルを表示

ファイル: zimuku.py プロジェクト: zx900930/bazarr

    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
        if season:
            params += ".S{season:02d}".format(season=season)
        elif year:
            params += " {:4d}".format(year)

        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = self.server_url + text_type(
            self.search_url).format(params)

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug("No data returned from provider")
            return []

        html = r.content.decode("utf-8", "ignore")
        # parse window location
        pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url"
        parts = re.findall(pattern, html)
        redirect_url = search_link
        while parts:
            parts.reverse()
            redirect_url = urljoin(self.server_url, "".join(parts))
            r = self.session.get(redirect_url, timeout=30)
            html = r.content.decode("utf-8", "ignore")
            parts = re.findall(pattern, html)
        logger.debug("search url located: " + redirect_url)

        soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                   ["lxml", "html.parser"])

        # non-shooter result page
        if soup.find("div", {"class": "item"}):
            logger.debug("enter a non-shooter page")
            for item in soup.find_all("div", {"class": "item"}):
                title_a = item.find("p", class_="tt clearfix").find("a")
                subs_year = year
                if season:
                    # episode year in zimuku is the season's year not show's year
                    actual_subs_year = re.findall(r"\d{4}",
                                                  title_a.text) or None
                    if actual_subs_year:
                        subs_year = int(actual_subs_year[0]) - season + 1
                    title = title_a.text
                    season_cn1 = re.search("第(.*)季", title)
                    if not season_cn1:
                        season_cn1 = "一"
                    else:
                        season_cn1 = season_cn1.group(1).strip()
                    season_cn2 = num_to_cn(str(season))
                    if season_cn1 != season_cn2:
                        continue
                episode_link = self.server_url + title_a.attrs["href"]
                new_subs = self._parse_episode_page(episode_link, subs_year)
                subtitles += new_subs

        # NOTE: shooter result pages are ignored due to the existence of assrt provider

        return subtitles

コード例 #10

0

ファイルを表示

    def query(self, show_id, series, season, year=None, country=None):
        # get the season list of the show
        logger.info('Getting the season list of show id %d', show_id)
        r = self.session.get(self.server_url + self.series_url.format(show_id),
                             timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        series = soup.find('name').text

        # loop over season rows
        seasons = soup.findAll('series_group')
        season_id = None

        for season_row in seasons:
            try:
                parsed_season = int(season_row['ssnnum'])
                if parsed_season == season:
                    season_id = int(season_row['ssnid'])
                    break
            except (ValueError, TypeError):
                continue

        if season_id is None:
            logger.debug('Season not found in provider')
            return []

        # get the subtitle list of the season
        logger.info('Getting the subtitle list of season %d', season)
        r = self.session.get(
            self.server_url +
            self.season_url.format(show_id=show_id, season=season_id),
            timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        subtitles = []
        # loop over episode rows
        for subtitle_group in soup.findAll('subg'):
            # read the episode info
            episode_info = subtitle_group.find('etitle')
            if episode_info is None:
                continue

            episodes = []
            episode_match = episode_re.match(episode_info['number'])
            if episode_match:
                episodes = [
                    int(e)
                    for e in [episode_match.group(1),
                              episode_match.group(3)] if e
                ]

            subtitle_info = subtitle_group.find('sgt')
            if subtitle_info is None:
                continue

            season = int(subtitle_info['ssnnum'])
            episode_id = int(subtitle_info['epsid'])

            # filter out unreleased subtitles
            for subs_tag in subtitle_group.findAll('sr'):
                if subs_tag['published_on'] == '':
                    continue

                page_link = self.server_url + self.page_link.format(
                    show_id=show_id,
                    season_id=season_id,
                    season=season,
                    episode=episode_id)
                title = episode_info['title']
                version = subs_tag.fmt.text + ' ' + subs_tag.team.text
                download_link = self.server_url + self.download_link.format(
                    int(subs_tag['rlsid']))

                for episode in episodes:
                    subtitle = self.subtitle_class(Language.fromalpha2('el'),
                                                   page_link, series, season,
                                                   episode, year, title,
                                                   version, download_link)
                    logger.debug('Found subtitle %r', subtitle)
                    subtitles.append(subtitle)

        return subtitles

コード例 #11

0

ファイルを表示

ファイル: titulky.py プロジェクト: mvanbaak/bazarr

    def query(self,
              language,
              video_names,
              type,
              keyword=None,
              year=None,
              season=None,
              episode=None,
              imdb_id=None):
        ## Build the search URL
        params = {}

        # Keyword
        if keyword:
            params['Fulltext'] = keyword
        # Video type
        if type == 'episode':
            params['Serial'] = 'S'
        else:
            params['Serial'] = 'F'
        # Season / Episode
        if season:
            params['Sezona'] = season
        if episode:
            params['Epizoda'] = episode
        # IMDB ID
        if imdb_id:
            params['IMDB'] = imdb_id[2:]  # Remove the tt from the imdb id
        # Year
        if year:
            params['Rok'] = year
        # Language
        if language == Language('ces'):
            params['Jazyk'] = 'CZ'
        elif language == Language('slk'):
            params['Jazyk'] = 'SK'
        elif language == None:
            params['Jazyk'] = ''
        else:
            return []
        # Status
        if self.approved_only:
            logger.debug(f"Titulky.com: Searching only for approved subtitles")
            params['ASchvalene'] = '1'
        else:
            params['ASchvalene'] = ''

        search_url = self.build_search_url(params)

        ## Search results page parsing
        html_src = self.fetch_page(search_url)
        search_page_soup = ParserBeautifulSoup(html_src,
                                               ['lxml', 'html.parser'])

        # If there is a message containing "Žádny odpovídající záznam", it means that there are no results
        # If that's the case, return an empty list
        error_message = search_page_soup.select('.panel-body > strong')
        if len(
                error_message
        ) > 0 and 'Žádný odpovídající záznam' in error_message[0].get_text(
                strip=True):
            logger.info("Titulky.com: No results found")
            return []

        # Get the table containing the search results
        table = search_page_soup.find('table', class_='table')
        if not table:
            logger.debug("Titulky.com: Could not find table")
            raise ParseResponseError(
                "Could not find table. Did the HTML source change?")

        # Get table body containing rows of subtitles
        table_body = table.find('tbody')
        if not table_body:
            logger.debug("Titulky.com: Could not find table body")
            raise ParseResponseError(
                "Could not find table body. Did the HTML source change?")

        ## Loop over all subtitles on the first page and put them in a list
        subtitles = []
        rows = table_body.find_all('tr')

        if not self.multithreading:
            # Process the rows sequentially
            logger.info("Titulky.com: processing results in sequence")
            for i, row in enumerate(rows):
                sub_info = self.process_row(row, video_names, search_url)

                # If subtitle info was returned, then everything was okay
                # and we can instationate it and add it to the list
                if sub_info:
                    logger.debug(
                        f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}"
                    )

                    # If we found the subtitle by IMDB ID, no need to get it from details page
                    sub_imdb_id = imdb_id or sub_info['imdb_id']

                    subtitle_instance = self.subtitle_class(
                        sub_info['id'],
                        sub_imdb_id,
                        sub_info['language'],
                        sub_info['names'],
                        season,
                        episode,
                        sub_info['year'],
                        sub_info['releases'],
                        sub_info['fps'],
                        sub_info['uploader'],
                        sub_info['approved'],
                        sub_info['details_link'],
                        sub_info['download_link'],
                        skip_wrong_fps=self.skip_wrong_fps,
                        asked_for_episode=(type == 'episode'))
                    subtitles.append(subtitle_instance)
                else:
                    # No subtitle info was returned, i. e. something unexpected
                    # happend during subtitle details page fetching and processing.
                    logger.debug(
                        f"Titulky.com: No subtitle info retrieved, row: {i}")
        else:
            # Process the rows in paralell
            logger.info(
                f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time."
            )

            threads = [None] * len(rows)
            threads_data = [None] * len(rows)

            # Process rows in parallel, self.max_threads at a time.
            cycles = math.ceil(len(rows) / self.max_threads)
            for i in range(cycles):
                # Batch number i
                starting_index = i * self.max_threads  # Inclusive
                ending_index = starting_index + self.max_threads  # Non-inclusive

                # Create threads for all rows in this batch
                for j in range(starting_index, ending_index):
                    # Check if j-th row exists
                    if j < len(rows):
                        # Row number j
                        logger.debug(
                            f"Titulky.com: Creating thread {j} (batch: {i})")
                        # Create a thread for row j and start it
                        threads[j] = Thread(
                            target=self.process_row,
                            args=[rows[j], video_names, search_url],
                            kwargs={
                                'thread_id': j,
                                'threads_data': threads_data
                            })
                        threads[j].start()

                # Wait for all created threads to finish before moving to another batch of rows
                for j in range(starting_index, ending_index):
                    # Check if j-th row exists
                    if j < len(rows):
                        threads[j].join()

            # Process the resulting data from all threads
            for i in range(len(threads_data)):
                thread_data = threads_data[i]

                # If the thread returned didn't return anything, but expected a dict object
                if not thread_data:
                    raise ProviderError(
                        f"No data returned from thread ID: {i}")

                # If an exception was raised in a thread, raise it again here
                if 'exception' in thread_data and thread_data['exception']:
                    logger.debug(
                        f"Titulky.com: An error occured while processing a row in the thread ID {i}"
                    )
                    raise thread_data['exception']

                # If the thread returned a subtitle info, great, instantiate it and add it to the list
                if 'sub_info' in thread_data and thread_data['sub_info']:
                    # Instantiate the subtitle object
                    logger.debug(
                        f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}"
                    )
                    sub_info = thread_data['sub_info']

                    # If we found the subtitle by IMDB ID, no need to get it from details page
                    sub_imdb_id = imdb_id or sub_info['imdb_id']

                    subtitle_instance = self.subtitle_class(
                        sub_info['id'],
                        sub_imdb_id,
                        sub_info['language'],
                        sub_info['names'],
                        season,
                        episode,
                        sub_info['year'],
                        sub_info['releases'],
                        sub_info['fps'],
                        sub_info['uploader'],
                        sub_info['approved'],
                        sub_info['details_link'],
                        sub_info['download_link'],
                        skip_wrong_fps=self.skip_wrong_fps,
                        asked_for_episode=(type == 'episode'))
                    subtitles.append(subtitle_instance)
                else:
                    # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected
                    # happend during subtitle details page fetching and processing.
                    logger.debug(
                        f"Titulky.com: No subtitle info retrieved, thread ID: {i}"
                    )

        # Clean up
        search_page_soup.decompose()
        search_page_soup = None

        logger.debug(f"Titulky.com: Found subtitles: {subtitles}")

        return subtitles

コード例 #12

0

ファイルを表示

ファイル: titulky.py プロジェクト: mvanbaak/bazarr

    def parse_details(self, details_url, search_url):
        html_src = self.fetch_page(details_url, ref=search_url)
        details_page_soup = ParserBeautifulSoup(html_src,
                                                ['lxml', 'html.parser'])

        details_container = details_page_soup.find('div', class_='detail')
        if not details_container:
            # The subtitles could be removed and got redirected to a different page. Better treat this silently.
            logger.info(
                "Titulky.com: Could not find details div container. Skipping.")
            return False

        ### IMDB ID
        imdb_id = None
        imdb_tag = details_container.find('a', attrs={'target': 'imdb'})

        if imdb_tag:
            imdb_url = imdb_tag.get('href')
            imdb_id = re.findall(r'tt(\d+)', imdb_url)[0]

        if not imdb_id:
            logger.debug("Titulky.com: No IMDB ID supplied on details page.")

        ### RELEASE
        release = None
        release_tag = details_container.find('div', class_='releas')

        if not release_tag:
            raise ParseResponseError(
                "Could not find release tag. Did the HTML source change?")

        release = release_tag.get_text(strip=True)

        if not release:
            logger.debug(
                "Titulky.com: No release information supplied on details page."
            )

        ### LANGUAGE
        language = None
        czech_flag = details_container.select('img[src*=\'flag-CZ\']')
        slovak_flag = details_container.select('img[src*=\'flag-SK\']')

        if czech_flag and not slovak_flag:
            language = Language('ces')
        elif slovak_flag and not czech_flag:
            language = Language('slk')

        if not language:
            logger.debug(
                "Titulky.com: No language information supplied on details page."
            )

        ### UPLOADER
        uploader = None
        uploader_tag = details_container.find('div', class_='ulozil')

        if not uploader_tag:
            raise ParseResponseError(
                "Could not find uploader tag. Did the HTML source change?")

        uploader_anchor_tag = uploader_tag.find('a')

        if not uploader_anchor_tag:
            raise ParseResponseError(
                "Could not find uploader anchor tag. Did the HTML source change?"
            )

        uploader = uploader_anchor_tag.string.strip(
        ) if uploader_anchor_tag else None

        if not uploader:
            logger.debug(
                "Titulky.com: No uploader name supplied on details page.")

        ### FPS
        fps = None
        fps_icon_tag_selection = details_container.select(
            'img[src*=\'Movieroll\']')

        if not fps_icon_tag_selection and not hasattr(
                fps_icon_tag_selection[0], 'parent'):
            raise ParseResponseError(
                "Could not find parent of the fps icon tag. Did the HTML source change?"
            )

        fps_icon_tag = fps_icon_tag_selection[0]
        parent_text = fps_icon_tag.parent.get_text(strip=True)
        match = re.findall(r'(\d+,\d+) fps', parent_text)

        # If the match is found, change the decimal separator to a dot and convert to float
        fps = float(match[0].replace(',', '.')) if len(match) > 0 else None

        if not fps:
            logger.debug("Titulky.com: No fps supplied on details page.")

        ### YEAR
        year = None
        h1_tag = details_container.find('h1', id='titulky')

        if not h1_tag:
            raise ParseResponseError(
                "Could not find h1 tag. Did the HTML source change?")

        # The h1 tag contains the name of the subtitle and a year
        h1_texts = [text for text in h1_tag.stripped_strings]
        year = int(h1_texts[1]) if len(h1_texts) > 1 else None

        if not year:
            logger.debug("Titulky.com: No year supplied on details page.")

        # Clean up
        details_page_soup.decompose()
        details_page_soup = None

        # Return the subtitle details
        return {
            'releases': [release],
            'language': language,
            'uploader': uploader,
            'fps': fps,
            'year': year,
            'imdb_id': imdb_id
        }

コード例 #13

0

ファイルを表示

    def query(self, video, languages):

        _searchurl = self.searchurl

        subtitles = []

        if isinstance(video, Movie):
            querytext = video.imdb_id if video.imdb_id else video.title

        if isinstance(video, Episode):
            querytext = '{} S{:02d}E{:02d}'.format(video.series, video.season,
                                                   video.episode)
            querytext = quote(querytext.lower())

        # language query filter
        if not isinstance(languages, (tuple, list, set)):
            languages = [languages]

        for language in languages:
            logger.debug("Legendasdivx.pt :: searching for %s subtitles.",
                         language)
            language_id = language.opensubtitles
            if 'por' in language_id:
                lang_filter = '&form_cat=28'
            elif 'pob' in language_id:
                lang_filter = '&form_cat=29'
            else:
                lang_filter = ''

            querytext = querytext + lang_filter if lang_filter else querytext

            try:
                # sleep for a 1 second before another request
                sleep(1)
                self.headers['Referer'] = self.site + '/index.php'
                self.session.headers.update(self.headers)
                res = self.session.get(_searchurl.format(query=querytext),
                                       allow_redirects=False)
                res.raise_for_status()
                if (res.status_code == 200
                        and "A legenda não foi encontrada" in res.text):
                    logger.warning(
                        'Legendasdivx.pt :: query %s return no results!',
                        querytext)
                    # for series, if no results found, try again just with series and season (subtitle packs)
                    if isinstance(video, Episode):
                        logger.debug(
                            "Legendasdivx.pt :: trying again with just series and season on query."
                        )
                        querytext = re.sub("(e|E)(\d{2})", "", querytext)
                        # sleep for a 1 second before another request
                        sleep(1)
                        res = self.session.get(
                            _searchurl.format(query=querytext),
                            allow_redirects=False)
                        res.raise_for_status()
                        if (res.status_code == 200 and
                                "A legenda não foi encontrada" in res.text):
                            logger.warning(
                                'Legendasdivx.pt :: query {0} return no results for language {1}(for series and season only).'
                                .format(querytext, language_id))
                            continue
                if res.status_code == 302:  # got redirected to login page.
                    # seems that our session cookies are no longer valid... clean them from cache
                    region.delete("legendasdivx_cookies2")
                    logger.debug(
                        "Legendasdivx.pt :: Logging in again. Cookies have expired!"
                    )
                    # login and try again
                    self.login()
                    # sleep for a 1 second before another request
                    sleep(1)
                    res = self.session.get(_searchurl.format(query=querytext))
                    res.raise_for_status()
            except HTTPError as e:
                if "bloqueado" in res.text.lower():
                    logger.error(
                        "LegendasDivx.pt :: Your IP is blocked on this server."
                    )
                    raise IPAddressBlocked(
                        "LegendasDivx.pt :: Your IP is blocked on this server."
                    )
                logger.error("Legendasdivx.pt :: HTTP Error %s", e)
                raise TooManyRequests("Legendasdivx.pt :: HTTP Error %s", e)
            except Exception as e:
                logger.error("LegendasDivx.pt :: Uncaught error: %r", e)
                raise ServiceUnavailable(
                    "LegendasDivx.pt :: Uncaught error: %r", e)

            bsoup = ParserBeautifulSoup(res.content, ['html.parser'])

            # search for more than 10 results (legendasdivx uses pagination)
            # don't throttle - maximum results = 6 * 10
            MAX_PAGES = 6

            # get number of pages bases on results found
            page_header = bsoup.find("div", {"class": "pager_bar"})
            results_found = re.search(
                r'\((.*?) encontradas\)',
                page_header.text).group(1) if page_header else 0
            logger.debug("Legendasdivx.pt :: Found %s subtitles",
                         str(results_found))
            num_pages = (int(results_found) // 10) + 1
            num_pages = min(MAX_PAGES, num_pages)

            # process first page
            subtitles += self._process_page(video, bsoup)

            # more pages?
            if num_pages > 1:
                for num_page in range(2, num_pages + 1):
                    sleep(1)  # another 1 sec before requesting...
                    _search_next = self.searchurl.format(
                        query=querytext) + "&page={0}".format(str(num_page))
                    logger.debug(
                        "Legendasdivx.pt :: Moving on to next page: %s",
                        _search_next)
                    # sleep for a 1 second before another request
                    sleep(1)
                    res = self.session.get(_search_next)
                    next_page = ParserBeautifulSoup(res.content,
                                                    ['html.parser'])
                    subs = self._process_page(video, next_page)
                    subtitles.extend(subs)

        return subtitles

コード例 #14

0

ファイルを表示

ファイル: greeksubs.py プロジェクト: zx900930/bazarr

    def query(self, video, languages, imdb_id, season=None, episode=None):
        logger.debug('Searching subtitles for %r', imdb_id)
        subtitles = []
        search_link = self.server_url + 'en/view/' + imdb_id

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        soup_page = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])

        if isinstance(video, Episode):
            try:
                episodes = soup_page.select('div.col-lg-offset-2.col-md-8.text-center.top30.bottom10 > a')
                for item in episodes:
                    season_episode = re.search(r'Season (\d+) Episode (\d+)', item.text)
                    season_number = int(season_episode.group(1))
                    episode_number = int(season_episode.group(2))
                    if season_number == season and episode_number == episode:
                        episode_page = item.attrs['href']
                        r = self.session.get(episode_page, timeout=30)
                        soup_subs = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
                        try:
                            secCode = soup_subs.find('input', {'id': 'secCode'}).get('value')
                        except Exception as e:
                            logging.debug(e)
                        else:
                            for subtitles_item in soup_subs.select('#elSub > tbody > tr'):
                                try:
                                    subtitle_id = re.search(r'downloadMe\(\'(.*)\'\)', subtitles_item.contents[2].contents[2].contents[0].attrs['onclick']).group(1)
                                    page_link = self.server_url + 'dll/' + subtitle_id + '/0/' + secCode
                                    language = Language.fromalpha2(subtitles_item.parent.find('img')['alt'])
                                    version = subtitles_item.contents[2].contents[4].text.strip()
                                    uploader = subtitles_item.contents[2].contents[5].contents[0].contents[1].text.strip()
                                    referer = episode_page.encode('utf-8')

                                    r = self.session.get(page_link,
                                                         headers={'Referer': referer},
                                                         timeout=30, allow_redirects=False)
                                    r.raise_for_status()
                                    soup_dll = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
                                    try:
                                        langcode = soup_dll.find(attrs={"name": 'langcode'}).get('value')
                                        uid = soup_dll.find(attrs={"name": 'uid'}).get('value')
                                        output = soup_dll.find(attrs={"name": 'output'}).get('value')
                                        dll = soup_dll.find(attrs={"name": 'dll'}).get('value')
                                    except Exception as e:
                                        logging.debug(e)
                                    else:
                                        download_req = self.session.post(page_link, data={'langcode': langcode,
                                                                                          'uid': uid,
                                                                                          'output': output,
                                                                                          'dll': dll},
                                                                         headers={'Referer': page_link}, timeout=10)
                                except Exception as e:
                                    logging.debug(e)
                                else:
                                    if language in languages:
                                        subtitle = self.subtitle_class(language, page_link, version, uploader, referer)
                                        if not download_req.content:
                                            logger.error('Unable to download subtitle. No data returned from provider')
                                            continue

                                        subtitle.content = download_req.content

                                        logger.debug('Found subtitle %r', subtitle)
                                        subtitles.append(subtitle)
                    else:
                        pass
            except Exception as e:
                logging.debug(e)
        elif isinstance(video, Movie):
            try:
                soup_subs = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
                try:
                    secCode = soup_subs.find('input', {'id': 'secCode'}).get('value')
                except Exception as e:
                    logging.debug(e)
                else:
                    for subtitles_item in soup_subs.select('#elSub > tbody > tr'):
                        try:
                            subtitle_id = re.search(r'downloadMe\(\'(.*)\'\)',
                                                    subtitles_item.contents[2].contents[2].contents[0].attrs[
                                                        'onclick']).group(1)
                            page_link = self.server_url + 'dll/' + subtitle_id + '/0/' + secCode
                            language = Language.fromalpha2(subtitles_item.parent.find('img')['alt'])
                            version = subtitles_item.contents[2].contents[4].text.strip()
                            uploader = subtitles_item.contents[2].contents[5].contents[0].contents[
                                1].text.strip()
                            referer = page_link.encode('utf-8')

                            r = self.session.get(page_link,
                                                 headers={'Referer': referer},
                                                 timeout=30, allow_redirects=False)
                            r.raise_for_status()
                            soup_dll = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
                            try:
                                langcode = soup_dll.find(attrs={"name": 'langcode'}).get('value')
                                uid = soup_dll.find(attrs={"name": 'uid'}).get('value')
                                output = soup_dll.find(attrs={"name": 'output'}).get('value')
                                dll = soup_dll.find(attrs={"name": 'dll'}).get('value')
                            except Exception as e:
                                logging.debug(e)
                            else:
                                download_req = self.session.post(page_link, data={'langcode': langcode,
                                                                                  'uid': uid,
                                                                                  'output': output,
                                                                                  'dll': dll},
                                                                 headers={'Referer': page_link}, timeout=10)
                        except Exception as e:
                            logging.debug(e)
                        else:
                            if language in languages:
                                subtitle = self.subtitle_class(language, page_link, version, uploader, referer)
                                if not download_req.content:
                                    logger.error('Unable to download subtitle. No data returned from provider')
                                    continue

                                subtitle.content = download_req.content

                                logger.debug('Found subtitle %r', subtitle)
                                subtitles.append(subtitle)
            except Exception as e:
                logging.debug(e)

        return subtitles

コード例 #15

0

ファイルを表示

    def get_archives(self, title_id, language_code):
        """Get the archive list from a given `title_id` and `language_code`.

        :param int title_id: title id.
        :param int language_code: language code.
        :return: the archives.
        :rtype: list of :class:`LegendasTVArchive`

        """
        logger.info('Getting archives for title %d and language %d', title_id, language_code)
        archives = []
        page = 0
        while True:
            # get the archive page
            url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format(
                language=language_code, page=page, title=title_id)
            r = self.session.get(url)
            r.raise_for_status()

            # parse the results
            soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
            for archive_soup in soup.select('div.list_element > article > div > div.f_left'):
                # create archive
                archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2],
                                            archive_soup.a.text,
                                            'pack' in archive_soup.parent['class'],
                                            'destaque' in archive_soup.parent['class'],
                                            self.server_url + archive_soup.a['href'][1:])

                # extract text containing downloads, rating and timestamp
                data_text = archive_soup.find('p', class_='data').text

                # match downloads
                archive.downloads = int(downloads_re.search(data_text).group('downloads'))

                # match rating
                match = rating_re.search(data_text)
                if match:
                    archive.rating = int(match.group('rating'))

                # match timestamp and validate it
                time_data = {k: int(v) for k, v in timestamp_re.search(data_text).groupdict().items()}
                archive.timestamp = pytz.timezone('America/Sao_Paulo').localize(datetime(**time_data))
                if archive.timestamp > datetime.utcnow().replace(tzinfo=pytz.utc):
                    raise ProviderError('Archive timestamp is in the future')

                # add archive
                logger.info('Found archive for title %d and language %d at page %s: %s',
                            title_id, language_code, page, archive)
                archives.append(archive)

            # stop on last page
            if soup.find('a', attrs={'class': 'load_more'}, string='carregar mais') is None:
                break

            # increment page count
            page += 1

        logger.debug('Found %d archives', len(archives))

        return archives