Python ParserBeautifulSoup.decomposeの例、subliminal.providers.ParserBeautifulSoup.decompose Pythonの例

コード例 #1

0

ファイルを表示

    def search_show_id(self, series, year=None):
        """Search the show id from the `series` and `year`.
        :param string series: series of the episode.
        :param year: year of the series, if any.
        :type year: int or None
        :return: the show id, if any.
        :rtype: int or None
        """
        # make the search
        logger.info('Searching show id for %r', series)
        r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10)
        r.raise_for_status()

        # get the series out of the suggestions
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
        show_id = None
        for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'):
            match = link_re.match(suggestion.text)
            if not match:
                logger.error('Failed to match %s', suggestion.text)
                continue

            if sanitize(match.group('series')).lower() == series.lower():
                if year is not None and int(match.group('first_year')) != year:
                    logger.debug('Year does not match')
                    continue
                show_id = int(suggestion['href'][8:-5])
                logger.debug('Found show id %d', show_id)
                break

        soup.decompose()
        soup = None

        return show_id

コード例 #2

0

ファイルを表示

ファイル: tvsubtitles.py プロジェクト: pannal/Sub-Zero.bundle

    def search_show_id(self, series, year=None):
        """Search the show id from the `series` and `year`.
        :param string series: series of the episode.
        :param year: year of the series, if any.
        :type year: int or None
        :return: the show id, if any.
        :rtype: int or None
        """
        # make the search
        logger.info('Searching show id for %r', series)
        r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10)
        r.raise_for_status()

        # get the series out of the suggestions
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
        show_id = None
        for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'):
            match = link_re.match(suggestion.text)
            if not match:
                logger.error('Failed to match %s', suggestion.text)
                continue

            if sanitize(match.group('series')).lower() == series.lower():
                if year is not None and int(match.group('first_year')) != year:
                    logger.debug('Year does not match')
                    continue
                show_id = int(suggestion['href'][8:-5])
                logger.debug('Found show id %d', show_id)
                break

        soup.decompose()
        soup = None

        return show_id

コード例 #3

0

ファイルを表示

ファイル: tvsubtitles.py プロジェクト: pannal/Sub-Zero.bundle

    def query(self, show_id, series, season, episode, year=None):
        # get the episode ids
        episode_ids = self.get_episode_ids(show_id, season)
        # Provider doesn't store multi episode information
        episode = min(episode) if episode and isinstance(episode, list) else episode

        if episode not in episode_ids:
            logger.error('Episode %d not found', episode)
            return []

        # get the episode page
        logger.info('Getting the page for episode %d', episode_ids[episode])
        r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10)
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # loop over subtitles rows
        subtitles = []
        for row in soup.select('.subtitlen'):
            # read the item
            language = Language.fromtvsubtitles(row.h5.img['src'][13:-4])
            subtitle_id = int(row.parent['href'][10:-5])
            page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
            rip = row.find('p', title='rip').text.strip() or None
            release = row.find('h5').text.strip() or None

            subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip,
                                           release)
            logger.info('Found subtitle %s', subtitle)
            subtitles.append(subtitle)

        soup.decompose()
        soup = None

        return subtitles

コード例 #4

0

ファイルを表示

ファイル: tvsubtitles.py プロジェクト: markstamp/Sub-Zero.bundle

    def query(self, series, season, episode, year=None):
        # search the show id
        show_id = self.search_show_id(series, year)
        if show_id is None:
            logger.info('No show id found for %r (%r)', series, {'year': year})
            return []

        # get the episode ids
        episode_ids = self.get_episode_ids(show_id, season)
        # Provider doesn't store multi episode information
        episode = min(episode) if episode and isinstance(episode,
                                                         list) else episode

        if episode not in episode_ids:
            logger.error('Episode %d not found', episode)
            return []

        # get the episode page
        logger.info('Getting the page for episode %d', episode_ids[episode])
        r = self.session.get(self.server_url +
                             'episode-%d.html' % episode_ids[episode],
                             timeout=10)
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # loop over subtitles rows
        subtitles = []
        for row in soup.select('.subtitlen'):
            # read the item
            language = Language.fromtvsubtitles(row.h5.img['src'][13:-4])
            subtitle_id = int(row.parent['href'][10:-5])
            page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
            rip = row.find('p', title='rip').text.strip() or None
            release = row.find('h5').text.strip() or None

            subtitle = self.subtitle_class(language, page_link, subtitle_id,
                                           series, season, episode, year, rip,
                                           release)
            logger.info('Found subtitle %s', subtitle)
            subtitles.append(subtitle)

        soup.decompose()
        soup = None

        return subtitles

コード例 #5

0

ファイルを表示

ファイル: tvsubtitles.py プロジェクト: markstamp/Sub-Zero.bundle

    def get_episode_ids(self, show_id, season):
        """Get episode ids from the show id and the season.

        :param int show_id: show id.
        :param int season: season of the episode.
        :return: episode ids per episode number.
        :rtype: dict

        """
        # get the page of the season of the show
        logger.info('Getting the page of show id %d, season %d', show_id,
                    season)
        r = self.session.get(self.server_url + 'tvshow-%d-%d.html' %
                             (show_id, season),
                             timeout=10)
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # loop over episode rows
        episode_ids = {}
        for row in soup.select('table#table5 tr'):
            # skip rows that do not have a link to the episode page
            if not row('a', href=episode_id_re):
                continue

            # extract data from the cells
            cells = row('td')
            episode = int(cells[0].text.split('x')[1])
            episode_id = int(cells[1].a['href'][8:-5])
            episode_ids[episode] = episode_id

        if episode_ids:
            logger.debug('Found episode ids %r', episode_ids)
        else:
            logger.warning('No episode ids found')

        soup.decompose()
        soup = None

        return episode_ids

コード例 #6

0

ファイルを表示

ファイル: tvsubtitles.py プロジェクト: pannal/Sub-Zero.bundle

    def get_episode_ids(self, show_id, season):
        """Get episode ids from the show id and the season.

        :param int show_id: show id.
        :param int season: season of the episode.
        :return: episode ids per episode number.
        :rtype: dict

        """
        # get the page of the season of the show
        logger.info('Getting the page of show id %d, season %d', show_id, season)
        r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10)
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # loop over episode rows
        episode_ids = {}
        for row in soup.select('table#table5 tr'):
            # skip rows that do not have a link to the episode page
            if not row('a', href=episode_id_re):
                continue

            # extract data from the cells
            cells = row('td')
            episode = int(cells[0].text.split('x')[1])
            episode_id = int(cells[1].a['href'][8:-5])
            episode_ids[episode] = episode_id

        if episode_ids:
            logger.debug('Found episode ids %r', episode_ids)
        else:
            logger.warning('No episode ids found')

        soup.decompose()
        soup = None

        return episode_ids

コード例 #7

0

ファイルを表示

    def query(self, keyword, season=None, episode=None, year=None, video=None):
        params = keyword
        if season and episode:
            params += ' S{season:02d}E{episode:02d}'.format(season=season,
                                                            episode=episode)
        elif year:
            params += '&ARok={:4d}'.format(year)

        logger.debug('Searching subtitles %r', params)
        subtitles = []
        if season and episode:
            search_link = self.server_url + text_type(
                self.search_url_series).format(params)
        elif year:
            search_link = self.server_url + text_type(
                self.search_url_movies).format(params)

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        # soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])

        # for entity in soup.select('table .main_table > tbody > tr'):
        # for entity in soup.find_all("table", class_="main_table"):
        #     moviename = entity.text
        # entity_url = self.server_url + entity['href']
        # logger.debug(entity_url)
        # r = self.session.get(entity_url, timeout=30)
        # r.raise_for_status()
        # logger.debug('looking into ' + entity_url)

        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'),
                                   ['lxml', 'html.parser']).find(
                                       "table", class_="main_table")
        # loop over subtitles cells
        if soup:
            subs = soup.find_all("tr", class_="row1")
            subs += soup.find_all("tr", class_="row2")
            for sub in subs:
                page_link = '%s%s' % (self.server_url,
                                      sub.a.get('href').encode('utf-8'))
                title = sub.find_all('td')[0:1]
                title = [x.text for x in title]
                version = sub.find(class_="fixedTip")
                if version is None:
                    version = ""
                else:
                    version = version['title']
                try:
                    r = sub.find_all('td')[6:7]
                    # r2 = td.find("td", "img")
                    langs = [x.text.encode('utf-8') for x in r]
                    pass
                except:
                    langs = 'CZ'
                name = '%s (%s)' % (version, langs)

                if b'CZ' in langs:
                    language = Language('ces')
                elif b'SK' in langs:
                    language = Language('slk')
                # read the item
                # subtitle = self.subtitle_class(language, page_link, year, version, page_link.replace("detail", "dld"))
                download_link = sub.find('a', class_='titulkydownloadajax')
                download_link = self.download_url + download_link.get('href')

                subtitle = self.subtitle_class(
                    language,
                    page_link,
                    season,
                    episode,
                    version,
                    download_link,
                    year,
                    title,
                    asked_for_release_group=video.release_group,
                    asked_for_episode=episode)

                logger.debug('Found subtitle %r', subtitle)
                subtitles.append(subtitle)

            soup.decompose()
            soup = None

        return subtitles

コード例 #8

0

ファイルを表示

ファイル: titulky.py プロジェクト: mvanbaak/bazarr

    def query(self,
              language,
              video_names,
              type,
              keyword=None,
              year=None,
              season=None,
              episode=None,
              imdb_id=None):
        ## Build the search URL
        params = {}

        # Keyword
        if keyword:
            params['Fulltext'] = keyword
        # Video type
        if type == 'episode':
            params['Serial'] = 'S'
        else:
            params['Serial'] = 'F'
        # Season / Episode
        if season:
            params['Sezona'] = season
        if episode:
            params['Epizoda'] = episode
        # IMDB ID
        if imdb_id:
            params['IMDB'] = imdb_id[2:]  # Remove the tt from the imdb id
        # Year
        if year:
            params['Rok'] = year
        # Language
        if language == Language('ces'):
            params['Jazyk'] = 'CZ'
        elif language == Language('slk'):
            params['Jazyk'] = 'SK'
        elif language == None:
            params['Jazyk'] = ''
        else:
            return []
        # Status
        if self.approved_only:
            logger.debug(f"Titulky.com: Searching only for approved subtitles")
            params['ASchvalene'] = '1'
        else:
            params['ASchvalene'] = ''

        search_url = self.build_search_url(params)

        ## Search results page parsing
        html_src = self.fetch_page(search_url)
        search_page_soup = ParserBeautifulSoup(html_src,
                                               ['lxml', 'html.parser'])

        # If there is a message containing "Žádny odpovídající záznam", it means that there are no results
        # If that's the case, return an empty list
        error_message = search_page_soup.select('.panel-body > strong')
        if len(
                error_message
        ) > 0 and 'Žádný odpovídající záznam' in error_message[0].get_text(
                strip=True):
            logger.info("Titulky.com: No results found")
            return []

        # Get the table containing the search results
        table = search_page_soup.find('table', class_='table')
        if not table:
            logger.debug("Titulky.com: Could not find table")
            raise ParseResponseError(
                "Could not find table. Did the HTML source change?")

        # Get table body containing rows of subtitles
        table_body = table.find('tbody')
        if not table_body:
            logger.debug("Titulky.com: Could not find table body")
            raise ParseResponseError(
                "Could not find table body. Did the HTML source change?")

        ## Loop over all subtitles on the first page and put them in a list
        subtitles = []
        rows = table_body.find_all('tr')

        if not self.multithreading:
            # Process the rows sequentially
            logger.info("Titulky.com: processing results in sequence")
            for i, row in enumerate(rows):
                sub_info = self.process_row(row, video_names, search_url)

                # If subtitle info was returned, then everything was okay
                # and we can instationate it and add it to the list
                if sub_info:
                    logger.debug(
                        f"Titulky.com: Sucessfully retrieved subtitle info, row: {i}"
                    )

                    # If we found the subtitle by IMDB ID, no need to get it from details page
                    sub_imdb_id = imdb_id or sub_info['imdb_id']

                    subtitle_instance = self.subtitle_class(
                        sub_info['id'],
                        sub_imdb_id,
                        sub_info['language'],
                        sub_info['names'],
                        season,
                        episode,
                        sub_info['year'],
                        sub_info['releases'],
                        sub_info['fps'],
                        sub_info['uploader'],
                        sub_info['approved'],
                        sub_info['details_link'],
                        sub_info['download_link'],
                        skip_wrong_fps=self.skip_wrong_fps,
                        asked_for_episode=(type == 'episode'))
                    subtitles.append(subtitle_instance)
                else:
                    # No subtitle info was returned, i. e. something unexpected
                    # happend during subtitle details page fetching and processing.
                    logger.debug(
                        f"Titulky.com: No subtitle info retrieved, row: {i}")
        else:
            # Process the rows in paralell
            logger.info(
                f"Titulky.com: processing results in parelell, {self.max_threads} rows at a time."
            )

            threads = [None] * len(rows)
            threads_data = [None] * len(rows)

            # Process rows in parallel, self.max_threads at a time.
            cycles = math.ceil(len(rows) / self.max_threads)
            for i in range(cycles):
                # Batch number i
                starting_index = i * self.max_threads  # Inclusive
                ending_index = starting_index + self.max_threads  # Non-inclusive

                # Create threads for all rows in this batch
                for j in range(starting_index, ending_index):
                    # Check if j-th row exists
                    if j < len(rows):
                        # Row number j
                        logger.debug(
                            f"Titulky.com: Creating thread {j} (batch: {i})")
                        # Create a thread for row j and start it
                        threads[j] = Thread(
                            target=self.process_row,
                            args=[rows[j], video_names, search_url],
                            kwargs={
                                'thread_id': j,
                                'threads_data': threads_data
                            })
                        threads[j].start()

                # Wait for all created threads to finish before moving to another batch of rows
                for j in range(starting_index, ending_index):
                    # Check if j-th row exists
                    if j < len(rows):
                        threads[j].join()

            # Process the resulting data from all threads
            for i in range(len(threads_data)):
                thread_data = threads_data[i]

                # If the thread returned didn't return anything, but expected a dict object
                if not thread_data:
                    raise ProviderError(
                        f"No data returned from thread ID: {i}")

                # If an exception was raised in a thread, raise it again here
                if 'exception' in thread_data and thread_data['exception']:
                    logger.debug(
                        f"Titulky.com: An error occured while processing a row in the thread ID {i}"
                    )
                    raise thread_data['exception']

                # If the thread returned a subtitle info, great, instantiate it and add it to the list
                if 'sub_info' in thread_data and thread_data['sub_info']:
                    # Instantiate the subtitle object
                    logger.debug(
                        f"Titulky.com: Sucessfully retrieved subtitle info, thread ID: {i}"
                    )
                    sub_info = thread_data['sub_info']

                    # If we found the subtitle by IMDB ID, no need to get it from details page
                    sub_imdb_id = imdb_id or sub_info['imdb_id']

                    subtitle_instance = self.subtitle_class(
                        sub_info['id'],
                        sub_imdb_id,
                        sub_info['language'],
                        sub_info['names'],
                        season,
                        episode,
                        sub_info['year'],
                        sub_info['releases'],
                        sub_info['fps'],
                        sub_info['uploader'],
                        sub_info['approved'],
                        sub_info['details_link'],
                        sub_info['download_link'],
                        skip_wrong_fps=self.skip_wrong_fps,
                        asked_for_episode=(type == 'episode'))
                    subtitles.append(subtitle_instance)
                else:
                    # The thread returned data, but it didn't contain a subtitle info, i. e. something unexpected
                    # happend during subtitle details page fetching and processing.
                    logger.debug(
                        f"Titulky.com: No subtitle info retrieved, thread ID: {i}"
                    )

        # Clean up
        search_page_soup.decompose()
        search_page_soup = None

        logger.debug(f"Titulky.com: Found subtitles: {subtitles}")

        return subtitles

コード例 #9

0

ファイルを表示

ファイル: titulky.py プロジェクト: mvanbaak/bazarr

    def parse_details(self, details_url, search_url):
        html_src = self.fetch_page(details_url, ref=search_url)
        details_page_soup = ParserBeautifulSoup(html_src,
                                                ['lxml', 'html.parser'])

        details_container = details_page_soup.find('div', class_='detail')
        if not details_container:
            # The subtitles could be removed and got redirected to a different page. Better treat this silently.
            logger.info(
                "Titulky.com: Could not find details div container. Skipping.")
            return False

        ### IMDB ID
        imdb_id = None
        imdb_tag = details_container.find('a', attrs={'target': 'imdb'})

        if imdb_tag:
            imdb_url = imdb_tag.get('href')
            imdb_id = re.findall(r'tt(\d+)', imdb_url)[0]

        if not imdb_id:
            logger.debug("Titulky.com: No IMDB ID supplied on details page.")

        ### RELEASE
        release = None
        release_tag = details_container.find('div', class_='releas')

        if not release_tag:
            raise ParseResponseError(
                "Could not find release tag. Did the HTML source change?")

        release = release_tag.get_text(strip=True)

        if not release:
            logger.debug(
                "Titulky.com: No release information supplied on details page."
            )

        ### LANGUAGE
        language = None
        czech_flag = details_container.select('img[src*=\'flag-CZ\']')
        slovak_flag = details_container.select('img[src*=\'flag-SK\']')

        if czech_flag and not slovak_flag:
            language = Language('ces')
        elif slovak_flag and not czech_flag:
            language = Language('slk')

        if not language:
            logger.debug(
                "Titulky.com: No language information supplied on details page."
            )

        ### UPLOADER
        uploader = None
        uploader_tag = details_container.find('div', class_='ulozil')

        if not uploader_tag:
            raise ParseResponseError(
                "Could not find uploader tag. Did the HTML source change?")

        uploader_anchor_tag = uploader_tag.find('a')

        if not uploader_anchor_tag:
            raise ParseResponseError(
                "Could not find uploader anchor tag. Did the HTML source change?"
            )

        uploader = uploader_anchor_tag.string.strip(
        ) if uploader_anchor_tag else None

        if not uploader:
            logger.debug(
                "Titulky.com: No uploader name supplied on details page.")

        ### FPS
        fps = None
        fps_icon_tag_selection = details_container.select(
            'img[src*=\'Movieroll\']')

        if not fps_icon_tag_selection and not hasattr(
                fps_icon_tag_selection[0], 'parent'):
            raise ParseResponseError(
                "Could not find parent of the fps icon tag. Did the HTML source change?"
            )

        fps_icon_tag = fps_icon_tag_selection[0]
        parent_text = fps_icon_tag.parent.get_text(strip=True)
        match = re.findall(r'(\d+,\d+) fps', parent_text)

        # If the match is found, change the decimal separator to a dot and convert to float
        fps = float(match[0].replace(',', '.')) if len(match) > 0 else None

        if not fps:
            logger.debug("Titulky.com: No fps supplied on details page.")

        ### YEAR
        year = None
        h1_tag = details_container.find('h1', id='titulky')

        if not h1_tag:
            raise ParseResponseError(
                "Could not find h1 tag. Did the HTML source change?")

        # The h1 tag contains the name of the subtitle and a year
        h1_texts = [text for text in h1_tag.stripped_strings]
        year = int(h1_texts[1]) if len(h1_texts) > 1 else None

        if not year:
            logger.debug("Titulky.com: No year supplied on details page.")

        # Clean up
        details_page_soup.decompose()
        details_page_soup = None

        # Return the subtitle details
        return {
            'releases': [release],
            'language': language,
            'uploader': uploader,
            'fps': fps,
            'year': year,
            'imdb_id': imdb_id
        }