Python ParserBeautifulSoup.find_allの例、subliminal.providers.ParserBeautifulSoup.find_all Pythonの例

コード例 #1

0

ファイルを表示

    def _parse_subtitles_page(self, video, response, language):
        subtitles = []

        page_soup = ParserBeautifulSoup(
            response.content.decode('iso-8859-1', 'ignore'),
            ['lxml', 'html.parser'])
        title_soups = page_soup.find_all("div",
                                         {'id': 'menu_detalle_buscador'})
        body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'})

        for subtitle in range(0, len(title_soups)):
            title_soup, body_soup = title_soups[subtitle], body_soups[subtitle]

            # title
            title = title_soup.find("a").text.replace("Subtitulos de ", "")
            page_link = title_soup.find("a")["href"]

            # description
            description = body_soup.find("div", {
                'id': 'buscador_detalle_sub'
            }).text
            description = description.replace(",", " ").lower()

            # uploader
            uploader = body_soup.find("a", {'class': 'link1'}).text

            subtitle = self.subtitle_class(language, video, page_link, title,
                                           description, uploader)

            logger.debug('Found subtitle %r', subtitle)
            subtitles.append(subtitle)

        return subtitles

コード例 #2

0

ファイルを表示

    def _parse_subtitles_page(self, video, response, language):
        subtitles = []

        page_soup = ParserBeautifulSoup(
            response.content.decode("utf-8", "ignore"),
            ["lxml", "html.parser"])
        title_soups = page_soup.find_all("div",
                                         {"id": "menu_detalle_buscador"})
        body_soups = page_soup.find_all("div", {"id": "buscador_detalle"})
        episode = isinstance(video, Episode)

        for subtitle in range(0, len(title_soups)):
            title_soup, body_soup = title_soups[subtitle], body_soups[subtitle]
            # title
            title = _clean_title(title_soup.find("a").text)

            # Forced subtitles are not supported
            if title.lower().rstrip().endswith(("forzado", "forzados")):
                logger.debug("Skipping forced subtitles: %s", title)
                continue

            # Check movie title (if the video is a movie)
            if not episode and not _check_movie(video, title):
                continue

            # Data
            datos = body_soup.find("div", {
                "id": "buscador_detalle_sub_datos"
            }).text
            # Ignore multi-disc and non-srt subtitles
            if not any(item in datos for item in ("Cds:</b> 1", "SubRip")):
                continue

            spain = "/pais/7.gif" in datos
            language = Language.fromalpha2("es") if spain else Language(
                "spa", "MX")

            # description
            sub_details = body_soup.find("div", {
                "id": "buscador_detalle_sub"
            }).text
            description = sub_details.replace(",", " ")

            # uploader
            uploader = body_soup.find("a", {"class": "link1"}).text
            download_url = _get_download_url(body_soup)
            page_link = title_soup.find("a")["href"]

            subtitle = self.subtitle_class(language, video, page_link, title,
                                           description, uploader, download_url)

            logger.debug("Found subtitle %r", subtitle)
            subtitles.append(subtitle)

        return subtitles

コード例 #3

0

ファイルを表示

    def _parse_subtitles_page(self, video, response, language):
        subtitles = []

        page_soup = ParserBeautifulSoup(
            response.content.decode("utf-8", "ignore"),
            ["lxml", "html.parser"])
        title_soups = page_soup.find_all("div",
                                         {"id": "menu_detalle_buscador"})
        body_soups = page_soup.find_all("div", {"id": "buscador_detalle"})

        for subtitle in range(0, len(title_soups)):
            title_soup, body_soup = title_soups[subtitle], body_soups[subtitle]
            # title
            title = self._clean_title(title_soup.find("a").text)
            # discard subtitles if a year between parenthesis is present in title and doesn't match the one provided
            # in video object
            if re.match(r'(\(\d{4}\))', title):
                if video.year and str(video.year) not in title:
                    continue

            # Data
            datos = body_soup.find("div", {
                "id": "buscador_detalle_sub_datos"
            }).text
            # Ignore multi-disc and non-srt subtitles
            if not any(item in datos for item in ("Cds:</b> 1", "SubRip")):
                continue

            spain = "/pais/7.gif" in datos
            language = Language.fromalpha2("es") if spain else Language(
                "spa", "MX")

            # description
            sub_details = body_soup.find("div", {
                "id": "buscador_detalle_sub"
            }).text
            description = sub_details.replace(",", " ").lower()

            # uploader
            uploader = body_soup.find("a", {"class": "link1"}).text
            page_link = title_soup.find("a")["href"]

            subtitle = self.subtitle_class(language, video, page_link, title,
                                           description, uploader)

            logger.debug("Found subtitle %r", subtitle)
            subtitles.append(subtitle)

        return subtitles

コード例 #4

0

ファイルを表示

ファイル: hosszupuska.py プロジェクト: rubicon/bazarr

    def query(self, series, season, episode, year=None, video=None):
        # Search for s01e03 instead of s1e3
        seasona = "%02d" % season
        episodea = "%02d" % episode
        seriesa = fix_inconsistent_naming(series)
        seriesa = series.replace(' ', '+')

        # get the episode page
        logger.info('Getting the page for episode %s', episode)
        url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \
            "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8"
        logger.info('Url %s', url)

        r = self.session.get(url, timeout=10).content

        soup = ParserBeautifulSoup(r, ['lxml'])

        subtitles = []

        for num, temp in enumerate(soup.find_all("table")):
            if "this.style.backgroundImage='url(css/over2.jpg)" in str(
                    temp) and "css/infooldal.png" in str(temp):
                logger.debug("Found valid table (%d index)", num)
                subtitles += self._loop_over_table(temp, season, episode,
                                                   video)

        return subtitles

コード例 #5

0

ファイルを表示

ファイル: tusubtitulo.py プロジェクト: youdroid/SickChill

    def query(self, series, season, episode, year=None):
        # get the show id
        show_id = self.get_show_id(series, year)
        if show_id is None:
            logger.error('No show id found for %s (%r)', series, year)
            return []

        # get the episode url
        episode_url = self.get_episode_url(show_id, series, season, episode, year)
        if episode_url is None:
            logger.error('No episode url found for %s, season %d, episode %d', series, season, episode)
            return []

        # get the page of the episode of the show
        r = self.session.get(episode_url, timeout=10)
        r.raise_for_status()
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # get episode title
        title_pattern = re.compile('Subt.+tulos de {}(.+){}x{:02d} - (.+)'.format(series, season, episode).lower())
        title = title_pattern.search(soup.select('#cabecera-subtitulo')[0].get_text().strip().lower()).group(2)

        # loop over subtitle rows
        subtitles = []

        for sub in soup.find_all('div', attrs={'id': re.compile('version([0-9]+)')}):
            # read the release subtitle
            release = sanitize_release_group(release_pattern.search(sub.find('p', class_='title-sub')
                                                                    .contents[2]).group(1))

            for html_language in sub.select('ul.sslist'):
                language = Language.fromtusubtitulo(html_language.find_next('b').get_text().strip())
                hearing_impaired = False

                # modify spanish latino subtitle language to only spanish and set hearing_impaired = True
                # because if exists spanish and spanish latino subtitle for the same episode, the score will be
                # higher with spanish subtitle. Spanish subtitle takes priority.
                if language == Language('spa', 'MX'):
                    language = Language('spa')
                    hearing_impaired = True

                # ignore incomplete subtitles
                status = sanitize(html_language.find_next('li', class_=re.compile('li-estado')).get_text())
                if status != 'completado':
                    logger.debug('Ignoring subtitle with status %s', status)
                    continue

                # get the most updated version of the subtitle and if it doesn't exist get the original version
                html_status = html_language.select('a[href^="updated/"]')
                if len(html_status) == 0:
                    html_status = html_language.select('a[href^="original/"]')

                subtitle_url = self.server_url + html_status[0]['href']
                subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title,
                                               year, release, subtitle_url)
                logger.debug('Found subtitle %r', subtitle)
                subtitles.append(subtitle)

        return subtitles

コード例 #6

0

ファイルを表示

ファイル: subtitulamos.py プロジェクト: xottl/SickChill

    def query(self, series, season, episode, year=None):
        # get the episode url
        episode_url = self._search_url_titles(series, season, episode, year)
        if episode_url is None:
            logger.info(
                f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}"
            )
            return []

        r = self.session.get(episode_url,
                             headers={"Referer": self.server_url},
                             timeout=10)
        r.raise_for_status()
        soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"])

        # get episode title
        title_pattern = re.compile("{}(.+){}x{:02d}- (.+)".format(
            series, season, episode).lower())
        title = title_pattern.search(
            soup.select("#episode_title")[0].get_text().strip().lower()).group(
                2)

        subtitles = []
        for sub in soup.find_all("div", attrs={"id": "progress_buttons_row"}):
            # read the language
            language = Language.fromsubtitulamos(
                sub.find_previous(
                    "div", class_="subtitle_language").get_text().strip())
            hearing_impaired = False

            # modify spanish latino subtitle language to only spanish and set hearing_impaired = True
            # because if exists spanish and spanish latino subtitle for the same episode, the score will be
            # higher with spanish subtitle. Spanish subtitle takes priority.
            if language == Language("spa", "MX"):
                language = Language("spa")
                hearing_impaired = True

            # read the release subtitle
            release = sub.find_next("div",
                                    class_="version_name").get_text().strip()

            # ignore incomplete subtitles
            status = sub.find_next("div",
                                   class_="subtitle_buttons").contents[1]
            if status.name != "a":
                logger.debug("Ignoring subtitle in [%s] not finished",
                             language)
                continue

            # read the subtitle url
            subtitle_url = self.server_url + status["href"][1:]
            subtitle = SubtitulamosSubtitle(language, hearing_impaired,
                                            episode_url, series, season,
                                            episode, title, year, release,
                                            subtitle_url)
            logger.debug("Found subtitle %r", subtitle)
            subtitles.append(subtitle)

        return subtitles

コード例 #7

0

ファイルを表示

ファイル: subtitulamos.py プロジェクト: vdweegen/SickChill

    def query(self, series, season, episode, year=None):
        # get the episode url
        episode_url = self._search_url_titles(series, season, episode, year)
        if episode_url is None:
            logger.error('No episode url found for %s, season %d, episode %d',
                         series, season, episode)
            return []

        r = self.session.get(episode_url,
                             headers={'Referer': self.server_url},
                             timeout=10)
        r.raise_for_status()
        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # get episode title
        title_pattern = re.compile('{}(.+){}x{:02d}- (.+)'.format(
            series, season, episode).lower())
        title = title_pattern.search(
            soup.select('#episode_title')[0].get_text().strip().lower()).group(
                2)

        subtitles = []
        for sub in soup.find_all('div', attrs={'id': 'progress_buttons_row'}):
            # read the language
            language = Language.fromsubtitulamos(
                sub.find_previous(
                    'div', class_='subtitle_language').get_text().strip())
            hearing_impaired = False

            # modify spanish latino subtitle language to only spanish and set hearing_impaired = True
            # because if exists spanish and spanish latino subtitle for the same episode, the score will be
            # higher with spanish subtitle. Spanish subtitle takes priority.
            if language == Language('spa', 'MX'):
                language = Language('spa')
                hearing_impaired = True

            # read the release subtitle
            release = sub.find_next('div',
                                    class_='version_name').get_text().strip()

            # ignore incomplete subtitles
            status = sub.find_next('div',
                                   class_='subtitle_buttons').contents[1]
            if status.name != 'a':
                logger.debug('Ignoring subtitle in [%s] not finished',
                             language)
                continue

            # read the subtitle url
            subtitle_url = self.server_url + status['href'][1:]
            subtitle = SubtitulamosSubtitle(language, hearing_impaired,
                                            episode_url, series, season,
                                            episode, title, year, release,
                                            subtitle_url)
            logger.debug('Found subtitle %r', subtitle)
            subtitles.append(subtitle)

        return subtitles

コード例 #8

0

ファイルを表示

    def query(self, series, season, episode, year=None):
        # get the show id
        show_id = self.get_show_id(series, year)
        if show_id is None:
            logger.error("No show id found for %s (%r)", series, year)
            return []

        # get the episode url
        episode_url = self.get_episode_url(show_id, series, season, episode, year)
        if episode_url is None:
            logger.info(f"[{self.provider_name}]: No episode url found for {series}, season {season}, episode {episode}")
            return []

        # get the page of the episode of the show
        r = self.session.get(episode_url, timeout=10)
        r.raise_for_status()
        soup = ParserBeautifulSoup(r.content, ["lxml", "html.parser"])

        # get episode title
        title_pattern = re.compile("Subt.+tulos de {}(.+){}x{:02d} - (.+)".format(series, season, episode).lower())
        title = title_pattern.search(soup.select("#cabecera-subtitulo")[0].get_text().strip().lower()).group(2)

        # loop over subtitle rows
        subtitles = []

        for sub in soup.find_all("div", attrs={"id": re.compile("version([0-9]+)")}):
            # read the release subtitle
            release = sanitize_release_group(release_pattern.search(sub.find("p", class_="title-sub").contents[2]).group(1))

            for html_language in sub.select("ul.sslist"):
                language = Language.fromtusubtitulo(html_language.find_next("b").get_text().strip())
                hearing_impaired = False

                # modify spanish latino subtitle language to only spanish and set hearing_impaired = True
                # because if exists spanish and spanish latino subtitle for the same episode, the score will be
                # higher with spanish subtitle. Spanish subtitle takes priority.
                if language == Language("spa", "MX"):
                    language = Language("spa")
                    hearing_impaired = True

                # ignore incomplete subtitles
                status = sanitize(html_language.find_next("li", class_=re.compile("li-estado")).get_text())
                if status != "completado":
                    logger.debug("Ignoring subtitle with status %s", status)
                    continue

                # get the most updated version of the subtitle and if it doesn't exist get the original version
                html_status = html_language.select('a[href^="updated/"]')
                if len(html_status) == 0:
                    html_status = html_language.select('a[href^="original/"]')

                subtitle_url = self.server_url + html_status[0]["href"]
                subtitle = TuSubtituloSubtitle(language, hearing_impaired, episode_url, series, season, episode, title, year, release, subtitle_url)
                logger.debug("Found subtitle %r", subtitle)
                subtitles.append(subtitle)

        return subtitles

コード例 #9

0

ファイルを表示

    def _get_download_link(self, subtitle):
        response = self.session.get(subtitle.page_link, timeout=20)
        self._check_response(response)
        try:
            page_soup = ParserBeautifulSoup(
                response.content.decode("utf-8", "ignore"),
                ["lxml", "html.parser"])
            links_soup = page_soup.find_all("a", {"class": "detalle_link"})
            for link_soup in links_soup:
                if link_soup["href"].startswith("bajar"):
                    return self.server_url + link_soup["href"]
            links_soup = page_soup.find_all("a", {"class": "link1"})
            for link_soup in links_soup:
                if "bajar.php" in link_soup["href"]:
                    return link_soup["href"]
        except Exception as e:
            raise APIThrottled(f"Error parsing download link: {e}")

        raise APIThrottled("Download link not found")

コード例 #10

0

ファイルを表示

    def _get_download_link(self, subtitle):
        response = self.session.get(subtitle.page_link, timeout=20)
        self._check_response(response)
        try:
            page_soup = ParserBeautifulSoup(
                response.content.decode('iso-8859-1', 'ignore'),
                ['lxml', 'html.parser'])
            links_soup = page_soup.find_all("a", {'class': 'detalle_link'})
            for link_soup in links_soup:
                if link_soup['href'].startswith('bajar'):
                    return self.server_url + link_soup['href']
            links_soup = page_soup.find_all("a", {'class': 'link1'})
            for link_soup in links_soup:
                if "bajar.php" in link_soup['href']:
                    return link_soup['href']
        except Exception as e:
            raise APIThrottled('Error parsing download link: ' + str(e))

        raise APIThrottled('Download link not found')

コード例 #11

0

ファイルを表示

    def _parse_subtitles_page(self, video, response, language):
        subtitles = []

        page_soup = ParserBeautifulSoup(
            response.content.decode("utf-8", "ignore"),
            ["lxml", "html.parser"])
        title_soups = page_soup.find_all("div",
                                         {"id": "menu_detalle_buscador"})
        body_soups = page_soup.find_all("div", {"id": "buscador_detalle"})

        for subtitle in range(0, len(title_soups)):
            title_soup, body_soup = title_soups[subtitle], body_soups[subtitle]

            # title
            title = title_soup.find("a").text.replace("Subtitulos de ", "")

            # filter by year
            if video.year and str(video.year) not in title:
                continue

            page_link = title_soup.find("a")["href"]

            # description
            description = body_soup.find("div", {
                "id": "buscador_detalle_sub"
            }).text
            description = description.replace(",", " ").lower()

            # uploader
            uploader = body_soup.find("a", {"class": "link1"}).text

            subtitle = self.subtitle_class(language, video, page_link, title,
                                           description, uploader)

            logger.debug("Found subtitle %r", subtitle)
            subtitles.append(subtitle)

        return subtitles

コード例 #12

0

ファイルを表示

    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
        if season:
            params += ".S{season:02d}".format(season=season)
        elif year:
            params += " {:4d}".format(year)

        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = self.server_url + text_type(
            self.search_url).format(params)

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug("No data returned from provider")
            return []

        soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                   ["lxml", "html.parser"])

        # non-shooter result page
        if soup.find("div", {"class": "item"}):
            logger.debug("enter a non-shooter page")
            for item in soup.find_all("div", {"class": "item"}):
                title_a = item.find("p", class_="tt clearfix").find("a")
                subs_year = re.findall(r"\d{4}", title_a.text) or None
                if season:
                    title = title_a.text
                    season_cn1 = re.search("第(.*)季", title)
                    if not season_cn1:
                        season_cn1 = "一"
                    else:
                        season_cn1 = season_cn1.group(1).strip()
                    season_cn2 = num_to_cn(str(season))
                    if season_cn1 != season_cn2:
                        continue
                episode_link = self.server_url + title_a.attrs["href"]
                new_subs = self._parse_episode_page(episode_link, subs_year)
                subtitles += new_subs

        # NOTE: shooter result pages are ignored due to the existence of assrt provider

        return subtitles

コード例 #13

0

ファイルを表示

ファイル: supersubtitles.py プロジェクト: zandadoum/bazarr

    def find_imdb_id(self, sub_id):
        """

        """

        url = self.server_url + "index.php?tipus=adatlap&azon=a_" + sub_id
        # url = https://www.feliratok.info/index.php?tipus=adatlap&azon=a_1518600916
        logger.info('Get IMDB id from URL %s', url)
        r = self.session.get(url, timeout=10).content

        soup = ParserBeautifulSoup(r, ['lxml'])
        links = soup.find_all("a")

        for value in links:
            if "imdb.com" in str(value):
                # <a alt="iMDB" href="http://www.imdb.com/title/tt2357547/" target="_blank"><img alt="iMDB" src="img/adatlap/imdb.png"/></a>
                imdb_id = re.findall(r'(?<=www\.imdb\.com/title/).*(?=/")', str(value))[0]
                return imdb_id

        return None

コード例 #14

0

ファイルを表示

    def query(self, keyword, season=None, episode=None, year=None):
        query = keyword
        if season and episode:
            query += ' S{season:02d}E{episode:02d}'.format(season=season,
                                                           episode=episode)
        elif year:
            query += ' {:4d}'.format(year)

        params = {
            'buscar': query,  # search string
            'accion': 5,  # action search
            'oxdown': 1,  # order by downloads descending
            'pg': 1  # page 1
        }

        logger.debug('Searching subtitles %r', query)
        subtitles = []
        language = self.language_list[0]
        search_link = self.server_url + 'index.php'
        while True:
            r = self.session.get(search_link, params=params, timeout=10)
            r.raise_for_status()

            if not r.content:
                logger.debug('No data returned from provider')
                return []

            page_soup = ParserBeautifulSoup(
                r.content.decode('iso-8859-1', 'ignore'),
                ['lxml', 'html.parser'])
            title_soups = page_soup.find_all("div",
                                             {'id': 'menu_detalle_buscador'})
            body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'})
            if len(title_soups) != len(body_soups):
                logger.debug('Error in provider data')
                return []
            for subtitle in range(0, len(title_soups)):
                title_soup, body_soup = title_soups[subtitle], body_soups[
                    subtitle]

                # title
                title = title_soup.find("a").text.replace("Subtitulo de ", "")
                page_link = title_soup.find("a")["href"].replace(
                    'http://', 'https://')

                # body
                description = body_soup.find("div", {
                    'id': 'buscador_detalle_sub'
                }).text
                download_link = body_soup.find(
                    "div", {
                        'id': 'buscador_detalle_sub_datos'
                    }).find("a", {'target': 'new'})["href"].replace(
                        'http://', 'https://')

                subtitle = self.subtitle_class(language, page_link,
                                               download_link, description,
                                               title)

                logger.debug('Found subtitle %r', subtitle)
                subtitles.append(subtitle)

            if len(title_soups) >= 20:
                params['pg'] += 1  # search next page
                time.sleep(self.multi_result_throttle)
            else:
                break

        return subtitles

コード例 #15

0

ファイルを表示

ファイル: supersubtitles.py プロジェクト: zandadoum/bazarr

    def process_subs(self, series, video, url):

        subtitles = []

        logger.info('URL for subtitles %s', url)
        r = self.session.get(url, timeout=10).content

        soup = ParserBeautifulSoup(r, ['lxml'])
        tables = soup.find_all("table")
        tables = tables[0].find_all("tr")
        i = 0
        series_imdb_id = None
        for table in tables:
            if "vilagit" in str(table) and i > 1:
                try:
                    sub_hun_name = table.findAll("div", {"class": "magyar"})[0]
                    if isinstance(video, Episode):
                        if "vad)" not in str(sub_hun_name):
                            # <div class="magyar">A pletykaf�szek (3. �vad)</div>
                            sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?= -)', str(sub_hun_name))[0]
                        else:
                            # <div class="magyar">A holnap legend�i - 3x11</div>
                            sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?= \()', str(sub_hun_name))[0]
                    if isinstance(video, Movie):
                        sub_hun_name = re.findall(r'(?<=<div class="magyar">).*(?=</div)', str(sub_hun_name))[0]
                except IndexError:
                    sub_hun_name = ""

                asked_for_episode = None
                sub_season = None
                sub_episode = None
                sub_english = table.findAll("div", {"class": "eredeti"})
                if isinstance(video, Episode):
                    asked_for_episode = video.episode
                    if "Season" not in str(sub_english):
                        # [<div class="eredeti">Gossip Girl (Season 3) (DVDRip-REWARD)</div>]
                        sub_english_name = re.findall(r'(?<=<div class="eredeti">).*?(?= -)', str(sub_english))[0]
                        sub_season = int((re.findall(r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[0]).strip())
                        sub_episode = int((re.findall(r"(?<=- ).*?(?= - )", str(sub_english))[0].split('x')[1]).strip())

                    else:
                        # [<div class="eredeti">DC's Legends of Tomorrow - 3x11 - Here I Go Again (HDTV-AFG, HDTV-RMX, 720p-SVA, 720p-PSA </div>]
                        sub_english_name = \
                            re.findall(r'(?<=<div class="eredeti">).*?(?=\(Season)', str(sub_english))[0]
                        sub_season = int(re.findall(r"(?<=Season )\d+(?=\))", str(sub_english))[0])
                        sub_episode = int(video.episode)
                if isinstance(video, Movie):
                    sub_english_name = re.findall(r'(?<=<div class="eredeti">).*?(?=\()', str(sub_english))[0]

                sub_version = (str(sub_english).split('(')[len(str(sub_english).split('(')) - 1]).split(')')[0]
                # <small>Angol</small>
                lang = table.findAll("small")[0]
                sub_language = self.get_language(re.findall(r"(?<=<small>).*(?=</small>)", str(lang))[0])

                # <a href="/index.php?action=letolt&amp;fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA.English.C.orig.Addic7ed.com.srt&amp;felirat=1519162191">
                link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "")
                sub_downloadlink = self.server_url + re.findall(r'(?<=href="/).*(?=">)', link)[0]

                sub_id = re.findall(r"(?<=felirat\=).*(?=\"\>)", link)[0]
                sub_year = video.year
                sub_releases = [s.strip() for s in sub_version.split(',')]

                # For episodes we open the series page so all subtitles imdb_id must be the same. no need to check all
                if isinstance(video, Episode) and series_imdb_id is not None:
                    sub_imdb_id = series_imdb_id
                else:
                    sub_imdb_id = self.find_imdb_id(sub_id)
                    series_imdb_id = sub_imdb_id

                subtitle = SuperSubtitlesSubtitle(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season,
                                                  sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id,
                                                  asked_for_episode, asked_for_release_group=video.release_group )
                subtitles.append(subtitle)
            i = i + 1
        return subtitles

コード例 #16

0

ファイルを表示

ファイル: zimuku.py プロジェクト: zx900930/bazarr

    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
        if season:
            params += ".S{season:02d}".format(season=season)
        elif year:
            params += " {:4d}".format(year)

        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = self.server_url + text_type(
            self.search_url).format(params)

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug("No data returned from provider")
            return []

        html = r.content.decode("utf-8", "ignore")
        # parse window location
        pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url"
        parts = re.findall(pattern, html)
        redirect_url = search_link
        while parts:
            parts.reverse()
            redirect_url = urljoin(self.server_url, "".join(parts))
            r = self.session.get(redirect_url, timeout=30)
            html = r.content.decode("utf-8", "ignore")
            parts = re.findall(pattern, html)
        logger.debug("search url located: " + redirect_url)

        soup = ParserBeautifulSoup(r.content.decode("utf-8", "ignore"),
                                   ["lxml", "html.parser"])

        # non-shooter result page
        if soup.find("div", {"class": "item"}):
            logger.debug("enter a non-shooter page")
            for item in soup.find_all("div", {"class": "item"}):
                title_a = item.find("p", class_="tt clearfix").find("a")
                subs_year = year
                if season:
                    # episode year in zimuku is the season's year not show's year
                    actual_subs_year = re.findall(r"\d{4}",
                                                  title_a.text) or None
                    if actual_subs_year:
                        subs_year = int(actual_subs_year[0]) - season + 1
                    title = title_a.text
                    season_cn1 = re.search("第(.*)季", title)
                    if not season_cn1:
                        season_cn1 = "一"
                    else:
                        season_cn1 = season_cn1.group(1).strip()
                    season_cn2 = num_to_cn(str(season))
                    if season_cn1 != season_cn2:
                        continue
                episode_link = self.server_url + title_a.attrs["href"]
                new_subs = self._parse_episode_page(episode_link, subs_year)
                subtitles += new_subs

        # NOTE: shooter result pages are ignored due to the existence of assrt provider

        return subtitles

コード例 #17

0

ファイルを表示

    def query(self, keyword, season=None, episode=None, year=None, video=None):
        params = keyword
        if season and episode:
            params += ' S{season:02d}E{episode:02d}'.format(season=season,
                                                            episode=episode)
        elif year:
            params += '&ARok={:4d}'.format(year)

        logger.debug('Searching subtitles %r', params)
        subtitles = []
        if season and episode:
            search_link = self.server_url + text_type(
                self.search_url_series).format(params)
        elif year:
            search_link = self.server_url + text_type(
                self.search_url_movies).format(params)

        r = self.session.get(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        # soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['lxml', 'html.parser'])

        # for entity in soup.select('table .main_table > tbody > tr'):
        # for entity in soup.find_all("table", class_="main_table"):
        #     moviename = entity.text
        # entity_url = self.server_url + entity['href']
        # logger.debug(entity_url)
        # r = self.session.get(entity_url, timeout=30)
        # r.raise_for_status()
        # logger.debug('looking into ' + entity_url)

        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'),
                                   ['lxml', 'html.parser']).find(
                                       "table", class_="main_table")
        # loop over subtitles cells
        if soup:
            subs = soup.find_all("tr", class_="row1")
            subs += soup.find_all("tr", class_="row2")
            for sub in subs:
                page_link = '%s%s' % (self.server_url,
                                      sub.a.get('href').encode('utf-8'))
                title = sub.find_all('td')[0:1]
                title = [x.text for x in title]
                version = sub.find(class_="fixedTip")
                if version is None:
                    version = ""
                else:
                    version = version['title']
                try:
                    r = sub.find_all('td')[6:7]
                    # r2 = td.find("td", "img")
                    langs = [x.text.encode('utf-8') for x in r]
                    pass
                except:
                    langs = 'CZ'
                name = '%s (%s)' % (version, langs)

                if b'CZ' in langs:
                    language = Language('ces')
                elif b'SK' in langs:
                    language = Language('slk')
                # read the item
                # subtitle = self.subtitle_class(language, page_link, year, version, page_link.replace("detail", "dld"))
                download_link = sub.find('a', class_='titulkydownloadajax')
                download_link = self.download_url + download_link.get('href')

                subtitle = self.subtitle_class(
                    language,
                    page_link,
                    season,
                    episode,
                    version,
                    download_link,
                    year,
                    title,
                    asked_for_release_group=video.release_group,
                    asked_for_episode=episode)

                logger.debug('Found subtitle %r', subtitle)
                subtitles.append(subtitle)

            soup.decompose()
            soup = None

        return subtitles

コード例 #18

0

ファイルを表示

    def query(self, series, season, episode, year=None, video=None):

        # Search for s01e03 instead of s1e3
        seasona = "%02d" % season
        episodea = "%02d" % episode
        series = fix_inconsistent_naming(series)
        seriesa = series.replace(' ', '+')

        # get the episode page
        logger.info('Getting the page for episode %s', episode)
        url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \
            "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8"
        logger.info('Url %s', url)

        r = self.session.get(url, timeout=10).content

        i = 0
        soup = ParserBeautifulSoup(r, ['lxml'])

        table = soup.find_all("table")[9]

        subtitles = []
        # loop over subtitles rows
        for row in table.find_all("tr"):
            i = i + 1
            if "this.style.backgroundImage='url(css/over2.jpg)" in str(row) and i > 5:
                datas = row.find_all("td")

                # Currently subliminal not use these params, but maybe later will come in handy
                # hunagrian_name = re.split('s(\d{1,2})', datas[1].find_all('b')[0].getText())[0]
                # Translator of subtitle
                # sub_translator = datas[3].getText()
                # Posting date of subtitle
                # sub_date = datas[4].getText()

                sub_year = sub_english_name = sub_version = None
                # Handle the case when '(' in subtitle
                if datas[1].getText().count('(') == 1:
                    sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText())[3]
                if datas[1].getText().count('(') == 2:
                    sub_year = re.findall(r"(?<=\()(\d{4})(?=\))", datas[1].getText().strip())[0]
                    sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText().split('(')[0])[0]

                if not sub_english_name:
                    continue

                sub_season = int((re.findall('s(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0])
                                 .lstrip('0'))
                sub_episode = int((re.findall('e(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0])
                                  .lstrip('0'))

                if sub_season == season and sub_episode == episode:
                    sub_language = self.get_language(datas[2].find_all('img')[0]['src'].split('/')[1])
                    sub_downloadlink = datas[6].find_all('a')[1]['href']
                    sub_id = sub_downloadlink.split('=')[1].split('.')[0]

                    if datas[1].getText().count('(') == 1:
                        sub_version = datas[1].getText().split('(')[1].split(')')[0]
                    if datas[1].getText().count('(') == 2:
                        sub_version = datas[1].getText().split('(')[2].split(')')[0]

                    # One subtitle can be used for several releases
                    sub_releases = [s.strip() for s in sub_version.split(',')]
                    subtitle = self.subtitle_class(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(),
                                                   sub_season, sub_episode, sub_version, sub_releases, sub_year,
                                                   asked_for_release_group=video.release_group,
                                                   asked_for_episode=episode)

                    logger.debug('Found subtitle: %r', subtitle)
                    subtitles.append(subtitle)

        return subtitles

コード例 #19

0

ファイルを表示

ファイル: hosszupuska.py プロジェクト: pannal/Sub-Zero.bundle

    def query(self, series, season, episode, year=None, video=None):

        # Search for s01e03 instead of s1e3
        seasona = "%02d" % season
        episodea = "%02d" % episode
        series = fix_inconsistent_naming(series)
        seriesa = series.replace(' ', '+')

        # get the episode page
        logger.info('Getting the page for episode %s', episode)
        url = self.server_url + "sorozatok.php?cim=" + seriesa + "&evad="+str(seasona) + \
            "&resz="+str(episodea)+"&nyelvtipus=%25&x=24&y=8"
        logger.info('Url %s', url)

        r = self.session.get(url, timeout=10).content

        i = 0
        soup = ParserBeautifulSoup(r, ['lxml'])

        table = soup.find_all("table")[9]

        subtitles = []
        # loop over subtitles rows
        for row in table.find_all("tr"):
            i = i + 1
            if "this.style.backgroundImage='url(css/over2.jpg)" in str(row) and i > 5:
                datas = row.find_all("td")

                # Currently subliminal not use these params, but maybe later will come in handy
                # hunagrian_name = re.split('s(\d{1,2})', datas[1].find_all('b')[0].getText())[0]
                # Translator of subtitle
                # sub_translator = datas[3].getText()
                # Posting date of subtitle
                # sub_date = datas[4].getText()

                sub_year = sub_english_name = sub_version = None
                # Handle the case when '(' in subtitle
                if datas[1].getText().count('(') == 1:
                    sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText())[3]
                if datas[1].getText().count('(') == 2:
                    sub_year = re.findall(r"(?<=\()(\d{4})(?=\))", datas[1].getText().strip())[0]
                    sub_english_name = re.split('s(\d{1,2})e(\d{1,2})', datas[1].getText().split('(')[0])[0]

                if not sub_english_name:
                    continue

                sub_season = int((re.findall('s(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0])
                                 .lstrip('0'))
                sub_episode = int((re.findall('e(\d{1,2})', datas[1].find_all('b')[0].getText(), re.VERBOSE)[0])
                                  .lstrip('0'))

                if sub_season == season and sub_episode == episode:
                    sub_language = self.get_language(datas[2].find_all('img')[0]['src'].split('/')[1])
                    sub_downloadlink = datas[6].find_all('a')[1]['href']
                    sub_id = sub_downloadlink.split('=')[1].split('.')[0]

                    if datas[1].getText().count('(') == 1:
                        sub_version = datas[1].getText().split('(')[1].split(')')[0]
                    if datas[1].getText().count('(') == 2:
                        sub_version = datas[1].getText().split('(')[2].split(')')[0]

                    # One subtitle can be used for several releases
                    sub_releases = [s.strip() for s in sub_version.split(',')]
                    subtitle = self.subtitle_class(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(),
                                                   sub_season, sub_episode, sub_version, sub_releases, sub_year,
                                                   asked_for_release_group=video.release_group,
                                                   asked_for_episode=episode)

                    logger.debug('Found subtitle: %r', subtitle)
                    subtitles.append(subtitle)

        return subtitles

コード例 #20

0

ファイルを表示

ファイル: supersubtitles.py プロジェクト: zx900930/bazarr

    def process_subs(self, languages, video, url):

        subtitles = []

        logger.info('URL for subtitles %s', url)
        r = self.session.get(url, timeout=10).content

        soup = ParserBeautifulSoup(r, ['lxml'])
        tables = soup.find_all("table")
        tables = tables[0].find_all("tr")
        i = 0
        series_imdb_id = None
        for table in tables:
            if "vilagit" in str(table) and i > 1:
                asked_for_episode = None
                sub_season = None
                sub_episode = None
                sub_english = table.findAll("div", {"class": "eredeti"})
                sub_english_name = None
                if isinstance(video, Episode):
                    asked_for_episode = video.episode
                    if "Season" not in str(sub_english):
                        # [<div class="eredeti">Gossip Girl (Season 3) (DVDRip-REWARD)</div>]
                        sub_english_name = re.search(
                            r'(?<=<div class="eredeti">).*?(?= -)',
                            str(sub_english))
                        sub_english_name = sub_english_name.group(
                        ) if sub_english_name else ''

                        sub_season = re.search(r"(?<=- ).*?(?= - )",
                                               str(sub_english))
                        sub_season = sub_season.group() if sub_season else ''
                        sub_season = int((sub_season.split('x')[0]).strip())

                        sub_episode = re.search(r"(?<=- ).*?(?= - )",
                                                str(sub_english))
                        sub_episode = sub_episode.group(
                        ) if sub_episode else ''
                        sub_episode = int((sub_episode.split('x')[1]).strip())

                    else:
                        # [<div class="eredeti">DC's Legends of Tomorrow - 3x11 - Here I Go Again (HDTV-AFG, HDTV-RMX,
                        # 720p-SVA, 720p-PSA </div>]
                        sub_english_name = \
                            re.search(r'(?<=<div class="eredeti">).*?(?=\(Season)', str(sub_english))
                        sub_english_name = sub_english_name.group(
                        ) if sub_english_name else ''
                        sub_season = re.search(r"(?<=Season )\d+(?=\))",
                                               str(sub_english))
                        sub_season = int(
                            sub_season.group()) if sub_season else None
                        sub_episode = int(video.episode)
                if isinstance(video, Movie):
                    sub_english_name = re.search(
                        r'(?<=<div class="eredeti">).*?(?=</div>)',
                        str(sub_english))
                    sub_english_name = sub_english_name.group(
                    ) if sub_english_name else ''
                    sub_english_name = sub_english_name.split(' (')[0]

                sub_version = 'n/a'
                if len(str(sub_english).split('(')) > 1:
                    sub_version = (str(sub_english).split('(')[
                        len(str(sub_english).split('(')) - 1]).split(')')[0]
                # <small>Angol</small>
                lang = table.find("small")
                sub_language = re.search(r"(?<=<small>).*(?=</small>)",
                                         str(lang))
                sub_language = sub_language.group() if sub_language else ''
                sub_language = self.get_language(sub_language)

                # <a href="/index.php?action=letolt&amp;fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA.
                # English.C.orig.Addic7ed.com.srt&amp;felirat=1519162191">
                link = str(table.findAll("a")[len(table.findAll("a")) -
                                              1]).replace("amp;", "")
                sub_downloadlink = re.search(r'(?<=href="/).*(?=">)', link)
                sub_downloadlink = sub_downloadlink.group(
                ) if sub_downloadlink else ''
                sub_downloadlink = self.server_url + sub_downloadlink

                sub_id = re.search(r"(?<=felirat=).*(?=\">)", link)
                sub_id = sub_id.group() if sub_id else ''
                sub_year = video.year
                sub_releases = [s.strip() for s in sub_version.split(',')]

                uploader = ''
                for item in table.contents[7].contents:
                    if isinstance(item, Tag):
                        uploader = item.text.lstrip('\r\n\t\t\t\t\t').rstrip(
                            '\r\n\t\t\t\t')
                    elif isinstance(item, NavigableString):
                        uploader = item.lstrip('\r\n\t\t\t\t\t').rstrip(
                            '\r\n\t\t\t\t')

                # For episodes we open the series page so all subtitles imdb_id must be the same. no need to check all
                if isinstance(video, Episode) and series_imdb_id is not None:
                    sub_imdb_id = series_imdb_id
                else:
                    sub_imdb_id = self.find_imdb_id(sub_id)
                    series_imdb_id = sub_imdb_id

                subtitle = SuperSubtitlesSubtitle(
                    sub_language,
                    sub_downloadlink,
                    sub_id,
                    sub_english_name.strip(),
                    sub_season,
                    sub_episode,
                    sub_version,
                    sub_releases,
                    sub_year,
                    sub_imdb_id,
                    uploader,
                    asked_for_episode,
                    asked_for_release_group=video.release_group)
                if subtitle.language in languages:
                    subtitles.append(subtitle)
            i = i + 1
        return subtitles

コード例 #21

0

ファイルを表示

    def process_subs(self, languages, video, url):
        if isinstance(video, Episode):
            return None

        subtitles = []

        logger.info('URL for subtitles %s', url)
        r = self.session.get(url, timeout=10).content

        soup = ParserBeautifulSoup(r, ['lxml'])
        tables = soup.find_all("table")
        tables = tables[0].find_all("tr")
        i = 0

        for table in tables:
            if "vilagit" in str(table) and i > 1:
                asked_for_episode = None
                sub_season = None
                sub_episode = None
                sub_english = table.findAll("div", {"class": "eredeti"})
                sub_english_name = re.search(
                    r'(?<=<div class="eredeti">).*?(?=</div>)',
                    str(sub_english))
                sub_english_name = sub_english_name.group(
                ) if sub_english_name else ''
                sub_english_name = sub_english_name.split(' (')[0]

                sub_english_name = sub_english_name.replace('&amp;', '&')
                sub_version = 'n/a'
                if len(str(sub_english).split('(')) > 1:
                    sub_version = (str(sub_english).split('(')[
                        len(str(sub_english).split('(')) - 1]).split(')')[0]
                # <small>Angol</small>
                lang = table.find("small")
                sub_language = re.search(r"(?<=<small>).*(?=</small>)",
                                         str(lang))
                sub_language = sub_language.group() if sub_language else ''
                sub_language = self.get_language(sub_language)

                # <a href="/index.php?action=letolt&amp;fnev=DCs Legends of Tomorrow - 03x11 - Here I Go Again.SVA.
                # English.C.orig.Addic7ed.com.srt&amp;felirat=1519162191">
                link = str(table.findAll("a")[len(table.findAll("a")) -
                                              1]).replace("amp;", "")
                sub_downloadlink = re.search(r'(?<=href="/).*(?=">)', link)
                sub_downloadlink = sub_downloadlink.group(
                ) if sub_downloadlink else ''
                sub_downloadlink = self.server_url + sub_downloadlink

                sub_id = re.search(r"(?<=felirat=).*(?=\">)", link)
                sub_id = sub_id.group() if sub_id else ''
                sub_year = video.year
                sub_releases = [s.strip() for s in sub_version.split(',')]

                uploader = ''
                for item in table.contents[7].contents:
                    if isinstance(item, Tag):
                        uploader = item.text.lstrip('\r\n\t\t\t\t\t').rstrip(
                            '\r\n\t\t\t\t')
                    elif isinstance(item, NavigableString):
                        uploader = item.lstrip('\r\n\t\t\t\t\t').rstrip(
                            '\r\n\t\t\t\t')

                sub_imdb_id = self.find_imdb_id(sub_id)

                subtitle = SuperSubtitlesSubtitle(
                    sub_language,
                    sub_downloadlink,
                    sub_id,
                    sub_english_name.strip(),
                    sub_season,
                    sub_episode,
                    sub_version,
                    sub_releases,
                    sub_year,
                    sub_imdb_id,
                    uploader,
                    asked_for_episode,
                    asked_for_release_group=video.release_group)
                if subtitle.language in languages:
                    subtitles.append(subtitle)
            i = i + 1
        return subtitles