Python ParserBeautifulSoup.findAllの例、subliminal.providers.ParserBeautifulSoup.findAll Pythonの例

コード例 #1

0

ファイルを表示

    def login(self):
        logger.info('Logging in')
        self.headers['Referer'] = self.site + '/index.php'
        self.session.headers.update(self.headers.items())
        res = self.session.get(self.loginpage)
        bsoup = ParserBeautifulSoup(res.content, ['lxml'])

        _allinputs = bsoup.findAll('input')
        fields = {}
        for field in _allinputs:
            fields[field.get('name')] = field.get('value')

        fields['username'] = self.username
        fields['password'] = self.password
        fields['autologin'] = '******'
        fields['viewonline'] = 'on'

        self.headers['Referer'] = self.loginpage
        self.session.headers.update(self.headers.items())
        res = self.session.post(self.loginpage, fields)
        try:
            logger.debug('Got session id %s' %
                         self.session.cookies.get_dict()['PHPSESSID'])
        except KeyError as e:
            logger.error(repr(e))
            logger.error("Didn't get session id, check your credentials")
            return False
        except Exception as e:
            logger.error(repr(e))
            logger.error('uncached error #legendasdivx #AA')
            return False

        return True

コード例 #2

0

ファイルを表示

ファイル: xsubs.py プロジェクト: mvanbaak/bazarr

    def _get_show_ids(self):
        # get the shows page
        logger.info('Getting show ids')
        r = self.session.get(self.server_url + self.all_series_url, timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        # populate the show ids
        show_ids = {}
        for show_category in soup.findAll('seriesl'):
            if show_category.attrs['category'] == u'Σειρές':
                for show in show_category.findAll('series'):
                    series = show.text
                    series_match = series_sanitize_re.match(series)
                    if series_match:
                        series = series_match.group(1)
                    show_ids[sanitize(series)] = int(show['srsid'])
                break
        logger.debug('Found %d show ids', len(show_ids))

        return show_ids

コード例 #3

0

ファイルを表示

    def login(self):
        logger.debug('Legendasdivx.pt :: Logging in')
        try:
            # sleep for a 1 second before another request
            sleep(1)
            res = self.session.get(self.loginpage)
            res.raise_for_status()
            bsoup = ParserBeautifulSoup(res.content, ['lxml'])

            _allinputs = bsoup.findAll('input')
            data = {}
            # necessary to set 'sid' for POST request
            for field in _allinputs:
                data[field.get('name')] = field.get('value')

            # sleep for a 1 second before another request
            sleep(1)
            data['username'] = self.username
            data['password'] = self.password
            res = self.session.post(self.loginpage, data)
            res.raise_for_status()
            # make sure we're logged in
            logger.debug(
                'Legendasdivx.pt :: Logged in successfully: PHPSESSID: %s',
                self.session.cookies.get_dict()['PHPSESSID'])
            cj = self.session.cookies.copy()
            store_cks = ("PHPSESSID", "phpbb3_2z8zs_sid", "phpbb3_2z8zs_k",
                         "phpbb3_2z8zs_u", "lang")
            for cn in iter(self.session.cookies.keys()):
                if cn not in store_cks:
                    del cj[cn]
            # store session cookies on cache
            logger.debug(
                "Legendasdivx.pt :: Storing legendasdivx session cookies: %r",
                cj)
            region.set("legendasdivx_cookies2", cj)

        except KeyError:
            logger.error(
                "Legendasdivx.pt :: Couldn't get session ID, check your credentials"
            )
            raise AuthenticationError(
                "Legendasdivx.pt :: Couldn't get session ID, check your credentials"
            )
        except HTTPError as e:
            if "bloqueado" in res.text.lower():
                logger.error(
                    "LegendasDivx.pt :: Your IP is blocked on this server.")
                raise IPAddressBlocked(
                    "LegendasDivx.pt :: Your IP is blocked on this server.")
            logger.error("Legendasdivx.pt :: HTTP Error %s", e)
            raise TooManyRequests("Legendasdivx.pt :: HTTP Error %s", e)
        except Exception as e:
            logger.error("LegendasDivx.pt :: Uncaught error: %r", e)
            raise ServiceUnavailable("LegendasDivx.pt :: Uncaught error: %r",
                                     e)

コード例 #4

0

ファイルを表示

    def query(self, video, language):
        try:
            logger.debug('Got session id %s' %
                         self.session.cookies.get_dict()['PHPSESSID'])
        except Exception as e:
            self.login()
            return []

        language_ids = '0'
        if isinstance(language, (tuple, list, set)):
            if len(language) == 1:
                language_ids = ','.join(
                    sorted(l.opensubtitles for l in language))
                if language_ids == 'por':
                    language_ids = '&form_cat=28'
                else:
                    language_ids = '&form_cat=29'

        querytext = video.name
        querytext = os.path.basename(querytext)
        querytext, _ = os.path.splitext(querytext)
        videoname = querytext
        querytext = querytext.lower()
        querytext = querytext.replace(".", "+").replace("[",
                                                        "").replace("]", "")
        if language_ids != '0':
            querytext = querytext + language_ids
        self.headers['Referer'] = self.site + '/index.php'
        self.session.headers.update(self.headers.items())
        res = self.session.get(self.searchurl.format(query=querytext))
        # form_cat=28 = br
        # form_cat=29 = pt
        if "A legenda não foi encontrada" in res.text:
            logger.warning('%s not found', querytext)
            return []

        bsoup = ParserBeautifulSoup(res.content, ['html.parser'])
        _allsubs = bsoup.findAll("div", {"class": "sub_box"})
        subtitles = []
        lang = Language.fromopensubtitles("pob")
        for _subbox in _allsubs:
            hits = 0
            for th in _subbox.findAll("th", {"class": "color2"}):
                if th.string == 'Hits:':
                    hits = int(th.parent.find("td").string)
                if th.string == 'Idioma:':
                    lang = th.parent.find("td").find("img").get('src')
                    if 'brazil' in lang:
                        lang = Language.fromopensubtitles('pob')
                    else:
                        lang = Language.fromopensubtitles('por')

            description = _subbox.find("td", {"class": "td_desc brd_up"})
            download = _subbox.find("a", {"class": "sub_download"})
            try:
                # sometimes BSoup just doesn't get the link
                logger.debug(download.get('href'))
            except Exception as e:
                logger.warning('skipping subbox on %s' %
                               self.searchurl.format(query=querytext))
                continue

            exact_match = False
            if video.name.lower() in description.get_text().lower():
                exact_match = True
            data = {
                'link': self.site + '/modules.php' + download.get('href'),
                'exact_match': exact_match,
                'hits': hits,
                'videoname': videoname,
                'description': description.get_text()
            }
            subtitles.append(LegendasdivxSubtitle(lang, video, data))

        return subtitles

コード例 #5

0

ファイルを表示

    def query(self, languages=None, title=None, imdb_id=None, video=None):
        subtitles = []

        params = self.getQueryParams(imdb_id, title)
        search_response = self.session.post(self.api_url,
                                            data=params,
                                            timeout=15)
        search_response.raise_for_status()

        soup = ParserBeautifulSoup(
            search_response.content.decode('utf-8', 'ignore'),
            ['lxml', 'html.parser'])

        # loop over subtitle cells
        rows = soup.select('div[id="round"]')

        if len(rows) == 0:
            logger.debug('No data returned from provider')
            return []

        # release comments are outside of the parent for the sub details itself, so we just map it to another list
        comment_rows = soup.findAll('div',
                                    attrs={
                                        'class': None,
                                        'id': None,
                                        'align': None
                                    })

        for index, row in enumerate(rows):
            result_anchor_el = row.select_one('.buton').select('a')

            # Download link
            href = result_anchor_el[0]['href']
            download_link = self.server_url + href

            fullTitle = row.select_one('#content-main a').text

            # Get title
            try:
                title = fullTitle.split("(")[0]
            except:
                logger.error("Error parsing title")

            # Get Uploader
            try:
                uploader = row.select('#content-main p')[4].text[10:]
            except:
                logger.error("Error parsing uploader")

            # Get downloads count
            downloads = 0
            try:
                downloads = int(row.select_one('#content-right p').text[12:])
            except:
                logger.error("Error parsing downloads")

            # Get year
            try:
                year = int(fullTitle.split("(")[1].split(")")[0])
            except:
                year = None
                logger.error("Error parsing year")

            # Get imdbId
            sub_imdb_id = self.getImdbIdFromSubtitle(row)

            comments = ''
            try:
                comments = comment_rows[index].text
                logger.debug('Comments: {}'.format(comments))
            except:
                logger.error("Error parsing comments")

            # Get Page Link
            try:
                page_link = row.select_one('#content-main a')['href']
            except:
                logger.error("Error parsing page_link")

            episode_number = video.episode if isinstance(video,
                                                         Episode) else None
            subtitle = self.subtitle_class(next(iter(languages)),
                                           download_link, index, comments,
                                           title, sub_imdb_id, uploader,
                                           page_link, year, downloads,
                                           isinstance(video,
                                                      Episode), episode_number)
            logger.debug('Found subtitle %r', str(subtitle))
            subtitles.append(subtitle)

        ordered_subs = self.order(subtitles)

        return ordered_subs

コード例 #6

0

ファイルを表示

    def query(self, show_id, series, season, year=None, country=None):
        # get the season list of the show
        logger.info('Getting the season list of show id %d', show_id)
        r = self.session.get(self.server_url + self.series_url.format(show_id),
                             timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        series = soup.find('name').text

        # loop over season rows
        seasons = soup.findAll('series_group')
        season_id = None

        for season_row in seasons:
            try:
                parsed_season = int(season_row['ssnnum'])
                if parsed_season == season:
                    season_id = int(season_row['ssnid'])
                    break
            except (ValueError, TypeError):
                continue

        if season_id is None:
            logger.debug('Season not found in provider')
            return []

        # get the subtitle list of the season
        logger.info('Getting the subtitle list of season %d', season)
        r = self.session.get(
            self.server_url +
            self.season_url.format(show_id=show_id, season=season_id),
            timeout=10)
        r.raise_for_status()

        if not r.content:
            logger.debug('No data returned from provider')
            return []

        soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])

        subtitles = []
        # loop over episode rows
        for subtitle_group in soup.findAll('subg'):
            # read the episode info
            episode_info = subtitle_group.find('etitle')
            if episode_info is None:
                continue

            episodes = []
            episode_match = episode_re.match(episode_info['number'])
            if episode_match:
                episodes = [
                    int(e)
                    for e in [episode_match.group(1),
                              episode_match.group(3)] if e
                ]

            subtitle_info = subtitle_group.find('sgt')
            if subtitle_info is None:
                continue

            season = int(subtitle_info['ssnnum'])
            episode_id = int(subtitle_info['epsid'])

            # filter out unreleased subtitles
            for subs_tag in subtitle_group.findAll('sr'):
                if subs_tag['published_on'] == '':
                    continue

                page_link = self.server_url + self.page_link.format(
                    show_id=show_id,
                    season_id=season_id,
                    season=season,
                    episode=episode_id)
                title = episode_info['title']
                version = subs_tag.fmt.text + ' ' + subs_tag.team.text
                download_link = self.server_url + self.download_link.format(
                    int(subs_tag['rlsid']))

                for episode in episodes:
                    subtitle = self.subtitle_class(Language.fromalpha2('el'),
                                                   page_link, series, season,
                                                   episode, year, title,
                                                   version, download_link)
                    logger.debug('Found subtitle %r', subtitle)
                    subtitles.append(subtitle)

        return subtitles