Python ScrapingError Exemples, cum.exceptions.ScrapingError Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : batoto.py Projet : drewbitt/cum

 def from_url(url):
     chapter_hash = re.search(BatotoChapter.url_re, url).group(1)
     r = BatotoChapter._reader_get(chapter_hash, 1)
     soup = BeautifulSoup(r.text, config.get().html_parser)
     try:
         series_url = soup.find('a', href=BatotoSeries.url_re)['href']
     except TypeError:
         raise exceptions.ScrapingError('Chapter has no parent series link')
     series = BatotoSeries(series_url)
     for chapter in series.chapters:
         if chapter.url.lstrip('htps') == url.lstrip('htps'):
             return chapter

Exemple #2

0

Afficher le fichier

    def __init__(self, url, **kwargs):
        super().__init__(url, **kwargs)

        response = requests.get(url)
        self.soup = BeautifulSoup(response.content, config.get().html_parser)

        # mangakakalot does not return 404 if there is no such title
        try:
            self.cached_name = self.soup.select('.manga-info-text h1')[0].text
        except IndexError:
            raise exceptions.ScrapingError()

        self.chapters = self.get_chapters()

Exemple #3

0

Afficher le fichier

Fichier : mangadex.py Projet : kozec/cum

 def from_url(url):
     r = MangadexChapter._reader_get(url, 1)
     soup = BeautifulSoup(r.text, config.get().html_parser)
     try:
         series_url = soup.find('a', href=MangadexSeries.url_re)['href']
     except TypeError:
         raise exceptions.ScrapingError('Chapter has no parent series link')
     series = MangadexSeries(urljoin('https://mangadex.com', series_url))
     for chapter in series.chapters:
         parsed_chapter_url = ''.join(urlparse(chapter.url)[1:])
         parsed_url = ''.join(urlparse(url)[1:])
         if parsed_chapter_url == parsed_url:
             return chapter

Exemple #4

0

Afficher le fichier

 def get_comic_details(self):
     """Parses through the various series listed on Foolslide until a match
     with the specified series URL is found.
     """
     while True:
         response = requests.get(self.api_hook_list).json()
         if response.get('error', None) == 'Comics could not be found':
             raise exceptions.ScrapingError()
         result = self._process_comic_list(response)
         if result:
             break
         self._page += 1
     self.foolslide_id = result['id']
     self.name = result['name']

Exemple #5

0

Afficher le fichier

    def get_chapters(self):
        try:
            rows = (self.soup.find('table',
                                   class_='mobile-files-table').find_all('tr'))
        except AttributeError:
            raise exceptions.ScrapingError()
        chapters = []
        for row in rows[1:]:
            # If the Read link cannot be found in the current row, the row is
            # assumed to be a non-manga file uploaded to the directory and will
            # thus be skipped.
            if not row.find('a', text='Read'):
                continue

            link = row.find('a')

            url = urljoin(self.url, link.get('href'))

            name = link.string
            name_parts = re.search(name_re, name)
            if not name_parts:
                name_parts = re.search(fallback_re, name)
            try:
                chapter = name_parts.group(1)
            except AttributeError:
                continue
            if name_parts.group(2):
                groups = name_parts.group(2).split('][')
            else:
                groups = []

            c = MadokamiChapter(name=self.name,
                                alias=self.alias,
                                chapter=chapter,
                                url=url,
                                groups=groups,
                                session=self.session)
            chapters.append(c)
        return chapters

Exemple #6

0

Afficher le fichier

Fichier : mangadex.py Projet : theowhy/cum

    def _get_page(self, url):
        manga_id = re.search(self.url_re, url)
        r = requests.get('https://mangadex.org/api/manga/' + manga_id.group(1), headers=MangadexSeries.headers)

        # TODO FIXME replace with properly spaced api calls
        #            This is a bad workaround for
        #                '503 please stop spaming the site'
        #            erros when making requests to /api/ urls quickly.
        #            It may still break when 4 calls are done at the same time
        sleep(randrange(0, 900) / 1000.0)
        if r.status_code == 503 and self.spam_failures < 3:
            # sleep 10-17 seconds to wait out the spam protection
            # and make it less likely for all threads to hit at the same time
            sleep(randrange(10000, 17000) / 1000.0)
            self.spam_failures = self.spam_failures + 1
            return self._get_page(url)
        elif self.spam_failures >= 3:
            print("Error: Mangadex server probably contacted too often\n")
            print(r.text)
            raise exceptions.ScrapingError("Mangadex spam error")

        self.spam_failures = 0
        self.json = json.loads(r.text)