Python ScrapingError Examples

Programming Language: Python

Namespace/Package Name: cum.exceptions

Method/Function: ScrapingError

Examples at hotexamples.com: 6

Python ScrapingError - 6 examples found. These are the top rated real world Python examples of cum.exceptions.ScrapingError extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: batoto.py Project: drewbitt/cum

 def from_url(url):
     chapter_hash = re.search(BatotoChapter.url_re, url).group(1)
     r = BatotoChapter._reader_get(chapter_hash, 1)
     soup = BeautifulSoup(r.text, config.get().html_parser)
     try:
         series_url = soup.find('a', href=BatotoSeries.url_re)['href']
     except TypeError:
         raise exceptions.ScrapingError('Chapter has no parent series link')
     series = BatotoSeries(series_url)
     for chapter in series.chapters:
         if chapter.url.lstrip('htps') == url.lstrip('htps'):
             return chapter

Example #2

Show file

    def __init__(self, url, **kwargs):
        super().__init__(url, **kwargs)

        response = requests.get(url)
        self.soup = BeautifulSoup(response.content, config.get().html_parser)

        # mangakakalot does not return 404 if there is no such title
        try:
            self.cached_name = self.soup.select('.manga-info-text h1')[0].text
        except IndexError:
            raise exceptions.ScrapingError()

        self.chapters = self.get_chapters()

Example #3

Show file

File: mangadex.py Project: kozec/cum

 def from_url(url):
     r = MangadexChapter._reader_get(url, 1)
     soup = BeautifulSoup(r.text, config.get().html_parser)
     try:
         series_url = soup.find('a', href=MangadexSeries.url_re)['href']
     except TypeError:
         raise exceptions.ScrapingError('Chapter has no parent series link')
     series = MangadexSeries(urljoin('https://mangadex.com', series_url))
     for chapter in series.chapters:
         parsed_chapter_url = ''.join(urlparse(chapter.url)[1:])
         parsed_url = ''.join(urlparse(url)[1:])
         if parsed_chapter_url == parsed_url:
             return chapter

Example #4

Show file

 def get_comic_details(self):
     """Parses through the various series listed on Foolslide until a match
     with the specified series URL is found.
     """
     while True:
         response = requests.get(self.api_hook_list).json()
         if response.get('error', None) == 'Comics could not be found':
             raise exceptions.ScrapingError()
         result = self._process_comic_list(response)
         if result:
             break
         self._page += 1
     self.foolslide_id = result['id']
     self.name = result['name']

Example #5

Show file

    def get_chapters(self):
        try:
            rows = (self.soup.find('table',
                                   class_='mobile-files-table').find_all('tr'))
        except AttributeError:
            raise exceptions.ScrapingError()
        chapters = []
        for row in rows[1:]:
            # If the Read link cannot be found in the current row, the row is
            # assumed to be a non-manga file uploaded to the directory and will
            # thus be skipped.
            if not row.find('a', text='Read'):
                continue

            link = row.find('a')

            url = urljoin(self.url, link.get('href'))

            name = link.string
            name_parts = re.search(name_re, name)
            if not name_parts:
                name_parts = re.search(fallback_re, name)
            try:
                chapter = name_parts.group(1)
            except AttributeError:
                continue
            if name_parts.group(2):
                groups = name_parts.group(2).split('][')
            else:
                groups = []

            c = MadokamiChapter(name=self.name,
                                alias=self.alias,
                                chapter=chapter,
                                url=url,
                                groups=groups,
                                session=self.session)
            chapters.append(c)
        return chapters

Example #6

Show file

File: mangadex.py Project: theowhy/cum

    def _get_page(self, url):
        manga_id = re.search(self.url_re, url)
        r = requests.get('https://mangadex.org/api/manga/' + manga_id.group(1), headers=MangadexSeries.headers)

        # TODO FIXME replace with properly spaced api calls
        #            This is a bad workaround for
        #                '503 please stop spaming the site'
        #            erros when making requests to /api/ urls quickly.
        #            It may still break when 4 calls are done at the same time
        sleep(randrange(0, 900) / 1000.0)
        if r.status_code == 503 and self.spam_failures < 3:
            # sleep 10-17 seconds to wait out the spam protection
            # and make it less likely for all threads to hit at the same time
            sleep(randrange(10000, 17000) / 1000.0)
            self.spam_failures = self.spam_failures + 1
            return self._get_page(url)
        elif self.spam_failures >= 3:
            print("Error: Mangadex server probably contacted too often\n")
            print(r.text)
            raise exceptions.ScrapingError("Mangadex spam error")

        self.spam_failures = 0
        self.json = json.loads(r.text)