def from_url(url): chapter_hash = re.search(BatotoChapter.url_re, url).group(1) r = BatotoChapter._reader_get(chapter_hash, 1) soup = BeautifulSoup(r.text, config.get().html_parser) try: series_url = soup.find('a', href=BatotoSeries.url_re)['href'] except TypeError: raise exceptions.ScrapingError('Chapter has no parent series link') series = BatotoSeries(series_url) for chapter in series.chapters: if chapter.url.lstrip('htps') == url.lstrip('htps'): return chapter
def __init__(self, url, **kwargs): super().__init__(url, **kwargs) response = requests.get(url) self.soup = BeautifulSoup(response.content, config.get().html_parser) # mangakakalot does not return 404 if there is no such title try: self.cached_name = self.soup.select('.manga-info-text h1')[0].text except IndexError: raise exceptions.ScrapingError() self.chapters = self.get_chapters()
def from_url(url): r = MangadexChapter._reader_get(url, 1) soup = BeautifulSoup(r.text, config.get().html_parser) try: series_url = soup.find('a', href=MangadexSeries.url_re)['href'] except TypeError: raise exceptions.ScrapingError('Chapter has no parent series link') series = MangadexSeries(urljoin('https://mangadex.com', series_url)) for chapter in series.chapters: parsed_chapter_url = ''.join(urlparse(chapter.url)[1:]) parsed_url = ''.join(urlparse(url)[1:]) if parsed_chapter_url == parsed_url: return chapter
def get_comic_details(self): """Parses through the various series listed on Foolslide until a match with the specified series URL is found. """ while True: response = requests.get(self.api_hook_list).json() if response.get('error', None) == 'Comics could not be found': raise exceptions.ScrapingError() result = self._process_comic_list(response) if result: break self._page += 1 self.foolslide_id = result['id'] self.name = result['name']
def get_chapters(self): try: rows = (self.soup.find('table', class_='mobile-files-table').find_all('tr')) except AttributeError: raise exceptions.ScrapingError() chapters = [] for row in rows[1:]: # If the Read link cannot be found in the current row, the row is # assumed to be a non-manga file uploaded to the directory and will # thus be skipped. if not row.find('a', text='Read'): continue link = row.find('a') url = urljoin(self.url, link.get('href')) name = link.string name_parts = re.search(name_re, name) if not name_parts: name_parts = re.search(fallback_re, name) try: chapter = name_parts.group(1) except AttributeError: continue if name_parts.group(2): groups = name_parts.group(2).split('][') else: groups = [] c = MadokamiChapter(name=self.name, alias=self.alias, chapter=chapter, url=url, groups=groups, session=self.session) chapters.append(c) return chapters
def _get_page(self, url): manga_id = re.search(self.url_re, url) r = requests.get('https://mangadex.org/api/manga/' + manga_id.group(1), headers=MangadexSeries.headers) # TODO FIXME replace with properly spaced api calls # This is a bad workaround for # '503 please stop spaming the site' # erros when making requests to /api/ urls quickly. # It may still break when 4 calls are done at the same time sleep(randrange(0, 900) / 1000.0) if r.status_code == 503 and self.spam_failures < 3: # sleep 10-17 seconds to wait out the spam protection # and make it less likely for all threads to hit at the same time sleep(randrange(10000, 17000) / 1000.0) self.spam_failures = self.spam_failures + 1 return self._get_page(url) elif self.spam_failures >= 3: print("Error: Mangadex server probably contacted too often\n") print(r.text) raise exceptions.ScrapingError("Mangadex spam error") self.spam_failures = 0 self.json = json.loads(r.text)