def download(self): if not self.available(): raise exceptions.ScrapingError base_url = _make_api_request("/at-home/server/" + self.json["id"]).json()["baseUrl"] pages = [ base_url + "/data/" + self.json["attributes"]["hash"] + "/" + x for x in self.json["attributes"]["data"] ] if len(pages) <= 0: output.error("{}: chapter is hosted externally".format(self.alias)) raise exceptions.ScrapingError("external") files = [None] * len(pages) futures = [] with self.progress_bar(pages) as bar: for i, page in enumerate(pages): r = self.req_session.get(page, stream=True) if not r or r.status_code == 404: output.error("{}: failed request for page {}".format( self.alias, i)) raise exceptions.ScrapingError fut = download_pool.submit(self.page_download_task, i, r, page_url=page) fut.add_done_callback( partial(self.page_download_finish, bar, files)) futures.append(fut) concurrent.futures.wait(futures) self.create_zip(files)
def _make_api_request(url, session=None, extra_headers={}): while True: if debug: output.warning("Mangadex API: requesting -> " + url) try: if session: r = session.get('https://api.mangadex.org/' + url.strip('/'), headers={ **MangadexV5Series.headers, **extra_headers }) else: r = requests.get('https://api.mangadex.org/' + url.strip('/'), headers={ **MangadexV5Series.headers, **extra_headers }) except requests.exceptions.ConnectionError: output.error( "Mangadex API: request to endpoint failed: {}".format(url)) raise exceptions.ScrapingError if r.status_code == 200: return r elif r.status_code == 429: retry_delay = int(r.headers["retry-after"]) output.warning( "Mangadex API: wait {} seconds...".format(retry_delay)) time.sleep(retry_delay) else: output.error("Mangadex API: got bad status code {}".format( r.status_code)) return r
def edit(alias, setting, value): """Modify settings for a follow. The following settings can be edited: alias, directory. """ series = db.Series.alias_lookup(alias, unfollowed=True) alias = series.alias if value.lower() == 'none' or value.lower() == '-': value = None if setting == 'alias': series.alias = value elif setting == 'directory': series.directory = value else: setting = click.style(setting, bold=True) output.error('Invalid setting {}'.format(setting)) exit(1) if not value: value = click.style('none', dim=True) else: value = click.style(value, bold=True) try: db.session.commit() except exceptions.DatabaseIntegrityError: db.session.rollback() output.error('Illegal value {}'.format(value)) exit(1) else: output.chapter('Changed {} for {} to {}'.format(setting, alias, value))
def test_chapter_download_latest(self): latest_releases = self.get_five_latest_releases() for release in latest_releases: try: chapter = mangahere.MangahereChapter.from_url(release) except exceptions.ScrapingError as e: output.error('Scraping error for {} - {}'.format(release, e)) raise exceptions.ScrapingError else: chapter.get(use_db=False)
def test_database(): """Runs a database sanity test.""" sanity_tester = sanity.DatabaseSanity(Base, engine) sanity_tester.test() if sanity_tester.errors: for error in sanity_tester.errors: err_target, err_msg = str(error).split(' ', 1) message = ' '.join([click.style(err_target, bold=True), err_msg]) output.warning(message) output.error('Database has failed sanity check; ' 'run `cu2 repair-db` to repair database') exit(1)
def alias_lookup(alias, unfollowed=False): """Returns a DB object for a series by alias name. Prints an error if an invalid alias is specified. """ filters = {'alias': alias} if not unfollowed: filters['following'] = not unfollowed try: s = session.query(Series).filter_by(**filters).one() except NoResultFound: output.error('Could not find alias "{}"'.format(alias)) exit(1) else: return s
def page_download_task(page_num, r, page_url=None): """Saves the response body of a single request, returning the file handle and the passed through number of the page to allow for non- sequential downloads in parallel. """ ext = BaseChapter.guess_extension(r.headers.get('content-type')) f = NamedTemporaryFile(suffix=ext, delete=False) retries = 20 while retries > 0: try: for chunk in r.iter_content(chunk_size=4096): if chunk: f.write(chunk) retries = 0 # basically ignores this exception that requests throws. my # understanding is that it is raised when you attempt to iter_content() # over the same content twice. don't understand how that situation # arises with the current code but it did somehow. # https://stackoverflow.com/questions/45379903/ except requests.exceptions.StreamConsumedError: pass # when under heavy load, Mangadex will often kill the connection in # the middle of an image download. in the original architecture, # the requests are all opened in the scrapers in stream mode, then # the actual image payloads are downloaded in the asynchronous # callbacks. when this occurs we have not choice but to re-request # the image from the beginning (easier than playing around with range # headers). this means each thread may issue multiple new requests. # I have found the performance overhead to be mostly negligible. except requests.exceptions.ChunkedEncodingError: if not page_url: output.error( "Connection killed on page {} but scraper does not support retries" .format(str(page_num))) raise exceptions.ScrapingError output.warning( "Connection killed on page {}, {} retries remaining". format(str(page_num), str(retries))) retries = retries - 1 if retries <= 0: output.error( "Connection killed on page {}, no retries remaining - aborting chapter" .format(str(page_num))) raise exceptions.ScrapingError r = self.req_session.get(page_url, stream=True) f.flush() f.close() r.close() return ((page_num, f))
def test_chapter_download_latest(self): latest_releases = self.get_five_latest_releases() for release in latest_releases: try: chapter = mangadex_v5.MangadexV5Chapter.from_url(release) except exceptions.ScrapingError as e: output.error('Scraping error for {} - {}'.format(release, e)) raise exceptions.ScrapingError else: try: chapter.get(use_db=False) except exceptions.ScrapingError as e: if e.message == "external": continue raise e
def _get_page(self, url): if len(url.rstrip('/').split('/')) == 7: manga_id = MangadexV5Series._translate_manga_id( url.rstrip('/').split('/')[-3]) elif len(url.rstrip('/').split('/')) == 6: manga_id = MangadexV5Series._translate_manga_id( url.rstrip('/').split('/')[-2]) elif url.rstrip('/').split('/')[-1].isdigit(): manga_id = MangadexV5Series._translate_manga_id( url.rstrip('/').split('/')[-1]) else: manga_id = url.rstrip('/').split('/')[-1] r = _make_api_request('/manga/' + manga_id, session=self.req_session) # this bit is duplicated in _decode_json because at this point we don't have # enough data from the API to call self.alias try: self.json = json.loads(r.text) except json.decoder.JSONDecodeError: output.error("Mangadex API: failed to decode JSON response") raise exceptions.ScrapingError
def _decode_json(string): try: try: return json.loads(string)["data"] except json.decoder.JSONDecodeError: output.error(self.alias + ": Mangadex API: failed to decode JSON response") raise exceptions.ScrapingError except KeyError: output.error(self.alias + ": Mangadex API: request returned status: " + json.loads(string)["result"]) raise exceptions.ScrapingError except NameError: output.error("Mangadex API: request returned status: " + json.loads(string)["result"])
def download(self): if not getattr(self, "cpage", None): self.cpage = self.req_session.get(self.url.replace("m.", "www."), headers=chrome_headers) if self.cpage.status_code == 404: raise exceptions.ScrapingError if not getattr(self, "soup", None): self.soup = BeautifulSoup(self.cpage.text, config.get().html_parser) pages = [] (mid, cid) = (None, None) # index of script with ids may vary # it may also change as ads are added/removed from the site for f in range(0, len(self.soup.find_all("script"))): try: if len(self.soup.find_all("script")[f].contents): mid = re.search( "var comicid = ([0-9]+)", self.soup.find_all("script") [f].contents[0]).groups()[0] cid = re.search( "var chapterid =([0-9]+)", self.soup.find_all("script") [f].contents[0]).groups()[0] except AttributeError: pass if mid and cid: old_num_pages = -1 while old_num_pages != len(pages): old_num_pages = len(pages) pages = self._request_pages(mid, cid, pages) else: # some titles (seems to be ones with low page counts like webtoons) # don't use progressively-loaded pages. for these, the image list # can be extracted directly off the main page for g in range(0, len(self.soup.find_all("script"))): try: pages = loads( re.search( "var newImgs = (.+);var newImginfos", beautify( self.soup.find_all("script")[g].text).replace( "\\", "").replace("'", "\"")).groups()[0]) except AttributeError: pass if not len(pages): raise exceptions.ScrapingError for i, page in enumerate(pages): pages[i] = "https:" + page futures = [] files = [None] * len(pages) with self.progress_bar(pages) as bar: for i, page in enumerate(pages): retries = 0 while retries < 10: try: r = self.req_session.get(page, stream=True) break except requests.exceptions.ConnectionError: retries += 1 # end of chapter detection in the web ui is done by issuing requests # for nonexistent pages which return 404s (who comes up with this) if r.status_code != 404: if r.status_code != 200: r.close() output.error("Page download got status code {}".format( str(r.status_code))) raise exceptions.ScrapingError fut = download_pool.submit(self.page_download_task, i, r) fut.add_done_callback( partial(self.page_download_finish, bar, files)) futures.append(fut) else: try: del files[i] except IndexError: raise exceptions.ScrapingError concurrent.futures.wait(futures) self.create_zip(files)
def config_command(mode, setting, value): """Get or set configuration options. Mode can be either "get" or "set", depending on whether you want to read or write configuration values. If mode is "get", you can specify a setting to read that particular setting or omit it to list out all the settings. If mode is "set", you must specify the setting to change and assign it a new value. """ if mode == 'get': if setting: parameters = setting.split('.') value = config.get() for parameter in parameters: try: value = getattr(value, parameter) except AttributeError: output.error('Setting not found') exit(1) output.configuration({setting: value}) else: configuration = config.get().serialize() output.configuration(configuration) elif mode == 'set': if setting is None: output.error('You must specify a setting') exit(1) if value is None: output.error('You must specify a value') exit(1) parameters = setting.split('.') preference = config.get() for parameter in parameters[0:-1]: try: preference = getattr(preference, parameter) except AttributeError: output.error('Setting not found') exit(1) try: current_value = getattr(preference, parameters[-1]) except AttributeError: output.error('Setting not found') exit(1) if current_value is not None: if isinstance(current_value, bool): if value.lower() == 'false' or value == 0: value = False else: value = True else: try: value = type(current_value)(value) except ValueError: output.error('Type mismatch: value should be {}' .format(type(current_value).__name__)) exit(1) setattr(preference, parameters[-1], value) config.get().write() else: output.error('Mode must be either get or set') exit(1)