def download_chapter(self, chapter, download_directory, download_name): files = [] warnings = [] logging.debug('Downloading chapter {}.'.format(chapter["url"])) page = BeautifulSoup(self.open_url(chapter["url"])) scripts = page.find_all("script") for script in scripts: if re.search(r'var pages', script.text): matches = re.findall(r'"image":"(.*?)"', script.text) image_count = len(matches) for image_name, match in enumerate(matches, start=1): print_info("Download: Page {0:04d} / {1:04d}".format(image_name, image_count)) image_url = 'http://dynasty-scans.com/' + match file_extension = re.search(r'.*\.([A-Za-z]*)', image_url).group(1) req = urllib.request.Request(image_url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Accept-encoding': 'gzip'}) try: response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print_info('WARNING: Unable to download file ({}).'.format(str(e))) warnings.append('Download of page {}, chapter {:g}, series "{}" failed.'.format(image_name, chapter["chapter"], self.series_info('title'))) continue filename = '{}/{:06d}.{}'.format(download_directory, image_name, file_extension) f = open(filename, 'wb') f.write(response.read()) f.close() files.append(filename) break filename = download_directory + '/' + download_name self.zip_files(files, filename) return warnings
def download_chapter(self, chapter, download_directory, download_name): files = [] warnings = [] logging.debug('Downloading chapter {}.'.format(chapter["url"])) page = BeautifulSoup(self.open_url(chapter["url"].encode('ascii', 'ignore').decode('utf-8'))) scripts = page.find("div", {"id": "containerRoot"}).find_all('script') for script in scripts: if re.search(r'lstImages', script.text): matches = re.findall(r'lstImages\.push\(".*"\);', script.text) image_count = len(matches) for image_name, match in enumerate(matches, start=1): print_info("Download: Page {0:04d} / {1:04d}".format(image_name, image_count)) image_url = re.search(r'lstImages\.push\("(.*)"\);', match).group(1) file_extension = re.search(r'.*\.([A-Za-z]*)', image_url).group(1) req = urllib.request.Request(image_url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Accept-encoding': 'gzip'}) try: response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print_info('WARNING: Unable to download file ({}).'.format(str(e))) warnings.append('Download of page {}, chapter {:g}, series "{}" failed.'.format(image_name, chapter["chapter"], self.series_info('title'))) continue filename = '{}/{:06d}.{}'.format(download_directory, image_name, file_extension) f = open(filename, 'wb') f.write(response.read()) f.close() files.append(filename) break filename = download_directory + '/' + download_name self.zip_files(files, filename) return warnings
def __init__(self, url, server=None): self.url = url if server == None: self.server = None elif server in ['img1', 'img2', 'img3', 'img4']: self.server = server else: print_info('Invalid server selection.') self.server = None if re.match(r'.*bato\.to/comic/.*', url): self.page = BeautifulSoup(self.open_url(url)) self.init_with_chapter = False logging.debug('Object initialized with series') elif re.match(r'.*bato\.to/read/.*', url): try: self.page = BeautifulSoup(self.open_url(self.chapter_series(url))) except IndexError: print_info('ERROR: Unable to scrape chapter \'{}\'. If this is a new release, please try again later (Batoto bug).'.format(self.url)) self.page = None self.init_with_chapter = True logging.debug('Object initialized with chapter') else: self.page = None self.init_with_chapter = False logging.debug('Empty object initialized') logging.debug('Object created with ' + url)
def download_chapter(self, chapter, download_directory, download_name): files = [] warnings = [] logging.debug('\n************************************************') logging.debug('Downloading chapter {}.'.format(chapter["url"])) page = BeautifulSoup(self.open_url(chapter["url"].encode('ascii', 'ignore').decode('utf-8')), "html.parser") scripts = page.find_all('script') # TODO chapter_name = chapter["url"].strip('/').split('/') chapter_name = chapter_name[len(chapter_name) - 1] image_name = 1 for script in scripts: if re.search(r'(var slides_page_path = \[")(.+)("\];)', script.text): image_url = re.search(r'(var slides_page_path = \[")(.+)("\];)', script.text).group(2) need_short = 1 elif re.search(r'(var slides_page_url_path = \[")(.+)("\];)', script.text): image_url = re.search(r'(var slides_page_url_path = \[")(.+)("\];)', script.text).group(2) need_short = 0 else: continue image_urls = image_url.split('","') if need_short == 1: image_urls = sorted(image_urls, key=cmp_to_key(cmp_items)) for image_url in image_urls: if image_url == '': continue file_extension = re.search(r'.*\.([A-Za-z]*)', image_url).group(1) logging.debug('Downloading image ' + image_url) req = urllib.request.Request(image_url, headers={ 'User-agent': self.default_user_agent(), 'Accept-encoding': 'gzip'}) try: response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print_info('WARNING: Unable to download file ({}).'.format(str(e))) warnings.append( 'Download of page {}, chapter {:g}, series "{}" failed.'.format(image_name, chapter["chapter"], self.series_info('title'))) continue filename = '{}/{}-{:06d}.{}'.format(download_directory, chapter_name, image_name, file_extension) f = open(filename, 'wb') f.write(response.read()) f.close() logging.debug('Saved image ' + filename) files.append(filename) image_name += 1 break filename = download_directory + '/' + download_name self.zip_files(files, filename) logging.debug('Finished {} Chapter'.format(chapter_name)) return warnings
def download_chapter(self, chapter, download_directory, download_name): files = [] warnings = [] logging.debug('Downloading chapter {}.'.format(chapter["url"])) page = BeautifulSoup(self.open_url(chapter["url"]), "html.parser") scripts = page.find_all("script") for script in scripts: if re.search(r'var pages', script.text): matches = re.findall(r'"image":"(.*?)"', script.text) image_count = len(matches) for image_name, match in enumerate(matches, start=1): print_info("Download: Page {0:04d} / {1:04d}".format(image_name, image_count)) image_url = 'http://dynasty-scans.com/' + match file_extension = re.search(r'.*\.([A-Za-z]*)', image_url).group(1) req = urllib.request.Request(image_url, headers={ 'User-agent': self.default_user_agent(), 'Accept-encoding': 'gzip'}) try: response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print_info('WARNING: Unable to download file ({}).'.format(str(e))) warnings.append('Download of page {}, chapter {:g}, series "{}" failed.'.format(image_name, chapter[ "chapter"], self.series_info( 'title'))) continue filename = '{}/{:06d}.{}'.format(download_directory, image_name, file_extension) f = open(filename, 'wb') f.write(response.read()) f.close() files.append(filename) break filename = download_directory + '/' + download_name self.zip_files(files, filename) return warnings
def zip_files(files, filename): zipf = zipfile.ZipFile(filename, mode="w") for f in files: zipf.write(f, os.path.basename(f)) os.remove(f) print_info("Zip created: " + filename.replace(os.environ['HOME'], "~"))
def download_chapter(self, chapter, download_directory, download_name): chapter_url = chapter["url"] logging.debug('Downloading chapter {}.'.format(chapter_url)) chapter = BeautifulSoup(self.open_url(chapter_url)) files = [] warnings = [] try: page_urls = chapter.find("select", {"name": "page_select"}).find_all("option") pages = [page["value"] for page in page_urls] logging.debug('Per page mode') image_count = len(pages) for image_name, page_url in enumerate(pages, start=1): print_info("Download: Page {0:04d} / {1:04d}".format(image_name, image_count)) page = BeautifulSoup(self.open_url(page_url)) url = page.find("div", {"id": "full_image"}).find("img")["src"] if self.server != None: url = 'http://{}.bato.to{}'.format(self.server, re.search(r'.*\.bato\.to(.*)', url).group(1)) file_extension = re.search(r'.*\.([A-Za-z]*)', url).group(1) req = urllib.request.Request(url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Accept-encoding': 'gzip'}) try: logging.debug('Downloading img {}'.format(url)) response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print_info('WARNING: Unable to download file ({}).'.format(str(e))) warnings.append('Download of page {}, chapter {:g}, series {} failed.'.format(image_name, chapter["chapter"], self.series_info('title'))) filename = '{}/{:06d}.{}'.format(download_directory, image_name, file_extension) f = open(filename, 'wb') f.write(response.read()) f.close() files.append(filename) except AttributeError: logging.debug('Long strip mode') page = BeautifulSoup(self.open_url(chapter_url)) images = page.find_all('img', src=re.compile("img[0-9]*\.bato\.to/comics/.*/.*/.*/.*/read.*/")) image_count = len(images) for image_name, image in enumerate(images, start=1): print_info("Download: Page {0:04d} / {1:04d}".format(image_name, image_count)) url = image['src'] file_extension = re.search(r'.*\.([A-Za-z]*)', url).group(1) req = urllib.request.Request(url, headers={'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36', 'Accept-encoding': 'gzip'}) try: response = urllib.request.urlopen(req) except urllib.error.HTTPError as e: print_info('WARNING: Unable to download file ({}).'.format(str(e))) warnings.append('Download of page {}, chapter {:g}, series "{}" failed.'.format(image_name, chapter["chapter"], series_info('title'))) continue filename = '{}/{:06d}.{}'.format(download_directory, image_name, file_extension) f = open(filename, 'wb') f.write(response.read()) f.close() files.append(filename) filename = download_directory + '/' + download_name self.zip_files(files, filename) return warnings