def Download_Nhentai(): '''Code, artist, range''' import bs4, re, ast, json from Webscraping import USER, WEBDRIVER from Webscraping.utils import save_image driver = WEBDRIVER() path = USER / r'Downloads\Images\Comics' comic = USER / r'Dropbox\Software\Webscraping\comics.json' for code, artist, range_ in json.load(open(comic))[""][:-1]: comic = path / f'[{artist}] {range_[0]}-{range_[1]}' comic.mkdir(exist_ok=True) driver.get(f'https://nhentai.net/g/{code}') html = bs4.BeautifulSoup(driver.page_source(5), 'lxml') pages = html.findAll('a', class_='gallerythumb') for page in pages[range_[0] - 1:range_[1]]: page = driver.get(f'https://nhentai.net{page.get("href")}') image = bs4.BeautifulSoup(driver.page_source(5), 'lxml') src = image.find(src=re.compile('.+galleries.+')).get('src') name = comic / src.split('/')[-1] if name.exists(): continue save_image(name, src) driver.close()
def Download_Nhentai(): '''Code, artist, range''' import requests, bs4, re, ast from Webscraping import USER from Webscraping.utils import save_image path = USER / r'Downloads\Images\Comics' comic = USER / r'Dropbox\Software\comics.txt' for arg in comic.read_text().splitlines()[:-1]: code, artist, range_ = ast.literal_eval(arg) comic = path / f'[{artist}] {range_[0]}-{range_[1]}' comic.mkdir(exist_ok=True) page_source = requests.get(f'https://nhentai.net/g/{code}') html = bs4.BeautifulSoup(page_source.content, 'lxml') pages = html.findAll('a', class_='gallerythumb') for page in pages[range_[0] - 1:range_[1]]: page = requests.get(f'https://nhentai.net{page.get("href")}') image = bs4.BeautifulSoup(page.content, 'lxml') src = image.find(src=re.compile('.+galleries.+')).get('src') name = comic / src.split('/')[-1] if name.exists(): continue save_image(name, src)
def extract_errors(path, dest): if path.exists(): for image in path.read_text().split('\n'): image = image.strip() name = dest / image.split('/')[-1].split('?')[0] if not name.exists(): save_image(name, image) else: path.touch()
def extract_files(source, dest=None, headless=True): import re, bs4 from urllib.parse import urlparse from Webscraping.utils import USER, save_image def get_url(driver, src): driver.get(src) time.sleep(4) html = bs4.BeautifulSoup(driver.page_source(), 'lxml') url = html.find(content=re.compile('.+mp4')) return url.get('content') def extract_errors(path, dest): if path.exists(): for image in path.read_text().split('\n'): image = image.strip() name = dest / image.split('/')[-1].split('?')[0] if not name.exists(): save_image(name, image) else: path.touch() if isinstance(source, str): source = USER / source if dest is None: dest = source else: dest = USER / dest driver = WEBDRIVER(headless=headless, profile=None) errors_txt = source / 'Errors.txt' extract_errors(errors_txt, dest) errors = [] for file in source.glob('*json'): for url in json_generator(file): path = urlparse(url['url']).path[1:] if re.match('https://i.imgur.com/.+gif', url['url']): path.replace('gif', 'mp4') elif re.search('.+/watch.+', url['url']): try: path = get_url(driver, url['url']) url['url'] = path except: errors.append(url['url']) continue name = dest / path.split('/')[-1] if name.exists(): continue try: image = ( f'https://{url["title"]}' if url['url'] == 'about:blank' else url['url'] ) except KeyError: continue # if name.suffix == '.gifv': # name = name.with_suffix('.mp4') # image = image.replace('gifv', 'mp4') # elif name.suffix == '.webp': # name = name.with_suffix('.jpg') if not save_image(name, image): errors.append(image) elif name.suffix == '.gif' and b'MPEG' in name.read_bytes(): try: name.rename(name.with_suffix('.mp4')) except: name.unlink(missing_ok=1) send2trash.send2trash(str(file)) if errors: errors_txt.write_text('\n'.join(errors)) for file in dest.glob('*webp'): name = file.with_suffix('.jpg') name.write_bytes(file.read_bytes()) file.unlink() for file in dest.glob('*gifv'): name = file.with_suffix('.mp4') image = f'https://i.imgur.com/{name.name}' save_image(name, image) file.unlink()