Ejemplo n.º 1
0
def Download_Nhentai():
    '''Code, artist, range'''
    
    import bs4, re, ast, json
    from Webscraping import USER, WEBDRIVER
    from Webscraping.utils import save_image

    driver = WEBDRIVER()
    path = USER / r'Downloads\Images\Comics'
    comic = USER / r'Dropbox\Software\Webscraping\comics.json'
    
    for code, artist, range_ in json.load(open(comic))[""][:-1]:
        
        comic = path / f'[{artist}] {range_[0]}-{range_[1]}'
        comic.mkdir(exist_ok=True)

        driver.get(f'https://nhentai.net/g/{code}')
        html = bs4.BeautifulSoup(driver.page_source(5), 'lxml')
        pages = html.findAll('a', class_='gallerythumb')
        
        for page in pages[range_[0] - 1:range_[1]]:

            page = driver.get(f'https://nhentai.net{page.get("href")}')
            image = bs4.BeautifulSoup(driver.page_source(5), 'lxml')
            src = image.find(src=re.compile('.+galleries.+')).get('src')       
            name = comic / src.split('/')[-1]
            if name.exists(): continue
            save_image(name, src)
            
    driver.close()
Ejemplo n.º 2
0
def Download_Nhentai():
    '''Code, artist, range'''

    import requests, bs4, re, ast
    from Webscraping import USER
    from Webscraping.utils import save_image

    path = USER / r'Downloads\Images\Comics'
    comic = USER / r'Dropbox\Software\comics.txt'

    for arg in comic.read_text().splitlines()[:-1]:

        code, artist, range_ = ast.literal_eval(arg)

        comic = path / f'[{artist}] {range_[0]}-{range_[1]}'
        comic.mkdir(exist_ok=True)

        page_source = requests.get(f'https://nhentai.net/g/{code}')
        html = bs4.BeautifulSoup(page_source.content, 'lxml')
        pages = html.findAll('a', class_='gallerythumb')

        for page in pages[range_[0] - 1:range_[1]]:

            page = requests.get(f'https://nhentai.net{page.get("href")}')
            image = bs4.BeautifulSoup(page.content, 'lxml')
            src = image.find(src=re.compile('.+galleries.+')).get('src')
            name = comic / src.split('/')[-1]
            if name.exists(): continue
            save_image(name, src)
Ejemplo n.º 3
0
 def extract_errors(path, dest):
     
     if path.exists():
         
         for image in path.read_text().split('\n'):
         
             image = image.strip()
             name = dest / image.split('/')[-1].split('?')[0]
             if not name.exists(): save_image(name, image)
         
     else: path.touch()
Ejemplo n.º 4
0
def extract_files(source, dest=None, headless=True):
    
    import re, bs4
    from urllib.parse import urlparse
    from Webscraping.utils import USER, save_image
        
    def get_url(driver, src):

        driver.get(src)
        time.sleep(4)
        html = bs4.BeautifulSoup(driver.page_source(), 'lxml')
        url = html.find(content=re.compile('.+mp4'))

        return url.get('content')

    def extract_errors(path, dest):
        
        if path.exists():
            
            for image in path.read_text().split('\n'):
            
                image = image.strip()
                name = dest / image.split('/')[-1].split('?')[0]
                if not name.exists(): save_image(name, image)
            
        else: path.touch()
    
    if isinstance(source, str): source = USER / source
    if dest is None: dest = source
    else: dest = USER / dest
    
    driver = WEBDRIVER(headless=headless, profile=None)
    errors_txt = source / 'Errors.txt'
    extract_errors(errors_txt, dest)
    errors = []
        
    for file in source.glob('*json'):

        for url in json_generator(file):
            
            path = urlparse(url['url']).path[1:]
            if re.match('https://i.imgur.com/.+gif', url['url']):
                path.replace('gif', 'mp4')
            elif re.search('.+/watch.+', url['url']):
                try: 
                    path = get_url(driver, url['url'])
                    url['url'] = path
                except:
                    errors.append(url['url'])
                    continue

            name = dest / path.split('/')[-1]
            if name.exists(): continue
            try: image = (
                    f'https://{url["title"]}'
                    if url['url'] == 'about:blank' else 
                    url['url']
                    )
            except KeyError: continue
            
            # if name.suffix == '.gifv':
            #     name = name.with_suffix('.mp4')
            #     image = image.replace('gifv', 'mp4')
            # elif name.suffix == '.webp':
            #     name = name.with_suffix('.jpg')
            
            if not save_image(name, image): errors.append(image)
            elif name.suffix == '.gif' and b'MPEG' in name.read_bytes():
                try: name.rename(name.with_suffix('.mp4'))
                except: name.unlink(missing_ok=1)
        
        send2trash.send2trash(str(file))
    
    if errors: errors_txt.write_text('\n'.join(errors))

    for file in dest.glob('*webp'):

        name = file.with_suffix('.jpg')
        name.write_bytes(file.read_bytes())
        file.unlink()
        
    for file in dest.glob('*gifv'):

        name = file.with_suffix('.mp4')
        image = f'https://i.imgur.com/{name.name}'
        save_image(name, image)
        file.unlink()