def deviantart_login(path: str) -> None: log.init(path, True) db.init(path) args = ['--cookies', db.get_rootpath() + '/cookies.txt'] args += ['-o', 'cache.file=' + db.get_rootpath() + '/gallery-dl-cache.db'] args += ['oauth:deviantart'] gallery_dl_utils.run_gallery_dl_with_custom_args(args)
def check_results_of_post_url(data: dict, sitename: str) -> bool: """ Downloads a URL with gallery-dl, then checks if the downloaded filenames, file content and anchor entries match what was provided by the caller. """ url = data['url'] filenames = data['filenames'] anchors = data['anchors'] log.info("hydownloader-test", f'Testing downloading of posts for site {sitename}') log_file = db.get_rootpath()+f"/logs/test-site-{sitename}-gallery-dl.txt" result_txt = gallery_dl_utils.run_gallery_dl( url=url, ignore_anchor=False, metadata_only=False, log_file=log_file, console_output_file=db.get_rootpath()+f"/test/test-site-{sitename}-gallery-dl-output.txt", unsupported_urls_file=db.get_rootpath()+f"/test/test-site-{sitename}-unsupported-urls-gallery-dl.txt", overwrite_existing=False, subscription_mode=False, test_mode = True ) result = True if result_txt: log.error("hydownloader-test", f"Error returned for {sitename} download: {result_txt}") result = False else: log.info("hydownloader-test", f"Return code for {sitename} download OK") for fname in filenames: abs_fname = db.get_rootpath()+"/test/data/gallery-dl/"+fname if not os.path.isfile(abs_fname): log.error("hydownloader-test", f"Missing expected file: {fname}") result = False else: log.info("hydownloader-test", f"Found expected file: {fname}") for content in filenames[fname]: with open(abs_fname) as f: if re.search(content, f.read()): log.info("hydownloader-test", "Expected file content found") else: log.error("hydownloader-test", f"Expected file content ({content}) NOT found") result = False conn = sqlite3.connect(db.get_rootpath()+"/test/anchor.db") conn.row_factory = sqlite3.Row c = conn.cursor() for anchor in anchors: try: c.execute('select entry from archive where entry = ?', (anchor,)) if len(c.fetchall()): log.info("hydownloader-test", f"Expected anchor {anchor} found in database") else: log.error("hydownloader-test", f"Expected anchor {anchor} NOT found in database") result = False except sqlite3.OperationalError as e: log.error("hydownloader-test", "Error while trying to query anchor database - download failed?", e) result = False return result
def route_set_cookies() -> dict: check_access() if not os.path.isfile(db.get_rootpath()+"/cookies.txt"): return {'status': False} jar = ck.MozillaCookieJar(db.get_rootpath()+"/cookies.txt") jar.load(ignore_discard=True, ignore_expires=True) for c in bottle.request.json["cookies"]: name, value, domain, path, expires = c[0], c[1], c[2], c[3], c[4] #version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, secure, expires, discard, comment, comment_url, rest cookie = ck.Cookie(0, name, value, None, False, domain, True, domain.startswith('.'), path, True, False, expires, False, None, None, {}) jar.set_cookie(cookie) jar.save(ignore_discard=True, ignore_expires=True) return {'status': True}
def parse_log_files(all_files: bool = False, worker: Optional[str] = None): if all_files: logs = glob.glob(db.get_rootpath() + "/logs/single-urls-*-gallery-dl-*.txt") + glob.glob( db.get_rootpath() + "/logs/subscription-*-gallery-dl-*.txt") for l in logs: db.add_log_file_to_parse_queue(l, 'reparse') while logfname := db.get_queued_log_file(worker): subscription_id = None url_id = None if m := re.match(r".*(?:\\|\/)single-urls-([0-9]+)-gallery-dl-.*\.txt", logfname): url_id = int(m.group(1))
def run_gallery_dl(url: str, subscription_mode: bool, ignore_anchor: bool, metadata_only: bool, log_file: str, console_output_file: str, unsupported_urls_file: str, overwrite_existing: bool, filter_: Optional[str] = None, chapter_filter: Optional[str] = None, abort_after: Optional[int] = None, test_mode: bool = False, old_log_file: Optional[str] = None, old_unsupported_urls_file: Optional[str] = None, max_file_count: Optional[int] = None) -> str: """ Downloads a URL with gallery-dl using the current hydownloader environment. """ run_args = [str(db.get_conf('gallery-dl.executable'))] run_args += ['--ignore-config'] run_args += ['--verbose'] if os.path.isfile(db.get_rootpath() + "/gallery-dl-config.json"): run_args += ["-c", db.get_rootpath() + "/gallery-dl-config.json"] if os.path.isfile(db.get_rootpath() + "/gallery-dl-user-config.json"): run_args += ["-c", db.get_rootpath() + "/gallery-dl-user-config.json"] run_args += ['--cookies', db.get_rootpath() + '/cookies.txt'] if not test_mode: run_args += ['--dest', db.get_rootpath() + '/data/gallery-dl'] else: run_args += ['--dest', db.get_rootpath() + '/test/data/gallery-dl'] run_args += ['--write-metadata'] if metadata_only: run_args += ['--no-download'] if old_log_file: append_file_contents(log_file, old_log_file) run_args += ['--write-log', log_file] if old_unsupported_urls_file: append_file_contents(unsupported_urls_file, old_unsupported_urls_file) run_args += ['--write-unsupported', unsupported_urls_file] if overwrite_existing: run_args += ['--no-skip'] if not ignore_anchor: if not test_mode: if override := str(db.get_conf("gallery-dl.archive-override")): run_args += ["--download-archive", override] else: run_args += [ "--download-archive", db.get_rootpath() + '/anchor.db' ] else: run_args += [ "--download-archive", db.get_rootpath() + '/test/anchor.db' ]
def check_db_for_anchors(anchor_patterns: list[str]) -> bool: """ Checks whether the given SQL LIKE-pattern is present in the anchor database. """ if not anchor_patterns: return False global _anchor_conn if not _anchor_conn: _anchor_conn = sqlite3.connect(db.get_rootpath() + "/anchor.db", check_same_thread=False, timeout=24 * 60 * 60) _anchor_conn.row_factory = sqlite3.Row c = _anchor_conn.cursor() conditions = [] values = [] for pattern in anchor_patterns: if pattern.endswith("_%"): conditions.append("entry >= ? and entry < ?") values.append(pattern[:-2]) values.append(pattern[:-2] + "`") # ` is the next char after _ else: conditions.append("entry = ?") values.append(pattern) c.execute( "select 1 from archive where " + " or ".join(conditions) + " limit 1", values) return c.fetchone() is not None
def run_gallery_dl_with_custom_args( args: list[str], capture_output: bool = False) -> subprocess.CompletedProcess: """ This function runs gallery-dl with the given arguments in the current hydownloader environment. Some arguments beyond the ones passed in by the caller will be added (these are needed to make gallery-dl use the current hydownloader environment and conventions). """ run_args = [str(db.get_conf('gallery-dl.executable'))] run_args += ['--ignore-config'] run_args += ['--verbose'] if os.path.isfile(db.get_rootpath() + "/gallery-dl-config.json"): run_args += ["-c", db.get_rootpath() + "/gallery-dl-config.json"] if os.path.isfile(db.get_rootpath() + "/gallery-dl-user-config.json"): run_args += ["-c", db.get_rootpath() + "/gallery-dl-user-config.json"] run_args += args result = subprocess.run(run_args, capture_output=capture_output, text=capture_output, check=False) return result
def check_db_for_anchor(anchor_pattern: str) -> bool: """ Checks whether the given SQL LIKE-pattern is present in the anchor database. """ global _anchor_conn if not _anchor_conn: _anchor_conn = sqlite3.connect(db.get_rootpath() + "/anchor.db", check_same_thread=False, timeout=24 * 60 * 60) _anchor_conn.row_factory = sqlite3.Row c = _anchor_conn.cursor() c.execute('select entry from archive where entry like ?', (anchor_pattern, )) return len(c.fetchall()) > 0
def run_gallery_dl(url: str, subscription_mode: bool, ignore_anchor: bool, metadata_only: bool, log_file: str, console_output_file: str, unsupported_urls_file: str, overwrite_existing: bool, filter_: Optional[str] = None, chapter_filter: Optional[str] = None, abort_after: Optional[int] = None, test_mode: bool = False, old_log_file: Optional[str] = None, old_unsupported_urls_file: Optional[str] = None, max_file_count: Optional[int] = None, process_id: Optional[str] = None) -> str: """ Downloads a URL with gallery-dl using the current hydownloader environment. """ global _process_map run_args = [str(db.get_conf('gallery-dl.executable'))] run_args += ['--ignore-config'] run_args += ['--verbose'] if os.path.isfile(db.get_rootpath() + "/gallery-dl-config.json"): run_args += ["-c", db.get_rootpath() + "/gallery-dl-config.json"] if os.path.isfile(db.get_rootpath() + "/gallery-dl-user-config.json"): run_args += ["-c", db.get_rootpath() + "/gallery-dl-user-config.json"] run_args += ['--cookies', db.get_rootpath() + '/cookies.txt'] if not test_mode: run_args += ['--dest', db.get_datapath() + '/gallery-dl'] else: run_args += ['--dest', db.get_rootpath() + '/test/data/gallery-dl'] run_args += ['--write-metadata'] if metadata_only: run_args += ['--no-download'] if old_log_file: append_file_contents(log_file, old_log_file) run_args += ['-o', f'output.logfile.path={json.dumps(log_file)}'] run_args += [ '-o', 'output.logfile.format="[{name}][{levelname}][{asctime}] {message}"' ] db.add_log_file_to_parse_queue(log_file, process_id if process_id else 'unknown') if old_unsupported_urls_file: append_file_contents(unsupported_urls_file, old_unsupported_urls_file) run_args += ['--write-unsupported', unsupported_urls_file] if overwrite_existing: run_args += ['--no-skip'] if not ignore_anchor: if not test_mode: if override := str(db.get_conf("gallery-dl.archive-override")): run_args += ["--download-archive", override] else: run_args += [ "--download-archive", db.get_rootpath() + '/anchor.db' ] else: run_args += [ "--download-archive", db.get_rootpath() + '/test/anchor.db' ]
def clear_test_env() -> None: log.info('hydownloader-test', 'Clearing test environment...') if os.path.exists(db.get_rootpath() + '/test'): shutil.rmtree(db.get_rootpath() + '/test') os.makedirs(db.get_rootpath() + "/test") log.info('hydownloader-test', 'Test environment cleared')
def test_internal(sites: str) -> bool: post_url_data = { 'gelbooru': { 'url': "https://gelbooru.com/index.php?page=post&s=view&id=6002236", 'filenames': { "gelbooru/gelbooru_6002236_0ef507cc4c222406da544db3231de323.jpg.json": ["1girl ", "wings", '"rating": "q"', '"tags_general":'], "gelbooru/gelbooru_6002236_0ef507cc4c222406da544db3231de323.jpg": [] }, 'anchors': ["gelbooru6002236"] }, 'gelbooru_notes': { 'url': "https://gelbooru.com/index.php?page=post&s=view&id=5997331", 'filenames': { "gelbooru/gelbooru_5997331_7726d401af0e6bf5b58809f65d08334e.png.json": [ '"y": 72', '"x": 35', '"width": 246', '"height": 553', '"body": "Look over this way when you talk~"' ] }, 'anchors': ["gelbooru5997331"] }, 'danbooru': { 'url': "https://danbooru.donmai.us/posts/4455434", 'filenames': { "danbooru/danbooru_4455434_e110444217827ef3f82fb33b45e1841f.png.json": ["1girl ", "tail", '"rating": "q"'], "danbooru/danbooru_4455434_e110444217827ef3f82fb33b45e1841f.png": [] }, 'anchors': ["danbooru4455434"] }, 'pixiv': { 'url': "https://www.pixiv.net/en/artworks/88865254", 'filenames': { "pixiv/3316400 rogia/88865254_p7.jpg.json": [], "pixiv/3316400 rogia/88865254_p6.jpg.json": [], "pixiv/3316400 rogia/88865254_p5.jpg.json": [], "pixiv/3316400 rogia/88865254_p4.jpg.json": [], "pixiv/3316400 rogia/88865254_p3.jpg.json": [], "pixiv/3316400 rogia/88865254_p2.jpg.json": [ "Fate/GrandOrder", '"title": "メイドロリンチちゃん"', '"tags":', '"tags": [' ], "pixiv/3316400 rogia/88865254_p1.jpg.json": [], "pixiv/3316400 rogia/88865254_p0.jpg.json": [], "pixiv/3316400 rogia/88865254_p7.jpg": [], "pixiv/3316400 rogia/88865254_p6.jpg": [], "pixiv/3316400 rogia/88865254_p5.jpg": [], "pixiv/3316400 rogia/88865254_p4.jpg": [], "pixiv/3316400 rogia/88865254_p3.jpg": [], "pixiv/3316400 rogia/88865254_p2.jpg": [], "pixiv/3316400 rogia/88865254_p1.jpg": [], "pixiv/3316400 rogia/88865254_p0.jpg": [] }, 'anchors': [ "pixiv88865254_p00", "pixiv88865254_p01", "pixiv88865254_p02", "pixiv88865254_p03", "pixiv88865254_p04", "pixiv88865254_p05", "pixiv88865254_p06", "pixiv88865254_p07" ] }, 'pixiv_ugoira': { 'url': "https://www.pixiv.net/en/artworks/88748768", 'filenames': { "pixiv/9313418 thaimay704/88748768_p0.zip": [], "pixiv/9313418 thaimay704/88748768_p0.zip.json": [], "pixiv/9313418 thaimay704/88748768_p0.webm": [] }, 'anchors': ["pixiv88748768"] }, 'lolibooru': { 'url': 'https://lolibooru.moe/post/show/178123/1girl-barefoot-brown_eyes-brown_hair-cameltoe-cove', 'filenames': { "lolibooru/lolibooru_178123_a77d70e0019fc77c25d0ae563fc9b324.jpg.json": ["1girl ", " swimsuit", '"rating": "q",'], "lolibooru/lolibooru_178123_a77d70e0019fc77c25d0ae563fc9b324.jpg": [] }, 'anchors': ["lolibooru178123"] }, '3dbooru': { 'url': "http://behoimi.org/post/show/648363/apron-black_legwear-collar-cosplay-hairband-immora", 'filenames': { "3dbooru/3dbooru_648363_720f344170696293c3fe2640c59d8f41.jpg.json": ["cosplay ", " maid_uniform", '"rating": "s",'], "3dbooru/3dbooru_648363_720f344170696293c3fe2640c59d8f41.jpg": [] }, 'anchors': ["3dbooru648363"] }, 'nijie': { 'url': "https://nijie.info/view.php?id=306993", 'filenames': { "nijie/72870/306993_p0.jpg": [], "nijie/72870/306993_p1.jpg": [], "nijie/72870/306993_p0.jpg.json": [], "nijie/72870/306993_p1.jpg.json": ["\"オリジナル\"", "\"title\": \"朝7時50分の通学路\","] }, 'anchors': ["nijie306993_0", "nijie306993_1"] }, 'patreon': { 'url': "https://www.patreon.com/posts/new-cg-set-on-48042243", 'filenames': { "patreon/Osiimi Chan/48042243_NEW CG SET on Gumroad!! Ganyu's Hypnotic Rendezvou_01.png": [] }, 'anchors': ["patreon48042243_1"] }, 'sankaku': { 'url': "https://chan.sankakucomplex.com/post/show/707246", 'filenames': { "sankaku/sankaku_707246_5da41b5136905c35cad9cbcba89836a3.jpg": [], "sankaku/sankaku_707246_5da41b5136905c35cad9cbcba89836a3.jpg.json": ['"kirisame_marisa"', '"3girls"'] }, 'anchors': ["sankaku707246"] }, 'idolcomplex': { 'url': "https://idol.sankakucomplex.com/post/show/701724", 'filenames': { "idolcomplex/idolcomplex_701724_92b853bcf8dbff393c6217839013bcab.jpg": [], "idolcomplex/idolcomplex_701724_92b853bcf8dbff393c6217839013bcab.jpg.json": ['"rating": "q",', 'nikumikyo,'] }, 'anchors': ["idolcomplex701724"] }, 'artstation': { 'url': "https://www.artstation.com/artwork/W2LROD", 'filenames': { "artstation/sergey_vasnev/artstation_6721469_24728858_Procession.jpg": [], "artstation/sergey_vasnev/artstation_6721469_24728858_Procession.jpg.json": ['"title": "Procession",'] }, 'anchors': ["artstation24728858"] }, 'deviantart': { 'url': "https://www.deviantart.com/squchan/art/Atelier-Ryza-820511154", 'filenames': { "deviantart/SquChan/deviantart_820511154_Atelier Ryza.jpg": [], "deviantart/SquChan/deviantart_820511154_Atelier Ryza.jpg.json": ['"is_mature": true,'] }, 'anchors': ["deviantart820511154"] }, 'twitter': { 'url': "https://twitter.com/momosuzunene/status/1380033327680266244", 'filenames': { "twitter/momosuzunene/1380033327680266244_1.jpg": [], "twitter/momosuzunene/1380033327680266244_1.jpg.json": ['"name": "momosuzunene",'] }, 'anchors': ["twitter1380033327680266244_1"] }, 'webtoons': { 'url': "https://www.webtoons.com/en/challenge/crawling-dreams/ep-1-nyarla-ghast/viewer?title_no=141539&episode_no=81", 'anchors': [ 'webtoons141539_81_1', 'webtoons141539_81_2', 'webtoons141539_81_3', 'webtoons141539_81_4' ], 'filenames': { "webtoons/crawling-dreams/81-01.jpg": [], "webtoons/crawling-dreams/81-01.jpg.json": ['"comic": "crawling-dreams"'] } }, 'baraag': { 'url': "https://baraag.net/@pumpkinnsfw/106191173043385531", 'anchors': ['baraag106191139078112401', 'baraag106191139927706653'], 'filenames': { "mastodon/baraag.net/pumpkinnsfw/baraag_106191173043385531_106191139078112401.png": [], "mastodon/baraag.net/pumpkinnsfw/baraag_106191173043385531_106191139078112401.png.json": ['"sensitive": true'] } }, 'hentaifoundry': { 'url': "https://www.hentai-foundry.com/pictures/user/PalomaP/907277/Rapunzel-loves-creampie", 'anchors': ["hentaifoundry907277"], 'filenames': { "hentaifoundry/PalomaP/hentaifoundry_907277_Rapunzel loves creampie.jpg": [], "hentaifoundry/PalomaP/hentaifoundry_907277_Rapunzel loves creampie.jpg.json": ['"tags": [', '"creampie"'] } }, 'yandere': { 'url': "https://yande.re/post/show/619304", 'anchors': ["yandere619304"], 'filenames': { 'yandere_619304_449a208b7a42f917498a00386e173118.jpg.json': [], 'yandere_619304_449a208b7a42f917498a00386e173118.jpg': ['"tags_artist": "zuima"'] } } } site_set = {site.strip() for site in sites.split(',')} for site in site_set: clear_test_env() log_file = db.get_rootpath() + f"/logs/test-site-{site}-gallery-dl.txt" should_break = False if site == 'environment': log.info("hydownloader-test", "Querying gallery-dl version") version_str = gallery_dl_utils.run_gallery_dl_with_custom_args( ['--version'], capture_output=True).stdout.strip() try: if version_str.endswith("-dev"): version_str = version_str[:-4] major, minor, patch = tuple(map(int, version_str.split('.'))) if major != 1 or minor < 17 or minor == 17 and patch < 4: log.error( 'hydownloader-test', f"Bad gallery-dl version: {version_str}, need 1.17.3 or newer" ) should_break = True else: log.info( 'hydownloader-test', f"Found gallery-dl version: {version_str}, this is OK") except ValueError as e: log.error('hydownloader-test', "Could not recognize gallery-dl version", e) should_break = True try: ff_result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True, check=False).stdout.split('\n')[0] log.info('hydownloader-test', f"Found ffmpeg version: {ff_result}") except FileNotFoundError as e: log.error('hydownloader-test', "Could not find ffmpeg", e) should_break = True try: yt_result = subprocess.run(['youtube-dl', '--version'], capture_output=True, text=True, check=False).stdout.strip() log.info('hydownloader-test', f"Found youtube-dl version: {yt_result}") except FileNotFoundError as e: log.error('hydownloader-test', "Could not find youtube-dl", e) should_break = True elif site == "gelbooru": log.info("hydownloader-test", "Testing gelbooru...") log.info("hydownloader-test", 'Testing search of "sensitive" content') sensitive_url = "https://gelbooru.com/index.php?page=post&s=list&tags=loli" result = gallery_dl_utils.run_gallery_dl_with_custom_args( [ sensitive_url, '--get-urls', '-o', 'image-range="1-10"', '--write-log', log_file ], capture_output=True) sensitive_ok = True if result.returncode != 0: status_txt = gallery_dl_utils.check_return_code( result.returncode) log.error( "hydownloader-test", f'Error returned while trying to download "sensitive" content: return code {result.returncode}, {status_txt}' ) sensitive_ok = False should_break = True sensitive_results_cnt = len( re.findall("https://.*?gelbooru.com/images", result.stdout)) if sensitive_results_cnt < 10: log.error( "hydownloader-test", f'Failed to find "sensitive" content, insufficient number of results: {sensitive_results_cnt}' ) sensitive_ok = False should_break = True if sensitive_ok: log.info( "hydownloader-test", 'Search of "sensitive" content seems to be working OK') should_break = not check_results_of_post_url( post_url_data['gelbooru'], site) or should_break log.info("hydownloader-test", 'Testing note extraction') should_break = not check_results_of_post_url( post_url_data['gelbooru_notes'], site) or should_break elif site == "danbooru": log.info("hydownloader-test", "Testing danbooru...") log.info("hydownloader-test", 'Testing search of "sensitive" content') sensitive_url = "https://danbooru.donmai.us/posts?tags=loli" result = gallery_dl_utils.run_gallery_dl_with_custom_args( [ sensitive_url, '--get-urls', '-o', 'image-range="1-10"', '--write-log', log_file ], capture_output=True) sensitive_ok = True if result.returncode != 0: status_txt = gallery_dl_utils.check_return_code( result.returncode) log.error( "hydownloader-test", f'Error returned while trying to download "sensitive" content: return code {result.returncode}, {status_txt}' ) sensitive_ok = False should_break = True sensitive_results_cnt = len( re.findall("https://danbooru.donmai.us/data", result.stdout)) if sensitive_results_cnt < 10: log.error( "hydownloader-test", f'Failed to find "sensitive" content, insufficient number of results: {sensitive_results_cnt}' ) sensitive_ok = False should_break = True if sensitive_ok: log.info( "hydownloader-test", 'Search of "sensitive" content seems to be working OK') should_break = not check_results_of_post_url( post_url_data['danbooru'], site) or should_break elif site == "pixiv": log.info("hydownloader-test", "Testing pixiv...") should_break = not check_results_of_post_url( post_url_data['pixiv'], site) or should_break log.info("hydownloader-test", 'Testing downloading of ugoira') should_break = not check_results_of_post_url( post_url_data['pixiv_ugoira'], site) or should_break elif site == "lolibooru": log.info("hydownloader-test", "Testing lolibooru.moe...") should_break = not check_results_of_post_url( post_url_data['lolibooru'], site) or should_break elif site == "3dbooru": log.info("hydownloader-test", "Testing 3dbooru...") should_break = not check_results_of_post_url( post_url_data['3dbooru'], site) or should_break elif site == "patreon": log.info("hydownloader-test", "Testing patreon...") should_break = not check_results_of_post_url( post_url_data['patreon'], site) or should_break elif site == "nijie": log.info("hydownloader-test", "Testing nijie.info...") should_break = not check_results_of_post_url( post_url_data['nijie'], site) or should_break elif site == "sankaku": log.info("hydownloader-test", "Testing sankaku...") should_break = not check_results_of_post_url( post_url_data['sankaku'], site) or should_break elif site == "idolcomplex": log.info("hydownloader-test", "Testing idolcomplex...") should_break = not check_results_of_post_url( post_url_data['idolcomplex'], site) or should_break elif site == "artstation": log.info("hydownloader-test", "Testing artstation...") should_break = not check_results_of_post_url( post_url_data['artstation'], site) or should_break elif site == "twitter": log.info("hydownloader-test", "Testing twitter...") should_break = not check_results_of_post_url( post_url_data['twitter'], site) or should_break elif site == "deviantart": log.info("hydownloader-test", "Testing deviantart...") should_break = not check_results_of_post_url( post_url_data['deviantart'], site) or should_break elif site == "webtoons": log.info("hydownloader-test", "Testing webtoons...") should_break = not check_results_of_post_url( post_url_data['webtoons'], site) or should_break elif site == "baraag": log.info("hydownloader-test", "Testing baraag...") should_break = not check_results_of_post_url( post_url_data['baraag'], site) or should_break elif site == "hentaifoundry": log.info("hydownloader-test", "Testing hentaifoundry...") should_break = not check_results_of_post_url( post_url_data['hentaifoundry'], site) or should_break elif site == "yandere": log.info("hydownloader-test", "Testing yande.re...") should_break = not check_results_of_post_url( post_url_data['yandere'], site) or should_break else: log.error("hydownloader-test", f"Site name not recognized: {site}, no testing done") return False if should_break: log.error( "hydownloader-test", f"Stopping early due to errors while testing {site}, test environment kept for inspection" ) return False clear_test_env() return True
def process_additional_data(subscription_id: Optional[int] = None, url_id: Optional[int] = None) -> tuple[int, int]: """ This function scans log files outputted by gallery-dl and tries to recognize filenames in the output. Based on which subscription or URL those files belong to, it queries the database for the associated additional_data values (from the subscriptions or single_url_queue tables), then inserts these filename + data entries into the additional_data database table (even if there is no additional_date for the given files). This way it is possible to keep track which files were found by which URL downloads/subscriptions, and correctly associate additional data with them (even if the files were not actually downloaded by the URL or sub because some earlier download already got them). If both the subscription and url ID arguments are None, then it scans all files in the temp directory, otherwise exactly one of those must not be None and then it only scans for the file belonging to that URL or subscription. When parsing gallery-dl output, it is much better to have false positives (recognize some output lines as filenames which are not) than to miss any actual filenames, since invalid filename entries in the additional_data table are not a big deal. """ def is_filepath(candidate: str) -> bool: candidate = candidate.strip() # return ("/" in candidate or "\\" in candidate) and not candidate.startswith("[") and not "gallery-dl:" in candidate return os.path.exists(candidate) skipped_count = 0 new_count = 0 if subscription_id is not None and os.path.isfile( db.get_rootpath() + f"/temp/subscription-{subscription_id}-gallery-dl-output.txt"): f = open(db.get_rootpath() + f"/temp/subscription-{subscription_id}-gallery-dl-output.txt", 'r', encoding='utf-8-sig') for line in f: line = line.strip() if not is_filepath(line): log.debug("hydownloader", f"Does not look like a filepath: {line}") continue if line.startswith("# "): log.debug("hydownloader", f"Looks like a skipped filepath: {line}") line = line[1:] line = line.strip() skipped_count += 1 else: log.debug("hydownloader", f"Looks like a new filepath: {line}") new_count += 1 db.associate_additional_data(filename=line, subscription_id=subscription_id, no_commit=True) db.sync() f.close() os.remove( db.get_rootpath() + f"/temp/subscription-{subscription_id}-gallery-dl-output.txt") elif url_id is not None and os.path.isfile( db.get_rootpath() + f"/temp/single-url-{url_id}-gallery-dl-output.txt"): f = open(db.get_rootpath() + f"/temp/single-url-{url_id}-gallery-dl-output.txt", 'r', encoding='utf-8-sig') for line in f: line = line.strip() if not is_filepath(line): log.debug("hydownloader", f"Does not look like a filepath: {line}") continue if line.startswith("# "): log.debug("hydownloader", f"Looks like a skipped filepath: {line}") line = line[1:] line = line.strip() skipped_count += 1 else: log.debug("hydownloader", f"Looks like a new filepath: {line}") new_count += 1 db.associate_additional_data(filename=line, url_id=url_id, no_commit=True) db.sync() f.close() os.remove(db.get_rootpath() + f"/temp/single-url-{url_id}-gallery-dl-output.txt") else: log.info( "hydownloader", "Checking for any leftover temporary gallery-dl output files...") filenames = os.listdir(db.get_rootpath() + "/temp") for filename in filenames: if match := re.match("single-url-([0-9]+)-gallery-dl-output.txt", filename.strip()): log.info("hydownloader", f"Processing leftover file {filename}...") process_additional_data(url_id=int(match.group(1))) elif match := re.match( "subscription-([0-9]+)-gallery-dl-output.txt", filename.strip()): log.info("hydownloader", f"Processing leftover file {filename}...") process_additional_data(subscription_id=int(match.group(1)))
db.get_rootpath() + "/logs/subscription-*-gallery-dl-*.txt") for l in logs: db.add_log_file_to_parse_queue(l, 'reparse') while logfname := db.get_queued_log_file(worker): subscription_id = None url_id = None if m := re.match(r".*(?:\\|\/)single-urls-([0-9]+)-gallery-dl-.*\.txt", logfname): url_id = int(m.group(1)) if m := re.match( r".*(?:\\|\/)subscription-([0-9]+)-gallery-dl-.*\.txt", logfname): subscription_id = int(m.group(1)) try: with open(db.get_rootpath() + "/" + logfname, 'r', encoding='utf-8-sig') as logf: log.info("hydownloader", f"Parsing log file: {logfname}") urls = [] for line in logf: if m := re.match( r'(?:\[.+\])* (http.*?)(?::[0-9]+)? "[A-Z]+ (\/.*?) HTTP.*', line.strip()): urls.append(m.group(1) + m.group(2)) if m := re.match(r".*Starting DownloadJob for '(.*)'$", line.strip()): urls.append(m.group(1)) db.add_known_urls(urls, subscription_id=subscription_id, url_id=url_id)
def subscription_worker() -> None: global _sub_worker_ended_flag try: log.info("hydownloader", "Starting subscription worker thread...") with _worker_lock: _sub_worker_ended_flag = False while True: time.sleep(2) with _worker_lock: if _end_threads_flag: break subs_due = db.get_due_subscriptions() if not subs_due: with _worker_lock: if _sub_worker_paused_flag: set_subscription_worker_status("paused") else: set_subscription_worker_status("nothing to do: checked for due subscriptions, found none") sub = subs_due[0] if subs_due else None while sub: with _worker_lock: if _end_threads_flag: break if _sub_worker_paused_flag: set_subscription_worker_status("paused") break initial_check = sub['last_check'] is None url = urls.subscription_data_to_url(sub['downloader'], sub['keywords']) check_started_time = time.time() status_msg = f"checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']})" set_subscription_worker_status(status_msg) log.info(f"subscription-{sub['id']}", status_msg.capitalize()) if initial_check: log.info(f"subscription-{sub['id']}", "This is the first check for this subscription") result = gallery_dl_utils.run_gallery_dl( url=url, ignore_anchor=False, metadata_only=False, log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-latest.txt", old_log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-old.txt", console_output_file=db.get_rootpath()+f"/temp/subscription-{sub['id']}-gallery-dl-output.txt", unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-latest.txt", old_unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-old.txt", overwrite_existing=False, filter_=sub['filter'], chapter_filter=None, subscription_mode=True, abort_after=sub['abort_after'], max_file_count = sub['max_files_initial'] if initial_check else sub['max_files_regular'] ) if result: log.warning(f"subscription-{sub['id']}", "Error: "+result) else: sub['last_successful_check'] = check_started_time sub['last_check'] = check_started_time new_files, skipped_files = process_additional_data(subscription_id = sub['id']) check_ended_time = time.time() db.add_subscription_check(sub['id'], new_files=new_files, already_seen_files=skipped_files, time_started=check_started_time, time_finished=check_ended_time, status=result) db.add_or_update_subscriptions([sub]) status_msg = f"finished checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']}), new files: {new_files}, skipped: {skipped_files}" set_subscription_worker_status(status_msg) log.info(f"subscription-{sub['id']}", status_msg.capitalize()) subs_due = db.get_due_subscriptions() sub = subs_due[0] if subs_due else None with _worker_lock: if _end_threads_flag: break with _worker_lock: if _end_threads_flag: log.info("hydownloader", "Stopping subscription worker thread") _sub_worker_ended_flag = True except Exception as e: log.fatal("hydownloader", "Uncaught exception in subscription worker thread", e) shutdown()
def route_serve_file(filename: str): check_access() if os.path.isfile(db.get_rootpath() + '/' + filename): return bottle.static_file(filename, root=db.get_rootpath()) bottle.abort(404)
def url_queue_worker() -> None: global _url_worker_ended_flag try: log.info("hydownloader", "Starting single URL queue worker thread...") with _worker_lock: _url_worker_ended_flag = False while True: time.sleep(2) with _worker_lock: if _end_threads_flag: break urls_to_dl = db.get_urls_to_download() if not urls_to_dl: with _worker_lock: if _url_worker_paused_flag: set_url_worker_status("paused") else: set_url_worker_status("nothing to do: checked for queued URLs, found none") urlinfo = urls_to_dl[0] if urls_to_dl else None while urlinfo: with _worker_lock: if _end_threads_flag: break if _url_worker_paused_flag: set_url_worker_status("paused") break check_time = time.time() status_msg = f"downloading URL: {urlinfo['url']}" set_url_worker_status(status_msg) log.info("single url downloader", status_msg.capitalize()) result = gallery_dl_utils.run_gallery_dl( url=urlinfo['url'], ignore_anchor=urlinfo['ignore_anchor'], metadata_only=urlinfo['metadata_only'], log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-latest.txt", old_log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-old.txt", console_output_file=db.get_rootpath()+f"/temp/single-url-{urlinfo['id']}-gallery-dl-output.txt", unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-latest.txt", old_unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-old.txt", overwrite_existing=urlinfo['overwrite_existing'], filter_=urlinfo['filter'], chapter_filter=None, subscription_mode=False, max_file_count = urlinfo['max_files'] ) if result: log.warning("single url downloader", f"Error while downloading {urlinfo['url']}: {result}") urlinfo['status'] = 1 urlinfo['status_text'] = result else: urlinfo['status'] = 0 urlinfo['status_text'] = 'ok' urlinfo['time_processed'] = check_time new_files, skipped_files = process_additional_data(url_id = urlinfo['id']) urlinfo['new_files'] = new_files urlinfo['already_seen_files'] = skipped_files db.add_or_update_urls([urlinfo]) status_msg = f"finished checking URL: {urlinfo['url']}, new files: {new_files}, skipped: {skipped_files}" set_url_worker_status(status_msg) log.info("single url downloader", status_msg.capitalize()) urls_to_dl = db.get_urls_to_download() urlinfo = urls_to_dl[0] if urls_to_dl else None with _worker_lock: if _end_threads_flag: break with _worker_lock: if _end_threads_flag: log.info("hydownloader", "Stopping single URL queue worker thread") _url_worker_ended_flag = True except Exception as e: log.fatal("hydownloader", "Uncaught exception in URL worker thread", e) shutdown()
def run_job(path: str, job: str, config: Optional[str], verbose: bool, do_it: bool, no_stop_on_missing_metadata: bool) -> None: log.init(path, True) db.init(path) config_path = db.get_rootpath() + '/hydownloader-import-jobs.json' data_path = db.get_datapath() if config: config_path = config if not os.path.isfile(config_path): log.fatal("hydownloader-importer", f"Configuration file not found: {config_path}") jobs = json.load(open(config_path, 'r', encoding='utf-8-sig')) if not job in jobs: log.fatal("hydownloader-importer", f"Job not found in configuration file: {job}") jd = jobs[job] force_add_metadata = jd.get('forceAddMetadata', True) force_add_files = jd.get('forceAddFiles', False) client = hydrus.Client(jd['apiKey'], jd['apiURL']) log.info("hydownloader-importer", f"Starting import job: {job}") # iterate over all files in the data directory for root, dirs, files in os.walk(data_path): for fname in files: # json files hold metadata, don't import them to Hydrus if fname.endswith('.json'): continue # set up some variables # some will be used later in the code, some are meant to be used in user-defined expressions abspath = root + "/" + fname path = os.path.relpath(abspath, start=data_path) split_path = os.path.split(path) fname_noext, fname_ext = os.path.splitext(fname) if fname_ext.startswith('.'): fname_ext = fname_ext[1:] # find the path of the associated json metadata file, check if it exists # for pixiv ugoira, the same metadata file belongs both to the .webm and the .zip, # so this needs special handling json_path = abspath + '.json' if not os.path.isfile(json_path) and abspath.endswith('.webm'): json_path = abspath[:-4] + "zip.json" json_exists = True if not os.path.isfile(json_path): json_exists = False printerr(f"Warning: no metadata file found for {path}") if not no_stop_on_missing_metadata: sys.exit(1) generated_urls = set() generated_tags: set[tuple[str, str]] = set() matched = False # will be true if at least 1 filter group matched the file json_data = None # this will hold the associated json metadata (if available) if verbose: printerr(f"Processing file: {path}...") # iterate over all filter groups, do they match this file? for group in jd['groups']: # evaluate filter, load json metadata if the filter matches and we haven't loaded it yet should_process = False try: should_process = eval(group['filter']) except: printerr(f"Failed to evaluate filter: {group['filter']}") sys.exit(1) if not json_data and json_exists: try: json_data = json.load( open(json_path, encoding='utf-8-sig')) except json.decoder.JSONDecodeError: printerr(f"Failed to parse JSON: {json_path}") sys.exit(1) if not should_process: continue matched = True # get the data for this file from the additional_data db table and process it # set up some variables that user-defined expressions will be able to use additional_data_dicts = db.get_additional_data_for_file(path) if not additional_data_dicts and path.endswith('.webm'): additional_data_dicts = db.get_additional_data_for_file( path[:-4] + "zip") extra_tags: defaultdict[str, list[str]] = defaultdict(list) min_time_added = -1 max_time_added = -1 for d in additional_data_dicts: parse_additional_data(extra_tags, d['data']) if min_time_added == -1 or min_time_added > d['time_added']: min_time_added = d['time_added'] if max_time_added == -1 or max_time_added < d['time_added']: max_time_added = d['time_added'] sub_ids = [] url_ids = [] for d in additional_data_dicts: if d['subscription_id']: sub_ids.append(str(d['subscription_id'])) if d['url_id']: url_ids.append(str(d['url_id'])) # execute user-defined tag and url generator expressions has_error = False for dtype, d in [('tag', x) for x in group.get('tags', [])] + [ ('url', x) for x in group.get('urls', []) ]: skip_on_error = d.get("skipOnError", False) allow_empty = d.get("allowEmpty", False) rule_name = d.get("name") generated_results = [] # if the expression is a single string if isinstance(d["values"], str): try: eval_res = eval(d["values"]) # check result type: must be string or iterable of strings if isinstance(eval_res, str): generated_results = [eval_res] else: for eval_res_str in eval_res: if not isinstance(eval_res_str, str): printerr( f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {d['values']}" ) sys.exit(1) else: generated_results.append(eval_res_str) except Exception as e: if verbose: printerr( f"Failed to evaluate expression: {d['values']}" ) print(e) has_error = True else: # multiple expressions (array of strings) for eval_expr in d["values"]: try: eval_res = eval(eval_expr) # check result type: must be string or iterable of strings if isinstance(eval_res, str): generated_results = [eval_res] else: for eval_res_str in eval_res: if not isinstance(eval_res_str, str): printerr( f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {eval_expr}" ) sys.exit(1) else: generated_results.append( eval_res_str) except Exception as e: if verbose: printerr( f"Failed to evaluate expression: {eval_expr}" ) printerr(e) has_error = True # check for empty results or failed evaluation, as necessary if not generated_results and not allow_empty: printerr( f"Error: the rule named {rule_name} yielded no results but this is not allowed" ) sys.exit(1) if has_error: printerr( f"Warning: an expression failed to evaluate in the rule named {rule_name}" ) if not skip_on_error: sys.exit(1) # save results of the currently evaluated expressions if dtype == 'url': generated_urls.update(generated_results) else: for repo in d["tagRepos"]: generated_tags.update( (repo, tag) for tag in generated_results) if matched: printerr(f"File matched: {path}...") if not os.path.getsize(abspath): print(f"Found truncated file: {abspath}") sys.exit(1) if verbose: printerr("Generated URLs:") for url in generated_urls: printerr(url) printerr("Generated tags:") for repo, tag in sorted(list(generated_tags), key=lambda x: x[0]): printerr(f"{repo} <- {tag}") if verbose: printerr('Hashing...') # calculate hash, check if Hydrus already knows the file already_added = False if do_it: hasher = hashlib.sha256() with open(abspath, 'rb') as hashedfile: buf = hashedfile.read(65536 * 16) while len(buf) > 0: hasher.update(buf) buf = hashedfile.read(65536 * 16) hexdigest = hasher.hexdigest() if client.file_metadata(hashes=[hexdigest], only_identifiers=True): printerr("File is already in Hydrus") already_added = True # send file, tags, metadata to Hydrus as needed if not already_added or force_add_files: if verbose: printerr("Sending file to Hydrus...") if do_it: client.add_file(abspath) if not already_added or force_add_metadata: if verbose: printerr("Associating URLs...") if do_it: client.associate_url(hashes=[hexdigest], add=generated_urls) if verbose: printerr("Adding tags...") tag_dict = defaultdict(list) for repo, tag in generated_tags: tag_dict[repo].append(tag) if do_it: client.add_tags(hashes=[hexdigest], service_to_tags=tag_dict) else: if verbose: printerr(f"Skipping due to no matching filter: {path}") log.info("hydownloader-importer", f"Finished import job: {job}") db.shutdown()
else: run_args += [ "--download-archive", db.get_rootpath() + '/test/anchor.db' ] if filter_: run_args += ['--filter', filter_] if chapter_filter: run_args += ['--chapter-filter', chapter_filter] if subscription_mode and abort_after: run_args += ['-A', f'{abort_after}'] if max_file_count: run_args += ['-o', f'image-range="1-{max_file_count}"'] run_args += [ '-o', 'cache.file=' + db.get_rootpath() + '/gallery-dl-cache.db' ] run_args += [url] console_out = open(console_output_file, 'a') console_out.write('\n') result = subprocess.run(run_args, text=True, stdout=console_out, stderr=console_out, check=False) console_out.close() return check_return_code(result.returncode)