def delete_subscriptions(sub_ids: list[int]) -> bool: check_init() c = get_conn().cursor() for i in sub_ids: c.execute('delete from subscriptions where id = ?', (i,)) get_conn().commit() log.info("hydownloader", f"Deleted subscriptions with IDs: {', '.join(map(str, sub_ids))}") return True
def delete_urls(url_ids: list[int]) -> bool: check_init() c = get_conn().cursor() for i in url_ids: c.execute('delete from single_url_queue where id = ?', (i,)) get_conn().commit() log.info("hydownloader", f"Deleted URLs with IDs: {', '.join(map(str, url_ids))}") return True
def print_url_entries(entries: list[dict]) -> None: for url in entries: log.info('hydownloader-report', ( f"URL: {url['url']}, " f"status: {url['status_text']} (code: {url['status']}), " f"time added: {format_date(url['time_added'])}, " f"time processed: {format_date(url['time_processed'])}, " f"paused: {url['paused']}" ))
def check_results_of_post_url(data: dict, sitename: str) -> bool: """ Downloads a URL with gallery-dl, then checks if the downloaded filenames, file content and anchor entries match what was provided by the caller. """ url = data['url'] filenames = data['filenames'] anchors = data['anchors'] log.info("hydownloader-test", f'Testing downloading of posts for site {sitename}') log_file = db.get_rootpath()+f"/logs/test-site-{sitename}-gallery-dl.txt" result_txt = gallery_dl_utils.run_gallery_dl( url=url, ignore_anchor=False, metadata_only=False, log_file=log_file, console_output_file=db.get_rootpath()+f"/test/test-site-{sitename}-gallery-dl-output.txt", unsupported_urls_file=db.get_rootpath()+f"/test/test-site-{sitename}-unsupported-urls-gallery-dl.txt", overwrite_existing=False, subscription_mode=False, test_mode = True ) result = True if result_txt: log.error("hydownloader-test", f"Error returned for {sitename} download: {result_txt}") result = False else: log.info("hydownloader-test", f"Return code for {sitename} download OK") for fname in filenames: abs_fname = db.get_rootpath()+"/test/data/gallery-dl/"+fname if not os.path.isfile(abs_fname): log.error("hydownloader-test", f"Missing expected file: {fname}") result = False else: log.info("hydownloader-test", f"Found expected file: {fname}") for content in filenames[fname]: with open(abs_fname) as f: if re.search(content, f.read()): log.info("hydownloader-test", "Expected file content found") else: log.error("hydownloader-test", f"Expected file content ({content}) NOT found") result = False conn = sqlite3.connect(db.get_rootpath()+"/test/anchor.db") conn.row_factory = sqlite3.Row c = conn.cursor() for anchor in anchors: try: c.execute('select entry from archive where entry = ?', (anchor,)) if len(c.fetchall()): log.info("hydownloader-test", f"Expected anchor {anchor} found in database") else: log.error("hydownloader-test", f"Expected anchor {anchor} NOT found in database") result = False except sqlite3.OperationalError as e: log.error("hydownloader-test", "Error while trying to query anchor database - download failed?", e) result = False return result
def print_sub_entries(entries: list[dict]) -> None:#keywords,downloader,last_check,last_successful_check, check_interval, paused for sub in entries: log.info('hydownloader-report', ( f"Downloader: {sub['downloader']}, " f"keywords: {sub['keywords']}, " f"last check: {format_date(sub['last_check'])}, " f"last successful check: {format_date(sub['last_successful_check'])}, " f"check interval: {sub['check_interval']}, " f"paused: {sub['paused']}" ))
def add_or_update_subscription_checks(sub_data: list[dict]) -> bool: check_init() for item in sub_data: add = "rowid" not in item if add: item["time_created"] = time.time() upsert_dict("subscription_checks", item, no_commit = True) if add: log.info("hydownloader", f"Added subscription check entry: rowid {item['rowid']}") else: log.info("hydownloader", f"Updated subscription check entry with rowid {item['rowid']}") get_conn().commit() return True
def add_or_update_urls(url_data: list[dict]) -> bool: for item in url_data: add = "id" not in item if add and not "url" in item: continue if add: item["time_added"] = time.time() if 'url' in item: item['url'] = uri_normalizer.normalizes(item['url']) upsert_dict("single_url_queue", item) if add: log.info("hydownloader", f"Added URL: {item['url']}") else: log.info("hydownloader", f"Updated URL with ID {item['id']}") return True
def api_worker(path: str, debug: bool) -> None: global _srv if db.get_conf('daemon.ssl') and os.path.isfile(path+"/server.pem"): log.info("hydownloader", "Starting daemon (with SSL)...") _srv = SSLWSGIRefServer(path+"/server.pem", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port')) bottle.run(server=_srv, debug=debug) else: if db.get_conf('daemon.ssl'): log.warning("hydownloader", "SSL enabled in config, but no server.pem file found in the db folder, continuing without SSL...") log.info("hydownloader", "Starting daemon...") _srv = SSLWSGIRefServer("", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port')) bottle.run(server=_srv, debug=debug)
def shutdown() -> None: global _shutdown_started if _shutdown_started: return _shutdown_started = True end_threads() db.shutdown() try: log.info("hydownloader", "hydownloader shut down") except RuntimeError: pass sys.stderr.close() os._exit(0)
def add_or_update_subscriptions(sub_data: list[dict]) -> bool: for item in sub_data: add = "id" not in item if add and not "keywords" in item: continue if add and not "downloader" in item: continue if add and not "additional_data" in item: item["additional_data"] = "" if add: item["time_created"] = time.time() upsert_dict("subscriptions", item) if add: log.info("hydownloader", f"Added subscription: {item['keywords']} for downloader {item['downloader']}") else: log.info("hydownloader", f"Updated subscription with ID {item['id']}") return True
def shutdown() -> None: global _shutdown_started db.close_thread_connections() if _shutdown_started: return _shutdown_started = True end_downloader_threads() if _srv: _srv.stop() db.shutdown() try: log.info("hydownloader", "hydownloader shut down") except RuntimeError: pass sys.stderr.close() os._exit(0)
def mass_add_urls(path: str, file_: str, additional_data: Optional[str], metadata_only: bool, overwrite_existing: bool, filter_: Optional[str], ignore_anchor: bool, max_files: Optional[int]) -> None: log.init(path, True) db.init(path) for line in open(file_, 'r'): line = line.strip() if line: db.add_or_update_urls([{ 'url': line, 'time_added': time.time(), 'additional_data': additional_data, 'metadata_only': metadata_only, 'overwrite_existing': overwrite_existing, 'filter': filter_, 'ignore_anchor': ignore_anchor, 'max_files': max_files }]) log.info("hydownloader-tools", f"Added URL: {line}")
def mass_add_subscriptions(path: str, file_: str, downloader: str, additional_data: Optional[str], paused: bool, filter_: Optional[str], abort_after: int, max_files_initial: Optional[int], max_files_regular: Optional[int]) -> None: log.init(path, True) db.init(path) for line in open(file_, 'r'): line = line.strip() if line: db.add_or_update_subscriptions([{ 'keywords': line, 'downloader': downloader, 'time_created': time.time(), 'additional_data': additional_data, 'filter': filter_, 'max_files_initial': max_files_initial, 'max_files_regular': max_files_regular, 'abort_after': abort_after, 'paused': paused }]) log.info("hydownloader-tools", f"Added subscription {line} with downloader {downloader}")
def init(path : str) -> None: global _inited, _path, _config _path = path if not os.path.isdir(path): log.info("hydownloader", f"Initializing new database folder at {path}") os.makedirs(path) if not os.path.isdir(path + "/logs"): os.makedirs(path + "/logs") if not os.path.isdir(path + "/logs"): os.makedirs(path + "/data") if not os.path.isdir(path + "/temp"): os.makedirs(path + "/temp") needs_db_init = False if not os.path.isfile(path+"/hydownloader.db"): needs_db_init = True if not os.path.isfile(path+"/gallery-dl-config.json"): gdl_cfg = open(path+"/gallery-dl-config.json", 'w', encoding='utf-8') gdl_cfg.write(C.DEFAULT_GALLERY_DL_CONFIG) gdl_cfg.close() if not os.path.isfile(path+"/gallery-dl-user-config.json"): gdl_cfg = open(path+"/gallery-dl-user-config.json", 'w', encoding='utf-8') gdl_cfg.write(C.DEFAULT_GALLERY_DL_USER_CONFIG) gdl_cfg.close() if not os.path.isfile(path+"/hydownloader-config.json"): hydl_cfg = open(path+"/hydownloader-config.json", 'w', encoding='utf-8') hydl_cfg.write(json.dumps(C.DEFAULT_CONFIG, indent=4)) hydl_cfg.close() if not os.path.isfile(path+"/hydownloader-import-jobs.json"): hydl_cfg = open(path+"/hydownloader-import-jobs.json", 'w', encoding='utf-8') hydl_cfg.write(json.dumps(C.DEFAULT_IMPORT_JOBS, indent=4)) hydl_cfg.close() if not os.path.isfile(path+"/cookies.txt"): open(path+"/cookies.txt", "w", encoding="utf-8").close() get_conn() if needs_db_init: create_db() _config = json.load(open(path+"/hydownloader-config.json", "r", encoding="utf-8-sig")) need_shared_db_init = not os.path.isfile(_shared_db_path()) get_shared_conn() if need_shared_db_init: create_shared_db() check_db_version() _inited = True
def mass_add_subscriptions(path: str, file_: str, downloader: str, additional_data: Optional[str], paused: bool, filter_: Optional[str], abort_after: int, max_files_initial: Optional[int], max_files_regular: Optional[int], check_interval: int, random_check_interval: int, encode_keywords: bool) -> None: log.init(path, True) db.init(path) for line in open(file_, 'r', encoding='utf-8-sig'): line = line.strip() if encode_keywords: line = line.replace(' ', '+') line = urllib.parse.quote(line, safe='/+').lower() if line: new_sub = { 'keywords': line, 'downloader': downloader, 'time_created': time.time(), 'additional_data': additional_data, 'filter': filter_, 'paused': paused, 'check_interval': check_interval + random.randint(0, random_check_interval) } if max_files_initial is not None: new_sub['max_files_initial'] = max_files_initial if max_files_regular is not None: new_sub['max_files_regular'] = max_files_regular if abort_after is not None: new_sub['abort_after'] = abort_after db.add_or_update_subscriptions([new_sub]) log.info( "hydownloader-tools", f"Added subscription {line} with downloader {downloader}")
def start(path : str, debug : bool) -> None: log.init(path, debug) db.init(path) process_additional_data() subs_thread = threading.Thread(target=subscription_worker, name='Subscription worker', daemon=True) subs_thread.start() url_thread = threading.Thread(target=url_queue_worker, name='Single URL queue worker', daemon=True) url_thread.start() if db.get_conf('daemon.ssl') and os.path.isfile(path+"/server.pem"): log.info("hydownloader", "Starting daemon (with SSL)...") srv = SSLWSGIRefServer(path+"/server.pem", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port')) bottle.run(server=srv, debug=debug) else: if db.get_conf('daemon.ssl'): log.warning("hydownloader", "SSL enabled in config, but no server.pem file found in the db folder, continuing without SSL...") log.info("hydownloader", "Starting daemon...") srv = SSLWSGIRefServer("", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port')) bottle.run(server=srv, debug=debug)
def init(path : str) -> None: global _conn, _inited, _path, _config _path = path if not os.path.isdir(path): log.info("hydownloader", f"Initializing new database folder at {path}") os.makedirs(path) if not os.path.isdir(path + "/logs"): os.makedirs(path + "/logs") if not os.path.isdir(path + "/logs"): os.makedirs(path + "/data") if not os.path.isdir(path + "/temp"): os.makedirs(path + "/temp") needs_db_init = False if not os.path.isfile(path+"/hydownloader.db"): needs_db_init = True if not os.path.isfile(path+"/gallery-dl-config.json"): gdl_cfg = open(path+"/gallery-dl-config.json", 'w') gdl_cfg.write(C.DEFAULT_GALLERY_DL_CONFIG) gdl_cfg.close() if not os.path.isfile(path+"/gallery-dl-user-config.json"): gdl_cfg = open(path+"/gallery-dl-user-config.json", 'w') gdl_cfg.write(C.DEFAULT_GALLERY_DL_USER_CONFIG) gdl_cfg.close() if not os.path.isfile(path+"/hydownloader-config.json"): hydl_cfg = open(path+"/hydownloader-config.json", 'w') hydl_cfg.write(json.dumps(C.DEFAULT_CONFIG, indent=4)) hydl_cfg.close() if not os.path.isfile(path+"/cookies.txt"): open(path+"/cookies.txt", "w").close() _conn = sqlite3.connect(path+"/hydownloader.db", check_same_thread=False, timeout=24*60*60) _conn.row_factory = lambda c, r: dict(zip([col[0] for col in c.description], r)) if needs_db_init: create_db() check_db_version() _config = json.load(open(path+"/hydownloader-config.json", "r")) _inited = True
def update_anchor(path: str, hydrus_master_db: str, sites: str, unrecognized_urls_file: Optional[str], recognized_urls_file: Optional[str]) -> None: """ This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs from the gallery-dl default, these are set in gallery-dl-config.json. """ log.init(path, True) db.init(path) if not os.path.isfile(hydrus_master_db): log.fatal("hydownloader-anchor-exporter", "The given client.master.db file does not exist!") hydrus_db = sqlite3.connect(hydrus_master_db) hydrus_db.row_factory = sqlite3.Row anchor_init_needed = not os.path.isfile(path + "/anchor.db") anchor_db = sqlite3.connect(path + "/anchor.db") hc = hydrus_db.cursor() ac = anchor_db.cursor() if anchor_init_needed: ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID') anchor_db.commit() ac.execute('select * from archive') known_anchors = {row[0] for row in ac.fetchall()} log.info("hydownloader-anchor-exporter", "Querying Hydrus database for URLs...") hc.execute('select * from url_domains natural inner join urls') rows = hc.fetchall() all_rows = len(rows) processed = 0 suspicious_urls = set() recognized_urls = set() sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = { 'pixiv': (["pixi"], []), 'gelbooru': (["gelbooru"], []), 'nijie': (["nijie"], []), 'lolibooru': (['lolibooru'], []), 'danbooru': (['danbooru'], []), '3dbooru': (['behoimi'], []), 'sankaku': (['sankaku'], ["idol."]), 'idolcomplex': (["idol.sankaku"], []), 'artstation': (["artstation"], []), 'twitter': (["twitter", "nitter"], []), 'deviantart': (['deviantart'], []), 'tumblr': (["tumblr"], []) } siteset = {x.strip() for x in sites.split(',') if x.strip()} if sites == "all": siteset = set(sites_to_keywords.keys()) anchors: Counter[str] = collections.Counter() for site in siteset: if not site in sites_to_keywords: log.fatal('hydownloader-anchor-exporter', f'Unsupported site: {site}') def process_url(url): patterns = urls.anchor_patterns_from_url(url) if patterns: recognized_urls.add(url) anchors[patterns[0]] += 1 else: suspicious_urls.add(url) log.info("hydownloader-anchor-exporter", "Processing URLs...") for row in rows: processed += 1 if processed % 1000 == 0: print(f"Processed {processed}/{all_rows} URLs") for site in siteset: accepts, rejects = sites_to_keywords[site] url_ok = False for accept in accepts: if accept in row['url']: url_ok = True break if url_ok: for reject in rejects: if reject in row['url']: url_ok = False if url_ok: process_url(row['url']) log.info("hydownloader-anchor-exporter", "Done processing URLs") if unrecognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing unrecognized URLs...") with open(unrecognized_urls_file, 'w') as f: for url in sorted(suspicious_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing unrecognized URLs") if recognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing recognized URLs...") with open(recognized_urls_file, 'w') as f: for url in sorted(recognized_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing recognized URLs") log.info("hydownloader-anchor-exporter", "Inserting new anchors...") anchor_count = len(anchors.keys()) processed = 0 new_anchor_rows = 0 for anchor in anchors: processed += 1 if processed % 50 == 0: print(f"Inserting new anchors {processed}/{anchor_count}") final_anchors = [anchor] if anchor.startswith("nijie"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("twitter") or anchor.startswith("tumblr"): for i in range(anchors[anchor] + 1): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("pixiv"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_p{:02d}".format(i)) for f_a in final_anchors: if f_a in known_anchors: continue ac.execute('insert into archive(entry) values (?)', (f_a, )) new_anchor_rows += 1 log.info( "hydownloader-anchor-exporter", f"Done inserting new anchors, added {new_anchor_rows} entries in total" ) anchor_db.commit() anchor_db.close() hydrus_db.close()
def clear_test_env() -> None: log.info('hydownloader-test', 'Clearing test environment...') if os.path.exists(db.get_rootpath() + '/test'): shutil.rmtree(db.get_rootpath() + '/test') os.makedirs(db.get_rootpath() + "/test") log.info('hydownloader-test', 'Test environment cleared')
def test_internal(sites: str) -> bool: post_url_data = { 'gelbooru': { 'url': "https://gelbooru.com/index.php?page=post&s=view&id=6002236", 'filenames': { "gelbooru/gelbooru_6002236_0ef507cc4c222406da544db3231de323.jpg.json": ["1girl ", "wings", '"rating": "q"', '"tags_general":'], "gelbooru/gelbooru_6002236_0ef507cc4c222406da544db3231de323.jpg": [] }, 'anchors': ["gelbooru6002236"] }, 'gelbooru_notes': { 'url': "https://gelbooru.com/index.php?page=post&s=view&id=5997331", 'filenames': { "gelbooru/gelbooru_5997331_7726d401af0e6bf5b58809f65d08334e.png.json": [ '"y": 72', '"x": 35', '"width": 246', '"height": 553', '"body": "Look over this way when you talk~"' ] }, 'anchors': ["gelbooru5997331"] }, 'danbooru': { 'url': "https://danbooru.donmai.us/posts/4455434", 'filenames': { "danbooru/danbooru_4455434_e110444217827ef3f82fb33b45e1841f.png.json": ["1girl ", "tail", '"rating": "q"'], "danbooru/danbooru_4455434_e110444217827ef3f82fb33b45e1841f.png": [] }, 'anchors': ["danbooru4455434"] }, 'pixiv': { 'url': "https://www.pixiv.net/en/artworks/88865254", 'filenames': { "pixiv/3316400 rogia/88865254_p7.jpg.json": [], "pixiv/3316400 rogia/88865254_p6.jpg.json": [], "pixiv/3316400 rogia/88865254_p5.jpg.json": [], "pixiv/3316400 rogia/88865254_p4.jpg.json": [], "pixiv/3316400 rogia/88865254_p3.jpg.json": [], "pixiv/3316400 rogia/88865254_p2.jpg.json": [ "Fate/GrandOrder", '"title": "メイドロリンチちゃん"', '"tags":', '"tags": [' ], "pixiv/3316400 rogia/88865254_p1.jpg.json": [], "pixiv/3316400 rogia/88865254_p0.jpg.json": [], "pixiv/3316400 rogia/88865254_p7.jpg": [], "pixiv/3316400 rogia/88865254_p6.jpg": [], "pixiv/3316400 rogia/88865254_p5.jpg": [], "pixiv/3316400 rogia/88865254_p4.jpg": [], "pixiv/3316400 rogia/88865254_p3.jpg": [], "pixiv/3316400 rogia/88865254_p2.jpg": [], "pixiv/3316400 rogia/88865254_p1.jpg": [], "pixiv/3316400 rogia/88865254_p0.jpg": [] }, 'anchors': [ "pixiv88865254_p00", "pixiv88865254_p01", "pixiv88865254_p02", "pixiv88865254_p03", "pixiv88865254_p04", "pixiv88865254_p05", "pixiv88865254_p06", "pixiv88865254_p07" ] }, 'pixiv_ugoira': { 'url': "https://www.pixiv.net/en/artworks/88748768", 'filenames': { "pixiv/9313418 thaimay704/88748768_p0.zip": [], "pixiv/9313418 thaimay704/88748768_p0.zip.json": [], "pixiv/9313418 thaimay704/88748768_p0.webm": [] }, 'anchors': ["pixiv88748768"] }, 'lolibooru': { 'url': 'https://lolibooru.moe/post/show/178123/1girl-barefoot-brown_eyes-brown_hair-cameltoe-cove', 'filenames': { "lolibooru/lolibooru_178123_a77d70e0019fc77c25d0ae563fc9b324.jpg.json": ["1girl ", " swimsuit", '"rating": "q",'], "lolibooru/lolibooru_178123_a77d70e0019fc77c25d0ae563fc9b324.jpg": [] }, 'anchors': ["lolibooru178123"] }, '3dbooru': { 'url': "http://behoimi.org/post/show/648363/apron-black_legwear-collar-cosplay-hairband-immora", 'filenames': { "3dbooru/3dbooru_648363_720f344170696293c3fe2640c59d8f41.jpg.json": ["cosplay ", " maid_uniform", '"rating": "s",'], "3dbooru/3dbooru_648363_720f344170696293c3fe2640c59d8f41.jpg": [] }, 'anchors': ["3dbooru648363"] }, 'nijie': { 'url': "https://nijie.info/view.php?id=306993", 'filenames': { "nijie/72870/306993_p0.jpg": [], "nijie/72870/306993_p1.jpg": [], "nijie/72870/306993_p0.jpg.json": [], "nijie/72870/306993_p1.jpg.json": ["\"オリジナル\"", "\"title\": \"朝7時50分の通学路\","] }, 'anchors': ["nijie306993_0", "nijie306993_1"] }, 'patreon': { 'url': "https://www.patreon.com/posts/new-cg-set-on-48042243", 'filenames': { "patreon/Osiimi Chan/48042243_NEW CG SET on Gumroad!! Ganyu's Hypnotic Rendezvou_01.png": [] }, 'anchors': ["patreon48042243_1"] }, 'sankaku': { 'url': "https://chan.sankakucomplex.com/post/show/707246", 'filenames': { "sankaku/sankaku_707246_5da41b5136905c35cad9cbcba89836a3.jpg": [], "sankaku/sankaku_707246_5da41b5136905c35cad9cbcba89836a3.jpg.json": ['"kirisame_marisa"', '"3girls"'] }, 'anchors': ["sankaku707246"] }, 'idolcomplex': { 'url': "https://idol.sankakucomplex.com/post/show/701724", 'filenames': { "idolcomplex/idolcomplex_701724_92b853bcf8dbff393c6217839013bcab.jpg": [], "idolcomplex/idolcomplex_701724_92b853bcf8dbff393c6217839013bcab.jpg.json": ['"rating": "q",', 'nikumikyo,'] }, 'anchors': ["idolcomplex701724"] }, 'artstation': { 'url': "https://www.artstation.com/artwork/W2LROD", 'filenames': { "artstation/sergey_vasnev/artstation_6721469_24728858_Procession.jpg": [], "artstation/sergey_vasnev/artstation_6721469_24728858_Procession.jpg.json": ['"title": "Procession",'] }, 'anchors': ["artstation24728858"] }, 'deviantart': { 'url': "https://www.deviantart.com/squchan/art/Atelier-Ryza-820511154", 'filenames': { "deviantart/SquChan/deviantart_820511154_Atelier Ryza.jpg": [], "deviantart/SquChan/deviantart_820511154_Atelier Ryza.jpg.json": ['"is_mature": true,'] }, 'anchors': ["deviantart820511154"] }, 'twitter': { 'url': "https://twitter.com/momosuzunene/status/1380033327680266244", 'filenames': { "twitter/momosuzunene/1380033327680266244_1.jpg": [], "twitter/momosuzunene/1380033327680266244_1.jpg.json": ['"name": "momosuzunene",'] }, 'anchors': ["twitter1380033327680266244_1"] }, 'webtoons': { 'url': "https://www.webtoons.com/en/challenge/crawling-dreams/ep-1-nyarla-ghast/viewer?title_no=141539&episode_no=81", 'anchors': [ 'webtoons141539_81_1', 'webtoons141539_81_2', 'webtoons141539_81_3', 'webtoons141539_81_4' ], 'filenames': { "webtoons/crawling-dreams/81-01.jpg": [], "webtoons/crawling-dreams/81-01.jpg.json": ['"comic": "crawling-dreams"'] } }, 'baraag': { 'url': "https://baraag.net/@pumpkinnsfw/106191173043385531", 'anchors': ['baraag106191139078112401', 'baraag106191139927706653'], 'filenames': { "mastodon/baraag.net/pumpkinnsfw/baraag_106191173043385531_106191139078112401.png": [], "mastodon/baraag.net/pumpkinnsfw/baraag_106191173043385531_106191139078112401.png.json": ['"sensitive": true'] } }, 'hentaifoundry': { 'url': "https://www.hentai-foundry.com/pictures/user/PalomaP/907277/Rapunzel-loves-creampie", 'anchors': ["hentaifoundry907277"], 'filenames': { "hentaifoundry/PalomaP/hentaifoundry_907277_Rapunzel loves creampie.jpg": [], "hentaifoundry/PalomaP/hentaifoundry_907277_Rapunzel loves creampie.jpg.json": ['"tags": [', '"creampie"'] } }, 'yandere': { 'url': "https://yande.re/post/show/619304", 'anchors': ["yandere619304"], 'filenames': { 'yandere_619304_449a208b7a42f917498a00386e173118.jpg.json': [], 'yandere_619304_449a208b7a42f917498a00386e173118.jpg': ['"tags_artist": "zuima"'] } } } site_set = {site.strip() for site in sites.split(',')} for site in site_set: clear_test_env() log_file = db.get_rootpath() + f"/logs/test-site-{site}-gallery-dl.txt" should_break = False if site == 'environment': log.info("hydownloader-test", "Querying gallery-dl version") version_str = gallery_dl_utils.run_gallery_dl_with_custom_args( ['--version'], capture_output=True).stdout.strip() try: if version_str.endswith("-dev"): version_str = version_str[:-4] major, minor, patch = tuple(map(int, version_str.split('.'))) if major != 1 or minor < 17 or minor == 17 and patch < 4: log.error( 'hydownloader-test', f"Bad gallery-dl version: {version_str}, need 1.17.3 or newer" ) should_break = True else: log.info( 'hydownloader-test', f"Found gallery-dl version: {version_str}, this is OK") except ValueError as e: log.error('hydownloader-test', "Could not recognize gallery-dl version", e) should_break = True try: ff_result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True, check=False).stdout.split('\n')[0] log.info('hydownloader-test', f"Found ffmpeg version: {ff_result}") except FileNotFoundError as e: log.error('hydownloader-test', "Could not find ffmpeg", e) should_break = True try: yt_result = subprocess.run(['youtube-dl', '--version'], capture_output=True, text=True, check=False).stdout.strip() log.info('hydownloader-test', f"Found youtube-dl version: {yt_result}") except FileNotFoundError as e: log.error('hydownloader-test', "Could not find youtube-dl", e) should_break = True elif site == "gelbooru": log.info("hydownloader-test", "Testing gelbooru...") log.info("hydownloader-test", 'Testing search of "sensitive" content') sensitive_url = "https://gelbooru.com/index.php?page=post&s=list&tags=loli" result = gallery_dl_utils.run_gallery_dl_with_custom_args( [ sensitive_url, '--get-urls', '-o', 'image-range="1-10"', '--write-log', log_file ], capture_output=True) sensitive_ok = True if result.returncode != 0: status_txt = gallery_dl_utils.check_return_code( result.returncode) log.error( "hydownloader-test", f'Error returned while trying to download "sensitive" content: return code {result.returncode}, {status_txt}' ) sensitive_ok = False should_break = True sensitive_results_cnt = len( re.findall("https://.*?gelbooru.com/images", result.stdout)) if sensitive_results_cnt < 10: log.error( "hydownloader-test", f'Failed to find "sensitive" content, insufficient number of results: {sensitive_results_cnt}' ) sensitive_ok = False should_break = True if sensitive_ok: log.info( "hydownloader-test", 'Search of "sensitive" content seems to be working OK') should_break = not check_results_of_post_url( post_url_data['gelbooru'], site) or should_break log.info("hydownloader-test", 'Testing note extraction') should_break = not check_results_of_post_url( post_url_data['gelbooru_notes'], site) or should_break elif site == "danbooru": log.info("hydownloader-test", "Testing danbooru...") log.info("hydownloader-test", 'Testing search of "sensitive" content') sensitive_url = "https://danbooru.donmai.us/posts?tags=loli" result = gallery_dl_utils.run_gallery_dl_with_custom_args( [ sensitive_url, '--get-urls', '-o', 'image-range="1-10"', '--write-log', log_file ], capture_output=True) sensitive_ok = True if result.returncode != 0: status_txt = gallery_dl_utils.check_return_code( result.returncode) log.error( "hydownloader-test", f'Error returned while trying to download "sensitive" content: return code {result.returncode}, {status_txt}' ) sensitive_ok = False should_break = True sensitive_results_cnt = len( re.findall("https://danbooru.donmai.us/data", result.stdout)) if sensitive_results_cnt < 10: log.error( "hydownloader-test", f'Failed to find "sensitive" content, insufficient number of results: {sensitive_results_cnt}' ) sensitive_ok = False should_break = True if sensitive_ok: log.info( "hydownloader-test", 'Search of "sensitive" content seems to be working OK') should_break = not check_results_of_post_url( post_url_data['danbooru'], site) or should_break elif site == "pixiv": log.info("hydownloader-test", "Testing pixiv...") should_break = not check_results_of_post_url( post_url_data['pixiv'], site) or should_break log.info("hydownloader-test", 'Testing downloading of ugoira') should_break = not check_results_of_post_url( post_url_data['pixiv_ugoira'], site) or should_break elif site == "lolibooru": log.info("hydownloader-test", "Testing lolibooru.moe...") should_break = not check_results_of_post_url( post_url_data['lolibooru'], site) or should_break elif site == "3dbooru": log.info("hydownloader-test", "Testing 3dbooru...") should_break = not check_results_of_post_url( post_url_data['3dbooru'], site) or should_break elif site == "patreon": log.info("hydownloader-test", "Testing patreon...") should_break = not check_results_of_post_url( post_url_data['patreon'], site) or should_break elif site == "nijie": log.info("hydownloader-test", "Testing nijie.info...") should_break = not check_results_of_post_url( post_url_data['nijie'], site) or should_break elif site == "sankaku": log.info("hydownloader-test", "Testing sankaku...") should_break = not check_results_of_post_url( post_url_data['sankaku'], site) or should_break elif site == "idolcomplex": log.info("hydownloader-test", "Testing idolcomplex...") should_break = not check_results_of_post_url( post_url_data['idolcomplex'], site) or should_break elif site == "artstation": log.info("hydownloader-test", "Testing artstation...") should_break = not check_results_of_post_url( post_url_data['artstation'], site) or should_break elif site == "twitter": log.info("hydownloader-test", "Testing twitter...") should_break = not check_results_of_post_url( post_url_data['twitter'], site) or should_break elif site == "deviantart": log.info("hydownloader-test", "Testing deviantart...") should_break = not check_results_of_post_url( post_url_data['deviantart'], site) or should_break elif site == "webtoons": log.info("hydownloader-test", "Testing webtoons...") should_break = not check_results_of_post_url( post_url_data['webtoons'], site) or should_break elif site == "baraag": log.info("hydownloader-test", "Testing baraag...") should_break = not check_results_of_post_url( post_url_data['baraag'], site) or should_break elif site == "hentaifoundry": log.info("hydownloader-test", "Testing hentaifoundry...") should_break = not check_results_of_post_url( post_url_data['hentaifoundry'], site) or should_break elif site == "yandere": log.info("hydownloader-test", "Testing yande.re...") should_break = not check_results_of_post_url( post_url_data['yandere'], site) or should_break else: log.error("hydownloader-test", f"Site name not recognized: {site}, no testing done") return False if should_break: log.error( "hydownloader-test", f"Stopping early due to errors while testing {site}, test environment kept for inspection" ) return False clear_test_env() return True
def update_anchor(path: str, hydrus_db_folder: str, sites: str, unrecognized_urls_file: Optional[str], recognized_urls_file: Optional[str], fill_known_urls: bool, keep_old_hydrus_url_data: bool) -> None: """ This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs from the gallery-dl default, these are set in gallery-dl-config.json. If enabled, also fills up the known_urls table in the hydownloader DB with all URLs known by Hydrus. """ log.init(path, True) db.init(path) if not os.path.isfile(hydrus_db_folder + "/client.master.db"): log.fatal( "hydownloader-anchor-exporter", "The client.master.db database was not found at the given location!" ) hydrus_db = sqlite3.connect("file:" + hydrus_db_folder + "/client.master.db?mode=ro", uri=True) hydrus_db.row_factory = sqlite3.Row anchor_init_needed = not os.path.isfile(path + "/anchor.db") anchor_db = sqlite3.connect(path + "/anchor.db") hc = hydrus_db.cursor() ac = anchor_db.cursor() if anchor_init_needed: ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID') anchor_db.commit() ac.execute('select * from archive') known_anchors = {row[0] for row in ac.fetchall()} log.info("hydownloader-anchor-exporter", "Querying Hydrus database for URLs...") hc.execute('select * from url_domains natural inner join urls') rows = hc.fetchall() all_rows = len(rows) processed = 0 suspicious_urls = set() recognized_urls = set() current_url_ids = set() deleted_url_ids = set() if fill_known_urls: if not os.path.isfile(hydrus_db_folder + "/client.db"): log.fatal( "hydownloader-anchor-exporter", "The client.db database was not found at the given location!") client_db = sqlite3.connect("file:" + hydrus_db_folder + "/client.db?mode=ro", uri=True) client_db.row_factory = sqlite3.Row cc = client_db.cursor() log.info("hydownloader-anchor-exporter", "Querying Hydrus database for current URL IDs...") cc.execute('select * from current_files natural inner join url_map') for row in cc.fetchall(): current_url_ids.add(row['url_id']) log.info("hydownloader-anchor-exporter", "Querying Hydrus database for deleted URL IDs...") cc.execute('select * from deleted_files natural inner join url_map') for row in cc.fetchall(): deleted_url_ids.add(row['url_id']) client_db.close() if keep_old_hydrus_url_data: log.info( "hydownloader-anchor-exporter", "Old Hydrus URL data will NOT be deleted from the shared hydownloader database" ) else: log.info( "hydownloader-anchor-exporter", "Deleting old Hydrus URL data from shared hydownloader database..." ) db.delete_all_hydrus_known_urls() sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = { 'pixiv': (["pixi"], []), 'gelbooru': (["gelbooru"], []), 'nijie': (["nijie"], []), 'lolibooru': (['lolibooru'], []), 'danbooru': (['danbooru'], []), '3dbooru': (['behoimi'], []), 'sankaku': (['sankaku'], ["idol."]), 'idolcomplex': (["idol.sankaku"], []), 'artstation': (["artstation"], []), 'twitter': (["twitter", "nitter"], []), 'deviantart': (['deviantart'], []), 'tumblr': (["tumblr"], []), 'hentaifoundry': (["hentai-foundry"], []), 'yandere': (["yande.re"], []) } siteset = {x.strip() for x in sites.split(',') if x.strip()} if sites == "all": siteset = set(sites_to_keywords.keys()) anchors: Counter[str] = collections.Counter() for site in siteset: if not site in sites_to_keywords: log.fatal('hydownloader-anchor-exporter', f'Unsupported site: {site}') def process_url(url): patterns = urls.anchor_patterns_from_url(url) if patterns: recognized_urls.add(url) anchors[patterns[0]] += 1 else: suspicious_urls.add(url) log.info("hydownloader-anchor-exporter", "Processing URLs...") for row in rows: processed += 1 if processed % 1000 == 0: print(f"Processed {processed}/{all_rows} URLs", file=sys.stderr) if fill_known_urls: known_url_status = 1 is_current = row['url_id'] in current_url_ids is_deleted = row['url_id'] in deleted_url_ids if is_current and is_deleted: known_url_status = 4 elif is_deleted: known_url_status = 3 elif is_current: known_url_status = 2 db.add_hydrus_known_url(row['url'], known_url_status) for site in siteset: accepts, rejects = sites_to_keywords[site] url_ok = False for accept in accepts: if accept in row['url']: url_ok = True break if url_ok: for reject in rejects: if reject in row['url']: url_ok = False if url_ok: process_url(row['url']) log.info("hydownloader-anchor-exporter", "Done processing URLs") if unrecognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing unrecognized URLs...") with open(unrecognized_urls_file, 'w', encoding='utf-8') as f: for url in sorted(suspicious_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing unrecognized URLs") if recognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing recognized URLs...") with open(recognized_urls_file, 'w', encoding='utf-8') as f: for url in sorted(recognized_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing recognized URLs") log.info("hydownloader-anchor-exporter", "Inserting new anchors...") anchor_count = len(anchors.keys()) processed = 0 new_anchor_rows = 0 for anchor in anchors: processed += 1 if processed % 50 == 0: print(f"Inserting new anchors {processed}/{anchor_count}", file=sys.stderr) final_anchors = [anchor] if anchor.startswith("nijie"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("twitter") or anchor.startswith("tumblr"): for i in range(anchors[anchor] + 1): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("pixiv"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_p{:02d}".format(i)) for f_a in final_anchors: if f_a in known_anchors: continue ac.execute('insert into archive(entry) values (?)', (f_a, )) new_anchor_rows += 1 log.info( "hydownloader-anchor-exporter", f"Done inserting new anchors, added {new_anchor_rows} entries in total" ) anchor_db.commit() anchor_db.close() hydrus_db.close() db.shutdown()
def process_additional_data(subscription_id: Optional[int] = None, url_id: Optional[int] = None) -> tuple[int, int]: """ This function scans log files outputted by gallery-dl and tries to recognize filenames in the output. Based on which subscription or URL those files belong to, it queries the database for the associated additional_data values (from the subscriptions or single_url_queue tables), then inserts these filename + data entries into the additional_data database table (even if there is no additional_date for the given files). This way it is possible to keep track which files were found by which URL downloads/subscriptions, and correctly associate additional data with them (even if the files were not actually downloaded by the URL or sub because some earlier download already got them). If both the subscription and url ID arguments are None, then it scans all files in the temp directory, otherwise exactly one of those must not be None and then it only scans for the file belonging to that URL or subscription. When parsing gallery-dl output, it is much better to have false positives (recognize some output lines as filenames which are not) than to miss any actual filenames, since invalid filename entries in the additional_data table are not a big deal. """ def is_filepath(candidate: str) -> bool: candidate = candidate.strip() # return ("/" in candidate or "\\" in candidate) and not candidate.startswith("[") and not "gallery-dl:" in candidate return os.path.exists(candidate) skipped_count = 0 new_count = 0 if subscription_id is not None and os.path.isfile( db.get_rootpath() + f"/temp/subscription-{subscription_id}-gallery-dl-output.txt"): f = open(db.get_rootpath() + f"/temp/subscription-{subscription_id}-gallery-dl-output.txt", 'r', encoding='utf-8-sig') for line in f: line = line.strip() if not is_filepath(line): log.debug("hydownloader", f"Does not look like a filepath: {line}") continue if line.startswith("# "): log.debug("hydownloader", f"Looks like a skipped filepath: {line}") line = line[1:] line = line.strip() skipped_count += 1 else: log.debug("hydownloader", f"Looks like a new filepath: {line}") new_count += 1 db.associate_additional_data(filename=line, subscription_id=subscription_id, no_commit=True) db.sync() f.close() os.remove( db.get_rootpath() + f"/temp/subscription-{subscription_id}-gallery-dl-output.txt") elif url_id is not None and os.path.isfile( db.get_rootpath() + f"/temp/single-url-{url_id}-gallery-dl-output.txt"): f = open(db.get_rootpath() + f"/temp/single-url-{url_id}-gallery-dl-output.txt", 'r', encoding='utf-8-sig') for line in f: line = line.strip() if not is_filepath(line): log.debug("hydownloader", f"Does not look like a filepath: {line}") continue if line.startswith("# "): log.debug("hydownloader", f"Looks like a skipped filepath: {line}") line = line[1:] line = line.strip() skipped_count += 1 else: log.debug("hydownloader", f"Looks like a new filepath: {line}") new_count += 1 db.associate_additional_data(filename=line, url_id=url_id, no_commit=True) db.sync() f.close() os.remove(db.get_rootpath() + f"/temp/single-url-{url_id}-gallery-dl-output.txt") else: log.info( "hydownloader", "Checking for any leftover temporary gallery-dl output files...") filenames = os.listdir(db.get_rootpath() + "/temp") for filename in filenames: if match := re.match("single-url-([0-9]+)-gallery-dl-output.txt", filename.strip()): log.info("hydownloader", f"Processing leftover file {filename}...") process_additional_data(url_id=int(match.group(1))) elif match := re.match( "subscription-([0-9]+)-gallery-dl-output.txt", filename.strip()): log.info("hydownloader", f"Processing leftover file {filename}...") process_additional_data(subscription_id=int(match.group(1)))
db.add_log_file_to_parse_queue(l, 'reparse') while logfname := db.get_queued_log_file(worker): subscription_id = None url_id = None if m := re.match(r".*(?:\\|\/)single-urls-([0-9]+)-gallery-dl-.*\.txt", logfname): url_id = int(m.group(1)) if m := re.match( r".*(?:\\|\/)subscription-([0-9]+)-gallery-dl-.*\.txt", logfname): subscription_id = int(m.group(1)) try: with open(db.get_rootpath() + "/" + logfname, 'r', encoding='utf-8-sig') as logf: log.info("hydownloader", f"Parsing log file: {logfname}") urls = [] for line in logf: if m := re.match( r'(?:\[.+\])* (http.*?)(?::[0-9]+)? "[A-Z]+ (\/.*?) HTTP.*', line.strip()): urls.append(m.group(1) + m.group(2)) if m := re.match(r".*Starting DownloadJob for '(.*)'$", line.strip()): urls.append(m.group(1)) db.add_known_urls(urls, subscription_id=subscription_id, url_id=url_id) db.remove_log_file_from_parse_queue(db.get_rootpath() + "/" + logfname) log.info(
def subscription_worker() -> None: global _sub_worker_ended_flag try: log.info("hydownloader", "Starting subscription worker thread...") with _worker_lock: _sub_worker_ended_flag = False while True: time.sleep(2) with _worker_lock: if _end_threads_flag: break subs_due = db.get_due_subscriptions() if not subs_due: with _worker_lock: if _sub_worker_paused_flag: set_subscription_worker_status("paused") else: set_subscription_worker_status("nothing to do: checked for due subscriptions, found none") sub = subs_due[0] if subs_due else None while sub: with _worker_lock: if _end_threads_flag: break if _sub_worker_paused_flag: set_subscription_worker_status("paused") break initial_check = sub['last_check'] is None url = urls.subscription_data_to_url(sub['downloader'], sub['keywords']) check_started_time = time.time() status_msg = f"checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']})" set_subscription_worker_status(status_msg) log.info(f"subscription-{sub['id']}", status_msg.capitalize()) if initial_check: log.info(f"subscription-{sub['id']}", "This is the first check for this subscription") result = gallery_dl_utils.run_gallery_dl( url=url, ignore_anchor=False, metadata_only=False, log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-latest.txt", old_log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-old.txt", console_output_file=db.get_rootpath()+f"/temp/subscription-{sub['id']}-gallery-dl-output.txt", unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-latest.txt", old_unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-old.txt", overwrite_existing=False, filter_=sub['filter'], chapter_filter=None, subscription_mode=True, abort_after=sub['abort_after'], max_file_count = sub['max_files_initial'] if initial_check else sub['max_files_regular'] ) if result: log.warning(f"subscription-{sub['id']}", "Error: "+result) else: sub['last_successful_check'] = check_started_time sub['last_check'] = check_started_time new_files, skipped_files = process_additional_data(subscription_id = sub['id']) check_ended_time = time.time() db.add_subscription_check(sub['id'], new_files=new_files, already_seen_files=skipped_files, time_started=check_started_time, time_finished=check_ended_time, status=result) db.add_or_update_subscriptions([sub]) status_msg = f"finished checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']}), new files: {new_files}, skipped: {skipped_files}" set_subscription_worker_status(status_msg) log.info(f"subscription-{sub['id']}", status_msg.capitalize()) subs_due = db.get_due_subscriptions() sub = subs_due[0] if subs_due else None with _worker_lock: if _end_threads_flag: break with _worker_lock: if _end_threads_flag: log.info("hydownloader", "Stopping subscription worker thread") _sub_worker_ended_flag = True except Exception as e: log.fatal("hydownloader", "Uncaught exception in subscription worker thread", e) shutdown()
def url_queue_worker() -> None: global _url_worker_ended_flag try: log.info("hydownloader", "Starting single URL queue worker thread...") with _worker_lock: _url_worker_ended_flag = False while True: time.sleep(2) with _worker_lock: if _end_threads_flag: break urls_to_dl = db.get_urls_to_download() if not urls_to_dl: with _worker_lock: if _url_worker_paused_flag: set_url_worker_status("paused") else: set_url_worker_status("nothing to do: checked for queued URLs, found none") urlinfo = urls_to_dl[0] if urls_to_dl else None while urlinfo: with _worker_lock: if _end_threads_flag: break if _url_worker_paused_flag: set_url_worker_status("paused") break check_time = time.time() status_msg = f"downloading URL: {urlinfo['url']}" set_url_worker_status(status_msg) log.info("single url downloader", status_msg.capitalize()) result = gallery_dl_utils.run_gallery_dl( url=urlinfo['url'], ignore_anchor=urlinfo['ignore_anchor'], metadata_only=urlinfo['metadata_only'], log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-latest.txt", old_log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-old.txt", console_output_file=db.get_rootpath()+f"/temp/single-url-{urlinfo['id']}-gallery-dl-output.txt", unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-latest.txt", old_unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-old.txt", overwrite_existing=urlinfo['overwrite_existing'], filter_=urlinfo['filter'], chapter_filter=None, subscription_mode=False, max_file_count = urlinfo['max_files'] ) if result: log.warning("single url downloader", f"Error while downloading {urlinfo['url']}: {result}") urlinfo['status'] = 1 urlinfo['status_text'] = result else: urlinfo['status'] = 0 urlinfo['status_text'] = 'ok' urlinfo['time_processed'] = check_time new_files, skipped_files = process_additional_data(url_id = urlinfo['id']) urlinfo['new_files'] = new_files urlinfo['already_seen_files'] = skipped_files db.add_or_update_urls([urlinfo]) status_msg = f"finished checking URL: {urlinfo['url']}, new files: {new_files}, skipped: {skipped_files}" set_url_worker_status(status_msg) log.info("single url downloader", status_msg.capitalize()) urls_to_dl = db.get_urls_to_download() urlinfo = urls_to_dl[0] if urls_to_dl else None with _worker_lock: if _end_threads_flag: break with _worker_lock: if _end_threads_flag: log.info("hydownloader", "Stopping single URL queue worker thread") _url_worker_ended_flag = True except Exception as e: log.fatal("hydownloader", "Uncaught exception in URL worker thread", e) shutdown()
def run_job(path: str, job: str, config: Optional[str], verbose: bool, do_it: bool, no_stop_on_missing_metadata: bool) -> None: log.init(path, True) db.init(path) config_path = db.get_rootpath() + '/hydownloader-import-jobs.json' data_path = db.get_datapath() if config: config_path = config if not os.path.isfile(config_path): log.fatal("hydownloader-importer", f"Configuration file not found: {config_path}") jobs = json.load(open(config_path, 'r', encoding='utf-8-sig')) if not job in jobs: log.fatal("hydownloader-importer", f"Job not found in configuration file: {job}") jd = jobs[job] force_add_metadata = jd.get('forceAddMetadata', True) force_add_files = jd.get('forceAddFiles', False) client = hydrus.Client(jd['apiKey'], jd['apiURL']) log.info("hydownloader-importer", f"Starting import job: {job}") # iterate over all files in the data directory for root, dirs, files in os.walk(data_path): for fname in files: # json files hold metadata, don't import them to Hydrus if fname.endswith('.json'): continue # set up some variables # some will be used later in the code, some are meant to be used in user-defined expressions abspath = root + "/" + fname path = os.path.relpath(abspath, start=data_path) split_path = os.path.split(path) fname_noext, fname_ext = os.path.splitext(fname) if fname_ext.startswith('.'): fname_ext = fname_ext[1:] # find the path of the associated json metadata file, check if it exists # for pixiv ugoira, the same metadata file belongs both to the .webm and the .zip, # so this needs special handling json_path = abspath + '.json' if not os.path.isfile(json_path) and abspath.endswith('.webm'): json_path = abspath[:-4] + "zip.json" json_exists = True if not os.path.isfile(json_path): json_exists = False printerr(f"Warning: no metadata file found for {path}") if not no_stop_on_missing_metadata: sys.exit(1) generated_urls = set() generated_tags: set[tuple[str, str]] = set() matched = False # will be true if at least 1 filter group matched the file json_data = None # this will hold the associated json metadata (if available) if verbose: printerr(f"Processing file: {path}...") # iterate over all filter groups, do they match this file? for group in jd['groups']: # evaluate filter, load json metadata if the filter matches and we haven't loaded it yet should_process = False try: should_process = eval(group['filter']) except: printerr(f"Failed to evaluate filter: {group['filter']}") sys.exit(1) if not json_data and json_exists: try: json_data = json.load( open(json_path, encoding='utf-8-sig')) except json.decoder.JSONDecodeError: printerr(f"Failed to parse JSON: {json_path}") sys.exit(1) if not should_process: continue matched = True # get the data for this file from the additional_data db table and process it # set up some variables that user-defined expressions will be able to use additional_data_dicts = db.get_additional_data_for_file(path) if not additional_data_dicts and path.endswith('.webm'): additional_data_dicts = db.get_additional_data_for_file( path[:-4] + "zip") extra_tags: defaultdict[str, list[str]] = defaultdict(list) min_time_added = -1 max_time_added = -1 for d in additional_data_dicts: parse_additional_data(extra_tags, d['data']) if min_time_added == -1 or min_time_added > d['time_added']: min_time_added = d['time_added'] if max_time_added == -1 or max_time_added < d['time_added']: max_time_added = d['time_added'] sub_ids = [] url_ids = [] for d in additional_data_dicts: if d['subscription_id']: sub_ids.append(str(d['subscription_id'])) if d['url_id']: url_ids.append(str(d['url_id'])) # execute user-defined tag and url generator expressions has_error = False for dtype, d in [('tag', x) for x in group.get('tags', [])] + [ ('url', x) for x in group.get('urls', []) ]: skip_on_error = d.get("skipOnError", False) allow_empty = d.get("allowEmpty", False) rule_name = d.get("name") generated_results = [] # if the expression is a single string if isinstance(d["values"], str): try: eval_res = eval(d["values"]) # check result type: must be string or iterable of strings if isinstance(eval_res, str): generated_results = [eval_res] else: for eval_res_str in eval_res: if not isinstance(eval_res_str, str): printerr( f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {d['values']}" ) sys.exit(1) else: generated_results.append(eval_res_str) except Exception as e: if verbose: printerr( f"Failed to evaluate expression: {d['values']}" ) print(e) has_error = True else: # multiple expressions (array of strings) for eval_expr in d["values"]: try: eval_res = eval(eval_expr) # check result type: must be string or iterable of strings if isinstance(eval_res, str): generated_results = [eval_res] else: for eval_res_str in eval_res: if not isinstance(eval_res_str, str): printerr( f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {eval_expr}" ) sys.exit(1) else: generated_results.append( eval_res_str) except Exception as e: if verbose: printerr( f"Failed to evaluate expression: {eval_expr}" ) printerr(e) has_error = True # check for empty results or failed evaluation, as necessary if not generated_results and not allow_empty: printerr( f"Error: the rule named {rule_name} yielded no results but this is not allowed" ) sys.exit(1) if has_error: printerr( f"Warning: an expression failed to evaluate in the rule named {rule_name}" ) if not skip_on_error: sys.exit(1) # save results of the currently evaluated expressions if dtype == 'url': generated_urls.update(generated_results) else: for repo in d["tagRepos"]: generated_tags.update( (repo, tag) for tag in generated_results) if matched: printerr(f"File matched: {path}...") if not os.path.getsize(abspath): print(f"Found truncated file: {abspath}") sys.exit(1) if verbose: printerr("Generated URLs:") for url in generated_urls: printerr(url) printerr("Generated tags:") for repo, tag in sorted(list(generated_tags), key=lambda x: x[0]): printerr(f"{repo} <- {tag}") if verbose: printerr('Hashing...') # calculate hash, check if Hydrus already knows the file already_added = False if do_it: hasher = hashlib.sha256() with open(abspath, 'rb') as hashedfile: buf = hashedfile.read(65536 * 16) while len(buf) > 0: hasher.update(buf) buf = hashedfile.read(65536 * 16) hexdigest = hasher.hexdigest() if client.file_metadata(hashes=[hexdigest], only_identifiers=True): printerr("File is already in Hydrus") already_added = True # send file, tags, metadata to Hydrus as needed if not already_added or force_add_files: if verbose: printerr("Sending file to Hydrus...") if do_it: client.add_file(abspath) if not already_added or force_add_metadata: if verbose: printerr("Associating URLs...") if do_it: client.associate_url(hashes=[hexdigest], add=generated_urls) if verbose: printerr("Adding tags...") tag_dict = defaultdict(list) for repo, tag in generated_tags: tag_dict[repo].append(tag) if do_it: client.add_tags(hashes=[hexdigest], service_to_tags=tag_dict) else: if verbose: printerr(f"Skipping due to no matching filter: {path}") log.info("hydownloader-importer", f"Finished import job: {job}") db.shutdown()
def report(verbose: bool, urls: bool = True) -> None: check_init() c = get_conn().cursor() def format_date(timestamp: Optional[Union[float, int, str]]) -> str: if isinstance(timestamp, str): return timestamp if timestamp is None: return 'never' return datetime.datetime.fromtimestamp(float(timestamp)).isoformat() log.info('hydownloader-report', 'Generating report...') urls_paused = len(c.execute('select * from single_url_queue where paused = 1').fetchall()) subs_paused = len(c.execute('select * from subscriptions where paused = 1').fetchall()) urls_errored_entries = c.execute('select * from single_url_queue where status > 0').fetchall() urls_errored = len(urls_errored_entries) subs_errored_entries = c.execute('select * from subscriptions where last_check is not null and last_successful_check <> last_check').fetchall() subs_errored = len(subs_errored_entries) urls_no_files_entries = c.execute('select * from single_url_queue where status = 0 and (new_files is null or already_seen_files is null or new_files + already_seen_files = 0)').fetchall() urls_no_files = len(urls_no_files_entries) subs_no_files_entries = c.execute(( 'select * from subscriptions where last_check is not null and id in ' '(select subscription_id from subscription_checks group by subscription_id having sum(new_files) + sum(already_seen_files) <= 0)' )).fetchall() subs_no_files = len(subs_no_files_entries) urls_waiting_long_entries = c.execute(f'select * from single_url_queue where time_processed is null and time_added + 86400 <= {time.time()}').fetchall() urls_waiting_long = len(urls_waiting_long_entries) subs_waiting_long_entries = c.execute(( f'select * from subscriptions where (last_check is not null and last_check + check_interval <= {time.time()})' f'or (last_check is null and time_created + check_interval <= {time.time()})' )).fetchall() subs_waiting_long = len(subs_waiting_long_entries) subs_no_recent_files_entries = c.execute(( 'select * from subscriptions where last_check is not null and id in ' f'(select subscription_id from subscription_checks where time_started + 30 * 86400 >= {time.time()} group by subscription_id having sum(new_files) + sum(already_seen_files) <= 0)' f'or id not in (select subscription_id from subscription_checks group by subscription_id having max(time_started) + 30 * 86400 < {time.time()})' )).fetchall() subs_no_recent_files = len(subs_no_recent_files_entries) subs_queued = len(get_due_subscriptions()) urls_queued = len(get_urls_to_download()) all_subs = len(c.execute('select * from subscriptions').fetchall()) all_urls = len(c.execute('select * from single_url_queue').fetchall()) all_sub_checks = len(c.execute('select * from subscription_checks').fetchall()) all_file_results = len(c.execute('select * from additional_data').fetchall()) last_time_url_processed_results = c.execute('select max(time_processed) t from single_url_queue').fetchall() last_time_url_processed = format_date(last_time_url_processed_results[0]['t'] if last_time_url_processed_results else 'never') last_time_sub_checked_results = c.execute('select max(time_finished) t from subscription_checks').fetchall() last_time_sub_checked = format_date(last_time_sub_checked_results[0]['t'] if last_time_sub_checked_results else 'never') def print_url_entries(entries: list[dict]) -> None: for url in entries: log.info('hydownloader-report', ( f"URL: {url['url']}, " f"status: {url['status_text']} (code: {url['status']}), " f"time added: {format_date(url['time_added'])}, " f"time processed: {format_date(url['time_processed'])}, " f"paused: {url['paused']}" )) def print_sub_entries(entries: list[dict]) -> None:#keywords,downloader,last_check,last_successful_check, check_interval, paused for sub in entries: log.info('hydownloader-report', ( f"Downloader: {sub['downloader']}, " f"keywords: {sub['keywords']}, " f"last check: {format_date(sub['last_check'])}, " f"last successful check: {format_date(sub['last_successful_check'])}, " f"check interval: {sub['check_interval']}, " f"paused: {sub['paused']}" )) log.info('hydownloader-report', f'Subscriptions: {all_subs}') if urls: log.info('hydownloader-report', f'Single URLs: {all_urls}') log.info('hydownloader-report', f'Subscription checks: {all_sub_checks}') log.info('hydownloader-report', f'All file results (including duplicates and skipped): {all_file_results}') log.info('hydownloader-report', f'Last time a subscription was checked: {last_time_sub_checked}') if urls: log.info('hydownloader-report', f'Last time a URL was downloaded: {last_time_url_processed}') log.info('hydownloader-report', f'Subscriptions due for a check: {subs_queued}') if urls: log.info('hydownloader-report', f'URLs waiting to be downloaded: {urls_queued}') log.info('hydownloader-report', f'Paused subscriptions: {subs_paused}') if urls: log.info('hydownloader-report', f'Paused URLs: {urls_paused}') if urls: log.info('hydownloader-report', f'Errored URLs: {urls_errored}') if verbose and urls_errored and urls: log.info('hydownloader-report', 'These are the following:') print_url_entries(urls_errored_entries) log.info('hydownloader-report', f'Errored subscriptions: {subs_errored}') if verbose and subs_errored: log.info('hydownloader-report', 'These are the following:') print_sub_entries(subs_errored_entries) if urls: log.info('hydownloader-report', f'URLs that did not error but produced no files: {urls_no_files}') if verbose and urls_no_files and urls: log.info('hydownloader-report', 'These are the following:') print_url_entries(urls_no_files_entries) log.info('hydownloader-report', f'Subscriptions that did not error but produced no files: {subs_no_files}') if verbose and subs_no_files: log.info('hydownloader-report', 'These are the following:') print_sub_entries(subs_no_files_entries) if urls: log.info('hydownloader-report', f'URLs waiting to be downloaded for more than a day: {urls_waiting_long}') if verbose and urls_waiting_long and urls: log.info('hydownloader-report', 'These are the following:') print_url_entries(urls_waiting_long_entries) log.info('hydownloader-report', f'Subscriptions due for a check longer than their check interval: {subs_waiting_long}') if verbose and subs_waiting_long: log.info('hydownloader-report', 'These are the following:') print_sub_entries(subs_waiting_long_entries) log.info('hydownloader-report', f'Subscriptions that were checked at least once but did not produce any files in the past 30 days: {subs_no_recent_files}') if verbose and subs_no_recent_files: log.info('hydownloader-report', 'These are the following:') print_sub_entries(subs_no_recent_files_entries) log.info('hydownloader-report', 'Report finished')