def deviantart_login(path: str) -> None: log.init(path, True) db.init(path) args = ['--cookies', db.get_rootpath() + '/cookies.txt'] args += ['-o', 'cache.file=' + db.get_rootpath() + '/gallery-dl-cache.db'] args += ['oauth:deviantart'] gallery_dl_utils.run_gallery_dl_with_custom_args(args)
def mass_add_urls(path: str, file_: str, additional_data: Optional[str], metadata_only: bool, overwrite_existing: bool, filter_: Optional[str], ignore_anchor: bool, max_files: Optional[int]) -> None: log.init(path, True) db.init(path) for line in open(file_, 'r'): line = line.strip() if line: db.add_or_update_urls([{ 'url': line, 'time_added': time.time(), 'additional_data': additional_data, 'metadata_only': metadata_only, 'overwrite_existing': overwrite_existing, 'filter': filter_, 'ignore_anchor': ignore_anchor, 'max_files': max_files }]) log.info("hydownloader-tools", f"Added URL: {line}")
def mass_add_subscriptions(path: str, file_: str, downloader: str, additional_data: Optional[str], paused: bool, filter_: Optional[str], abort_after: int, max_files_initial: Optional[int], max_files_regular: Optional[int]) -> None: log.init(path, True) db.init(path) for line in open(file_, 'r'): line = line.strip() if line: db.add_or_update_subscriptions([{ 'keywords': line, 'downloader': downloader, 'time_created': time.time(), 'additional_data': additional_data, 'filter': filter_, 'max_files_initial': max_files_initial, 'max_files_regular': max_files_regular, 'abort_after': abort_after, 'paused': paused }]) log.info("hydownloader-tools", f"Added subscription {line} with downloader {downloader}")
def mass_add_subscriptions(path: str, file_: str, downloader: str, additional_data: Optional[str], paused: bool, filter_: Optional[str], abort_after: int, max_files_initial: Optional[int], max_files_regular: Optional[int], check_interval: int, random_check_interval: int, encode_keywords: bool) -> None: log.init(path, True) db.init(path) for line in open(file_, 'r', encoding='utf-8-sig'): line = line.strip() if encode_keywords: line = line.replace(' ', '+') line = urllib.parse.quote(line, safe='/+').lower() if line: new_sub = { 'keywords': line, 'downloader': downloader, 'time_created': time.time(), 'additional_data': additional_data, 'filter': filter_, 'paused': paused, 'check_interval': check_interval + random.randint(0, random_check_interval) } if max_files_initial is not None: new_sub['max_files_initial'] = max_files_initial if max_files_regular is not None: new_sub['max_files_regular'] = max_files_regular if abort_after is not None: new_sub['abort_after'] = abort_after db.add_or_update_subscriptions([new_sub]) log.info( "hydownloader-tools", f"Added subscription {line} with downloader {downloader}")
def start(path : str, debug : bool, no_sub_worker: bool, no_url_worker: bool) -> None: log.init(path, debug) db.init(path) output_postprocessors.process_additional_data() output_postprocessors.parse_log_files() if not no_sub_worker: subs_thread = threading.Thread(target=subscription_worker, name='Subscription worker', daemon=True) subs_thread.start() if not no_url_worker: url_thread = threading.Thread(target=url_queue_worker, name='Single URL queue worker', daemon=True) url_thread.start() api_thread = threading.Thread(target=api_worker, args=(path, debug)) api_thread.start() while not _shutdown_started and not _shutdown_requested_by_api_thread: time.sleep(1) shutdown()
def start(path : str, debug : bool) -> None: log.init(path, debug) db.init(path) process_additional_data() subs_thread = threading.Thread(target=subscription_worker, name='Subscription worker', daemon=True) subs_thread.start() url_thread = threading.Thread(target=url_queue_worker, name='Single URL queue worker', daemon=True) url_thread.start() if db.get_conf('daemon.ssl') and os.path.isfile(path+"/server.pem"): log.info("hydownloader", "Starting daemon (with SSL)...") srv = SSLWSGIRefServer(path+"/server.pem", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port')) bottle.run(server=srv, debug=debug) else: if db.get_conf('daemon.ssl'): log.warning("hydownloader", "SSL enabled in config, but no server.pem file found in the db folder, continuing without SSL...") log.info("hydownloader", "Starting daemon...") srv = SSLWSGIRefServer("", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port')) bottle.run(server=srv, debug=debug)
def update_anchor(path: str, hydrus_master_db: str, sites: str, unrecognized_urls_file: Optional[str], recognized_urls_file: Optional[str]) -> None: """ This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs from the gallery-dl default, these are set in gallery-dl-config.json. """ log.init(path, True) db.init(path) if not os.path.isfile(hydrus_master_db): log.fatal("hydownloader-anchor-exporter", "The given client.master.db file does not exist!") hydrus_db = sqlite3.connect(hydrus_master_db) hydrus_db.row_factory = sqlite3.Row anchor_init_needed = not os.path.isfile(path + "/anchor.db") anchor_db = sqlite3.connect(path + "/anchor.db") hc = hydrus_db.cursor() ac = anchor_db.cursor() if anchor_init_needed: ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID') anchor_db.commit() ac.execute('select * from archive') known_anchors = {row[0] for row in ac.fetchall()} log.info("hydownloader-anchor-exporter", "Querying Hydrus database for URLs...") hc.execute('select * from url_domains natural inner join urls') rows = hc.fetchall() all_rows = len(rows) processed = 0 suspicious_urls = set() recognized_urls = set() sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = { 'pixiv': (["pixi"], []), 'gelbooru': (["gelbooru"], []), 'nijie': (["nijie"], []), 'lolibooru': (['lolibooru'], []), 'danbooru': (['danbooru'], []), '3dbooru': (['behoimi'], []), 'sankaku': (['sankaku'], ["idol."]), 'idolcomplex': (["idol.sankaku"], []), 'artstation': (["artstation"], []), 'twitter': (["twitter", "nitter"], []), 'deviantart': (['deviantart'], []), 'tumblr': (["tumblr"], []) } siteset = {x.strip() for x in sites.split(',') if x.strip()} if sites == "all": siteset = set(sites_to_keywords.keys()) anchors: Counter[str] = collections.Counter() for site in siteset: if not site in sites_to_keywords: log.fatal('hydownloader-anchor-exporter', f'Unsupported site: {site}') def process_url(url): patterns = urls.anchor_patterns_from_url(url) if patterns: recognized_urls.add(url) anchors[patterns[0]] += 1 else: suspicious_urls.add(url) log.info("hydownloader-anchor-exporter", "Processing URLs...") for row in rows: processed += 1 if processed % 1000 == 0: print(f"Processed {processed}/{all_rows} URLs") for site in siteset: accepts, rejects = sites_to_keywords[site] url_ok = False for accept in accepts: if accept in row['url']: url_ok = True break if url_ok: for reject in rejects: if reject in row['url']: url_ok = False if url_ok: process_url(row['url']) log.info("hydownloader-anchor-exporter", "Done processing URLs") if unrecognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing unrecognized URLs...") with open(unrecognized_urls_file, 'w') as f: for url in sorted(suspicious_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing unrecognized URLs") if recognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing recognized URLs...") with open(recognized_urls_file, 'w') as f: for url in sorted(recognized_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing recognized URLs") log.info("hydownloader-anchor-exporter", "Inserting new anchors...") anchor_count = len(anchors.keys()) processed = 0 new_anchor_rows = 0 for anchor in anchors: processed += 1 if processed % 50 == 0: print(f"Inserting new anchors {processed}/{anchor_count}") final_anchors = [anchor] if anchor.startswith("nijie"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("twitter") or anchor.startswith("tumblr"): for i in range(anchors[anchor] + 1): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("pixiv"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_p{:02d}".format(i)) for f_a in final_anchors: if f_a in known_anchors: continue ac.execute('insert into archive(entry) values (?)', (f_a, )) new_anchor_rows += 1 log.info( "hydownloader-anchor-exporter", f"Done inserting new anchors, added {new_anchor_rows} entries in total" ) anchor_db.commit() anchor_db.close() hydrus_db.close()
def reparse_all_logfiles(path: str) -> None: log.init(path, True) db.init(path) output_postprocessors.parse_log_files(True)
def init_db(path: str) -> None: log.init(path, True) db.init(path)
def report(path: str, verbose: bool, no_urls: bool) -> None: log.init(path, True) db.init(path) db.report(verbose, not no_urls)
def test(path: str, sites: str) -> None: log.init(path, True) db.init(path) if not test_internal(sites): sys.exit(1)
def update_anchor(path: str, hydrus_db_folder: str, sites: str, unrecognized_urls_file: Optional[str], recognized_urls_file: Optional[str], fill_known_urls: bool, keep_old_hydrus_url_data: bool) -> None: """ This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs from the gallery-dl default, these are set in gallery-dl-config.json. If enabled, also fills up the known_urls table in the hydownloader DB with all URLs known by Hydrus. """ log.init(path, True) db.init(path) if not os.path.isfile(hydrus_db_folder + "/client.master.db"): log.fatal( "hydownloader-anchor-exporter", "The client.master.db database was not found at the given location!" ) hydrus_db = sqlite3.connect("file:" + hydrus_db_folder + "/client.master.db?mode=ro", uri=True) hydrus_db.row_factory = sqlite3.Row anchor_init_needed = not os.path.isfile(path + "/anchor.db") anchor_db = sqlite3.connect(path + "/anchor.db") hc = hydrus_db.cursor() ac = anchor_db.cursor() if anchor_init_needed: ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID') anchor_db.commit() ac.execute('select * from archive') known_anchors = {row[0] for row in ac.fetchall()} log.info("hydownloader-anchor-exporter", "Querying Hydrus database for URLs...") hc.execute('select * from url_domains natural inner join urls') rows = hc.fetchall() all_rows = len(rows) processed = 0 suspicious_urls = set() recognized_urls = set() current_url_ids = set() deleted_url_ids = set() if fill_known_urls: if not os.path.isfile(hydrus_db_folder + "/client.db"): log.fatal( "hydownloader-anchor-exporter", "The client.db database was not found at the given location!") client_db = sqlite3.connect("file:" + hydrus_db_folder + "/client.db?mode=ro", uri=True) client_db.row_factory = sqlite3.Row cc = client_db.cursor() log.info("hydownloader-anchor-exporter", "Querying Hydrus database for current URL IDs...") cc.execute('select * from current_files natural inner join url_map') for row in cc.fetchall(): current_url_ids.add(row['url_id']) log.info("hydownloader-anchor-exporter", "Querying Hydrus database for deleted URL IDs...") cc.execute('select * from deleted_files natural inner join url_map') for row in cc.fetchall(): deleted_url_ids.add(row['url_id']) client_db.close() if keep_old_hydrus_url_data: log.info( "hydownloader-anchor-exporter", "Old Hydrus URL data will NOT be deleted from the shared hydownloader database" ) else: log.info( "hydownloader-anchor-exporter", "Deleting old Hydrus URL data from shared hydownloader database..." ) db.delete_all_hydrus_known_urls() sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = { 'pixiv': (["pixi"], []), 'gelbooru': (["gelbooru"], []), 'nijie': (["nijie"], []), 'lolibooru': (['lolibooru'], []), 'danbooru': (['danbooru'], []), '3dbooru': (['behoimi'], []), 'sankaku': (['sankaku'], ["idol."]), 'idolcomplex': (["idol.sankaku"], []), 'artstation': (["artstation"], []), 'twitter': (["twitter", "nitter"], []), 'deviantart': (['deviantart'], []), 'tumblr': (["tumblr"], []), 'hentaifoundry': (["hentai-foundry"], []), 'yandere': (["yande.re"], []) } siteset = {x.strip() for x in sites.split(',') if x.strip()} if sites == "all": siteset = set(sites_to_keywords.keys()) anchors: Counter[str] = collections.Counter() for site in siteset: if not site in sites_to_keywords: log.fatal('hydownloader-anchor-exporter', f'Unsupported site: {site}') def process_url(url): patterns = urls.anchor_patterns_from_url(url) if patterns: recognized_urls.add(url) anchors[patterns[0]] += 1 else: suspicious_urls.add(url) log.info("hydownloader-anchor-exporter", "Processing URLs...") for row in rows: processed += 1 if processed % 1000 == 0: print(f"Processed {processed}/{all_rows} URLs", file=sys.stderr) if fill_known_urls: known_url_status = 1 is_current = row['url_id'] in current_url_ids is_deleted = row['url_id'] in deleted_url_ids if is_current and is_deleted: known_url_status = 4 elif is_deleted: known_url_status = 3 elif is_current: known_url_status = 2 db.add_hydrus_known_url(row['url'], known_url_status) for site in siteset: accepts, rejects = sites_to_keywords[site] url_ok = False for accept in accepts: if accept in row['url']: url_ok = True break if url_ok: for reject in rejects: if reject in row['url']: url_ok = False if url_ok: process_url(row['url']) log.info("hydownloader-anchor-exporter", "Done processing URLs") if unrecognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing unrecognized URLs...") with open(unrecognized_urls_file, 'w', encoding='utf-8') as f: for url in sorted(suspicious_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing unrecognized URLs") if recognized_urls_file: log.info("hydownloader-anchor-exporter", "Writing recognized URLs...") with open(recognized_urls_file, 'w', encoding='utf-8') as f: for url in sorted(recognized_urls): f.write(url.strip() + '\n') log.info("hydownloader-anchor-exporter", "Done writing recognized URLs") log.info("hydownloader-anchor-exporter", "Inserting new anchors...") anchor_count = len(anchors.keys()) processed = 0 new_anchor_rows = 0 for anchor in anchors: processed += 1 if processed % 50 == 0: print(f"Inserting new anchors {processed}/{anchor_count}", file=sys.stderr) final_anchors = [anchor] if anchor.startswith("nijie"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("twitter") or anchor.startswith("tumblr"): for i in range(anchors[anchor] + 1): final_anchors.append(anchor + "_" + str(i)) if anchor.startswith("pixiv"): for i in range(anchors[anchor]): final_anchors.append(anchor + "_p{:02d}".format(i)) for f_a in final_anchors: if f_a in known_anchors: continue ac.execute('insert into archive(entry) values (?)', (f_a, )) new_anchor_rows += 1 log.info( "hydownloader-anchor-exporter", f"Done inserting new anchors, added {new_anchor_rows} entries in total" ) anchor_db.commit() anchor_db.close() hydrus_db.close() db.shutdown()
def run_job(path: str, job: str, config: Optional[str], verbose: bool, do_it: bool, no_stop_on_missing_metadata: bool) -> None: log.init(path, True) db.init(path) config_path = db.get_rootpath() + '/hydownloader-import-jobs.json' data_path = db.get_datapath() if config: config_path = config if not os.path.isfile(config_path): log.fatal("hydownloader-importer", f"Configuration file not found: {config_path}") jobs = json.load(open(config_path, 'r', encoding='utf-8-sig')) if not job in jobs: log.fatal("hydownloader-importer", f"Job not found in configuration file: {job}") jd = jobs[job] force_add_metadata = jd.get('forceAddMetadata', True) force_add_files = jd.get('forceAddFiles', False) client = hydrus.Client(jd['apiKey'], jd['apiURL']) log.info("hydownloader-importer", f"Starting import job: {job}") # iterate over all files in the data directory for root, dirs, files in os.walk(data_path): for fname in files: # json files hold metadata, don't import them to Hydrus if fname.endswith('.json'): continue # set up some variables # some will be used later in the code, some are meant to be used in user-defined expressions abspath = root + "/" + fname path = os.path.relpath(abspath, start=data_path) split_path = os.path.split(path) fname_noext, fname_ext = os.path.splitext(fname) if fname_ext.startswith('.'): fname_ext = fname_ext[1:] # find the path of the associated json metadata file, check if it exists # for pixiv ugoira, the same metadata file belongs both to the .webm and the .zip, # so this needs special handling json_path = abspath + '.json' if not os.path.isfile(json_path) and abspath.endswith('.webm'): json_path = abspath[:-4] + "zip.json" json_exists = True if not os.path.isfile(json_path): json_exists = False printerr(f"Warning: no metadata file found for {path}") if not no_stop_on_missing_metadata: sys.exit(1) generated_urls = set() generated_tags: set[tuple[str, str]] = set() matched = False # will be true if at least 1 filter group matched the file json_data = None # this will hold the associated json metadata (if available) if verbose: printerr(f"Processing file: {path}...") # iterate over all filter groups, do they match this file? for group in jd['groups']: # evaluate filter, load json metadata if the filter matches and we haven't loaded it yet should_process = False try: should_process = eval(group['filter']) except: printerr(f"Failed to evaluate filter: {group['filter']}") sys.exit(1) if not json_data and json_exists: try: json_data = json.load( open(json_path, encoding='utf-8-sig')) except json.decoder.JSONDecodeError: printerr(f"Failed to parse JSON: {json_path}") sys.exit(1) if not should_process: continue matched = True # get the data for this file from the additional_data db table and process it # set up some variables that user-defined expressions will be able to use additional_data_dicts = db.get_additional_data_for_file(path) if not additional_data_dicts and path.endswith('.webm'): additional_data_dicts = db.get_additional_data_for_file( path[:-4] + "zip") extra_tags: defaultdict[str, list[str]] = defaultdict(list) min_time_added = -1 max_time_added = -1 for d in additional_data_dicts: parse_additional_data(extra_tags, d['data']) if min_time_added == -1 or min_time_added > d['time_added']: min_time_added = d['time_added'] if max_time_added == -1 or max_time_added < d['time_added']: max_time_added = d['time_added'] sub_ids = [] url_ids = [] for d in additional_data_dicts: if d['subscription_id']: sub_ids.append(str(d['subscription_id'])) if d['url_id']: url_ids.append(str(d['url_id'])) # execute user-defined tag and url generator expressions has_error = False for dtype, d in [('tag', x) for x in group.get('tags', [])] + [ ('url', x) for x in group.get('urls', []) ]: skip_on_error = d.get("skipOnError", False) allow_empty = d.get("allowEmpty", False) rule_name = d.get("name") generated_results = [] # if the expression is a single string if isinstance(d["values"], str): try: eval_res = eval(d["values"]) # check result type: must be string or iterable of strings if isinstance(eval_res, str): generated_results = [eval_res] else: for eval_res_str in eval_res: if not isinstance(eval_res_str, str): printerr( f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {d['values']}" ) sys.exit(1) else: generated_results.append(eval_res_str) except Exception as e: if verbose: printerr( f"Failed to evaluate expression: {d['values']}" ) print(e) has_error = True else: # multiple expressions (array of strings) for eval_expr in d["values"]: try: eval_res = eval(eval_expr) # check result type: must be string or iterable of strings if isinstance(eval_res, str): generated_results = [eval_res] else: for eval_res_str in eval_res: if not isinstance(eval_res_str, str): printerr( f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {eval_expr}" ) sys.exit(1) else: generated_results.append( eval_res_str) except Exception as e: if verbose: printerr( f"Failed to evaluate expression: {eval_expr}" ) printerr(e) has_error = True # check for empty results or failed evaluation, as necessary if not generated_results and not allow_empty: printerr( f"Error: the rule named {rule_name} yielded no results but this is not allowed" ) sys.exit(1) if has_error: printerr( f"Warning: an expression failed to evaluate in the rule named {rule_name}" ) if not skip_on_error: sys.exit(1) # save results of the currently evaluated expressions if dtype == 'url': generated_urls.update(generated_results) else: for repo in d["tagRepos"]: generated_tags.update( (repo, tag) for tag in generated_results) if matched: printerr(f"File matched: {path}...") if not os.path.getsize(abspath): print(f"Found truncated file: {abspath}") sys.exit(1) if verbose: printerr("Generated URLs:") for url in generated_urls: printerr(url) printerr("Generated tags:") for repo, tag in sorted(list(generated_tags), key=lambda x: x[0]): printerr(f"{repo} <- {tag}") if verbose: printerr('Hashing...') # calculate hash, check if Hydrus already knows the file already_added = False if do_it: hasher = hashlib.sha256() with open(abspath, 'rb') as hashedfile: buf = hashedfile.read(65536 * 16) while len(buf) > 0: hasher.update(buf) buf = hashedfile.read(65536 * 16) hexdigest = hasher.hexdigest() if client.file_metadata(hashes=[hexdigest], only_identifiers=True): printerr("File is already in Hydrus") already_added = True # send file, tags, metadata to Hydrus as needed if not already_added or force_add_files: if verbose: printerr("Sending file to Hydrus...") if do_it: client.add_file(abspath) if not already_added or force_add_metadata: if verbose: printerr("Associating URLs...") if do_it: client.associate_url(hashes=[hexdigest], add=generated_urls) if verbose: printerr("Adding tags...") tag_dict = defaultdict(list) for repo, tag in generated_tags: tag_dict[repo].append(tag) if do_it: client.add_tags(hashes=[hexdigest], service_to_tags=tag_dict) else: if verbose: printerr(f"Skipping due to no matching filter: {path}") log.info("hydownloader-importer", f"Finished import job: {job}") db.shutdown()
def report(path: str, verbose: bool) -> None: log.init(path, True) db.init(path) db.report(verbose)