def anchor_patterns_from_url(url: str) -> list[str]: """ This function scans a URL (usually taken from a Hydrus database), and generates gallery-dl anchors (in the format that hydownloader uses). If multiple anchors can be generated from a single post URL (e.g. it can produce multiple files, like pixiv), then some of them might end with _% where % functions as a wildcard matching any number of characters. This can be used to match all of the anchors belonging to the given post URL. The basic, non-wildcard anchor should always be returned as the first entry in the result list. Check the pixiv patterns for an example. hydownloader anchor pattern examples for supported sites: pixiv: pixiv88847570, pixiv88536044_p00, ..., pixiv88536044_p117 gelbooru: gelbooru5994487 danbooru: danbooru4442363 lolibooru.moe: lolibooru178123 3dbooru: 3dbooru52352 artstation: artstation9322141 (difficult, extracted from URL components) sankaku: sankaku24860317 idolcomplex: idolcomplex752647 twitter: twitter1375563339296768001_1 deviantart: deviantart873044835 patreon: patreon48042243_1 nijie: nijie306993_0, nijie306993_1 tumblr: tumblr188243485974 fantia: {post_id}_{file_id} fanbox: {id}_{num} (num starts at 1) See also gallery-dl-config.json. """ u = uri_normalizer.normalizes(url) if m := re.match( r"https?://gelbooru\.com/index\.php\?(page=post&)?(s=view&)?id=(?P<id>[0-9]+)(&.*)?", u): return [f"gelbooru{m.group('id')}"]
def downloader_for_url(url: str) -> str: """ Returns the name of the downloader that gallery-dl would use for the given URL. Returns an empty string if gallery-dl does not recognize the URL. """ u = uri_normalizer.normalizes(url) if match := extractor.find(u): return match.category
def check_anchor_for_url(url: str) -> bool: """ Checks whether the given file(s) represented by this URL are present in the anchor database. """ u = uri_normalizer.normalizes(url) patterns = urls.anchor_patterns_from_url(u) return check_db_for_anchors(patterns)
def route_url_history_info() -> str: check_access() result = [] for url in bottle.request.json['urls']: url_info = {'url': url, 'normalized_url': uri_normalizer.normalizes(url)} url_info['queue_info'] = db.check_single_queue_for_url(url) url_info['anchor_info'] = gallery_dl_utils.check_anchor_for_url(url) result.append(url_info) return json.dumps(result)
def urls_for_known_url_lookup(url: str) -> set[str]: """ Takes a raw URL and generates variants that are suitable for lookup in the known_urls database table (to find equivalent versions of the input URL that were already downloaded). """ result = {url, uri_normalizer.normalizes(url)} # new URL variants are generated using the replacement patterns defined above # repeat the process until there are no new URLs generated while True: new_urls = set() for u in result: for (repl_from, repl_to) in known_url_replacements: replaced = re.sub(repl_from, repl_to, u) if not replaced in result: new_urls.add(replaced) result.update(new_urls) if not new_urls: break # alphabetize query params new_urls = set() for u in result: spliturl = urllib.parse.urlsplit(u) sortedquery = urllib.parse.urlencode( sorted( urllib.parse.parse_qsl(spliturl.query, keep_blank_values=True))) finalurl = urllib.parse.urlunsplit( (spliturl.scheme, spliturl.netloc, spliturl.path, sortedquery, spliturl.fragment)) new_urls.add(finalurl) # variants with utm_* shit removed sortedquery_no_utm = urllib.parse.urlencode( list( filter( lambda x: not x[0].startswith("utm_"), sorted( urllib.parse.parse_qsl(spliturl.query, keep_blank_values=True))))) finalurl_no_utm_sorted = urllib.parse.urlunsplit( (spliturl.scheme, spliturl.netloc, spliturl.path, sortedquery_no_utm, spliturl.fragment)) query_no_utm = urllib.parse.urlencode( list( filter( lambda x: not x[0].startswith("utm_"), urllib.parse.parse_qsl(spliturl.query, keep_blank_values=True)))) finalurl_no_utm = urllib.parse.urlunsplit( (spliturl.scheme, spliturl.netloc, spliturl.path, query_no_utm, spliturl.fragment)) new_urls.add(finalurl_no_utm) new_urls.add(finalurl_no_utm_sorted) result.update(new_urls) return result
def subscription_data_from_url(url: str) -> tuple[str, str]: """ This function tries to recognize gallery URLs and generate a hydownloader downloader name and some keywords from them to be used as a subscription. In Hydrus terms, this is the reverse of what a GUG (gallery URL generator) does. """ u = uri_normalizer.normalizes(url) if m := re.match( r"https?://gelbooru\.com/index.php\?page=post&s=list&tags=(?P<keywords>[^&]+)(&.*)?", u): return ('gelbooru', m.group('keywords').lower())
def route_check_urls() -> str: check_access() result : list[dict] = [] for url in bottle.request.json['urls']: url_info = {'url': url, 'normalized_url': uri_normalizer.normalizes(url)} url_info['downloader'] = gallery_dl_utils.downloader_for_url(url) sub_data = urls.subscription_data_from_url(url) existing_subs = [] for sub in sub_data: existing_subs += db.get_subscriptions_by_downloader_data(sub[0], sub[1]) url_info['existing_subscriptions'] = existing_subs return json.dumps(result)
def add_or_update_urls(url_data: list[dict]) -> bool: for item in url_data: add = "id" not in item if add and not "url" in item: continue if add: item["time_added"] = time.time() if 'url' in item: item['url'] = uri_normalizer.normalizes(item['url']) upsert_dict("single_url_queue", item) if add: log.info("hydownloader", f"Added URL: {item['url']}") else: log.info("hydownloader", f"Updated URL with ID {item['id']}") return True
def check_single_queue_for_url(url: str) -> list[dict]: check_init() c = get_conn().cursor() url = uri_normalizer.normalizes(url) c.execute('select * from single_url_queue where url = ?', (url,)) return c.fetchall()