Esempio n. 1
0
def anchor_patterns_from_url(url: str) -> list[str]:
    """
    This function scans a URL (usually taken from a Hydrus database), and
    generates gallery-dl anchors (in the format that hydownloader uses).
    If multiple anchors can be generated from a single post URL (e.g. it can produce multiple files, like pixiv),
    then some of them might end with _% where % functions as a wildcard matching any number of characters.
    This can be used to match all of the anchors belonging to the given post URL.
    The basic, non-wildcard anchor should always be returned as the first entry in the result list.
    Check the pixiv patterns for an example.

    hydownloader anchor pattern examples for supported sites:
    pixiv: pixiv88847570, pixiv88536044_p00, ..., pixiv88536044_p117
    gelbooru: gelbooru5994487
    danbooru: danbooru4442363
    lolibooru.moe: lolibooru178123
    3dbooru: 3dbooru52352
    artstation: artstation9322141 (difficult, extracted from URL components)
    sankaku: sankaku24860317
    idolcomplex: idolcomplex752647
    twitter: twitter1375563339296768001_1
    deviantart: deviantart873044835
    patreon: patreon48042243_1
    nijie: nijie306993_0, nijie306993_1
    tumblr: tumblr188243485974
    fantia: {post_id}_{file_id}
    fanbox: {id}_{num} (num starts at 1)
    See also gallery-dl-config.json.
    """
    u = uri_normalizer.normalizes(url)

    if m := re.match(
            r"https?://gelbooru\.com/index\.php\?(page=post&)?(s=view&)?id=(?P<id>[0-9]+)(&.*)?",
            u):
        return [f"gelbooru{m.group('id')}"]
Esempio n. 2
0
def downloader_for_url(url: str) -> str:
    """
    Returns the name of the downloader that gallery-dl would use for the given URL.
    Returns an empty string if gallery-dl does not recognize the URL.
    """
    u = uri_normalizer.normalizes(url)
    if match := extractor.find(u):
        return match.category
def check_anchor_for_url(url: str) -> bool:
    """
    Checks whether the given file(s) represented by this URL are present
    in the anchor database.
    """
    u = uri_normalizer.normalizes(url)
    patterns = urls.anchor_patterns_from_url(u)
    return check_db_for_anchors(patterns)
Esempio n. 4
0
def route_url_history_info() -> str:
    check_access()
    result = []
    for url in bottle.request.json['urls']:
        url_info = {'url': url, 'normalized_url': uri_normalizer.normalizes(url)}
        url_info['queue_info'] = db.check_single_queue_for_url(url)
        url_info['anchor_info'] = gallery_dl_utils.check_anchor_for_url(url)
        result.append(url_info)
    return json.dumps(result)
Esempio n. 5
0
def urls_for_known_url_lookup(url: str) -> set[str]:
    """
    Takes a raw URL and generates variants that are suitable for
    lookup in the known_urls database table (to find equivalent versions of the input URL that were already downloaded).
    """
    result = {url, uri_normalizer.normalizes(url)}

    # new URL variants are generated using the replacement patterns defined above
    # repeat the process until there are no new URLs generated
    while True:
        new_urls = set()
        for u in result:
            for (repl_from, repl_to) in known_url_replacements:
                replaced = re.sub(repl_from, repl_to, u)
                if not replaced in result:
                    new_urls.add(replaced)
        result.update(new_urls)
        if not new_urls: break

    # alphabetize query params
    new_urls = set()
    for u in result:
        spliturl = urllib.parse.urlsplit(u)
        sortedquery = urllib.parse.urlencode(
            sorted(
                urllib.parse.parse_qsl(spliturl.query,
                                       keep_blank_values=True)))
        finalurl = urllib.parse.urlunsplit(
            (spliturl.scheme, spliturl.netloc, spliturl.path, sortedquery,
             spliturl.fragment))
        new_urls.add(finalurl)
        # variants with utm_* shit removed
        sortedquery_no_utm = urllib.parse.urlencode(
            list(
                filter(
                    lambda x: not x[0].startswith("utm_"),
                    sorted(
                        urllib.parse.parse_qsl(spliturl.query,
                                               keep_blank_values=True)))))
        finalurl_no_utm_sorted = urllib.parse.urlunsplit(
            (spliturl.scheme, spliturl.netloc, spliturl.path,
             sortedquery_no_utm, spliturl.fragment))
        query_no_utm = urllib.parse.urlencode(
            list(
                filter(
                    lambda x: not x[0].startswith("utm_"),
                    urllib.parse.parse_qsl(spliturl.query,
                                           keep_blank_values=True))))
        finalurl_no_utm = urllib.parse.urlunsplit(
            (spliturl.scheme, spliturl.netloc, spliturl.path, query_no_utm,
             spliturl.fragment))
        new_urls.add(finalurl_no_utm)
        new_urls.add(finalurl_no_utm_sorted)
    result.update(new_urls)

    return result
Esempio n. 6
0
def subscription_data_from_url(url: str) -> tuple[str, str]:
    """
    This function tries to recognize gallery URLs and generate a hydownloader downloader name and
    some keywords from them to be used as a subscription.
    In Hydrus terms, this is the reverse of what a GUG (gallery URL generator) does.
    """
    u = uri_normalizer.normalizes(url)

    if m := re.match(
            r"https?://gelbooru\.com/index.php\?page=post&s=list&tags=(?P<keywords>[^&]+)(&.*)?",
            u):
        return ('gelbooru', m.group('keywords').lower())
Esempio n. 7
0
def route_check_urls() -> str:
    check_access()
    result : list[dict] = []
    for url in bottle.request.json['urls']:
        url_info = {'url': url, 'normalized_url': uri_normalizer.normalizes(url)}
        url_info['downloader'] = gallery_dl_utils.downloader_for_url(url)
        sub_data = urls.subscription_data_from_url(url)
        existing_subs = []
        for sub in sub_data:
            existing_subs += db.get_subscriptions_by_downloader_data(sub[0], sub[1])
        url_info['existing_subscriptions'] = existing_subs
    return json.dumps(result)
Esempio n. 8
0
def add_or_update_urls(url_data: list[dict]) -> bool:
    for item in url_data:
        add = "id" not in item
        if add and not "url" in item: continue
        if add: item["time_added"] = time.time()
        if 'url' in item: item['url'] = uri_normalizer.normalizes(item['url'])
        upsert_dict("single_url_queue", item)
        if add:
            log.info("hydownloader", f"Added URL: {item['url']}")
        else:
            log.info("hydownloader", f"Updated URL with ID {item['id']}")
    return True
Esempio n. 9
0
def check_single_queue_for_url(url: str) -> list[dict]:
    check_init()
    c = get_conn().cursor()
    url = uri_normalizer.normalizes(url)
    c.execute('select * from single_url_queue where url = ?', (url,))
    return c.fetchall()