Ejemplo n.º 1
0
def test_urlutils():
    '''Test URL manipulation tools'''
    # domain extraction
    assert extract_domain('h') is None
    assert extract_domain('https://httpbin.org/') == 'httpbin.org'
    # url parsing
    result = _parse('https://httpbin.org/')
    assert isinstance(result, ParseResult)
    newresult = _parse(result)
    assert isinstance(result, ParseResult)
    with pytest.raises(TypeError):
        result = _parse(float(1.23))

    assert get_base_url('https://example.org/path') == 'https://example.org'
    with pytest.raises(ValueError):
        assert get_host_and_path('123') is None
    assert get_host_and_path('https://example.org/path') == (
        'https://example.org', '/path')
    assert get_host_and_path('https://example.org/') == ('https://example.org',
                                                         '/')
    assert get_host_and_path('https://example.org') == ('https://example.org',
                                                        '/')
    assert get_hostinfo('https://httpbin.org/') == ('httpbin.org',
                                                    'https://httpbin.org')
    assert get_hostinfo('https://example.org/path') == ('example.org',
                                                        'https://example.org')
    # keeping track of known URLs
    known_links = {'https://test.org'}
    assert is_known_link('https://test.org/1', known_links) is False
    assert is_known_link('https://test.org', known_links) is True
    assert is_known_link('http://test.org', known_links) is True
    assert is_known_link('http://test.org/', known_links) is True
    assert is_known_link('https://test.org/', known_links) is True
Ejemplo n.º 2
0
def handle_link(link, sitemapurl, domainname, baseurl, target_lang):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(baseurl, link)
    # clean and normalize
    link = clean_url(link, target_lang)
    if link is not None:
        if lang_filter(link, target_lang) is True:
            newdomain = extract_domain(link)
            if newdomain is not None:
                # don't take links from another domain and make an exception for main platforms
                if newdomain != domainname and not WHITELISTED_PLATFORMS.search(
                        newdomain):
                    LOGGER.warning('Diverging domain names: %s %s', domainname,
                                   newdomain)
                else:
                    if re.search(r'\.xml$|\.xml[.?#]', link):
                        state = 'sitemap'
                    else:
                        state = 'link'
            else:
                LOGGER.error("Couldn't extract domain: %s", link)
    return link, state
Ejemplo n.º 3
0
def sitemap_search(url, target_lang=None):
    'Look for sitemaps for the given URL and gather links.'
    domainname, hostmatch = extract_domain(url), HOSTINFO.match(url)
    if domainname is None or hostmatch is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    baseurl = hostmatch.group(0)
    if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'):
        sitemapurl = url
    else:
        sitemapurl = url.rstrip('/') + '/sitemap.xml'
    sitemapurls, linklist = process_sitemap(sitemapurl, domainname, baseurl, target_lang)
    if sitemapurls == [] and len(linklist) > 0:
        return linklist
    if sitemapurls == [] and linklist == []:
        for sitemapurl in find_robots_sitemaps(url, baseurl):
            tmp_sitemapurls, tmp_linklist = process_sitemap(sitemapurl, domainname, baseurl, target_lang)
            sitemapurls.extend(tmp_sitemapurls)
            linklist.extend(tmp_linklist)
    while sitemapurls:
        tmp_sitemapurls, tmp_linklist = process_sitemap(sitemapurls.pop(), domainname, baseurl, target_lang)
        sitemapurls.extend(tmp_sitemapurls)
        linklist.extend(tmp_linklist)
    LOGGER.debug('%s links found for %s', len(linklist), domainname)
    return linklist
Ejemplo n.º 4
0
def draw_backoff_url(domain_dict, backoff_dict, sleeptime, i):
    '''Select a random URL from the domains pool and apply backoff rule'''
    host = random.choice(list(domain_dict))
    domain = extract_domain(host)
    # safeguard
    if domain in backoff_dict and \
        (datetime.now() - backoff_dict[domain]).total_seconds() < sleeptime:
        i += 1
        if i >= len(domain_dict)*3:
            LOGGER.debug('spacing request for domain name %s', domain)
            sleep(sleeptime)
            i = 0
    # draw URL
    url = host + domain_dict[host].pop()
    # clean registries
    if not domain_dict[host]:
        del domain_dict[host]
        try:
            del backoff_dict[domain]
        except KeyError:
            pass
    # register backoff
    else:
        backoff_dict[domain] = datetime.now()
    return url, domain_dict, backoff_dict, i
Ejemplo n.º 5
0
def find_feed_urls(url, target_lang=None):
    """Try to find feed URLs.

    Args:
        url: Homepage or feed URL as string.
        target_lang: Define a language to filter URLs based on heuristics
            (two-letter string, ISO 639-1 format).

    Returns:
        The extracted links as list (sorted list of unique links).

    """
    url = url.rstrip('/')
    domainname, hostmatch = extract_domain(url), HOSTINFO.match(url)
    if domainname is None or hostmatch is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    baseurl = hostmatch.group(0)
    downloaded = fetch_url(url)
    if downloaded is None:
        LOGGER.warning('Could not download web page: %s', url)
        return []
    # assume it's a web page
    feed_links = []
    for feed in determine_feed(downloaded, baseurl, url):
        sleep(SLEEP_TIME)
        feed_string = fetch_url(feed)
        feed_links.extend(
            extract_links(feed_string, domainname, baseurl, url, target_lang))
    feed_links = sorted(list(set(feed_links)))
    LOGGER.debug('%s feed links found for %s', len(feed_links), domainname)
    return feed_links
Ejemplo n.º 6
0
def url_processing_pipeline(args, input_urls, sleeptime):
    '''Aggregated functions to show a list and download and process an input list'''
    input_urls = url_processing_checks(args.blacklist, input_urls)
    # print list without further processing
    if args.list:
        for url in input_urls:
            write_result(url, args)  # print('\n'.join(input_urls))
        return None
    # build domain-aware processing list
    domain_dict = dict()
    while input_urls:
        url = input_urls.pop()
        domain_name = extract_domain(url)
        if domain_name not in domain_dict:
            domain_dict[domain_name] = []
        domain_dict[domain_name].append(url)
    # initialize file counter if necessary
    if len(input_urls) > MAX_FILES_PER_DIRECTORY:
        counter = 0
    else:
        counter = None
    if len(domain_dict) <= 5:
        backoff_dict = dict()
        single_threaded_processing(domain_dict, backoff_dict, args, sleeptime,
                                   counter)
    else:
        multi_threaded_processing(domain_dict, args, sleeptime, counter)
Ejemplo n.º 7
0
def test_urlutils():
    '''Test URL manipulation tools'''
    assert extract_domain('https://httpbin.org/') == 'httpbin.org'
    assert get_base_url('https://example.org/path') == 'https://example.org'
    assert get_host_and_path('https://example.org/path') == ('https://example.org', '/path')
    assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org')
    assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org')
Ejemplo n.º 8
0
def find_feed_urls(url, target_lang=None):
    """Try to find feed URLs.

    Args:
        url: Webpage or feed URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
             (two-letter string, ISO 639-1 format).

    Returns:
        The extracted links as a list (sorted list of unique links).

    """
    url = url.rstrip('/')
    domainname, hostmatch = extract_domain(url), HOSTINFO.match(url)
    if domainname is None or hostmatch is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    baseurl = hostmatch.group(0)
    urlfilter = None
    downloaded = fetch_url(url)
    if downloaded is not None:
        # assume it's a feed
        feed_links = extract_links(downloaded, domainname, baseurl, url,
                                   target_lang)
        if len(feed_links) == 0:
            # assume it's a web page
            for feed in determine_feed(downloaded, baseurl, url):
                feed_string = fetch_url(feed)
                feed_links.extend(
                    extract_links(feed_string, domainname, baseurl, url,
                                  target_lang))
            # filter triggered, prepare it
            if len(url) > len(baseurl) + 2:
                urlfilter = url
        # return links found
        if len(feed_links) > 0:
            feed_links = filter_urls(feed_links, urlfilter)
            LOGGER.debug('%s feed links found for %s', len(feed_links),
                         domainname)
            return feed_links
    else:
        LOGGER.warning('Could not download web page: %s', url)
    # try alternative: Google News
    if target_lang is not None:
        url = 'https://news.google.com/rss/search?q=site:' + baseurl + '&hl=' + target_lang + '&scoring=n&num=100'
        downloaded = fetch_url(url)
        if downloaded is not None:
            feed_links = extract_links(downloaded, domainname, baseurl, url,
                                       target_lang)
            feed_links = filter_urls(feed_links, urlfilter)
            LOGGER.debug('%s feed links found for %s', len(feed_links),
                         domainname)
            return feed_links
        LOGGER.warning('Could not download web page: %s', url)
    return []
Ejemplo n.º 9
0
def sitemap_search(url, target_lang=None):
    """Look for sitemaps for the given URL and gather links.

    Args:
        url: Webpage or sitemap URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
             (two-letter string, ISO 639-1 format).

    Returns:
        The extracted links as a list (sorted list of unique links).

    """
    domainname, hostmatch = extract_domain(url), HOSTINFO.match(url)
    if domainname is None or hostmatch is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    baseurl = hostmatch.group(0)
    urlfilter = None
    # determine sitemap URL
    if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'):
        sitemapurl = url
    else:
        sitemapurl = baseurl + '/sitemap.xml'
        # filter triggered, prepare it
        if len(url) > len(baseurl) + 2:
            urlfilter = url
    sitemapurls, linklist = download_and_process_sitemap(
        sitemapurl, domainname, baseurl, target_lang)
    if sitemapurls == [] and len(linklist) > 0:
        linklist = filter_urls(linklist, urlfilter)
        LOGGER.debug('%s sitemap links found for %s', len(linklist),
                     domainname)
        return linklist
    # try sitemaps in robots.txt file if nothing has been found
    if sitemapurls == [] and linklist == []:
        sitemapurls = find_robots_sitemaps(baseurl)
        # try additional URLs just in case
        if sitemapurls == []:
            sitemapurls = [''.join([baseurl, '/', g]) for g in GUESSES]
    # iterate through nested sitemaps and results
    i = 1
    while sitemapurls:
        sitemapurls, linklist = download_and_process_sitemap(
            sitemapurls.pop(), domainname, baseurl, target_lang, sitemapurls,
            linklist)
        # counter and safeguard
        i += 1
        if i > MAX_SITEMAPS_SEEN:
            break
    linklist = filter_urls(linklist, urlfilter)
    LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
    return linklist
Ejemplo n.º 10
0
def multi_threaded_processing(domain_dict, args, sleeptime, counter):
    '''Implement a multi-threaded processing algorithm'''
    i = 0
    backoff_dict = dict()
    download_threads = args.parallel or DOWNLOAD_THREADS
    while domain_dict:
        # the remaining list is too small, process it differently
        if len({x
                for v in domain_dict.values() for x in v}) < download_threads:
            single_threaded_processing(domain_dict, backoff_dict, args,
                                       sleeptime, counter)
            return
        # populate buffer
        bufferlist, bufferdomains = list(), set()
        while len(bufferlist) < download_threads:
            domain = random.choice(list(domain_dict))
            if domain not in backoff_dict or \
            (datetime.now() - backoff_dict[domain]).total_seconds() > sleeptime:
                bufferlist.append(domain_dict[domain].pop())
                bufferdomains.add(domain)
                backoff_dict[domain] = datetime.now()
            # safeguard
            else:
                i += 1
                if i > len(domain_dict) * 3:
                    LOGGER.debug('spacing request for domain name %s', domain)
                    sleep(sleeptime)
                    i = 0
        # start several threads
        with ThreadPoolExecutor(max_workers=download_threads) as executor:
            future_to_url = {
                executor.submit(fetch_url, url): url
                for url in bufferlist
            }
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                # register in backoff dictionary to ensure time between requests
                domain = extract_domain(url)
                backoff_dict[domain] = datetime.now()
                # handle result
                counter = process_result(future.result(), args, url, counter)
        # clean registries
        for domain in bufferdomains:
            if not domain_dict[domain]:
                del domain_dict[domain]
                del backoff_dict[domain]
Ejemplo n.º 11
0
def find_feed_urls(url, target_lang=None):
    '''Try to find feed URLs'''
    url = url.rstrip('/')
    domainname, hostmatch = extract_domain(url), HOSTINFO.match(url)
    if domainname is None or hostmatch is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    baseurl = hostmatch.group(0)
    downloaded = fetch_url(url)
    if downloaded is None:
        LOGGER.warning('Could not download web page: %s', url)
        return []
    # assume it's a web page
    feed_links = []
    for feed in determine_feed(downloaded, baseurl, url):
        sleep(SLEEP_TIME)
        feed_string = fetch_url(feed)
        feed_links.extend(
            extract_links(feed_string, domainname, baseurl, url, target_lang))
    return sorted(list(set(feed_links)))
Ejemplo n.º 12
0
def sitemap_search(url, target_lang=None):
    """Look for sitemaps for the given URL and gather links.

    Args:
        url: Homepage or sitemap URL as string.
        target_lang: Define a language to filter URLs based on heuristics
            (two-letter string, ISO 639-1 format).

    Returns:
        The extracted links as list (sorted list of unique links).

    """
    domainname, hostmatch = extract_domain(url), HOSTINFO.match(url)
    if domainname is None or hostmatch is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    baseurl = hostmatch.group(0)
    if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'):
        sitemapurl = url
    else:
        sitemapurl = url.rstrip('/') + '/sitemap.xml'
    sitemapurls, linklist = download_and_process_sitemap(
        sitemapurl, domainname, baseurl, target_lang)
    if sitemapurls == [] and len(linklist) > 0:
        return linklist
    # try sitemaps in robots.txt file if nothing has been found
    if sitemapurls == [] and linklist == []:
        sitemapurls = find_robots_sitemaps(url, baseurl)
    # iterate through nested sitemaps and results
    i = 1
    while sitemapurls:
        sitemapurls, linklist = download_and_process_sitemap(
            sitemapurls.pop(), domainname, baseurl, target_lang, sitemapurls,
            linklist)
        # counter and safeguard
        i += 1
        if i > MAX_SITEMAPS_SEEN:
            break
    linklist = sorted(list(set(linklist)))
    LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
    return linklist
Ejemplo n.º 13
0
def url_processing_pipeline(args, input_urls, sleeptime):
    '''Aggregated functions to show a list and download and process an input list'''
    input_urls = url_processing_checks(args.blacklist, input_urls)
    # print list without further processing
    if args.list:
        for url in input_urls:
            write_result(url, args)  # print('\n'.join(input_urls))
        return None
    # build domain-aware processing list
    domain_dict = dict()
    while input_urls:
        url = input_urls.pop()
        domain_name = extract_domain(url)
        if domain_name not in domain_dict:
            domain_dict[domain_name] = []
        domain_dict[domain_name].append(url)
    # initialize file counter if necessary
    if len(input_urls) > MAX_FILES_PER_DIRECTORY:
        counter = 0
    else:
        counter = None
    if len(domain_dict) <= 5:
        errors, counter = single_threaded_processing(domain_dict, dict(), args,
                                                     sleeptime, counter)
    else:
        errors, counter = multi_threaded_processing(domain_dict, args,
                                                    sleeptime, counter)
    LOGGER.debug('%s URLs could not be found', len(errors))
    # option to retry
    if args.archived is True:
        domain_dict = dict()
        domain_dict['archive.org'] = [
            'https://web.archive.org/web/20/' + e for e in errors
        ]
        archived_errors, _ = single_threaded_processing(
            domain_dict, dict(), args, sleeptime, counter)
        LOGGER.debug('%s archived URLs out of %s could not be found',
                     len(archived_errors), len(errors))
Ejemplo n.º 14
0
def sitemap_search(url, target_lang=None):
    'Look for sitemaps for the given URL and gather links.'
    domain = extract_domain(url)
    if url.endswith('.xml') or url.endswith('sitemap'):
        sitemapurl = url
    else:
        sitemapurl = url.rstrip('/') + '/sitemap.xml'
    sitemapurls, linklist = process_sitemap(sitemapurl, domain, target_lang)
    if sitemapurls == [] and len(linklist) > 0:
        return linklist
    if sitemapurls == [] and linklist == []:
        for sitemapurl in find_robots_sitemaps(url):
            tmp_sitemapurls, tmp_linklist = process_sitemap(
                sitemapurl, domain, target_lang)
            sitemapurls.extend(tmp_sitemapurls)
            linklist.extend(tmp_linklist)
    while sitemapurls:
        tmp_sitemapurls, tmp_linklist = process_sitemap(
            sitemapurls.pop(), domain, target_lang)
        sitemapurls.extend(tmp_sitemapurls)
        linklist.extend(tmp_linklist)
    LOGGER.debug('%s links found for %s', len(linklist), domain)
    return linklist
Ejemplo n.º 15
0
def handle_link(link, sitemapurl, domainname, baseurl, target_lang=None):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(baseurl, link)
    # clean and normalize
    link = clean_url(link, target_lang)
    if link is not None:
        if lang_filter(link, target_lang) is True:
            newdomain = extract_domain(link)
            if newdomain != domainname:
                LOGGER.warning('Diverging domain names: %s %s', domainname,
                               newdomain)
            else:
                if re.search(r'\.xml$|\.xml[.?#]', link):
                    state = 'sitemap'
                else:
                    state = 'link'
    return link, state
Ejemplo n.º 16
0
def get_hostinfo(url):
    """Extract domain and host info (protocol + host/domain) from a URL."""
    domainname = extract_domain(url)
    parsed_url = urlparse(url)
    host = parsed_url._replace(path='', params='', query='', fragment='')
    return domainname, host.geturl()