Exemple #1
0
def test_urlutils():
    '''Test URL manipulation tools'''
    # domain extraction
    assert extract_domain('h') is None
    assert extract_domain('https://httpbin.org/') == 'httpbin.org'
    # url parsing
    result = _parse('https://httpbin.org/')
    assert isinstance(result, ParseResult)
    newresult = _parse(result)
    assert isinstance(result, ParseResult)
    with pytest.raises(TypeError):
        result = _parse(float(1.23))

    assert get_base_url('https://example.org/path') == 'https://example.org'
    with pytest.raises(ValueError):
        assert get_host_and_path('123') is None
    assert get_host_and_path('https://example.org/path') == (
        'https://example.org', '/path')
    assert get_host_and_path('https://example.org/') == ('https://example.org',
                                                         '/')
    assert get_host_and_path('https://example.org') == ('https://example.org',
                                                        '/')
    assert get_hostinfo('https://httpbin.org/') == ('httpbin.org',
                                                    'https://httpbin.org')
    assert get_hostinfo('https://example.org/path') == ('example.org',
                                                        'https://example.org')
    # keeping track of known URLs
    known_links = {'https://test.org'}
    assert is_known_link('https://test.org/1', known_links) is False
    assert is_known_link('https://test.org', known_links) is True
    assert is_known_link('http://test.org', known_links) is True
    assert is_known_link('http://test.org/', known_links) is True
    assert is_known_link('https://test.org/', known_links) is True
Exemple #2
0
def test_urlutils():
    '''Test URL manipulation tools'''
    assert extract_domain('https://httpbin.org/') == 'httpbin.org'
    assert get_base_url('https://example.org/path') == 'https://example.org'
    assert get_host_and_path('https://example.org/path') == ('https://example.org', '/path')
    assert get_hostinfo('https://example.org/path') == ('example.org', 'https://example.org')
    assert get_hostinfo('https://httpbin.org/') == ('httpbin.org', 'https://httpbin.org')
Exemple #3
0
def refresh_detection(htmlstring, homepage):
    "Check if there could be a redirection by meta-refresh tag."
    if '"refresh"' in htmlstring or '"REFRESH"' in htmlstring:
        try:
            html_tree = load_html(htmlstring)
            # test meta-refresh redirection
            # https://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python
            attr = html_tree.xpath(
                '//meta[@http-equiv="refresh"]/@content|//meta[@http-equiv="REFRESH"]/@content'
            )[0]
            _, text = attr.split(';')
            text = text.strip().lower()
            if text.startswith('url=') or text.startswith('URL='):
                url2 = text[4:]
                if not url2.startswith('http'):
                    # Relative URL, adapt
                    _, base_url = get_hostinfo(url2)
                    url2 = fix_relative_urls(base_url, url2)
                # second fetch
                newhtmlstring = fetch_url(url2)
                if newhtmlstring is None:
                    logging.warning('failed redirect: %s', url2)
                    return None, None
                #else:
                htmlstring, homepage = newhtmlstring, url2
                logging.info('successful redirect: %s', url2)
        except (IndexError, etree.ParserError, etree.XMLSyntaxError,
                etree.XPathEvalError) as err:
            logging.info('no redirect found: %s %s', homepage, err)
    return htmlstring, homepage
Exemple #4
0
def init_crawl(homepage,
               todo,
               known_links,
               language=None,
               shortform=False,
               rules=None):
    """Start crawl by initializing variables and potentially examining the starting page."""
    # config=DEFAULT_CONFIG
    _, base_url = get_hostinfo(homepage)
    known_links = known_links or set()
    i = 0
    # fetch and parse robots.txt file if necessary
    if rules is None:
        rules = urllib.robotparser.RobotFileParser()
        rules.set_url(base_url + '/robots.txt')
        # exceptions happening here
        try:
            rules.read()
        except Exception as exc:
            LOGGER.error('cannot read robots.txt: %s', exc)
            rules = None
    # initialize crawl by visiting homepage if necessary
    if todo is None:
        todo = deque([homepage])
        todo, known_links, i, _ = crawl_page(i,
                                             base_url,
                                             todo,
                                             known_links,
                                             lang=language,
                                             shortform=shortform,
                                             rules=rules,
                                             initial=True)
    return todo, known_links, base_url, i, rules
Exemple #5
0
def find_feed_urls(url, target_lang=None):
    """Try to find feed URLs.

    Args:
        url: Webpage or feed URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
             (two-letter string, ISO 639-1 format).

    Returns:
        The extracted links as a list (sorted list of unique links).

    """
    domainname, baseurl = get_hostinfo(url)
    if domainname is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    urlfilter = None
    downloaded = fetch_url(url)
    if downloaded is not None:
        # assume it's a feed
        feed_links = extract_links(downloaded, domainname, baseurl, url,
                                   target_lang)
        if len(feed_links) == 0:
            # assume it's a web page
            for feed in determine_feed(downloaded, baseurl, url):
                feed_string = fetch_url(feed)
                feed_links.extend(
                    extract_links(feed_string, domainname, baseurl, url,
                                  target_lang))
            # filter triggered, prepare it
            if len(url) > len(baseurl) + 2:
                urlfilter = url
        # return links found
        if len(feed_links) > 0:
            feed_links = filter_urls(feed_links, urlfilter)
            LOGGER.debug('%s feed links found for %s', len(feed_links),
                         domainname)
            return feed_links
        LOGGER.debug('No usable feed links found: %s', url)
    else:
        LOGGER.warning('Could not download web page: %s', url)
        if url.strip('/') != baseurl:
            return try_homepage(baseurl, target_lang)
    # try alternative: Google News
    if target_lang is not None:
        downloaded = fetch_url('https://news.google.com/rss/search?q=site:' +
                               baseurl + '&hl=' + target_lang +
                               '&scoring=n&num=100')
        if downloaded is not None:
            feed_links = extract_links(downloaded, domainname, baseurl, url,
                                       target_lang)
            feed_links = filter_urls(feed_links, urlfilter)
            LOGGER.debug('%s Google news links found for %s', len(feed_links),
                         domainname)
            return feed_links
    return []
Exemple #6
0
def sitemap_search(url, target_lang=None):
    """Look for sitemaps for the given URL and gather links.

    Args:
        url: Webpage or sitemap URL as string.
             Triggers URL-based filter if the webpage isn't a homepage.
        target_lang: Define a language to filter URLs based on heuristics
             (two-letter string, ISO 639-1 format).

    Returns:
        The extracted links as a list (sorted list of unique links).

    """
    domainname, baseurl = get_hostinfo(url)
    if domainname is None:
        LOGGER.warning('Invalid URL: %s', url)
        return []
    urlfilter = None
    sitemaps_seen = set()
    # determine sitemap URL
    if url.endswith('.xml') or url.endswith('.gz') or url.endswith('sitemap'):
        sitemapurl = url
    else:
        sitemapurl = baseurl + '/sitemap.xml'
        # filter triggered, prepare it
        if len(url) > len(baseurl) + 2:
            urlfilter = url
    sitemapurls, linklist = download_and_process_sitemap(sitemapurl, domainname, baseurl, target_lang)
    sitemaps_seen.add(sitemapurl)
    if sitemapurls == [] and len(linklist) > 0:
        linklist = filter_urls(linklist, urlfilter)
        LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
        return linklist
    # try sitemaps in robots.txt file if nothing has been found
    if sitemapurls == [] and linklist == []:
        sitemapurls = find_robots_sitemaps(baseurl)
        # try additional URLs just in case
        if sitemapurls == []:
            sitemapurls = [''.join([baseurl, '/', g]) for g in GUESSES]
    # iterate through nested sitemaps and results
    i = 1
    while sitemapurls:
        sitemapurl = sitemapurls.pop()
        sitemapurls, linklist = download_and_process_sitemap(sitemapurl, domainname, baseurl, target_lang, sitemapurls, linklist)
        # sanity check: keep track of visited sitemaps and exclude them
        sitemaps_seen.add(sitemapurl)
        sitemapurls = [s for s in sitemapurls if s not in sitemaps_seen]
        # counter and safeguard
        i += 1
        if i > MAX_SITEMAPS_SEEN:
            break
    linklist = filter_urls(linklist, urlfilter)
    LOGGER.debug('%s sitemap links found for %s', len(linklist), domainname)
    return linklist
Exemple #7
0
def probe_alternative_homepage(homepage):
    "Check if the homepage is redirected and return appropriate values."
    response = fetch_url(homepage, decode=False)
    if response is None or response == '':
        return None, None, None
    # get redirected URL here?
    if response.geturl() != homepage:
        logging.info('followed redirect: %s', response.geturl())
        homepage = response.geturl()
    # decode response
    htmlstring = decode_response(response.data)
    # is there a meta-refresh on the page?
    htmlstring, homepage = refresh_detection(htmlstring, homepage)
    logging.info('fetching homepage OK: %s', homepage)
    _, base_url = get_hostinfo(homepage)
    return htmlstring, homepage, base_url