Example #1
0
def test_fix_relative():
    assert fix_relative_urls('https://example.org',
                             'page.html') == 'https://example.org/page.html'
    assert fix_relative_urls(
        'http://example.org',
        '//example.org/page.html') == 'http://example.org/page.html'
    assert fix_relative_urls('https://example.org',
                             './page.html') == 'https://example.org/page.html'
    assert fix_relative_urls('https://example.org',
                             '/page.html') == 'https://example.org/page.html'
    # fixing partial URLs
    assert fix_relative_urls(
        'https://example.org',
        'https://example.org/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org',
                             '/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls(
        'https://example.org',
        '//example.org/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls(
        'http://example.org',
        '//example.org/test.html') == 'http://example.org/test.html'
    assert fix_relative_urls('https://example.org',
                             'test.html') == 'https://example.org/test.html'
    assert fix_relative_urls(
        'https://example.org',
        '../../test.html') == 'https://example.org/test.html'
Example #2
0
def refresh_detection(htmlstring, homepage):
    "Check if there could be a redirection by meta-refresh tag."
    if '"refresh"' in htmlstring or '"REFRESH"' in htmlstring:
        try:
            html_tree = load_html(htmlstring)
            # test meta-refresh redirection
            # https://stackoverflow.com/questions/2318446/how-to-follow-meta-refreshes-in-python
            attr = html_tree.xpath(
                '//meta[@http-equiv="refresh"]/@content|//meta[@http-equiv="REFRESH"]/@content'
            )[0]
            _, text = attr.split(';')
            text = text.strip().lower()
            if text.startswith('url=') or text.startswith('URL='):
                url2 = text[4:]
                if not url2.startswith('http'):
                    # Relative URL, adapt
                    _, base_url = get_hostinfo(url2)
                    url2 = fix_relative_urls(base_url, url2)
                # second fetch
                newhtmlstring = fetch_url(url2)
                if newhtmlstring is None:
                    logging.warning('failed redirect: %s', url2)
                    return None, None
                #else:
                htmlstring, homepage = newhtmlstring, url2
                logging.info('successful redirect: %s', url2)
        except (IndexError, etree.ParserError, etree.XMLSyntaxError,
                etree.XPathEvalError) as err:
            logging.info('no redirect found: %s %s', homepage, err)
    return htmlstring, homepage
Example #3
0
def handle_link(link, sitemapurl, domainname, baseurl, target_lang):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(baseurl, link)
    # clean and normalize
    link = clean_url(link, target_lang)
    if link is not None:
        if lang_filter(link, target_lang) is True:
            newdomain = extract_domain(link)
            if newdomain is not None:
                # don't take links from another domain and make an exception for main platforms
                if newdomain != domainname and not WHITELISTED_PLATFORMS.search(newdomain):
                    LOGGER.warning('Diverging domain names: %s %s', domainname, newdomain)
                else:
                    if DETECT_SITEMAP_LINK.search(link):
                        state = 'sitemap'
                    else:
                        state = 'link'
            else:
                LOGGER.error("Couldn't extract domain: %s", link)
    return link, state
Example #4
0
def determine_feed(htmlstring, baseurl, reference):
    '''Try to extract the feed URL from the home page.
       Adapted from http://www.aaronsw.com/2002/feedfinder/'''
    # parse the page to look for feeds
    tree = load_html(htmlstring)
    # safeguard
    if tree is None:
        LOGGER.debug('Invalid HTML/Feed page: %s', baseurl)
        return []
    feed_urls = []
    for linkelem in tree.xpath('//link[@rel="alternate"]'):
        # discard elements without links
        if 'href' not in linkelem.attrib:
            continue
        # most common case
        if 'type' in linkelem.attrib and linkelem.get('type') in FEED_TYPES:
            feed_urls.append(linkelem.get('href'))
        # websites like geo.de
        elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get('href'):
            feed_urls.append(linkelem.get('href'))
    # backup
    if not feed_urls:
        for linkelem in tree.xpath('//a[@href]'):
            if linkelem.get('href')[-4:].lower() in ('.rss', '.rdf', '.xml'):
                feed_urls.append(linkelem.get('href'))
            elif linkelem.get('href')[-5:].lower() == '.atom':
                feed_urls.append(linkelem.get('href'))
            elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get(
                    'href'):
                feed_urls.append(linkelem.get('href'))
    # refine
    output_urls = []
    for link in sorted(set(feed_urls)):
        link = fix_relative_urls(baseurl, link)
        link = clean_url(link)
        if link == reference or validate_url(link)[0] is False:
            continue
        if BLACKLIST.search(link):
            continue
        output_urls.append(link)
    # log result
    LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls),
                 len(output_urls))
    return output_urls
Example #5
0
def handle_link_list(linklist, domainname, baseurl, target_lang=None):
    '''Examine links to determine if they are valid and
       lead to a web page'''
    output_links = []
    # sort and uniq
    for item in sorted(set(linklist)):
        # fix and check
        link = fix_relative_urls(baseurl, item)
        # control output for validity
        checked = check_url(link, language=target_lang)
        if checked is not None:
            output_links.append(checked[0])
            if checked[1] != domainname:
                LOGGER.warning('Diverging domain names: %s %s', domainname,
                               checked[1])
        # Feedburner/Google feeds
        elif 'feedburner' in item or 'feedproxy' in item:
            output_links.append(item)
    return output_links
Example #6
0
def extract_robots_sitemaps(robotstxt, baseurl):
    'Read a robots.txt file and find sitemap links.'
    # sanity check on length (cause: redirections)
    if robotstxt is None or len(robotstxt) > 10000:
        return []
    sitemapurls = []
    # source: https://github.com/python/cpython/blob/3.8/Lib/urllib/robotparser.py
    for line in robotstxt.splitlines():
        # remove optional comment and strip line
        i = line.find('#')
        if i >= 0:
            line = line[:i]
        line = line.strip()
        if not line:
            continue
        line = line.split(':', 1)
        if len(line) == 2:
            line[0] = line[0].strip().lower()
            if line[0] == "sitemap":
                # urllib.parse.unquote(line[1].strip())
                candidate = fix_relative_urls(baseurl, line[1].strip())
                sitemapurls.append(candidate)
    LOGGER.debug('%s sitemaps found in robots.txt', len(sitemapurls))
    return sitemapurls