コード例 #1
0
ファイル: sitemaps.py プロジェクト: lukehassel/trafilatura
def handle_link(link, sitemapurl, domainname, baseurl, target_lang):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(baseurl, link)
    # clean and normalize
    link = clean_url(link, target_lang)
    if link is not None:
        if lang_filter(link, target_lang) is True:
            newdomain = extract_domain(link)
            if newdomain is not None:
                # don't take links from another domain and make an exception for main platforms
                if newdomain != domainname and not WHITELISTED_PLATFORMS.search(
                        newdomain):
                    LOGGER.warning('Diverging domain names: %s %s', domainname,
                                   newdomain)
                else:
                    if re.search(r'\.xml$|\.xml[.?#]', link):
                        state = 'sitemap'
                    else:
                        state = 'link'
            else:
                LOGGER.error("Couldn't extract domain: %s", link)
    return link, state
コード例 #2
0
def test_examples():
    '''test README examples'''
    assert check_url('https://github.com/adbar/courlan') == ('https://github.com/adbar/courlan', 'github.com')
    assert check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org')
    assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de'
    assert validate_url('http://1234') == (False, None)
    assert validate_url('http://www.example.org/')[0] is True
    assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
コード例 #3
0
def test_scrub():
    # clean: scrub + normalize
    assert clean_url(5) is None
    assert clean_url('ø\xaa') == 'øª'
    # scrub
    assert scrub_url('  https://www.dwds.de') == 'https://www.dwds.de'
    assert scrub_url(
        '<![CDATA[https://www.dwds.de]]>') == 'https://www.dwds.de'
    assert scrub_url('https://www.dwds.de/test?param=test&amp;other=test'
                     ) == 'https://www.dwds.de/test?param=test&other=test'
    assert scrub_url('https://www.dwds.de/garbledhttps://www.dwds.de/'
                     ) == 'https://www.dwds.de/garbled'
    assert scrub_url(
        'https://g__https://www.dwds.de/') == 'https://www.dwds.de'
    # exception for archive URLs
    assert scrub_url(
        'https://web.archive.org/web/20131021165347/https://www.imdb.com/'
    ) == 'https://web.archive.org/web/20131021165347/https://www.imdb.com'
    # social sharing
    assert scrub_url(
        'https://twitter.com/share?&text=Le%20sabre%20de%20bambou%20%232&via=NouvellesJapon&url=https://nouvellesdujapon.com/le-sabre-de-bambou-2'
    ) == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2'
    assert scrub_url(
        'https://www.facebook.com/sharer.php?u=https://nouvellesdujapon.com/le-sabre-de-bambou-2'
    ) == 'https://nouvellesdujapon.com/le-sabre-de-bambou-2'
    # end of URL
    assert scrub_url('https://www.test.com/&') == 'https://www.test.com'
    # white space
    assert scrub_url('\x19https://www.test.com/\x06') == 'https://www.test.com'
    # markup
    assert scrub_url('https://www.test.com/</a>') == 'https://www.test.com'
    # garbled URLs e.g. due to quotes
    assert scrub_url('https://www.test.com/"' +
                     '<p></p>' * 100) == 'https://www.test.com'
    assert scrub_url('https://www.test.com/"' * 50) != 'https://www.test.com'
    # simply too long, left untouched
    my_url = 'https://www.test.com/' + 'abcdefg' * 100
    assert scrub_url(my_url) == my_url
コード例 #4
0
ファイル: feeds.py プロジェクト: adbar/trafilatura
def determine_feed(htmlstring, baseurl, reference):
    '''Try to extract the feed URL from the home page.
       Adapted from http://www.aaronsw.com/2002/feedfinder/'''
    # parse the page to look for feeds
    tree = load_html(htmlstring)
    # safeguard
    if tree is None:
        LOGGER.debug('Invalid HTML/Feed page: %s', baseurl)
        return []
    feed_urls = []
    for linkelem in tree.xpath('//link[@rel="alternate"]'):
        # discard elements without links
        if 'href' not in linkelem.attrib:
            continue
        # most common case
        if 'type' in linkelem.attrib and linkelem.get('type') in FEED_TYPES:
            feed_urls.append(linkelem.get('href'))
        # websites like geo.de
        elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get('href'):
            feed_urls.append(linkelem.get('href'))
    # backup
    if not feed_urls:
        for linkelem in tree.xpath('//a[@href]'):
            if linkelem.get('href')[-4:].lower() in ('.rss', '.rdf', '.xml'):
                feed_urls.append(linkelem.get('href'))
            elif linkelem.get('href')[-5:].lower() == '.atom':
                feed_urls.append(linkelem.get('href'))
            elif 'atom' in linkelem.get('href') or 'rss' in linkelem.get(
                    'href'):
                feed_urls.append(linkelem.get('href'))
    # refine
    output_urls = []
    for link in sorted(set(feed_urls)):
        link = fix_relative_urls(baseurl, link)
        link = clean_url(link)
        if link == reference or validate_url(link)[0] is False:
            continue
        if BLACKLIST.search(link):
            continue
        output_urls.append(link)
    # log result
    LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls),
                 len(output_urls))
    return output_urls
コード例 #5
0
def handle_link(link, sitemapurl, domainname, baseurl, target_lang=None):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(baseurl, link)
    # clean and normalize
    link = clean_url(link, target_lang)
    if link is not None:
        if lang_filter(link, target_lang) is True:
            newdomain = extract_domain(link)
            if newdomain != domainname:
                LOGGER.warning('Diverging domain names: %s %s', domainname,
                               newdomain)
            else:
                if re.search(r'\.xml$|\.xml[.?#]', link):
                    state = 'sitemap'
                else:
                    state = 'link'
    return link, state
コード例 #6
0
ファイル: feeds.py プロジェクト: EiffelFly/trafilatura
def determine_feed(htmlstring, baseurl, reference):
    '''Try to extract the feed URL from the home page'''
    feed_urls = []
    # try to find RSS URL
    for feed_url in re.findall(
            r'<link[^<>]+?type="application/rss\+xml"[^<>]+?href="(.+?)"',
            htmlstring):
        feed_urls.append(feed_url)
    for feed_url in re.findall(
            r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/rss\+xml"',
            htmlstring):
        feed_urls.append(feed_url)
    # try to find Atom URL
    if len(feed_urls) == 0:
        for feed_url in re.findall(
                r'<link[^<>]+?type="application/atom\+xml"[^<>]+?href="(.+?)"',
                htmlstring):
            feed_urls.append(feed_url)
        for feed_url in re.findall(
                r'<link[^<>]+?href="(.+?)"[^<>]+?type="application/atom\+xml"',
                htmlstring):
            feed_urls.append(feed_url)
    # refine
    output_urls = []
    for link in sorted(list(set(feed_urls))):
        link = fix_relative_urls(baseurl, link)
        link = clean_url(link)
        if link == reference or validate_url(link)[0] is False:
            continue
        if 'comments' in link:
            continue
        output_urls.append(link)
    # log result
    LOGGER.debug('Feed URLs found: %s of which %s valid', len(feed_urls),
                 len(output_urls))
    return output_urls