コード例 #1
0
def test_examples():
    '''test README examples'''
    assert check_url('https://github.com/adbar/courlan') == ('https://github.com/adbar/courlan', 'github.com')
    assert check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org')
    assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de'
    assert validate_url('http://1234') == (False, None)
    assert validate_url('http://www.example.org/')[0] is True
    assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
コード例 #2
0
def handle_link_list(linklist, domainname, baseurl, target_lang=None):
    '''Examine links to determine if they are valid and
       lead to a web page'''
    output_links = []
    # sort and uniq
    for item in sorted(list(set(linklist))):
        # fix and check
        link = fix_relative_urls(baseurl, item)
        # control output for validity
        checked = check_url(link, language=target_lang)
        if checked is not None:
            output_links.append(checked[0])
            if checked[1] != domainname:
                LOGGER.warning('Diverging domain names: %s %s', domainname, checked[1])
    return output_links
コード例 #3
0
ファイル: sitemaps.py プロジェクト: phongtnit/trafilatura
def handle_link(link, domainname, sitemapurl, target_lang=None):
    '''Examine a link and determine if it's valid and if it leads to
       a sitemap or a web page.'''
    state = '0'
    # safety net: recursivity
    if link == sitemapurl:
        return link, state
    # fix and check
    link = fix_relative_urls(domainname, link)
    if re.search(r'\.xml$|\.xml[.?#]', link):
        state = 'sitemap'
    else:
        checked = check_url(link, language=target_lang)
        if checked is not None:
            link, state = checked[0], 'link'
    return link, state
コード例 #4
0
def test_urlcheck():
    assert check_url('AAA') is None
    assert check_url('1234') is None
    assert check_url('http://ab') is None
    assert check_url('ftps://example.org/') is None
    assert check_url('http://t.g/test') is None
    assert check_url('https://www.dwds.de/test?param=test&other=test',
                     strict=True) == ('https://www.dwds.de/test', 'dwds.de')
    assert check_url('http://example.com/index.html#term', strict=True) is None
    assert check_url('http://example.com/index.html#term',
                     strict=False)[0] == 'http://example.com/index.html#term'
    assert check_url('http://example.com/test.js') is None
    assert check_url('http://twitter.com/') is None
    assert check_url(
        'https://www.httpbin.org/status/200',
        with_redirects=True) == ('https://www.httpbin.org/status/200',
                                 'httpbin.org')
    #assert check_url('https://www.httpbin.org/status/302', with_redirects=True) == ('https://www.httpbin.org/status/302', 'httpbin.org')
    assert check_url('https://www.httpbin.org/status/404',
                     with_redirects=True) is None
    assert check_url('https://www.ht.or', with_redirects=True) is None
    if TLD_EXTRACTION is None:
        assert check_url('http://www.example') is None
        assert check_url('http://example.invalid/', False) is None
    # recheck type and spam filters
    assert check_url('http://example.org/code/oembed/') is None
    assert check_url('http://cams.com/') is None
    assert check_url('https://denkiterm.wordpress.com/impressum/',
                     strict=True) is None
    assert check_url(
        'http://www.fischfutter-index.de/improvit-trocken-frostfutter-fur-fast-alle-fische/',
        strict=True) is not None
    # language and internationalization
    assert check_url('http://example.com/test.html?lang=en',
                     language='de') is None
    assert check_url('http://example.com/test.html?lang=en',
                     language=None) is not None
    assert check_url('http://example.com/test.html?lang=en',
                     language='en') is not None
    assert check_url('http://example.com/de/test.html',
                     language='de') is not None
    assert check_url('http://example.com/en/test.html', language='de') is None
    assert check_url('http://example.com/en/test.html',
                     language=None) is not None
    assert check_url('http://example.com/en/test.html',
                     language='en') is not None
    assert check_url(
        'https://www.myswitzerland.com/de-ch/erlebnisse/veranstaltungen/wild-im-sternen/',
        language='de') is not None
    assert check_url(
        'https://www.myswitzerland.com/en-id/accommodations/other-types-of-accommodations/on-the-farm/farm-experiences-search/',
        language='en') is not None
    assert check_url(
        'https://www.myswitzerland.com/EN-ID/accommodations/other-types-of-accommodations/on-the-farm/farm-experiences-search/',
        language='en') is not None
    # impressum and index
    assert check_url('http://www.example.org/index', strict=True) is None
    assert check_url('http://www.example.org/index.html', strict=True) is None
    assert check_url('http://concordia-hagen.de/impressum.html',
                     strict=True) is None
    assert check_url('http://concordia-hagen.de/de/impressum',
                     strict=True) is None
    assert check_url('http://parkkralle.de/detail/index/sArticle/2704',
                     strict=True) is not None
    assert check_url(
        'https://www.katholisch-in-duisdorf.de/kontakt/links/index.html',
        strict=True) is not None
コード例 #5
0
def test_path_filter():
    assert check_url(
        'http://www.case-modder.de/index.php?sec=artikel&id=68&page=1',
        strict=True) is not None
    assert check_url('http://www.case-modder.de/index.php',
                     strict=True) is None