def test_examples(): '''test README examples''' assert check_url('https://github.com/adbar/courlan') == ('https://github.com/adbar/courlan', 'github.com') assert check_url('https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org') assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de' assert validate_url('http://1234') == (False, None) assert validate_url('http://www.example.org/')[0] is True assert normalize_url('http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
def handle_link_list(linklist, domainname, baseurl, target_lang=None): '''Examine links to determine if they are valid and lead to a web page''' output_links = [] # sort and uniq for item in sorted(list(set(linklist))): # fix and check link = fix_relative_urls(baseurl, item) # control output for validity checked = check_url(link, language=target_lang) if checked is not None: output_links.append(checked[0]) if checked[1] != domainname: LOGGER.warning('Diverging domain names: %s %s', domainname, checked[1]) return output_links
def handle_link(link, domainname, sitemapurl, target_lang=None): '''Examine a link and determine if it's valid and if it leads to a sitemap or a web page.''' state = '0' # safety net: recursivity if link == sitemapurl: return link, state # fix and check link = fix_relative_urls(domainname, link) if re.search(r'\.xml$|\.xml[.?#]', link): state = 'sitemap' else: checked = check_url(link, language=target_lang) if checked is not None: link, state = checked[0], 'link' return link, state
def test_urlcheck(): assert check_url('AAA') is None assert check_url('1234') is None assert check_url('http://ab') is None assert check_url('ftps://example.org/') is None assert check_url('http://t.g/test') is None assert check_url('https://www.dwds.de/test?param=test&other=test', strict=True) == ('https://www.dwds.de/test', 'dwds.de') assert check_url('http://example.com/index.html#term', strict=True) is None assert check_url('http://example.com/index.html#term', strict=False)[0] == 'http://example.com/index.html#term' assert check_url('http://example.com/test.js') is None assert check_url('http://twitter.com/') is None assert check_url( 'https://www.httpbin.org/status/200', with_redirects=True) == ('https://www.httpbin.org/status/200', 'httpbin.org') #assert check_url('https://www.httpbin.org/status/302', with_redirects=True) == ('https://www.httpbin.org/status/302', 'httpbin.org') assert check_url('https://www.httpbin.org/status/404', with_redirects=True) is None assert check_url('https://www.ht.or', with_redirects=True) is None if TLD_EXTRACTION is None: assert check_url('http://www.example') is None assert check_url('http://example.invalid/', False) is None # recheck type and spam filters assert check_url('http://example.org/code/oembed/') is None assert check_url('http://cams.com/') is None assert check_url('https://denkiterm.wordpress.com/impressum/', strict=True) is None assert check_url( 'http://www.fischfutter-index.de/improvit-trocken-frostfutter-fur-fast-alle-fische/', strict=True) is not None # language and internationalization assert check_url('http://example.com/test.html?lang=en', language='de') is None assert check_url('http://example.com/test.html?lang=en', language=None) is not None assert check_url('http://example.com/test.html?lang=en', language='en') is not None assert check_url('http://example.com/de/test.html', language='de') is not None assert check_url('http://example.com/en/test.html', language='de') is None assert check_url('http://example.com/en/test.html', language=None) is not None assert check_url('http://example.com/en/test.html', language='en') is not None assert check_url( 'https://www.myswitzerland.com/de-ch/erlebnisse/veranstaltungen/wild-im-sternen/', language='de') is not None assert check_url( 'https://www.myswitzerland.com/en-id/accommodations/other-types-of-accommodations/on-the-farm/farm-experiences-search/', language='en') is not None assert check_url( 'https://www.myswitzerland.com/EN-ID/accommodations/other-types-of-accommodations/on-the-farm/farm-experiences-search/', language='en') is not None # impressum and index assert check_url('http://www.example.org/index', strict=True) is None assert check_url('http://www.example.org/index.html', strict=True) is None assert check_url('http://concordia-hagen.de/impressum.html', strict=True) is None assert check_url('http://concordia-hagen.de/de/impressum', strict=True) is None assert check_url('http://parkkralle.de/detail/index/sArticle/2704', strict=True) is not None assert check_url( 'https://www.katholisch-in-duisdorf.de/kontakt/links/index.html', strict=True) is not None
def test_path_filter(): assert check_url( 'http://www.case-modder.de/index.php?sec=artikel&id=68&page=1', strict=True) is not None assert check_url('http://www.case-modder.de/index.php', strict=True) is None