def test_normalization(): assert normalize_url('HTTPS://WWW.DWDS.DE/') == 'https://www.dwds.de/' assert normalize_url( 'http://test.net/foo.html#bar') == 'http://test.net/foo.html' assert normalize_url('http://test.net/foo.html#:~:text=night-,vision' ) == 'http://test.net/foo.html' assert normalize_url('http://www.example.org:80/test.html' ) == 'http://www.example.org/test.html'
def test_examples(): '''test README examples''' assert check_url('https://github.com/adbar/courlan') == ( 'https://github.com/adbar/courlan', 'github.com') assert check_url( 'https://httpbin.org/redirect-to?url=http%3A%2F%2Fexample.org', strict=True) == ('https://httpbin.org/redirect-to', 'httpbin.org') assert clean_url('HTTPS://WWW.DWDS.DE:80/') == 'https://www.dwds.de' assert validate_url('http://1234') == (False, None) assert validate_url('http://www.example.org/')[0] is True assert normalize_url( 'http://test.net/foo.html?utm_source=twitter&post=abc&page=2#fragment', strict=True) == 'http://test.net/foo.html?page=2&post=abc'
def extract_url(tree, default_url=None): '''Extract the URL from the canonical link''' # https://www.tutorialrepublic.com/html-reference/html-base-tag.php # default url as fallback url = default_url # try canonical link first element = tree.find('.//head//link[@rel="canonical"]') if element is not None and 'href' in element.attrib and URL_COMP_CHECK.match( element.attrib['href']): url = element.attrib['href'] # try default language link else: for element in tree.iterfind('.//head//link[@rel="alternate"]'): if 'hreflang' in element.attrib and element.attrib[ 'hreflang'] is not None and element.attrib[ 'hreflang'] == 'x-default': if URL_COMP_CHECK.match(element.attrib['href']): LOGGER.debug( html.tostring(element, pretty_print=False, encoding='unicode').strip()) url = element.attrib['href'] # add domain name if it's missing if url is not None and url.startswith('/'): for element in tree.iterfind('.//head//meta[@content]'): if 'name' in element.attrib: attrtype = element.attrib['name'] elif 'property' in element.attrib: attrtype = element.attrib['property'] else: continue if attrtype.startswith('og:') or attrtype.startswith('twitter:'): domain_match = re.match(r'https?://[^/]+', element.attrib['content']) if domain_match: # prepend URL url = domain_match.group(0) + url break # sanity check: don't return invalid URLs if url is not None: validation_result, parsed_url = validate_url(url) if validation_result is False: url = None else: url = normalize_url(parsed_url) return url
def test_qelems(): assert normalize_url('http://test.net/foo.html?utm_source=twitter' ) == 'http://test.net/foo.html?utm_source=twitter' assert normalize_url('http://test.net/foo.html?utm_source=twitter', strict=True) == 'http://test.net/foo.html' assert normalize_url( 'http://test.net/foo.html?utm_source=twitter&post=abc&page=2' ) == 'http://test.net/foo.html?page=2&post=abc&utm_source=twitter' assert normalize_url( 'http://test.net/foo.html?utm_source=twitter&post=abc&page=2', strict=True) == 'http://test.net/foo.html?page=2&post=abc' assert normalize_url( 'http://test.net/foo.html?page=2&itemid=10&lang=en' ) == 'http://test.net/foo.html?itemid=10&lang=en&page=2' with pytest.raises(ValueError): assert normalize_url('http://test.net/foo.html?page=2&lang=en', language='de') assert normalize_url( 'http://www.evolanguage.de/index.php?page=deutschkurse_fuer_aerzte&language=ES', language='de')