コード例 #1
0
def normalize_url(input_url):
    url = parse_string(input_url)
    try:
        _validate_url(url)
    except MalformatUrlException as ex:
        raise ex
    builder = url.builder()
    builder.set_scheme(url.get_scheme().lower())
    host = url.get_host().lower()
    if host.endswith("."):
        host = host[:-1]
    builder.set_host(host)
    builder.set_fragment(None)

    # remove utm parameters
    # https://support.google.com/analytics/answer/1033867
    blacklist = [
        'utm_source', 'utm_medium', 'utm_term', 'utm_content', 'utm_campaign'
    ]
    queries = filter(lambda x: x[0] not in blacklist, url.get_queries())

    # sort queries
    queries = sorted(queries)
    builder.set_queries(queries)

    # remove fragment
    builder.set_fragment(None)

    return builder.build().get()
コード例 #2
0
ファイル: urlnormalizer.py プロジェクト: kafji/urlnormalizer
def normalize_url(input_url):
	url = parse_string(input_url)
	try:
		_validate_url(url)
	except MalformatUrlException as ex:
		raise ex
	builder = url.builder()
	builder.set_scheme(url.get_scheme().lower())
	host = url.get_host().lower()
	if host.endswith("."):
		host = host[:-1]
	builder.set_host(host)
	builder.set_fragment(None)

	# remove utm parameters
	# https://support.google.com/analytics/answer/1033867
	blacklist = ['utm_source', 'utm_medium', 'utm_term', 'utm_content', 'utm_campaign']
	queries = filter(lambda x: x[0] not in blacklist, url.get_queries())

	# sort queries
	queries = sorted(queries)
	builder.set_queries(queries)

	# remove fragment
	builder.set_fragment(None)

	return builder.build().get()
コード例 #3
0
def test_url_with_user_and_password():
    url = parse_string('http://*****:*****@example.com/')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_user() == 'user'
    assert url.get_password() == 'password'
    assert url.get() == 'http://*****:*****@example.com/'
コード例 #4
0
def test_url_with_user_and_password():
	url = parse_string('http://*****:*****@example.com/')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get_user() == 'user'
	assert url.get_password() == 'password'
	assert url.get() == 'http://*****:*****@example.com/'
コード例 #5
0
def test_url():
	url = parse_string('http://*****:*****@example.com/path1/path2?q=query&query#fragment')
	assert url.get_scheme() == 'http'
	assert url.get_user() == 'user'
	assert url.get_password() == 'password'
	assert url.get_host() == 'example.com'
	assert url.get_paths() == ['path1', 'path2']
	assert url.get_queries() == [('q', 'query'), ('query',)]
	assert url.get_fragment() == 'fragment'
	assert url.get() == 'http://*****:*****@example.com/path1/path2?q=query&query#fragment'
コード例 #6
0
def test_url():
    url = parse_string(
        'http://*****:*****@example.com/path1/path2?q=query&query#fragment')
    assert url.get_scheme() == 'http'
    assert url.get_user() == 'user'
    assert url.get_password() == 'password'
    assert url.get_host() == 'example.com'
    assert url.get_paths() == ['path1', 'path2']
    assert url.get_queries() == [('q', 'query'), ('query', )]
    assert url.get_fragment() == 'fragment'
    assert url.get(
    ) == 'http://*****:*****@example.com/path1/path2?q=query&query#fragment'
コード例 #7
0
def test_malformat_url():
    with raises(MalformatUrlException) as ex:
        parse_string('example.com')
    assert str(ex.value) == 'Missing scheme.'
コード例 #8
0
def test_queries_sorted():
	assert parse_string('http://example.com/?param1=param1val&param2=param2val').get_queries() == \
		parse_string('http://example.com/?param2=param2val&param1=param1val').get_queries()
コード例 #9
0
def test_keep_queries_order():
    assert parse_string('http://example.com/?param1=param1val&param2=param2val').get_queries() != \
     parse_string('http://example.com/?param2=param2val&param1=param1val').get_queries()
コード例 #10
0
def test_url_with_queries():
	url = parse_string('http://example.com/?q=query&query')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get_queries() == [('q', 'query'), ('query',)]
	assert url.get() == 'http://example.com/?q=query&query'
コード例 #11
0
def test_url_with_paths_and_trailing_slash():
	url = parse_string('http://example.com/path1/path2/')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get_paths() == ['path1', 'path2']
	assert url.get() == 'http://example.com/path1/path2'
コード例 #12
0
def test_url_with_nan_port():
	with raises(MalformatUrlException) as ex:
		parse_string('http://example.com:abcd/')
	assert str(ex.value) == 'Port must be a number.'
コード例 #13
0
def test_malformat_url():
	with raises(MalformatUrlException) as ex:
		parse_string('example.com')
	assert str(ex.value) == 'Missing scheme.'
コード例 #14
0
def test_url_with_user():
	url = parse_string('http://[email protected]/')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get_user() == 'user'
	assert url.get() == 'http://[email protected]/'
コード例 #15
0
def test_host_is_ip():
	url = parse_string('http://203.0.113.1/')
	assert url.get_scheme() == 'http'
	assert url.get_host() == '203.0.113.1'
	assert url.get() == 'http://203.0.113.1/'
コード例 #16
0
def test_url_with_port():
    url = parse_string('http://example.com:8000/')
    assert url.get_port() == 8000
コード例 #17
0
def test_url_with_user():
    url = parse_string('http://[email protected]/')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_user() == 'user'
    assert url.get() == 'http://[email protected]/'
コード例 #18
0
def test_simple_url_without_trailing_slash():
    url = parse_string('http://example.com')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get() == 'http://example.com/'
コード例 #19
0
def test_host_is_ip():
    url = parse_string('http://203.0.113.1/')
    assert url.get_scheme() == 'http'
    assert url.get_host() == '203.0.113.1'
    assert url.get() == 'http://203.0.113.1/'
コード例 #20
0
def test_simple_url_without_trailing_slash():
	url = parse_string('http://example.com')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get() == 'http://example.com/'
コード例 #21
0
def test_url_with_nan_port():
    with raises(MalformatUrlException) as ex:
        parse_string('http://example.com:abcd/')
    assert str(ex.value) == 'Port must be a number.'
コード例 #22
0
def test_url_with_path():
    url = parse_string('http://example.com/path')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_paths() == ['path']
    assert url.get() == 'http://example.com/path'
コード例 #23
0
def test_url_with_paths_and_trailing_slash():
    url = parse_string('http://example.com/path1/path2/')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_paths() == ['path1', 'path2']
    assert url.get() == 'http://example.com/path1/path2'
コード例 #24
0
def test_url_with_port():
	url = parse_string('http://example.com:8000/')
	assert url.get_port() == 8000
コード例 #25
0
def test_url_with_query():
    url = parse_string('http://example.com/?q=query')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_queries() == [('q', 'query')]
    assert url.get() == 'http://example.com/?q=query'
コード例 #26
0
def test_url_with_path():
	url = parse_string('http://example.com/path')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get_paths() == ['path']
	assert url.get() == 'http://example.com/path'
コード例 #27
0
def test_simple_url():
    url = parse_string('http://example.com/')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get() == 'http://example.com/'
コード例 #28
0
def test_simple_url():
	url = parse_string('http://example.com/')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get() == 'http://example.com/'
コード例 #29
0
def test_url_with_empty_fragment():
	url = parse_string('http://example.com/#')
	assert url.get_scheme() == 'http'
	assert url.get_host() == 'example.com'
	assert url.get_fragment() == ''
	assert url.get() == 'http://example.com/#'
コード例 #30
0
def test_url_with_query_and_equal_sign_no_value():
    url = parse_string("http://example.com/?query=")
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_queries() == [('query', '')]
    assert url.get() == 'http://example.com/?query='
コード例 #31
0
def test_url_with_empty_fragment():
    url = parse_string('http://example.com/#')
    assert url.get_scheme() == 'http'
    assert url.get_host() == 'example.com'
    assert url.get_fragment() == ''
    assert url.get() == 'http://example.com/'