def test_parse_url(): data = [ ('http://www.test.com/', ('', ), [('depth', 1)]), ('http://www.test.com/?', ('', ''), [('depth', 2)]), ('http://www.test.com/abc/def?k=v#xxx', ('abc', 'def', 'v', 'xxx'), [('depth', 4), ('has_fragment', True)]), ] for url, p, m in data: url_meta, parts = analyze_url(url) assert parts == p for k, v in m: assert getattr(url_meta, k) == v with pytest.raises(IrregularURLException): analyze_url('http://www.g.com')
def test_unpack_pack(): data = [ ('http://www.g.com/', '/'), ('http://www.g.com/abc', '/abc'), ('http://www.g.com/abc?a=1#c', '/abc[\\?]a=1#c'), ('http://www.g.com/abc???a=1#c', '/abc[\\?][\\?]{2}a=1#c'), ('http://www.g.com/abc?=1#c', '/abc[\\?]=1#c'), ('http://www.g.com/abc?a=1#', '/abc[\\?]a=1#'), ('http://www.g.com/abc?a=1&b=2#', '/abc[\\?]a=1&b=2#'), ] for url, expected in data: assert pack(*analyze_url(url)) == expected
def test_parse_url_pattern_string(): patterns = [ ('/AaBb/123456.shtml', '/[A-Za-z]+/[0-9]{6}[\\.]shtml'), ('/abc/123/index.html', '/abc/123/index[\\.]html'), ('/12345678/index.asp?id=123', '/[0-9]{8}/[a-z]+[\\.]asp[\\?]id=[0-9]+'), ('/newsShow.asp?dataID=1', '/newsShow[\\.]asp[\\?]dataID=[0-9]+'), ] for url, pattern in patterns: url = 'http://example.com' + url um1, pieces = analyze_url(url) um2, pattern_strings = analyze_url_pattern_string(pattern) assert um1 == um2 for p, s in zip(pattern_strings, pieces): assert Pattern(p).match(s)
def test_parse_url_pattern(): data = [ 'http://www.g.com/', 'http://www.g.com/abc', 'http://www.g.com/abc?a=1#c', 'http://www.g.com/abc???a=1#c', 'http://www.g.com/abc?=1#c', 'http://www.g.com/abc?a=1#', 'http://www.g.com/abc?a=1&b=2#', ] for url in data: meta1, parts1 = analyze_url(url) pattern_string = pack(meta1, parts1) meta2, parts2 = analyze_url_pattern_string(pattern_string) assert meta1 == meta2 assert len(parts1) == len(parts2)
def test_digest(): parser = PieceParser() data = [ ('/abc/', '/abcdef/'), ('/abc/index.html?k1=v1&k2=v2', '/abc/html.htm?k1=c01&k2=2m'), ('/abc/index.html?k1=v1#abc', '/abc/html.htm?k1=c01#def'), ] for urls in data: urls = ['http://example.com' + u for u in urls] digests = set() for url in urls: url_meta, pieces = analyze_url(url) parsed_pieces = [parser.parse(piece) for piece in pieces] sid = digest(url_meta, [p.fuzzy_rule for p in parsed_pieces]) assert fuzzy_digest(url_meta, parsed_pieces) == sid digests.add(sid) assert len(digests) == 1
def test_analyze_url(): data = [ [ 'http://www.g.com/test', ('path', '/test'), ('query', None), ('fragment', None) ], ['http://www.g.com/test?', ('query', ''), ('fragment', None)], ['http://www.g.com/test?#', ('query', ''), ('fragment', '')], ['http://www.g.com/test?#abc', ('query', ''), ('fragment', 'abc')], ['http://www.g.com/test#abc', ('query', None), ('fragment', 'abc')], ['http://www.g.com/test?a#', ('query', 'a'), ('fragment', '')], ['http://www.g.com/test?a##', ('query', 'a'), ('fragment', '#')], ['http://www.g.com/test#?', ('query', None), ('fragment', '?')], ] for check in data: url = check[0] r = analyze_url(url) for attr, expect in check[1:]: assert getattr(r, attr) == expect