Example #1
0
def test_parse_url():
    data = [
        ('http://www.test.com/', ('', ), [('depth', 1)]),
        ('http://www.test.com/?', ('', ''), [('depth', 2)]),
        ('http://www.test.com/abc/def?k=v#xxx', ('abc', 'def', 'v', 'xxx'),
         [('depth', 4), ('has_fragment', True)]),
    ]
    for url, p, m in data:
        url_meta, parts = analyze_url(url)
        assert parts == p
        for k, v in m:
            assert getattr(url_meta, k) == v
    with pytest.raises(IrregularURLException):
        analyze_url('http://www.g.com')
Example #2
0
def test_unpack_pack():
    data = [
        ('http://www.g.com/', '/'),
        ('http://www.g.com/abc', '/abc'),
        ('http://www.g.com/abc?a=1#c', '/abc[\\?]a=1#c'),
        ('http://www.g.com/abc???a=1#c', '/abc[\\?][\\?]{2}a=1#c'),
        ('http://www.g.com/abc?=1#c', '/abc[\\?]=1#c'),
        ('http://www.g.com/abc?a=1#', '/abc[\\?]a=1#'),
        ('http://www.g.com/abc?a=1&b=2#', '/abc[\\?]a=1&b=2#'),
    ]
    for url, expected in data:
        assert pack(*analyze_url(url)) == expected
Example #3
0
def test_parse_url_pattern_string():
    patterns = [
        ('/AaBb/123456.shtml', '/[A-Za-z]+/[0-9]{6}[\\.]shtml'),
        ('/abc/123/index.html', '/abc/123/index[\\.]html'),
        ('/12345678/index.asp?id=123',
         '/[0-9]{8}/[a-z]+[\\.]asp[\\?]id=[0-9]+'),
        ('/newsShow.asp?dataID=1', '/newsShow[\\.]asp[\\?]dataID=[0-9]+'),
    ]

    for url, pattern in patterns:
        url = 'http://example.com' + url
        um1, pieces = analyze_url(url)
        um2, pattern_strings = analyze_url_pattern_string(pattern)
        assert um1 == um2
        for p, s in zip(pattern_strings, pieces):
            assert Pattern(p).match(s)
Example #4
0
def test_parse_url_pattern():
    data = [
        'http://www.g.com/',
        'http://www.g.com/abc',
        'http://www.g.com/abc?a=1#c',
        'http://www.g.com/abc???a=1#c',
        'http://www.g.com/abc?=1#c',
        'http://www.g.com/abc?a=1#',
        'http://www.g.com/abc?a=1&b=2#',
    ]
    for url in data:
        meta1, parts1 = analyze_url(url)
        pattern_string = pack(meta1, parts1)
        meta2, parts2 = analyze_url_pattern_string(pattern_string)
        assert meta1 == meta2
        assert len(parts1) == len(parts2)
Example #5
0
def test_digest():
    parser = PieceParser()
    data = [
        ('/abc/', '/abcdef/'),
        ('/abc/index.html?k1=v1&k2=v2', '/abc/html.htm?k1=c01&k2=2m'),
        ('/abc/index.html?k1=v1#abc', '/abc/html.htm?k1=c01#def'),
    ]

    for urls in data:
        urls = ['http://example.com' + u for u in urls]
        digests = set()
        for url in urls:
            url_meta, pieces = analyze_url(url)
            parsed_pieces = [parser.parse(piece) for piece in pieces]
            sid = digest(url_meta, [p.fuzzy_rule for p in parsed_pieces])
            assert fuzzy_digest(url_meta, parsed_pieces) == sid
            digests.add(sid)
        assert len(digests) == 1
Example #6
0
def test_analyze_url():
    data = [
        [
            'http://www.g.com/test', ('path', '/test'), ('query', None),
            ('fragment', None)
        ],
        ['http://www.g.com/test?', ('query', ''), ('fragment', None)],
        ['http://www.g.com/test?#', ('query', ''), ('fragment', '')],
        ['http://www.g.com/test?#abc', ('query', ''), ('fragment', 'abc')],
        ['http://www.g.com/test#abc', ('query', None), ('fragment', 'abc')],
        ['http://www.g.com/test?a#', ('query', 'a'), ('fragment', '')],
        ['http://www.g.com/test?a##', ('query', 'a'), ('fragment', '#')],
        ['http://www.g.com/test#?', ('query', None), ('fragment', '?')],
    ]
    for check in data:
        url = check[0]
        r = analyze_url(url)
        for attr, expect in check[1:]:
            assert getattr(r, attr) == expect