Ejemplo n.º 1
0
def test_split_head_body():
    '''
    Whitebox test of the hueristics in this function
    '''
    head, body = parse.split_head_body('x' * 100000)
    assert head == ''
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<HeAd>' + 'x' * 100000)
    assert head == ''
    assert len(body) == 100007
    head, body = parse.split_head_body('x' + '</HeAd>' + 'x' * 100000)
    assert head == 'x'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<BoDy>' + 'x' * 100000)
    assert head == 'x'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<heAd><boDy>' + 'x' * 100000)
    assert head == 'x<heAd>'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<hEad></heAd>' + 'x' * 100000)
    assert head == 'x<hEad>'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<heaD></Head><bOdy>' +
                                       'x' * 100000)
    assert head == 'x<heaD>'
    assert len(body) == 100006
Ejemplo n.º 2
0
def test_split_head_body():
    '''
    Whitebox test of the heuristics in this function
    '''
    head, body = parse.split_head_body('x'*100000)
    assert head == ''
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<HeAd>' + 'x'*100000)
    assert head == ''
    assert len(body) == 100007
    head, body = parse.split_head_body('x' + '</HeAd>' + 'x'*100000)
    assert head == 'x'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<BoDy>' + 'x'*100000)
    assert head == 'x'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<heAd><boDy>' + 'x'*100000)
    assert head == 'x<heAd>'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<hEad></heAd>' + 'x'*100000)
    assert head == 'x<hEad>'
    assert len(body) == 100000
    head, body = parse.split_head_body('x' + '<heaD></Head><bOdy>' + 'x'*100000)
    assert head == 'x<heaD>'
    assert len(body) == 100006
Ejemplo n.º 3
0
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 6
    assert len(embeds) == 0
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'foo.gif' in links
    assert 'torture"\n<url>' in links

    head, body = parse.split_head_body(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'foo.gif' in embeds
    head_soup = BeautifulSoup(head, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    assert len(links) == 0
    assert len(embeds) == 1
    assert 'link.html' in embeds

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links.update(lbody)
    embeds.update(ebody)
    assert len(links) == 4
    assert len(embeds) == 2
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'link.html' in embeds
    assert 'foo.gif' in embeds

    head, body = parse.split_head_body(test_html_harder)
    body_soup = BeautifulSoup(body, 'lxml')
    lbody, ebody = parse.find_body_links_soup(body_soup)
    assert len(lbody) == 1
    assert len(ebody) == 1
    assert 'iframe.html' in lbody
    assert 'stylesheet.blah' in ebody
Ejemplo n.º 4
0
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 6
    assert len(embeds) == 0
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'foo.gif' in links
    assert 'torture"\n<url>' in links

    head, body = parse.split_head_body(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'foo.gif' in embeds
    head_soup = BeautifulSoup(head, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    assert len(links) == 0
    assert len(embeds) == 1
    assert 'link.html' in embeds

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links.update(lbody)
    embeds.update(ebody)
    assert len(links) == 4
    assert len(embeds) == 2
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'link.html' in embeds
    assert 'foo.gif' in embeds
Ejemplo n.º 5
0
def parse_all(name, string):
    all_links = []

    # warmup

    head, body = parse.split_head_body(string)

    links, embeds = parse.find_html_links_re(string)  # embeds is empty here by design
    links, embeds = parse.find_body_links_re(body)

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    links, embeds = parse.find_body_links_soup(body_soup)

    # measurement

    with stats.record_burn('split_head_body', url=name):
        head, body = parse.split_head_body(string)

    with stats.record_burn('find_html_links_re', url=name):
        links, embeds = parse.find_html_links_re(string)  # embeds is empty here by design
    all_links.append(links.union(embeds))

    with stats.record_burn('head_soup', url=name):
        head_soup = BeautifulSoup(head, 'lxml')
    with stats.record_burn('find_head_links_soup', url=name):
        head_links, head_embeds = parse.find_head_links_soup(head_soup)

    body = '<html>' + body  # because a closing tag at the start of body screws up lxml
    with stats.record_burn('find_body_links_re', url=name):
        links, embeds = parse.find_body_links_re(body)
    all_links.append(links.union(embeds).union(head_links).union(head_embeds))

    with stats.record_burn('body_soup', url=name):
        body_soup = BeautifulSoup(body, 'lxml')
    with stats.record_burn('find_body_links_soup', url=name):
        links, embeds = parse.find_body_links_soup(body_soup)
    all_links.append(links.union(embeds).union(head_links).union(head_embeds))

    # evaluation

    biggest = functools.reduce(max, [len(x) for x in all_links])
    for i, v in enumerate(all_links):
        if len(v) == biggest:
            biggest_index = i
            biggest_links = v

    names = 'find_html_links_re', 'find_body_links_re', 'find_body_links_soup'

    for i, v in enumerate(all_links):
        if len(v) != biggest:
            print('{} had different link counts of {} and {}'.format(name, biggest, len(v)))
            extra1 = v.difference(biggest_links)
            extra2 = biggest_links.difference(v)
            if extra1:
                print('  extra in {}: {!r}'.format(names[i], extra1))
            else:
                print('  count was {} for {}'.format(len(v), names[i]))
            if extra2:
                print('  extra in {}: {!r}'.format(names[biggest_index], extra2))
            else:
                print('  count was {} for {}'.format(len(biggest_links), names[biggest_index]))
Ejemplo n.º 6
0
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 6
    assert len(embeds) == 0
    linkset = set(parse.collapse_links(links))
    assert 'foo2.htm' in linkset
    assert 'foo3.html ' in linkset
    assert 'foo.gif' in linkset
    assert 'torture"\n<url>' in linkset

    head, body = parse.split_head_body(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    linkset = set(parse.collapse_links(links))
    embedset = set(parse.collapse_links(embeds))
    assert 'foo2.htm' in linkset
    assert 'foo3.html ' in linkset
    assert 'torture"\n<url>' in linkset
    assert 'foo.gif' in embedset

    links, embeds = parse.find_body_links_anchors_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    linkdict = dict([(l['href'], l['anchor']) for l in links])
    # {('foo1.html', 'Anchor 1'), ('foo3.html ', 'Anchor 3'), ('foo2.htm', 'Anchor 2'), ('torture"\n<url>', 'torture\nanchor')}
    assert linkdict['foo2.htm'] == 'Anchor 2'
    assert linkdict['foo3.html '] == 'Anchor 3'
    assert linkdict['torture"\n<url>'] == 'torture\nanchor'
    assert 'foo.gif' in embeds[0]['src']

    head_soup = BeautifulSoup(head, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    embedset = set(parse.collapse_links(embeds))
    assert len(links) == 0
    assert len(embeds) == 1
    assert 'link.html' in embedset

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links += lbody
    embeds += ebody
    linkset = set(parse.collapse_links(links))
    embedset = set(parse.collapse_links(embeds))
    assert len(links) == 4
    assert len(embeds) == 2
    assert 'foo2.htm' in linkset
    assert 'foo3.html ' in linkset
    assert 'torture"\n<url>' in linkset
    assert 'link.html' in embedset
    assert 'foo.gif' in embedset

    head, body = parse.split_head_body(test_html_harder)
    body_soup = BeautifulSoup(body, 'lxml')
    lbody, ebody = parse.find_body_links_soup(body_soup)
    assert len(lbody) == 1
    assert len(ebody) == 1
    assert 'iframe.html' == lbody[0]['src']
    assert 'stylesheet.blah' == ebody[0]['href']
Ejemplo n.º 7
0
</head></head>
<body>
<a href="a-url">an-anchor</a>
</body>
</html>
'''

all_soup = BeautifulSoup(html, 'lxml')
print('all soup:', repr(all_soup))

# try to get the adblock key out
print('got data-adblockkey of', all_soup.get('data-adblockkey'))  # fails
html_soup = all_soup.find('html')
print('got data-adblockkey of', html_soup.get('data-adblockkey'))  # works

head, body = parse.split_head_body(html)
print('head', head)
head_soup = BeautifulSoup(head, 'lxml')
print('head soup:', repr(head_soup))

print()

print('body:', body)
body_soup = BeautifulSoup(body, 'lxml')
print('body soup:', repr(body_soup))

body = '''
<body>
<a href="a-url">an-anchor</a>
</body>
</html>