コード例 #1
0
def test_lxml():
    body_soup = BeautifulSoup(html, 'lxml')
    links, embeds = parse.find_body_links_soup(body_soup)
    assert len(links) == 1
    assert len(embeds) == 0

    fixedup_html = '<html></head>' + html
    body_soup = BeautifulSoup(fixedup_html, 'lxml')
    links, embeds = parse.find_body_links_soup(body_soup)
    assert len(links) == 1
    assert len(embeds) == 0
コード例 #2
0
ファイル: test_parse.py プロジェクト: cocrawler/cocrawler
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 6
    assert len(embeds) == 0
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'foo.gif' in links
    assert 'torture"\n<url>' in links

    head, body = parse.split_head_body(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'foo.gif' in embeds
    head_soup = BeautifulSoup(head, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    assert len(links) == 0
    assert len(embeds) == 1
    assert 'link.html' in embeds

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links.update(lbody)
    embeds.update(ebody)
    assert len(links) == 4
    assert len(embeds) == 2
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'link.html' in embeds
    assert 'foo.gif' in embeds

    head, body = parse.split_head_body(test_html_harder)
    body_soup = BeautifulSoup(body, 'lxml')
    lbody, ebody = parse.find_body_links_soup(body_soup)
    assert len(lbody) == 1
    assert len(ebody) == 1
    assert 'iframe.html' in lbody
    assert 'stylesheet.blah' in ebody
コード例 #3
0
ファイル: test_parse.py プロジェクト: pombredanne/cocrawler
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 6
    assert len(embeds) == 0
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'foo.gif' in links
    assert 'torture"\n<url>' in links

    head, body = parse.split_head_body(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'foo.gif' in embeds
    head_soup = BeautifulSoup(head, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    assert len(links) == 0
    assert len(embeds) == 1
    assert 'link.html' in embeds

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links.update(lbody)
    embeds.update(ebody)
    assert len(links) == 4
    assert len(embeds) == 2
    assert 'foo2.htm' in links
    assert 'foo3.html ' in links
    assert 'torture"\n<url>' in links
    assert 'link.html' in embeds
    assert 'foo.gif' in embeds
コード例 #4
0
ファイル: test_parse.py プロジェクト: caowenbin08/cocrawler
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 5
    assert len(embeds) == 0
    assert 'foo3.html' in links
    assert 'foo.gif' in links

    head, body = parse.split_head_body_re(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 3
    assert len(embeds) == 1
    assert 'foo3.html' in links
    assert 'foo.gif' in embeds

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links.update(lbody)
    embeds.update(ebody)
    assert len(links) == 3
    assert len(embeds) == 2
    assert 'foo3.html ' in links  # this space will disapper in urls.URL()
    assert 'foo.gif' in embeds
コード例 #5
0
def parse_all(name, string):
    all_links = []

    # warmup

    head, body = parse.split_head_body(string)

    links, embeds = parse.find_html_links_re(string)  # embeds is empty here by design
    links, embeds = parse.find_body_links_re(body)

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    links, embeds = parse.find_body_links_soup(body_soup)

    # measurement

    with stats.record_burn('split_head_body', url=name):
        head, body = parse.split_head_body(string)

    with stats.record_burn('find_html_links_re', url=name):
        links, embeds = parse.find_html_links_re(string)  # embeds is empty here by design
    all_links.append(links.union(embeds))

    with stats.record_burn('head_soup', url=name):
        head_soup = BeautifulSoup(head, 'lxml')
    with stats.record_burn('find_head_links_soup', url=name):
        head_links, head_embeds = parse.find_head_links_soup(head_soup)

    body = '<html>' + body  # because a closing tag at the start of body screws up lxml
    with stats.record_burn('find_body_links_re', url=name):
        links, embeds = parse.find_body_links_re(body)
    all_links.append(links.union(embeds).union(head_links).union(head_embeds))

    with stats.record_burn('body_soup', url=name):
        body_soup = BeautifulSoup(body, 'lxml')
    with stats.record_burn('find_body_links_soup', url=name):
        links, embeds = parse.find_body_links_soup(body_soup)
    all_links.append(links.union(embeds).union(head_links).union(head_embeds))

    # evaluation

    biggest = functools.reduce(max, [len(x) for x in all_links])
    for i, v in enumerate(all_links):
        if len(v) == biggest:
            biggest_index = i
            biggest_links = v

    names = 'find_html_links_re', 'find_body_links_re', 'find_body_links_soup'

    for i, v in enumerate(all_links):
        if len(v) != biggest:
            print('{} had different link counts of {} and {}'.format(name, biggest, len(v)))
            extra1 = v.difference(biggest_links)
            extra2 = biggest_links.difference(v)
            if extra1:
                print('  extra in {}: {!r}'.format(names[i], extra1))
            else:
                print('  count was {} for {}'.format(len(v), names[i]))
            if extra2:
                print('  extra in {}: {!r}'.format(names[biggest_index], extra2))
            else:
                print('  count was {} for {}'.format(len(biggest_links), names[biggest_index]))
コード例 #6
0
def test_lxml_close():
    defective_html = '</head>' + html
    body_soup = BeautifulSoup(defective_html, 'lxml')
    links, embeds = parse.find_body_links_soup(body_soup)
    assert len(links) == 1
    assert len(embeds) == 0
コード例 #7
0
def test_individual_parsers():
    links, embeds = parse.find_html_links_re(test_html)
    assert len(links) == 6
    assert len(embeds) == 0
    linkset = set(parse.collapse_links(links))
    assert 'foo2.htm' in linkset
    assert 'foo3.html ' in linkset
    assert 'foo.gif' in linkset
    assert 'torture"\n<url>' in linkset

    head, body = parse.split_head_body(test_html)
    links, embeds = parse.find_body_links_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    linkset = set(parse.collapse_links(links))
    embedset = set(parse.collapse_links(embeds))
    assert 'foo2.htm' in linkset
    assert 'foo3.html ' in linkset
    assert 'torture"\n<url>' in linkset
    assert 'foo.gif' in embedset

    links, embeds = parse.find_body_links_anchors_re(body)
    assert len(links) == 4
    assert len(embeds) == 1
    linkdict = dict([(l['href'], l['anchor']) for l in links])
    # {('foo1.html', 'Anchor 1'), ('foo3.html ', 'Anchor 3'), ('foo2.htm', 'Anchor 2'), ('torture"\n<url>', 'torture\nanchor')}
    assert linkdict['foo2.htm'] == 'Anchor 2'
    assert linkdict['foo3.html '] == 'Anchor 3'
    assert linkdict['torture"\n<url>'] == 'torture\nanchor'
    assert 'foo.gif' in embeds[0]['src']

    head_soup = BeautifulSoup(head, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    embedset = set(parse.collapse_links(embeds))
    assert len(links) == 0
    assert len(embeds) == 1
    assert 'link.html' in embedset

    head_soup = BeautifulSoup(head, 'lxml')
    body_soup = BeautifulSoup(body, 'lxml')
    links, embeds = parse.find_head_links_soup(head_soup)
    lbody, ebody = parse.find_body_links_soup(body_soup)
    links += lbody
    embeds += ebody
    linkset = set(parse.collapse_links(links))
    embedset = set(parse.collapse_links(embeds))
    assert len(links) == 4
    assert len(embeds) == 2
    assert 'foo2.htm' in linkset
    assert 'foo3.html ' in linkset
    assert 'torture"\n<url>' in linkset
    assert 'link.html' in embedset
    assert 'foo.gif' in embedset

    head, body = parse.split_head_body(test_html_harder)
    body_soup = BeautifulSoup(body, 'lxml')
    lbody, ebody = parse.find_body_links_soup(body_soup)
    assert len(lbody) == 1
    assert len(ebody) == 1
    assert 'iframe.html' == lbody[0]['src']
    assert 'stylesheet.blah' == ebody[0]['href']