def test_lxml(): body_soup = BeautifulSoup(html, 'lxml') links, embeds = parse.find_body_links_soup(body_soup) assert len(links) == 1 assert len(embeds) == 0 fixedup_html = '<html></head>' + html body_soup = BeautifulSoup(fixedup_html, 'lxml') links, embeds = parse.find_body_links_soup(body_soup) assert len(links) == 1 assert len(embeds) == 0
def test_individual_parsers(): links, embeds = parse.find_html_links_re(test_html) assert len(links) == 6 assert len(embeds) == 0 assert 'foo2.htm' in links assert 'foo3.html ' in links assert 'foo.gif' in links assert 'torture"\n<url>' in links head, body = parse.split_head_body(test_html) links, embeds = parse.find_body_links_re(body) assert len(links) == 4 assert len(embeds) == 1 assert 'foo2.htm' in links assert 'foo3.html ' in links assert 'torture"\n<url>' in links assert 'foo.gif' in embeds head_soup = BeautifulSoup(head, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) assert len(links) == 0 assert len(embeds) == 1 assert 'link.html' in embeds head_soup = BeautifulSoup(head, 'lxml') body_soup = BeautifulSoup(body, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) lbody, ebody = parse.find_body_links_soup(body_soup) links.update(lbody) embeds.update(ebody) assert len(links) == 4 assert len(embeds) == 2 assert 'foo2.htm' in links assert 'foo3.html ' in links assert 'torture"\n<url>' in links assert 'link.html' in embeds assert 'foo.gif' in embeds head, body = parse.split_head_body(test_html_harder) body_soup = BeautifulSoup(body, 'lxml') lbody, ebody = parse.find_body_links_soup(body_soup) assert len(lbody) == 1 assert len(ebody) == 1 assert 'iframe.html' in lbody assert 'stylesheet.blah' in ebody
def test_individual_parsers(): links, embeds = parse.find_html_links_re(test_html) assert len(links) == 6 assert len(embeds) == 0 assert 'foo2.htm' in links assert 'foo3.html ' in links assert 'foo.gif' in links assert 'torture"\n<url>' in links head, body = parse.split_head_body(test_html) links, embeds = parse.find_body_links_re(body) assert len(links) == 4 assert len(embeds) == 1 assert 'foo2.htm' in links assert 'foo3.html ' in links assert 'torture"\n<url>' in links assert 'foo.gif' in embeds head_soup = BeautifulSoup(head, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) assert len(links) == 0 assert len(embeds) == 1 assert 'link.html' in embeds head_soup = BeautifulSoup(head, 'lxml') body_soup = BeautifulSoup(body, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) lbody, ebody = parse.find_body_links_soup(body_soup) links.update(lbody) embeds.update(ebody) assert len(links) == 4 assert len(embeds) == 2 assert 'foo2.htm' in links assert 'foo3.html ' in links assert 'torture"\n<url>' in links assert 'link.html' in embeds assert 'foo.gif' in embeds
def test_individual_parsers(): links, embeds = parse.find_html_links_re(test_html) assert len(links) == 5 assert len(embeds) == 0 assert 'foo3.html' in links assert 'foo.gif' in links head, body = parse.split_head_body_re(test_html) links, embeds = parse.find_body_links_re(body) assert len(links) == 3 assert len(embeds) == 1 assert 'foo3.html' in links assert 'foo.gif' in embeds head_soup = BeautifulSoup(head, 'lxml') body_soup = BeautifulSoup(body, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) lbody, ebody = parse.find_body_links_soup(body_soup) links.update(lbody) embeds.update(ebody) assert len(links) == 3 assert len(embeds) == 2 assert 'foo3.html ' in links # this space will disapper in urls.URL() assert 'foo.gif' in embeds
def parse_all(name, string): all_links = [] # warmup head, body = parse.split_head_body(string) links, embeds = parse.find_html_links_re(string) # embeds is empty here by design links, embeds = parse.find_body_links_re(body) head_soup = BeautifulSoup(head, 'lxml') body_soup = BeautifulSoup(body, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) links, embeds = parse.find_body_links_soup(body_soup) # measurement with stats.record_burn('split_head_body', url=name): head, body = parse.split_head_body(string) with stats.record_burn('find_html_links_re', url=name): links, embeds = parse.find_html_links_re(string) # embeds is empty here by design all_links.append(links.union(embeds)) with stats.record_burn('head_soup', url=name): head_soup = BeautifulSoup(head, 'lxml') with stats.record_burn('find_head_links_soup', url=name): head_links, head_embeds = parse.find_head_links_soup(head_soup) body = '<html>' + body # because a closing tag at the start of body screws up lxml with stats.record_burn('find_body_links_re', url=name): links, embeds = parse.find_body_links_re(body) all_links.append(links.union(embeds).union(head_links).union(head_embeds)) with stats.record_burn('body_soup', url=name): body_soup = BeautifulSoup(body, 'lxml') with stats.record_burn('find_body_links_soup', url=name): links, embeds = parse.find_body_links_soup(body_soup) all_links.append(links.union(embeds).union(head_links).union(head_embeds)) # evaluation biggest = functools.reduce(max, [len(x) for x in all_links]) for i, v in enumerate(all_links): if len(v) == biggest: biggest_index = i biggest_links = v names = 'find_html_links_re', 'find_body_links_re', 'find_body_links_soup' for i, v in enumerate(all_links): if len(v) != biggest: print('{} had different link counts of {} and {}'.format(name, biggest, len(v))) extra1 = v.difference(biggest_links) extra2 = biggest_links.difference(v) if extra1: print(' extra in {}: {!r}'.format(names[i], extra1)) else: print(' count was {} for {}'.format(len(v), names[i])) if extra2: print(' extra in {}: {!r}'.format(names[biggest_index], extra2)) else: print(' count was {} for {}'.format(len(biggest_links), names[biggest_index]))
def test_lxml_close(): defective_html = '</head>' + html body_soup = BeautifulSoup(defective_html, 'lxml') links, embeds = parse.find_body_links_soup(body_soup) assert len(links) == 1 assert len(embeds) == 0
def test_individual_parsers(): links, embeds = parse.find_html_links_re(test_html) assert len(links) == 6 assert len(embeds) == 0 linkset = set(parse.collapse_links(links)) assert 'foo2.htm' in linkset assert 'foo3.html ' in linkset assert 'foo.gif' in linkset assert 'torture"\n<url>' in linkset head, body = parse.split_head_body(test_html) links, embeds = parse.find_body_links_re(body) assert len(links) == 4 assert len(embeds) == 1 linkset = set(parse.collapse_links(links)) embedset = set(parse.collapse_links(embeds)) assert 'foo2.htm' in linkset assert 'foo3.html ' in linkset assert 'torture"\n<url>' in linkset assert 'foo.gif' in embedset links, embeds = parse.find_body_links_anchors_re(body) assert len(links) == 4 assert len(embeds) == 1 linkdict = dict([(l['href'], l['anchor']) for l in links]) # {('foo1.html', 'Anchor 1'), ('foo3.html ', 'Anchor 3'), ('foo2.htm', 'Anchor 2'), ('torture"\n<url>', 'torture\nanchor')} assert linkdict['foo2.htm'] == 'Anchor 2' assert linkdict['foo3.html '] == 'Anchor 3' assert linkdict['torture"\n<url>'] == 'torture\nanchor' assert 'foo.gif' in embeds[0]['src'] head_soup = BeautifulSoup(head, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) embedset = set(parse.collapse_links(embeds)) assert len(links) == 0 assert len(embeds) == 1 assert 'link.html' in embedset head_soup = BeautifulSoup(head, 'lxml') body_soup = BeautifulSoup(body, 'lxml') links, embeds = parse.find_head_links_soup(head_soup) lbody, ebody = parse.find_body_links_soup(body_soup) links += lbody embeds += ebody linkset = set(parse.collapse_links(links)) embedset = set(parse.collapse_links(embeds)) assert len(links) == 4 assert len(embeds) == 2 assert 'foo2.htm' in linkset assert 'foo3.html ' in linkset assert 'torture"\n<url>' in linkset assert 'link.html' in embedset assert 'foo.gif' in embedset head, body = parse.split_head_body(test_html_harder) body_soup = BeautifulSoup(body, 'lxml') lbody, ebody = parse.find_body_links_soup(body_soup) assert len(lbody) == 1 assert len(ebody) == 1 assert 'iframe.html' == lbody[0]['src'] assert 'stylesheet.blah' == ebody[0]['href']