def test_fixlinks_fragment(): """Links with fragment part to included pages should not be marked as 'nopo'.""" chosen_pages = {'foo'} html = '<a href="/wiki/foo#bar></a>' soup = bs4.BeautifulSoup(html, 'lxml') a_tag = soup.find('a') ImageParser.fixlinks(a_tag, chosen_pages) assert a_tag.attrs.get('class') is None
def test_fixlinks_nopo(name): """Test that wiki links to not included pages are marked with the 'nopo' class.""" chosen_pages = {'eggs', 'spam'} html = '<a href="/wiki/{}"></a>'.format(name) soup = bs4.BeautifulSoup(html, 'lxml') a_tag = soup.find('a') ImageParser.fixlinks(a_tag, chosen_pages) assert a_tag.attrs.get('class') == ['nopo']
def test_fixlinks_no_nopo(name): """Test that wiki links to included pages are not marked with the 'nopo' class.""" fname = to3dirs.to_filename(name) chosen_pages = {fname} url = urllib.parse.quote(name) html = '<a href="/wiki/{}"></a>'.format(url) soup = bs4.BeautifulSoup(html, 'lxml') a_tag = soup.find('a') ImageParser.fixlinks(a_tag, chosen_pages) assert a_tag.attrs.get('class') is None
def test_no_size_querystring_when_size_undefined(): soup = bs4.BeautifulSoup(features="html.parser") url = ("//upload.wikimedia.org/wikipedia/commons/" "thumb/4/40/P_ps.png/35px-P_ps.png") tag = soup.new_tag("img", src=url) ImageParser.replace(tag) assert tag.attrs['src'].endswith(".png")
def test_included_pages_links(): original_html = load_fixture('article_with_inlinemath.html') html, _ = ImageParser.parse_html(original_html, chosen_pages=set()) soup1 = bs4.BeautifulSoup(html, "lxml") html, _ = ImageParser.parse_html(original_html, chosen_pages={"Wikcionario"}) soup2 = bs4.BeautifulSoup(html, "lxml") no_chosen_pages_count = len(soup1.find_all("a", "nopo")) assert no_chosen_pages_count - 1 == len(soup2.find_all("a", "nopo"))
def test_append_size_querystring(): soup = bs4.BeautifulSoup(features="html.parser") url = ("//upload.wikimedia.org/wikipedia/commons/" "thumb/4/40/P_ps.png/35px-P_ps.png") tag = soup.new_tag("img", src=url, width='100px', height='50px') ImageParser.replace(tag) assert tag.attrs.get("width") is None assert tag.attrs.get("height") is None assert tag.attrs['src'].endswith("?s=100px-50px")
def test_parse_html(): html = load_fixture('article_with_inlinemath.html') base_soup = bs4.BeautifulSoup(html, features="html.parser") html, _ = ImageParser.parse_html(html, chosen_pages=set()) soup = bs4.BeautifulSoup(html, features="html.parser") assert len(soup.find_all("img")) == 7 assert len(soup.find_all("a")) == 221 assert len(soup.find_all("a", "external")) == 8 # no link starting with // assert any( [tag.attrs['href'].startswith("//") for tag in soup.find_all("a")]) assert "data-file-width" not in html assert "data-file-height" not in html # check that the "image links" are removed assert len(base_soup.find_all("a", "image")) != 0 assert len(soup.find_all("a", "image")) == 0 # check that the only image removed is "Special:CentralAutoLogin" assert len(soup.find_all("img")) == len(base_soup.find_all("img")) - 1 assert any( ["AutoLogin" in tag.attrs["src"] for tag in base_soup.find_all("img")]) assert not any( ["AutoLogin" in tag.attrs["src"] for tag in soup.find_all("img")])
def test_parse_html_remove_selflinks(): link_without_href = '<a class="mw-selflink selflink">Argentina</a>' html, _ = ImageParser.parse_html(link_without_href, chosen_pages=set()) # check that links without href are removed soup = bs4.BeautifulSoup(html, "lxml") assert len(soup.find_all("a", href=None)) == 0 assert 'Argentina' in html
def _check(self, url, should_web, should_dsk): """Do proper checking.""" tag = self.soup.new_tag("img", src=url) dsk, web = ImageParser.replace(tag) self.assertEqual(web, should_web) self.assertEqual(dsk, should_dsk) self.assertEqual(tag.attrs["src"], '/images/' + should_dsk)