コード例 #1
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_fixlinks_fragment():
    """Links with fragment part to included pages should not be marked as 'nopo'."""
    chosen_pages = {'foo'}
    html = '<a href="/wiki/foo#bar></a>'
    soup = bs4.BeautifulSoup(html, 'lxml')
    a_tag = soup.find('a')
    ImageParser.fixlinks(a_tag, chosen_pages)
    assert a_tag.attrs.get('class') is None
コード例 #2
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_fixlinks_nopo(name):
    """Test that wiki links to not included pages are marked with the 'nopo' class."""
    chosen_pages = {'eggs', 'spam'}
    html = '<a href="/wiki/{}"></a>'.format(name)
    soup = bs4.BeautifulSoup(html, 'lxml')
    a_tag = soup.find('a')
    ImageParser.fixlinks(a_tag, chosen_pages)
    assert a_tag.attrs.get('class') == ['nopo']
コード例 #3
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_fixlinks_no_nopo(name):
    """Test that wiki links to included pages are not marked with the 'nopo' class."""
    fname = to3dirs.to_filename(name)
    chosen_pages = {fname}
    url = urllib.parse.quote(name)
    html = '<a href="/wiki/{}"></a>'.format(url)
    soup = bs4.BeautifulSoup(html, 'lxml')
    a_tag = soup.find('a')
    ImageParser.fixlinks(a_tag, chosen_pages)
    assert a_tag.attrs.get('class') is None
コード例 #4
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_no_size_querystring_when_size_undefined():
    soup = bs4.BeautifulSoup(features="html.parser")
    url = ("//upload.wikimedia.org/wikipedia/commons/"
           "thumb/4/40/P_ps.png/35px-P_ps.png")

    tag = soup.new_tag("img", src=url)

    ImageParser.replace(tag)

    assert tag.attrs['src'].endswith(".png")
コード例 #5
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_included_pages_links():
    original_html = load_fixture('article_with_inlinemath.html')

    html, _ = ImageParser.parse_html(original_html, chosen_pages=set())
    soup1 = bs4.BeautifulSoup(html, "lxml")

    html, _ = ImageParser.parse_html(original_html,
                                     chosen_pages={"Wikcionario"})
    soup2 = bs4.BeautifulSoup(html, "lxml")

    no_chosen_pages_count = len(soup1.find_all("a", "nopo"))
    assert no_chosen_pages_count - 1 == len(soup2.find_all("a", "nopo"))
コード例 #6
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_append_size_querystring():
    soup = bs4.BeautifulSoup(features="html.parser")
    url = ("//upload.wikimedia.org/wikipedia/commons/"
           "thumb/4/40/P_ps.png/35px-P_ps.png")

    tag = soup.new_tag("img", src=url, width='100px', height='50px')

    ImageParser.replace(tag)

    assert tag.attrs.get("width") is None
    assert tag.attrs.get("height") is None
    assert tag.attrs['src'].endswith("?s=100px-50px")
コード例 #7
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_parse_html():
    html = load_fixture('article_with_inlinemath.html')
    base_soup = bs4.BeautifulSoup(html, features="html.parser")

    html, _ = ImageParser.parse_html(html, chosen_pages=set())

    soup = bs4.BeautifulSoup(html, features="html.parser")

    assert len(soup.find_all("img")) == 7
    assert len(soup.find_all("a")) == 221
    assert len(soup.find_all("a", "external")) == 8

    # no link starting with //
    assert any(
        [tag.attrs['href'].startswith("//") for tag in soup.find_all("a")])

    assert "data-file-width" not in html
    assert "data-file-height" not in html

    # check that the "image links" are removed
    assert len(base_soup.find_all("a", "image")) != 0
    assert len(soup.find_all("a", "image")) == 0

    # check that the only image removed is "Special:CentralAutoLogin"
    assert len(soup.find_all("img")) == len(base_soup.find_all("img")) - 1
    assert any(
        ["AutoLogin" in tag.attrs["src"] for tag in base_soup.find_all("img")])
    assert not any(
        ["AutoLogin" in tag.attrs["src"] for tag in soup.find_all("img")])
コード例 #8
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
def test_parse_html_remove_selflinks():
    link_without_href = '<a class="mw-selflink selflink">Argentina</a>'

    html, _ = ImageParser.parse_html(link_without_href, chosen_pages=set())

    # check that links without href are removed
    soup = bs4.BeautifulSoup(html, "lxml")
    assert len(soup.find_all("a", href=None)) == 0
    assert 'Argentina' in html
コード例 #9
0
ファイル: test_extract.py プロジェクト: chazuttu/CDPedia
    def _check(self, url, should_web, should_dsk):
        """Do proper checking."""

        tag = self.soup.new_tag("img", src=url)

        dsk, web = ImageParser.replace(tag)

        self.assertEqual(web, should_web)
        self.assertEqual(dsk, should_dsk)
        self.assertEqual(tag.attrs["src"], '/images/' + should_dsk)