Esempio n. 1
0
def test_ensure_correct_paragraph_wrapping():
    """Do not wrap bare text inside <div> with <p> tags."""
    check_exact_html_output(
        """
        <div>
            Some example text here.
        </div>""", """<div>Some example text here.</div>""")
Esempio n. 2
0
def test_prune_div_with_one_populated_one_empty_span():
    check_exact_html_output(
        """
        <div>
            <span>dfs</span>
            <span></span>
        </div>
    """, "<div>dfs</div>")
Esempio n. 3
0
def test_iframe_containing_tags():
    """At present we blacklist iframes completely"""
    check_exact_html_output(
        """
        <div>
            <iframe><span>text</span></iframe>
        </div>
        """, "<div></div>")
Esempio n. 4
0
def test_paragraph_splitting_with_unclosed_tags():
    """Ensure that paragraphs with unclosed tags inside them split correctly."""
    check_exact_html_output(
        """
        <p>
            <meta charset="utf-8">First paragraph.
            <br><br>
            Second paragraph.
        </p>""", "<div><p>First paragraph.</p><p>Second paragraph.</p></div>")
Esempio n. 5
0
def test_span_removal_and_conversion():
    """First <span> should be removed. Second should give bare text that will be wrapped."""
    check_exact_html_output(
        """
        <div>
            <p>Some <span>example</span> text here.</p>
            <span>More text in a span.</span>
        </div>""",
        "<div><p>Some example text here.</p><p>More text in a span.</p></div>")
Esempio n. 6
0
def test_ensure_correct_outer_div_wrapping():
    """Do not wrap in a <div> if this is already a <div>."""
    check_exact_html_output(
        """
        <div>
            <p>
                Some example text here.
            </p>
        </div>""", """<div><p>Some example text here.</p></div>""")
Esempio n. 7
0
def test_nested_superscript_with_linebreaks():
    """Ensure that nested superscripts with linebreaks are correctly parsed."""
    check_exact_html_output(
        """
        <p>Some text<br>
        with linebreaks <sup><br>
        <sup>around a footnote</sup></sup>.
        </p>""",
        "<div><p>Some text with linebreaks ^ ^around a footnote.</p></div>")
Esempio n. 8
0
def test_consecutive_links():
    """Check that whitespace is preserved between consecutive <a> links."""
    check_exact_html_output(
        """
        <blockquote>
            <p>First paragraph: <a href="https://example.com">first link</a> <a href="https://example.com">second link</a></p>
            <p>Second paragraph: <a href="https://example.com">third link</a></p>
        </blockquote>""",
        "<div><blockquote><p>First paragraph: first link second link</p><p>Second paragraph: third link</p></blockquote></div>"
    )
Esempio n. 9
0
def test_single_br_with_semantic_space():
    """Empty elements should be removed."""
    check_exact_html_output(
        """
        <div>
            <p>This tag<br> will be removed but the space after it is important.</p>
        </div>
    """,
        "<div><p>This tag will be removed but the space after it is important.</p></div>"
    )
Esempio n. 10
0
def test_ensure_correct_punctuation_joining(terminal_punctuation):
    """Do not join with ' ' if the following character is a punctuation mark."""
    input_html = """
        <div>
            <p>
                Some text <a href="example.com">like this</a>{0} with punctuation.
            </p>
        </div>""".format(terminal_punctuation)
    expected_output = """<div><p>Some text like this{0} with punctuation.</p></div>""".format(
        terminal_punctuation)
    check_exact_html_output(input_html, expected_output)
Esempio n. 11
0
def test_consecutive_links_with_spaces():
    """Check that extra whitespace is remove inside <a> links even when they are consecutive."""
    check_exact_html_output(
        """
        <blockquote>
            <p>First paragraph: <a href="https://example.com">first link </a> <a href="https://example.com"> second link</a></p>
            <p>Second paragraph: <a href="https://example.com">third link </a></p>
            <p>Third paragraph: <a href="https://example.com">first link </a><a href="https://example.com">second link</a></p>
        </blockquote>""",
        "<div><blockquote><p>First paragraph: first link second link</p><p>Second paragraph: third link</p><p>Third paragraph: first link second link</p></blockquote></div>"
    )
Esempio n. 12
0
def test_ensure_correct_bracket_quote_joining(matched_pair):
    """Do not join with ' ' if we are inside matched punctuation marks."""
    input_html = """
        <div>
            <p>
                Some text {0}<a href="example.com">like this</a>{1} with punctuation.
            </p>
        </div>""".format(*matched_pair)
    expected_output = """<div><p>Some text {0}like this{1} with punctuation.</p></div>""".format(
        *matched_pair)
    check_exact_html_output(input_html, expected_output)
Esempio n. 13
0
def test_empty_element_removal():
    """Empty elements should be removed."""
    check_exact_html_output(
        """
        <div>
            <p>Text</p>
            <p></p>
            <span>Paragraphs</span>
        </div>
        Bare <span></span> t<a></a>ext
        <div></div>
    """, "<div><div><p>Text</p><p>Paragraphs</p></div><p>Bare text</p></div>")
Esempio n. 14
0
def test_consolidating_string_between_tags():
    """First <span> should be removed. Second should give bare text that will be wrapped."""
    check_exact_html_output(
        """
        <div>
            <p>Some <br><br>example text here.</p>
            <span>More text in a span.</span>
            Part of the same paragraph. <br>
            <br> A new paragraph.
        </div>""",
        "<div><p>Some</p><p>example text here.</p><p>More text in a span. Part of the same paragraph.</p><p>A new paragraph.</p></div>"
    )
Esempio n. 15
0
def test_non_printing_control_characters():
    """Non-printing characters should be removed."""
    check_exact_html_output(
        """
        <div>
            <p>First paragraph.</p>
            <p><span></span></p>
            <p>Last paragraph.</p>
        </div>
    """, """
        <div>
            <p>First paragraph.</p>
            <p>Last paragraph.</p>
        </div>
    """)
Esempio n. 16
0
def test_html_space_separated_double_br_inside_and_outside_div():
    """First double <br> should trigger a new <p>, second several <p> inside the div, third a new <p>"""
    check_exact_html_output(
        """
        <div>
            <p>Some <br/>
            <br/>example text here.</p>
        </div>
        <div>
        Text in a div. <br/>
        <br/> A new div.
        </div>
        Bare text. <br/>
        <br/> A new paragraph.
        """,
        "<div><div><p>Some</p><p>example text here.</p></div><div><p>Text in a div.</p><p>A new div.</p></div><p>Bare text.</p><p>A new paragraph.</p></div>"
    )
Esempio n. 17
0
def test_nested_table_inside_paragraph():
    """Ensure that blocks (illegally) nested inside paragraphs are split out."""
    check_exact_html_output(
        """
        <p>
            First paragraph.
            <br/><br/>
            <table>
                <tbody>
                    <tr>
                        <td>Table text.</td>
                    </tr>
                </tbody>
            </table>
            Second paragraph.
        </p>""",
        "<div><p>First paragraph.</p><table><tbody><tr><td>Table text.</td></tr></tbody></table><p>Second paragraph.</p></div>"
    )
Esempio n. 18
0
def test_nested_span_inside_paragraph():
    """Ensure that spans nested inside paragraphs are kept in."""
    check_exact_html_output(
        "<p>Some text <span>in a span</span> that should stay together.</p>"
        "", "<div><p>Some text in a span that should stay together.</p></div>")
Esempio n. 19
0
def test_prune_div_with_one_whitespace_paragraph():
    check_exact_html_output(
        """<div>
            <p>        </p>
        </div>
        """, "<div></div>")
Esempio n. 20
0
def test_html_text_with_semantic_br():
    """Single <br> is sometimes used as a word separator so should be replaced
    with a space."""
    check_exact_html_output(
        """<a href="http://example.com">link</a><br />caption""",
        "<div><p>link caption</p></div>")
Esempio n. 21
0
def test_iframe_with_source():
    """At present we blacklist iframes, but may want to extract the links in future."""
    check_exact_html_output(
        """<div><iframe src="https://www.youtube.com/embed/BgB5E91lD6s" width="640" height="355" frameborder="0" allowfullscreen="allowfullscreen"></iframe></div>""",
        "<div></div>")
Esempio n. 22
0
def test_comments_inside_tags():
    """Ensure that comments inside tags are removed."""
    check_exact_html_output(
        "<p>Some <!-- --> text <!-- with a comment --> here<!--or here-->.<!----></p>",
        "<div><p>Some text here.</p></div>")
Esempio n. 23
0
def test_tags_inside_words():
    """Ensure that words with tags inside them are kept together when the tags are stripped."""
    check_exact_html_output("""a<a href="http://example.com">i</a>sle""",
                            "<div><p>aisle</p></div>")
Esempio n. 24
0
def test_nested_superscript():
    """Ensure that nested superscripts are correctly parsed."""
    check_exact_html_output(
        "<p>Some text with <sup>nested <sup>superscripts</sup></sup> here.</p>",
        "<div><p>Some text with ^nested ^superscripts here.</p></div>")
Esempio n. 25
0
def test_nested_linebreaks_inside_superscript():
    """Ensure that linebreaks inside superscript are correctly parsed."""
    check_exact_html_output(
        "<p>Some text <sup>with<br/>superscripts</sup> that should be joined.</p>",
        "<div><p>Some text ^with superscripts that should be joined.</p></div>"
    )