def test_ensure_correct_paragraph_wrapping(): """Do not wrap bare text inside <div> with <p> tags.""" check_exact_html_output( """ <div> Some example text here. </div>""", """<div>Some example text here.</div>""")
def test_prune_div_with_one_populated_one_empty_span(): check_exact_html_output( """ <div> <span>dfs</span> <span></span> </div> """, "<div>dfs</div>")
def test_iframe_containing_tags(): """At present we blacklist iframes completely""" check_exact_html_output( """ <div> <iframe><span>text</span></iframe> </div> """, "<div></div>")
def test_paragraph_splitting_with_unclosed_tags(): """Ensure that paragraphs with unclosed tags inside them split correctly.""" check_exact_html_output( """ <p> <meta charset="utf-8">First paragraph. <br><br> Second paragraph. </p>""", "<div><p>First paragraph.</p><p>Second paragraph.</p></div>")
def test_span_removal_and_conversion(): """First <span> should be removed. Second should give bare text that will be wrapped.""" check_exact_html_output( """ <div> <p>Some <span>example</span> text here.</p> <span>More text in a span.</span> </div>""", "<div><p>Some example text here.</p><p>More text in a span.</p></div>")
def test_ensure_correct_outer_div_wrapping(): """Do not wrap in a <div> if this is already a <div>.""" check_exact_html_output( """ <div> <p> Some example text here. </p> </div>""", """<div><p>Some example text here.</p></div>""")
def test_nested_superscript_with_linebreaks(): """Ensure that nested superscripts with linebreaks are correctly parsed.""" check_exact_html_output( """ <p>Some text<br> with linebreaks <sup><br> <sup>around a footnote</sup></sup>. </p>""", "<div><p>Some text with linebreaks ^ ^around a footnote.</p></div>")
def test_consecutive_links(): """Check that whitespace is preserved between consecutive <a> links.""" check_exact_html_output( """ <blockquote> <p>First paragraph: <a href="https://example.com">first link</a> <a href="https://example.com">second link</a></p> <p>Second paragraph: <a href="https://example.com">third link</a></p> </blockquote>""", "<div><blockquote><p>First paragraph: first link second link</p><p>Second paragraph: third link</p></blockquote></div>" )
def test_single_br_with_semantic_space(): """Empty elements should be removed.""" check_exact_html_output( """ <div> <p>This tag<br> will be removed but the space after it is important.</p> </div> """, "<div><p>This tag will be removed but the space after it is important.</p></div>" )
def test_ensure_correct_punctuation_joining(terminal_punctuation): """Do not join with ' ' if the following character is a punctuation mark.""" input_html = """ <div> <p> Some text <a href="example.com">like this</a>{0} with punctuation. </p> </div>""".format(terminal_punctuation) expected_output = """<div><p>Some text like this{0} with punctuation.</p></div>""".format( terminal_punctuation) check_exact_html_output(input_html, expected_output)
def test_consecutive_links_with_spaces(): """Check that extra whitespace is remove inside <a> links even when they are consecutive.""" check_exact_html_output( """ <blockquote> <p>First paragraph: <a href="https://example.com">first link </a> <a href="https://example.com"> second link</a></p> <p>Second paragraph: <a href="https://example.com">third link </a></p> <p>Third paragraph: <a href="https://example.com">first link </a><a href="https://example.com">second link</a></p> </blockquote>""", "<div><blockquote><p>First paragraph: first link second link</p><p>Second paragraph: third link</p><p>Third paragraph: first link second link</p></blockquote></div>" )
def test_ensure_correct_bracket_quote_joining(matched_pair): """Do not join with ' ' if we are inside matched punctuation marks.""" input_html = """ <div> <p> Some text {0}<a href="example.com">like this</a>{1} with punctuation. </p> </div>""".format(*matched_pair) expected_output = """<div><p>Some text {0}like this{1} with punctuation.</p></div>""".format( *matched_pair) check_exact_html_output(input_html, expected_output)
def test_empty_element_removal(): """Empty elements should be removed.""" check_exact_html_output( """ <div> <p>Text</p> <p></p> <span>Paragraphs</span> </div> Bare <span></span> t<a></a>ext <div></div> """, "<div><div><p>Text</p><p>Paragraphs</p></div><p>Bare text</p></div>")
def test_consolidating_string_between_tags(): """First <span> should be removed. Second should give bare text that will be wrapped.""" check_exact_html_output( """ <div> <p>Some <br><br>example text here.</p> <span>More text in a span.</span> Part of the same paragraph. <br> <br> A new paragraph. </div>""", "<div><p>Some</p><p>example text here.</p><p>More text in a span. Part of the same paragraph.</p><p>A new paragraph.</p></div>" )
def test_non_printing_control_characters(): """Non-printing characters should be removed.""" check_exact_html_output( """ <div> <p>First paragraph.</p> <p><span></span></p> <p>Last paragraph.</p> </div> """, """ <div> <p>First paragraph.</p> <p>Last paragraph.</p> </div> """)
def test_html_space_separated_double_br_inside_and_outside_div(): """First double <br> should trigger a new <p>, second several <p> inside the div, third a new <p>""" check_exact_html_output( """ <div> <p>Some <br/> <br/>example text here.</p> </div> <div> Text in a div. <br/> <br/> A new div. </div> Bare text. <br/> <br/> A new paragraph. """, "<div><div><p>Some</p><p>example text here.</p></div><div><p>Text in a div.</p><p>A new div.</p></div><p>Bare text.</p><p>A new paragraph.</p></div>" )
def test_nested_table_inside_paragraph(): """Ensure that blocks (illegally) nested inside paragraphs are split out.""" check_exact_html_output( """ <p> First paragraph. <br/><br/> <table> <tbody> <tr> <td>Table text.</td> </tr> </tbody> </table> Second paragraph. </p>""", "<div><p>First paragraph.</p><table><tbody><tr><td>Table text.</td></tr></tbody></table><p>Second paragraph.</p></div>" )
def test_nested_span_inside_paragraph(): """Ensure that spans nested inside paragraphs are kept in.""" check_exact_html_output( "<p>Some text <span>in a span</span> that should stay together.</p>" "", "<div><p>Some text in a span that should stay together.</p></div>")
def test_prune_div_with_one_whitespace_paragraph(): check_exact_html_output( """<div> <p> </p> </div> """, "<div></div>")
def test_html_text_with_semantic_br(): """Single <br> is sometimes used as a word separator so should be replaced with a space.""" check_exact_html_output( """<a href="http://example.com">link</a><br />caption""", "<div><p>link caption</p></div>")
def test_iframe_with_source(): """At present we blacklist iframes, but may want to extract the links in future.""" check_exact_html_output( """<div><iframe src="https://www.youtube.com/embed/BgB5E91lD6s" width="640" height="355" frameborder="0" allowfullscreen="allowfullscreen"></iframe></div>""", "<div></div>")
def test_comments_inside_tags(): """Ensure that comments inside tags are removed.""" check_exact_html_output( "<p>Some <!-- --> text <!-- with a comment --> here<!--or here-->.<!----></p>", "<div><p>Some text here.</p></div>")
def test_tags_inside_words(): """Ensure that words with tags inside them are kept together when the tags are stripped.""" check_exact_html_output("""a<a href="http://example.com">i</a>sle""", "<div><p>aisle</p></div>")
def test_nested_superscript(): """Ensure that nested superscripts are correctly parsed.""" check_exact_html_output( "<p>Some text with <sup>nested <sup>superscripts</sup></sup> here.</p>", "<div><p>Some text with ^nested ^superscripts here.</p></div>")
def test_nested_linebreaks_inside_superscript(): """Ensure that linebreaks inside superscript are correctly parsed.""" check_exact_html_output( "<p>Some text <sup>with<br/>superscripts</sup> that should be joined.</p>", "<div><p>Some text ^with superscripts that should be joined.</p></div>" )