Esempio n. 1
0
def test_dont_transform_div_with_div():
    """Verify that only child <div> element is replaced by <p>."""
    dom = document_fromstring("<html><body><div>text<div>child</div>"
                              "aftertext</div></body></html>")

    assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
        "<html><body><div>text<p>child</p>"
        "aftertext</div></body></html>")
Esempio n. 2
0
def test_misused_divs_transform():
    """Verify we replace leaf node divs with p's

    They should have the same content, just be a p vs a div

    """
    test_html = "<html><body><div>simple</div></body></html>"
    test_doc = document_fromstring(test_html)
    assert tounicode(leaf_div_elements_into_paragraphs(
        test_doc)) == to_unicode("<html><body><p>simple</p></body></html>")

    test_html2 = ('<html><body><div>simple<a href="">link</a>'
                  '</div></body></html>')
    test_doc2 = document_fromstring(test_html2)
    assert tounicode(
        leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
            '<html><body><p>simple<a href="">link</a></p></body></html>')
Esempio n. 3
0
def test_misused_divs_transform():
    """Verify we replace leaf node divs with p's

    They should have the same content, just be a p vs a div

    """
    test_html = "<html><body><div>simple</div></body></html>"
    test_doc = document_fromstring(test_html)
    assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(
        "<html><body><p>simple</p></body></html>"
    )

    test_html2 = ('<html><body><div>simple<a href="">link</a>'
                  '</div></body></html>')
    test_doc2 = document_fromstring(test_html2)
    assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
        '<html><body><p>simple<a href="">link</a></p></body></html>'
    )
Esempio n. 4
0
 def dom(self):
     """Parsed lxml tree (Document Object Model) of the given html."""
     try:
         dom = self._original_document.dom
         # cleaning doesn't return, just wipes in place
         html_cleaner(dom)
         return leaf_div_elements_into_paragraphs(dom)
     except ValueError:
         return None
Esempio n. 5
0
 def dom(self):
     """Parsed lxml tree (Document Object Model) of the given html."""
     try:
         dom = self._original_document.dom
         # cleaning doesn't return, just wipes in place
         html_cleaner(dom)
         return leaf_div_elements_into_paragraphs(dom)
     except ValueError:
         return None
Esempio n. 6
0
def test_dont_transform_div_with_div():
    """Verify that only child <div> element is replaced by <p>."""
    dom = document_fromstring(
        "<html><body><div>text<div>child</div>"
        "aftertext</div></body></html>"
    )

    assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
        "<html><body><div>text<p>child</p>"
        "aftertext</div></body></html>"
    )