Example #1
0
def test_dont_transform_div_with_div():
    """Verify that only child <div> element is replaced by <p>."""
    dom = document_fromstring("<html><body><div>text<div>child</div>"
                              "aftertext</div></body></html>")

    assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
        "<html><body><div>text<p>child</p>"
        "aftertext</div></body></html>")
Example #2
0
def test_misused_divs_transform():
    """Verify we replace leaf node divs with p's

    They should have the same content, just be a p vs a div

    """
    test_html = "<html><body><div>simple</div></body></html>"
    test_doc = document_fromstring(test_html)
    assert tounicode(leaf_div_elements_into_paragraphs(
        test_doc)) == to_unicode("<html><body><p>simple</p></body></html>")

    test_html2 = ('<html><body><div>simple<a href="">link</a>'
                  '</div></body></html>')
    test_doc2 = document_fromstring(test_html2)
    assert tounicode(
        leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
            '<html><body><p>simple<a href="">link</a></p></body></html>')
Example #3
0
def test_misused_divs_transform():
    """Verify we replace leaf node divs with p's

    They should have the same content, just be a p vs a div

    """
    test_html = "<html><body><div>simple</div></body></html>"
    test_doc = document_fromstring(test_html)
    assert tounicode(leaf_div_elements_into_paragraphs(test_doc)) == to_unicode(
        "<html><body><p>simple</p></body></html>"
    )

    test_html2 = ('<html><body><div>simple<a href="">link</a>'
                  '</div></body></html>')
    test_doc2 = document_fromstring(test_html2)
    assert tounicode(leaf_div_elements_into_paragraphs(test_doc2)) == to_unicode(
        '<html><body><p>simple<a href="">link</a></p></body></html>'
    )
Example #4
0
def test_dont_transform_div_with_div():
    """Verify that only child <div> element is replaced by <p>."""
    dom = document_fromstring(
        "<html><body><div>text<div>child</div>"
        "aftertext</div></body></html>"
    )

    assert tounicode(leaf_div_elements_into_paragraphs(dom)) == to_unicode(
        "<html><body><div>text<p>child</p>"
        "aftertext</div></body></html>"
    )
def test_readin_with_base_url():
    """Passing a url should update links to be absolute links"""
    doc = OriginalDocument(load_snippet('document_absolute_url.html'),
                           url="http://blog.mitechie.com/test.html")

    assert to_unicode(doc).startswith('<html>')

    # find the links on the page and make sure each one starts with out
    # base url we told it to use.
    links = doc.links
    assert len(links) == 3
    # we should have two links that start with our blog url
    # and one link that starts with amazon
    link_counts = defaultdict(int)
    for link in links:
        if link.get('href').startswith('http://blog.mitechie.com'):
            link_counts['blog'] += 1
        else:
            link_counts['other'] += 1

    assert link_counts['blog'] == 2
    assert link_counts['other'] == 1
    def test_readin_with_base_url(self):
        """Passing a url should update links to be absolute links"""
        doc = OriginalDocument(
            load_snippet('document_absolute_url.html'),
            url="http://blog.mitechie.com/test.html")
        self.assertTrue(to_unicode(doc).startswith('<html>'))

        # find the links on the page and make sure each one starts with out
        # base url we told it to use.
        links = doc.links
        self.assertEqual(len(links), 3)
        # we should have two links that start with our blog url
        # and one link that starts with amazon
        link_counts = defaultdict(int)
        for link in links:
            if link.get('href').startswith('http://blog.mitechie.com'):
                link_counts['blog'] += 1
            else:
                link_counts['other'] += 1

        self.assertEqual(link_counts['blog'], 2)
        self.assertEqual(link_counts['other'], 1)
def test_readin_min_document():
    """Verify we can read in a min html document"""
    doc = OriginalDocument(load_snippet('document_min.html'))

    assert to_unicode(doc).startswith('<html>')
    assert doc.title == 'Min Document Title'
 def test_readin_min_document(self):
     """Verify we can read in a min html document"""
     doc = OriginalDocument(load_snippet('document_min.html'))
     self.assertTrue(to_unicode(doc).startswith('<html>'))
     self.assertEqual(doc.title, 'Min Document Title')