Python preprocessor Examples, justext.core.preprocessor Python Examples

Example #1

0

Show file

File: external.py Project: zanachka/trafilatura

def custom_justext(tree, stoplist):
    'Customized version of JusText processing'
    dom = preprocessor(tree) # tree_cleaning(tree, True)
    paragraphs = ParagraphMaker.make_paragraphs(dom)
    classify_paragraphs(paragraphs, stoplist, 50, 200, 0.1, 0.2, 0.2, True)
    revise_paragraph_classification(paragraphs, 200)
    return paragraphs

Example #2

0

Show file

File: test_dom_utils.py Project: JacobWSU/jusText

    def test_remove_comments(self):
        dom = html.fromstring(
            "<html><!-- comment --><body>" "<h1>Header</h1>" "<!-- comment --> text" "<p>footer" "</body></html>"
        )

        expected = "<html><!-- comment --><body><h1>Header</h1><!-- comment --> text<p>footer</p></body></html>"
        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(expected, returned)

        dom = preprocessor(dom)

        expected = "<html><body><h1>Header</h1> text<p>footer</p></body></html>"
        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(expected, returned)

Example #3

0

Show file

    def test_preprocess_simple_unicode_string(self):
        html_string = ('<html><head><title>Title</title></head><body>'
                       '<h1>Header</h1>'
                       '<p>pre<span>text</span>post<em>emph</em>popost</p>'
                       '<p>footer <em>like</em> a boss</p>'
                       '</body></html>')

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = ('<html><body>'
                    '<h1>Header</h1>'
                    '<p>pre<span>text</span>post<em>emph</em>popost</p>'
                    '<p>footer <em>like</em> a boss</p>'
                    '</body></html>')
        tools.assert_equal(expected, returned)

Example #4

0

Show file

    def test_remove_comments(self):
        dom = html.fromstring('<html><!-- comment --><body>'
                              '<h1>Header</h1>'
                              '<!-- comment --> text'
                              '<p>footer'
                              '</body></html>')

        expected = '<html><!-- comment --><body><h1>Header</h1><!-- comment --> text<p>footer</p></body></html>'
        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(expected, returned)

        dom = preprocessor(dom)

        expected = '<html><body><h1>Header</h1> text<p>footer</p></body></html>'
        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(expected, returned)

Example #5

0

Show file

def test_preprocess_simple_bytes_string():
    html_string = (b'<html><head><title>Title</title></head><body>'
                   b'<h1>Header</h1>'
                   b'<p>pre<span>text</span>post<em>emph</em>popost</p>'
                   b'<p>footer <em>like</em> a boss</p>'
                   b'  <!-- abcdefgh -->\n'
                   b'</body></html>')

    dom = preprocessor(html_to_dom(html_string))
    returned = html.tostring(dom).decode("utf8")
    expected = ('<html><body>'
                '<h1>Header</h1>'
                '<p>pre<span>text</span>post<em>emph</em>popost</p>'
                '<p>footer <em>like</em> a boss</p>'
                '  \n'
                '</body></html>')
    assert expected == returned

Example #6

0

Show file

File: test_dom_utils.py Project: anukat2015/jusText

def test_remove_comments():
    dom = html.fromstring(
        '<html><!-- comment --><body>'
        '<h1>Header</h1>'
        '<!-- comment --> text'
        '<p>footer'
        '</body></html>'
    )

    expected = '<html><!-- comment --><body><h1>Header</h1><!-- comment --> text<p>footer</p></body></html>'
    returned = html.tostring(dom).decode("utf8")
    assert expected == returned

    dom = preprocessor(dom)

    expected = '<html><body><h1>Header</h1> text<p>footer</p></body></html>'
    returned = html.tostring(dom).decode("utf8")
    assert expected == returned

Example #7

0

Show file

    def test_remove_head_tag(self):
        html_string = ('<html><head><title>Title</title></head><body>'
                       '<h1>Header</h1>'
                       '<p><span>text</span></p>'
                       '<p>footer <em>like</em> a boss</p>'
                       '</body></html>')

        dom = html.fromstring(html_string)
        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        dom = preprocessor(dom)
        returned = html.tostring(dom).decode("utf8")
        expected = ('<html><body>'
                    '<h1>Header</h1>'
                    '<p><span>text</span></p>'
                    '<p>footer <em>like</em> a boss</p>'
                    '</body></html>')
        tools.assert_equal(expected, returned)

Example #8

0

Show file

File: test_dom_utils.py Project: JacobWSU/jusText

    def test_preprocess_simple_unicode_string(self):
        html_string = (
            "<html><head><title>Title</title></head><body>"
            "<h1>Header</h1>"
            "<p>pre<span>text</span>post<em>emph</em>popost</p>"
            "<p>footer <em>like</em> a boss</p>"
            "</body></html>"
        )

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = (
            "<html><body>"
            "<h1>Header</h1>"
            "<p>pre<span>text</span>post<em>emph</em>popost</p>"
            "<p>footer <em>like</em> a boss</p>"
            "</body></html>"
        )
        tools.assert_equal(expected, returned)

Example #9

0

Show file

File: test_dom_utils.py Project: anukat2015/jusText

def test_preprocess_simple_unicode_string():
    html_string = (
        '<html><head><title>Title</title></head><body>'
        '<h1>Header</h1>'
        '<p>pre<span>text</span>post<em>emph</em>popost</p>'
        '<p>footer <em>like</em> a boss</p>'
        '</body></html>'
    )

    dom = preprocessor(html_to_dom(html_string))
    returned = html.tostring(dom).decode("utf8")
    expected = (
        '<html><body>'
        '<h1>Header</h1>'
        '<p>pre<span>text</span>post<em>emph</em>popost</p>'
        '<p>footer <em>like</em> a boss</p>'
        '</body></html>'
    )
    assert expected == returned

Example #10

0

Show file

    def test_preprocess_simple_unicode_xhtml_string_with_declaration(self):
        html_string = (
            '<?xml version="1.0" encoding="windows-1250"?>'
            '<!DOCTYPE html>'
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            '<head>'
            '<title>Hello World</title>'
            '<meta http-equiv="imagetoolbar" content="no" />'
            '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />'
            '</head>'
            '<body id="index">'
            '</body>'
            '</html>')

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = (
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            '<body id="index">'
            '</body>'
            '</html>')
        tools.assert_equal(expected, returned)

Example #11

0

Show file

File: test_dom_utils.py Project: JacobWSU/jusText

    def test_remove_head_tag(self):
        html_string = (
            "<html><head><title>Title</title></head><body>"
            "<h1>Header</h1>"
            "<p><span>text</span></p>"
            "<p>footer <em>like</em> a boss</p>"
            "</body></html>"
        )

        dom = html.fromstring(html_string)
        returned = html.tostring(dom).decode("utf8")
        tools.assert_equal(html_string, returned)

        dom = preprocessor(dom)
        returned = html.tostring(dom).decode("utf8")
        expected = (
            "<html><body>"
            "<h1>Header</h1>"
            "<p><span>text</span></p>"
            "<p>footer <em>like</em> a boss</p>"
            "</body></html>"
        )
        tools.assert_equal(expected, returned)

Example #12

0

Show file

File: test_dom_utils.py Project: anukat2015/jusText

def test_remove_head_tag():
    html_string = (
        '<html><head><title>Title</title></head><body>'
        '<h1>Header</h1>'
        '<p><span>text</span></p>'
        '<p>footer <em>like</em> a boss</p>'
        '</body></html>'
    )

    dom = html.fromstring(html_string)
    returned = html.tostring(dom).decode("utf8")
    assert html_string == returned

    dom = preprocessor(dom)
    returned = html.tostring(dom).decode("utf8")
    expected = (
        '<html><body>'
        '<h1>Header</h1>'
        '<p><span>text</span></p>'
        '<p>footer <em>like</em> a boss</p>'
        '</body></html>'
    )
    assert expected == returned

Example #13

0

Show file

File: test_dom_utils.py Project: JacobWSU/jusText

    def test_preprocess_simple_unicode_xhtml_string_with_declaration(self):
        html_string = (
            '<?xml version="1.0" encoding="windows-1250"?>'
            "<!DOCTYPE html>"
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            "<head>"
            "<title>Hello World</title>"
            '<meta http-equiv="imagetoolbar" content="no" />'
            '<meta http-equiv="Content-Type" content="text/html; charset=windows-1250" />'
            "</head>"
            '<body id="index">'
            "</body>"
            "</html>"
        )

        dom = preprocessor(html_to_dom(html_string))
        returned = html.tostring(dom).decode("utf8")
        expected = (
            '<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="sk" lang="sk">'
            '<body id="index">'
            "</body>"
            "</html>"
        )
        tools.assert_equal(expected, returned)