Exemple #1
0
def test_HtmlTool_clean_html():
    """Test HTML cleanup algorithm."""
    from clair.textprocessing import HtmlTool
    
    html_nice = HtmlTool.clean_html("""<head></head>
                                      <body>Foo</body>""")
    print html_nice
    
    html_nice = HtmlTool.clean_html(html_nasty)
#    print html_nasty
    print html_nice
    
    pytest.xfail("LXML does strange things.") #IGNORE:E1101
    assert html_nice.find("Blitzger&auml;t") != -1
Exemple #2
0
def test_HtmlTool_to_nice_text():
    """Test HTML cleanup algorithm."""
    from clair.textprocessing import HtmlTool
    
    text_nice = HtmlTool.to_nice_text(html_nasty)
    print html_nasty
    print text_nice
    
    assert text_nice.find(u"Blitzgerät") != -1
    assert text_nice.find(u"Rückseite") != -1
    assert len(text_nice.split("\n")) == 8
Exemple #3
0
def test_HtmlTool_remove_html():
    """Test the HTML to pure text conversion."""
    from clair.textprocessing import HtmlTool
    
    text = HtmlTool.remove_html(
                    "This is <b>bold</b> text.   <p>Paragraph.</p> 3 &gt; 2")
    print text
    assert text == "This is bold text. Paragraph. 3 > 2"
    
    text = HtmlTool.remove_html(
                    """Integrated style sheet. 
                       <style type="text/css"> p {color:blue;} </style>
                       Text after style sheet.""")
    print text
    assert text == "Integrated style sheet. Text after style sheet."
    
    assert HtmlTool.remove_html(None) == ""
    assert HtmlTool.remove_html(nan) == ""
    
    text = HtmlTool.remove_html(" 1 &lt; 2 &gt; 0.5 ")
    print text
    assert text == " 1 < 2 > 0.5 "