def test_CRLF(): """CR is not converted to ' ' """ symbol = ' ' extracted = quotations.extract_from_html('<html>\r\n</html>') assert_false(symbol in extracted) eq_('<html></html>', RE_WHITESPACE.sub('', extracted)) msg_body = """My reply <blockquote> <div> On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote: </div> <div> Test </div> </blockquote>""" msg_body = msg_body.replace('\n', '\r\n') extracted = quotations.extract_from_html(msg_body) assert_false(symbol in extracted) # Keep new lines otherwise "My reply" becomes one word - "Myreply" eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
def test_no_blockquote(): msg_body = """ <html> <body> Reply <div> On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote: </div> <div> Test </div> </body> </html> """ reply = """ <html> <head></head> <body> Reply </body></html>""" eq_(RE_WHITESPACE.sub('', reply), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_compact(): msg_body = 'Reply' \ '<div class="gmail_quote">' \ '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote:' \ '<div>Test</div>' \ '</div>' \ '</div>' eq_("<html><head></head><body>Reply</body></html>", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_too_large_html(): msg_body = 'Reply' \ '<div class="gmail_quote">' \ '<div class="gmail_quote">On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote:' \ '<div>Test</div>' \ '</div>' \ '</div>' eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_gmail_quote_blockquote(): msg_body = """Message <blockquote class="gmail_quote"> <div class="gmail_default"> My name is William Shakespeare. <br/> </div> </blockquote>""" eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_quotation_splitter_outside_blockquote(): msg_body = """Reply <div> On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote: </div> <blockquote> <div> Test </div> </blockquote> """ eq_("<html><head></head><body>Reply</body></html>", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_from_block(): msg_body = """<div> message<br> <div> <hr> From: <a href="mailto:[email protected]">[email protected]</a><br> Date: Fri, 23 Mar 2012 12:35:31 -0600<br> To: <a href="mailto:[email protected]">[email protected]</a><br> Subject: You Have New Mail From Mary!<br><br> text </div></div> """ eq_('<html><head></head><body><div>message<br></div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_unicode_in_reply(): msg_body = u"""Reply \xa0 \xa0 Text<br> <div> <br> </div> <blockquote> Quote </blockquote>""".encode("utf-8") eq_( "<html><head></head><body>Reply  Text<br><div><br></div>" "</body></html>", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def extract_reply_and_check(filename): import sys kwargs = {} if sys.version_info > (3, 0): kwargs["encoding"] = "utf8" f = open(filename, **kwargs) msg_body = f.read() reply = quotations.extract_from_html(msg_body) plain_reply = u.html_to_text(reply) plain_reply = plain_reply.decode('utf8') eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), RE_WHITESPACE.sub('', plain_reply))
def test_readable_html_empty(): msg_body = """ <blockquote> Reply <div> On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote: </div> <div> Test </div> </blockquote>""" eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_reply_shares_div_with_from_block(): msg_body = ''' <body> <div> Blah<br><br> <hr>Date: Tue, 22 May 2012 18:29:16 -0600<br> To: [email protected]<br> From: [email protected]<br> Subject: You Have New Mail From x!<br><br> </div> </body>''' eq_('<html><head></head><body><div>Blah<br><br></div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_regular_blockquote(): msg_body = """Reply <blockquote>Regular</blockquote> <div> On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote: </div> <blockquote> <div> <blockquote>Nested</blockquote> </div> </blockquote> """ eq_( "<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_from_block_and_quotations_in_separate_divs(): msg_body = ''' Reply <div> <hr/> <div> <font> <b>From: [email protected]</b> <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b> </font> </div> <div> Quoted message </div> </div> ''' eq_('<html><head></head><body>Reply<div><hr></div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_validate_output_html(): msg_body = """Reply <div> On 11-Apr-2011, at 6:54 PM, Bob <[email protected]> wrote: <blockquote> <div> Test </div> </blockquote> </div> <div/> """ out = quotations.extract_from_html(msg_body) ok_('<html>' in out and '</html>' in out, 'Invalid HTML - <html>/</html> tag not present') ok_('<div/>' not in out, 'Invalid HTML output - <div/> element is not valid')
def test_blockquote_disclaimer(): msg_body = """ <html> <body> <div> <div> message </div> <blockquote> Quote </blockquote> </div> <div> disclaimer </div> </body> </html> """ stripped_html = """ <html> <head></head> <body> <div> <div> message </div> </div> <div> disclaimer </div> </body> </html> """ eq_(RE_WHITESPACE.sub('', stripped_html), RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
def test_empty_body(): eq_('', quotations.extract_from_html(''))
def test_OLK_SRC_BODY_SECTION_stripped(): eq_( '<html><head></head><body><div>Reply</div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(OLK_SRC_BODY_SECTION)))
def test_bad_html(): bad_html = "<html></html>" eq_(bad_html, quotations.extract_from_html(bad_html))
def test_reply_separated_by_hr(): eq_( '<html><head></head><body><div>Hi<div>there</div></div></body></html>', RE_WHITESPACE.sub('', quotations.extract_from_html(REPLY_SEPARATED_BY_HR)))
def test_gmail_forwarded_msg(): msg_body = """<div dir="ltr"><br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername">Bob</b> <span dir="ltr"><<a href="mailto:[email protected]">[email protected]</a>></span><br>Date: Fri, Feb 11, 2010 at 5:59 PM<br>Subject: Bob WFH today<br>To: Mary <<a href="mailto:[email protected]">[email protected]</a>><br><br><br><div dir="ltr">eom</div> </div><br></div>""" extracted = quotations.extract_from_html(msg_body) eq_(RE_WHITESPACE.sub('', msg_body), RE_WHITESPACE.sub('', extracted))