def test_html_to_text(): html = """<body> <p>Hello world!</p> <br> <ul> <li>One!</li> <li>Two</li> </ul> <p> Haha </p> </body>""" text = u.html_to_text(html) eq_(b"Hello world! \n\n * One! \n * Two \nHaha", text) eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8')) html = '<body><br/><br/>Hi</body>' eq_(b'Hi', u.html_to_text(html)) html = """Hi <style type="text/css"> div, p, li { font: 13px 'Lucida Grande', Arial, sans-serif; } </style> <style type="text/css"> h1 { font: 13px 'Lucida Grande', Arial, sans-serif; } </style>""" eq_(b'Hi', u.html_to_text(html)) html = """<div> <!-- COMMENT 1 --> <span>TEXT 1</span> <p>TEXT 2 <!-- COMMENT 2 --></p> </div>""" eq_(b'TEXT 1 \nTEXT 2', u.html_to_text(html))
def extract_reply_and_check(filename): f = open(filename) msg_body = f.read() reply = quotations.extract_from_html(msg_body) plain_reply = u.html_to_text(reply) eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), RE_WHITESPACE.sub('', plain_reply))
def extract_reply_and_check(filename): import sys kwargs = {} if sys.version_info > (3, 0): kwargs["encoding"] = "utf8" f = open(filename, **kwargs) msg_body = f.read() reply = quotations.extract_from_html(msg_body) plain_reply = u.html_to_text(reply) plain_reply = plain_reply.decode('utf8') eq_(RE_WHITESPACE.sub('', "Hi. I am fine.\n\nThanks,\nAlex"), RE_WHITESPACE.sub('', plain_reply))
def test_html_to_text(): eq_("Hello", u.html_to_text("<div>Hello</div>")) eq_(None, u.html_to_text("<div><span>Hi</span></div>"))
def test_bad_html_to_text(): bad_html = "one<br>two<br>three" eq_(None, u.html_to_text(bad_html))
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body msg_body = msg_body.replace('\r\n', '').replace('\n', '') html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8") ) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree) ) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False] * number_of_checkpoints msg_with_checkpoints = html.tostring(html_tree) plain_text = html_to_text(msg_with_checkpoints) plain_text = preprocess(plain_text, '\n', content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] for line in lines] # Remove checkpoints lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags( html_tree_copy, 0, quotation_checkpoints ) return html.tostring(html_tree_copy)
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body msg_body = msg_body.replace('\r\n', '').replace('\n', '') html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8")) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree)) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False] * number_of_checkpoints msg_with_checkpoints = html.tostring(html_tree) plain_text = html_to_text(msg_with_checkpoints) plain_text = preprocess(plain_text, '\n', content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [ int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line) ] for line in lines ] # Remove checkpoints lines = [ re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines ] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints) return html.tostring(html_tree_copy)
def test_html_to_text(): eq_(b"Hello", u.html_to_text("<div>Hello</div>")) eq_(None, u.html_to_text("<div><span>Hi</span></div>"))