def test_html_fromstring_too_big(fromstring): eq_(None, u.html_fromstring("<html></html>")) assert_false(fromstring.called)
def test_html_fromstring_exception(): eq_(None, u.html_fromstring("<html></html>"))
def _extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == b'': return msg_body msg_body = msg_body.replace(b'\r\n', b'\n') html_tree = html_fromstring(msg_body) if html_tree is None: return msg_body cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_zimbra_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree)) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False] * number_of_checkpoints plain_text = html_tree_to_text(html_tree) plain_text = preprocess(plain_text, '\n', content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [ int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line) ] for line in lines ] # Remove checkpoints lines = [ re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines ] # Use plain text quotation extracting algorithm markers = remove_initial_spaces_and_mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if not lines_were_deleted and not cut_quotations: return msg_body if lines_were_deleted: #collect checkpoints from deleted lines for i in range(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints) if _readable_text_empty(html_tree_copy): return msg_body return _html_tostring(html_tree_copy)