def extract_signature(msg_body): ''' Analyzes message for a presence of signature block (by common patterns) and returns tuple with two elements: message text without signature block and the signature itself. >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman') ('Hey man! How r u?', '--\nRegards,\nRoman') >>> extract_signature('Hey man!') ('Hey man!', None) ''' try: # identify line delimiter first delimiter = get_delimiter(msg_body) # make an assumption stripped_body = msg_body.strip() phone_signature = None # strip off phone signature phone_signature = RE_PHONE_SIGNATURE.search(msg_body) if phone_signature: stripped_body = stripped_body[:phone_signature.start()] phone_signature = phone_signature.group() # decide on signature candidate lines = stripped_body.splitlines() candidate = get_signature_candidate(lines) candidate = delimiter.join(candidate) # try to extract signature signature = RE_SIGNATURE.search(candidate) if not signature: return (stripped_body.strip(), phone_signature) else: signature = signature.group() # when we splitlines() and then join them # we can lose a new line at the end # we did it when identifying a candidate # so we had to do it for stripped_body now stripped_body = delimiter.join(lines) stripped_body = stripped_body[:-len(signature)] if phone_signature: signature = delimiter.join([signature, phone_signature]) return (stripped_body.strip(), signature.strip()) except Exception, e: log.exception('ERROR extracting signature') return (msg_body, None)
def extract_from_plain(msg_body): """Extracts a non quoted message from provided plain text.""" stripped_text = msg_body delimiter = get_delimiter(msg_body) msg_body = preprocess(msg_body, delimiter) lines = msg_body.splitlines() # don't process too long messages if len(lines) > MAX_LINES_COUNT: return stripped_text markers = mark_message_lines(lines) lines = process_marked_lines(lines, markers) # concatenate lines, change links back, strip and return msg_body = delimiter.join(lines) msg_body = postprocess(msg_body) return msg_body
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8")) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree)) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() h.body_width = 0 # generate plain text without wrap # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') plain_text = h.handle(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [ int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line) ] for line in lines ] # Remove checkpoints lines = [ re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines ] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints) return html.tostring(html_tree_copy)
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8") ) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree) ) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False for i in range(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() h.body_width = 0 # generate plain text without wrap # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') plain_text = h.handle(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] for line in lines] # Remove checkpoints lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in range(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags( html_tree_copy, 0, quotation_checkpoints ) return html.tostring(html_tree_copy)
def test_get_delimiter(): eq_('\r\n', utils.get_delimiter('abc\r\n123')) eq_('\n', utils.get_delimiter('abc\n123')) eq_('\n', utils.get_delimiter('abc'))