def extract(body, sender): """Strips signature from the body of the message. Returns stripped body and signature as a tuple. If no signature is found the corresponding returned value is None. """ try: delimiter = get_delimiter(body) body = body.strip() if has_signature(body, sender): lines = body.splitlines() markers = _mark_lines(lines, sender) text, signature = _process_marked_lines(lines, markers) if signature: text = delimiter.join(text) if text.strip(): return (text, delimiter.join(signature)) except Exception: log.exception('ERROR when extracting signature with classifiers') return (body, None)
def extract_signature(msg_body): """ Analyzes message for a presence of signature block (by common patterns) and returns tuple with two elements: message text without signature block and the signature itself. >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman') ('Hey man! How r u?', '--\nRegards,\nRoman') >>> extract_signature('Hey man!') ('Hey man!', None) """ try: # identify line delimiter first delimiter = get_delimiter(msg_body) # make an assumption stripped_body = msg_body.strip() phone_signature = None # strip off phone signature phone_signature = RE_PHONE_SIGNATURE.search(msg_body) if phone_signature: stripped_body = stripped_body[:phone_signature.start()] phone_signature = phone_signature.group() # decide on signature candidate lines = stripped_body.splitlines() candidate = get_signature_candidate(lines) candidate = delimiter.join(candidate) # try to extract signature signature = RE_SIGNATURE.search(candidate) if not signature: return (stripped_body.strip(), phone_signature) else: signature = signature.group() # when we splitlines() and then join them # we can lose a new line at the end # we did it when identifying a candidate # so we had to do it for stripped_body now stripped_body = delimiter.join(lines) stripped_body = stripped_body[:-len(signature)] if phone_signature: signature = delimiter.join([signature, phone_signature]) return (stripped_body.strip(), signature.strip()) except Exception: log.exception('ERROR extracting signature') return (msg_body, None)
def extract_signature(msg_body): ''' Analyzes message for a presence of signature block (by common patterns) and returns tuple with two elements: message text without signature block and the signature itself. >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman') ('Hey man! How r u?', '--\nRegards,\nRoman') >>> extract_signature('Hey man!') ('Hey man!', None) ''' try: # identify line delimiter first delimiter = get_delimiter(msg_body) # make an assumption stripped_body = msg_body.strip() phone_signature = None # strip off phone signature phone_signature = RE_PHONE_SIGNATURE.search(msg_body) if phone_signature: stripped_body = stripped_body[:phone_signature.start()] phone_signature = phone_signature.group() # decide on signature candidate lines = stripped_body.splitlines() candidate = get_signature_candidate(lines) candidate = delimiter.join(candidate) # try to extract signature signature = RE_SIGNATURE.search(candidate) if not signature: return (stripped_body.strip(), phone_signature) else: signature = signature.group() # when we splitlines() and then join them # we can lose a new line at the end # we did it when identifying a candidate # so we had to do it for stripped_body now stripped_body = delimiter.join(lines) stripped_body = stripped_body[:-len(signature)] if phone_signature: signature = delimiter.join([signature, phone_signature]) return (stripped_body.strip(), signature.strip()) except Exception as e: log.exception('ERROR extracting signature') return (msg_body, None)
def extract_from_html_by_plaintext(html_tree, placeholder): html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree_copy, 0) quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree_copy) # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') plain_text = textify_html(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return False # Collect checkpoints on each line line_checkpoints = [ [int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] for line in lines] # Remove checkpoints lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if not lines_were_deleted: return False #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags(html_tree, quotation_checkpoints, placeholder) return True
def extract_from_plain(msg_body): """Extracts a non quoted message from provided plain text.""" delimiter = get_delimiter(msg_body) msg_body = preprocess(msg_body, delimiter) # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) lines = process_marked_lines(lines, markers) # concatenate lines, change links back, strip and return msg_body = delimiter.join(lines) msg_body = postprocess(msg_body) return msg_body
def extract_from_plain(msg_body): """Extracts a non quoted message from provided plain text.""" stripped_text = msg_body delimiter = get_delimiter(msg_body) msg_body = preprocess(msg_body, delimiter) # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) lines = process_marked_lines(lines, markers) # concatenate lines, change links back, strip and return msg_body = delimiter.join(lines) msg_body = postprocess(msg_body) return msg_body
def split_emails(msg_body): """ :param text: plain text email chain :return: ??? """ delimiter = get_delimiter(msg_body) msg_body = quotations.preprocess(msg_body, delimiter) lines = msg_body.splitlines() markers = mark_message_lines(lines) # Get the indices for all markers denoting a quoted section transitions = [i for i, x in enumerate(markers) if x == 's'] sections = partition(lines, transitions) return sections
def _CRLF_to_LF(s): """Replace CRLF with LF >>> s, changed = _CRLF_to_LF('a\r\n'b) >>> s 'a\nb' >>> changed True >>> s, changed = _CRLF_to_LF('a\n'b) >>> s 'a\nb' >>> changed False """ delimiter = get_delimiter(s) if delimiter == '\r\n': return s.replace(delimiter, '\n'), True return s, False
def split_emails(msg): """ Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify split lines, content lines and empty lines. Correct the split line markers inside header blocks. Header blocks are identified by the regular expression RE_HEADER. Return the corrected markers """ delimiter = get_delimiter(msg) msg_body = preprocess(msg, delimiter) # don't process too long messages lines = msg_body.splitlines()[:MAX_LINES_COUNT] markers = mark_message_lines(lines) # we don't want splitlines in header blocks markers = _correct_splitlines_in_headers(markers, lines) return markers
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8") ) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree) ) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False for i in range(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() h.body_width = 0 # generate plain text without wrap # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.decode('utf-8').replace('*', '3423oorkg432') plain_text = h.handle(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)] for line in lines] # Remove checkpoints lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in range(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags( html_tree_copy, 0, quotation_checkpoints ) return html.tostring(html_tree_copy).decode('utf-8')
def test_get_delimiter(): eq_('\r\n', u.get_delimiter('abc\r\n123')) eq_('\n', u.get_delimiter('abc\n123')) eq_('\n', u.get_delimiter('abc'))
def test_get_delimiter(): eq_('\r\n', utils.get_delimiter('abc\r\n123')) eq_('\n', utils.get_delimiter('abc\n123')) eq_('\n', utils.get_delimiter('abc'))
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8")) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree)) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() h.body_width = 0 # generate plain text without wrap # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') plain_text = h.handle(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [ int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line) ] for line in lines ] # Remove checkpoints lines = [ re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines ] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints) return html.tostring(html_tree_copy)
def test_get_delimiter(): eq_("\r\n", u.get_delimiter("abc\r\n123")) eq_("\n", u.get_delimiter("abc\n123")) eq_("\n", u.get_delimiter("abc"))
def preprocess(emails, folder, csv_results): """ v1: used to preprocess the marked file(xx_body), to generate the original one(xx_origin), the signature part(_sig), and the details infromation(xx_detail). v2: used to preprocess the marked emails(xx_body), to generate a csv file contains all the ACTUAL information of the emails. """ with open(csv_results, 'w') as csvfile: fields = [ 'filename', 'sender', 'origin', 'marked', 'has_sig', 'sig', 'name', 'title', 'company', 'address', 'number', 'work_number', 'fax', 'email', 'url', 'slogan', 'quote' ] # predict_fields = ['p_has_sig','p_sig','p_name','p_title','p_company','p_address','p_number','p_work_number','p_fax','p_email','p_url','p_slogan','p_quote'] # fields.extend(predict_fields) writer = csv.DictWriter(csvfile, fieldnames=fields) writer.writeheader() for email in emails: filename = folder + email sender, msg = parse_msg_sender(filename, sender_known=True) if not sender or not msg: print 'Empty: ' + filename continue delim = get_delimiter(msg) lines = msg.split(delim) sig = [] dict = {} label = -1 for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1): line = lines[-i] if line[:len(SIGNATURE_ANNOTATION)] == SIGNATURE_ANNOTATION: label = 1 line = line[len(SIGNATURE_ANNOTATION):] dict, line = find_details(dict, line) sig.append(line) lines[-i] = line origin = build_filename(filename, ORIGIN_SUFFIX) details = build_filename(filename, DETAILS_SUFFIX) signature = build_filename(filename, SIG_SUFFIX) writer.writerow({ 'filename': email, 'sender': sender, 'origin': delim.join(lines), 'marked': msg, 'has_sig': label, 'sig': delim.join(sig[::-1]), 'name': dict.get('name'), 'title': dict.get('title'), 'company': dict.get('company'), 'address': dict.get('address'), 'number': dict.get('num'), 'work_number': dict.get('work_num'), 'fax': dict.get('fax'), 'email': dict.get('email'), 'url': dict.get('url'), 'slogan': dict.get('slogan'), 'quote': dict.get('quote') })