Example #1
0
def extract(body, sender):
    """Strips signature from the body of the message.

    Returns stripped body and signature as a tuple.
    If no signature is found the corresponding returned value is None.
    """
    try:
        delimiter = get_delimiter(body)

        body = body.strip()

        if has_signature(body, sender):
            lines = body.splitlines()

            markers = _mark_lines(lines, sender)
            text, signature = _process_marked_lines(lines, markers)

            if signature:
                text = delimiter.join(text)
                if text.strip():
                    return (text, delimiter.join(signature))
    except Exception:
        log.exception('ERROR when extracting signature with classifiers')

    return (body, None)
Example #2
0
def extract(body, sender):
    """Strips signature from the body of the message.

    Returns stripped body and signature as a tuple.
    If no signature is found the corresponding returned value is None.
    """
    try:
        delimiter = get_delimiter(body)

        body = body.strip()

        if has_signature(body, sender):
            lines = body.splitlines()

            markers = _mark_lines(lines, sender)
            text, signature = _process_marked_lines(lines, markers)

            if signature:
                text = delimiter.join(text)
                if text.strip():
                    return (text, delimiter.join(signature))
    except Exception:
        log.exception('ERROR when extracting signature with classifiers')

    return (body, None)
Example #3
0
def extract_signature(msg_body):
    """
    Analyzes message for a presence of signature block (by common patterns)
    and returns tuple with two elements: message text without signature block
    and the signature itself.

    >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
    ('Hey man! How r u?', '--\nRegards,\nRoman')

    >>> extract_signature('Hey man!')
    ('Hey man!', None)
    """
    try:
        # identify line delimiter first
        delimiter = get_delimiter(msg_body)

        # make an assumption
        stripped_body = msg_body.strip()
        phone_signature = None

        # strip off phone signature
        phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
        if phone_signature:
            stripped_body = stripped_body[:phone_signature.start()]
            phone_signature = phone_signature.group()

        # decide on signature candidate
        lines = stripped_body.splitlines()
        candidate = get_signature_candidate(lines)
        candidate = delimiter.join(candidate)

        # try to extract signature
        signature = RE_SIGNATURE.search(candidate)
        if not signature:
            return (stripped_body.strip(), phone_signature)
        else:
            signature = signature.group()
            # when we splitlines() and then join them
            # we can lose a new line at the end
            # we did it when identifying a candidate
            # so we had to do it for stripped_body now
            stripped_body = delimiter.join(lines)
            stripped_body = stripped_body[:-len(signature)]

            if phone_signature:
                signature = delimiter.join([signature, phone_signature])

            return (stripped_body.strip(),
                    signature.strip())
    except Exception:
        log.exception('ERROR extracting signature')
        return (msg_body, None)
Example #4
0
def extract_signature(msg_body):
    '''
    Analyzes message for a presence of signature block (by common patterns)
    and returns tuple with two elements: message text without signature block
    and the signature itself.

    >>> extract_signature('Hey man! How r u?\n\n--\nRegards,\nRoman')
    ('Hey man! How r u?', '--\nRegards,\nRoman')

    >>> extract_signature('Hey man!')
    ('Hey man!', None)
    '''
    try:
        # identify line delimiter first
        delimiter = get_delimiter(msg_body)

        # make an assumption
        stripped_body = msg_body.strip()
        phone_signature = None

        # strip off phone signature
        phone_signature = RE_PHONE_SIGNATURE.search(msg_body)
        if phone_signature:
            stripped_body = stripped_body[:phone_signature.start()]
            phone_signature = phone_signature.group()

        # decide on signature candidate
        lines = stripped_body.splitlines()
        candidate = get_signature_candidate(lines)
        candidate = delimiter.join(candidate)

        # try to extract signature
        signature = RE_SIGNATURE.search(candidate)
        if not signature:
            return (stripped_body.strip(), phone_signature)
        else:
            signature = signature.group()
            # when we splitlines() and then join them
            # we can lose a new line at the end
            # we did it when identifying a candidate
            # so we had to do it for stripped_body now
            stripped_body = delimiter.join(lines)
            stripped_body = stripped_body[:-len(signature)]

            if phone_signature:
                signature = delimiter.join([signature, phone_signature])

            return (stripped_body.strip(),
                    signature.strip())
    except Exception as e:
        log.exception('ERROR extracting signature')
        return (msg_body, None)
Example #5
0
def extract_from_html_by_plaintext(html_tree, placeholder):
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree_copy, 0)
    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
    msg_with_checkpoints = html.tostring(html_tree_copy)

    # html2text adds unnecessary star symbols. Remove them.
    # Mask star symbols
    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
    plain_text = textify_html(msg_with_checkpoints)
    # Remove created star symbols
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')

    delimiter = get_delimiter(plain_text)

    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return False

    # Collect checkpoints on each line
    line_checkpoints = [
        [int(i[4:-4])  # Only checkpoint number
         for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
        for line in lines]

    # Remove checkpoints
    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
             for line in lines]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags
    if not lines_were_deleted:
        return False

    #collect checkpoints from deleted lines
    for i in xrange(first_deleted, last_deleted):
        for checkpoint in line_checkpoints[i]:
            quotation_checkpoints[checkpoint] = True

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(html_tree, quotation_checkpoints, placeholder)
    return True
Example #6
0
def extract_from_plain(msg_body):
    """Extracts a non quoted message from provided plain text."""
    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)

    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

    # concatenate lines, change links back, strip and return
    msg_body = delimiter.join(lines)
    msg_body = postprocess(msg_body)
    return msg_body
Example #7
0
def extract_from_plain(msg_body):
    """Extracts a non quoted message from provided plain text."""
    stripped_text = msg_body

    delimiter = get_delimiter(msg_body)
    msg_body = preprocess(msg_body, delimiter)
    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)
    lines = process_marked_lines(lines, markers)

    # concatenate lines, change links back, strip and return
    msg_body = delimiter.join(lines)
    msg_body = postprocess(msg_body)
    return msg_body
Example #8
0
def split_emails(msg_body):
    """
    :param text: plain text email chain
    :return: ???
    """

    delimiter = get_delimiter(msg_body)
    msg_body = quotations.preprocess(msg_body, delimiter)
    lines = msg_body.splitlines()

    markers = mark_message_lines(lines)

    # Get the indices for all markers denoting a quoted section
    transitions = [i for i, x in enumerate(markers) if x == 's']

    sections = partition(lines, transitions)

    return sections
Example #9
0
def _CRLF_to_LF(s):
    """Replace CRLF with LF

    >>> s, changed = _CRLF_to_LF('a\r\n'b)
    >>> s
    'a\nb'
    >>> changed
    True

    >>> s, changed = _CRLF_to_LF('a\n'b)
    >>> s
    'a\nb'
    >>> changed
    False
    """
    delimiter = get_delimiter(s)
    if delimiter == '\r\n':
        return s.replace(delimiter, '\n'), True
    return s, False
Example #10
0
def split_emails(msg):
    """
    Given a message (which may consist of an email conversation thread with multiple emails), mark the lines to identify
     split lines, content lines and empty lines.

    Correct the split line markers inside header blocks. Header blocks are identified by the regular expression
    RE_HEADER.

    Return the corrected markers
    """
    delimiter = get_delimiter(msg)
    msg_body = preprocess(msg, delimiter)
    # don't process too long messages
    lines = msg_body.splitlines()[:MAX_LINES_COUNT]
    markers = mark_message_lines(lines)

    # we don't want splitlines in header blocks
    markers = _correct_splitlines_in_headers(markers, lines)

    return markers
Example #11
0
def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """

    if msg_body.strip() == '':
        return msg_body

    html_tree = html.document_fromstring(
        msg_body,
        parser=html.HTMLParser(encoding="utf-8")
    )

    cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or
                      html_quotations.cut_blockquote(html_tree) or
                      html_quotations.cut_microsoft_quote(html_tree) or
                      html_quotations.cut_by_id(html_tree) or
                      html_quotations.cut_from_block(html_tree)
                      )

    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False for i in range(number_of_checkpoints)]
    msg_with_checkpoints = html.tostring(html_tree)

    h = html2text.HTML2Text()
    h.body_width = 0  # generate plain text without wrap

    # html2text adds unnecessary star symbols. Remove them.
    # Mask star symbols
    msg_with_checkpoints = msg_with_checkpoints.decode('utf-8').replace('*', '3423oorkg432')
    plain_text = h.handle(msg_with_checkpoints)
    # Remove created star symbols
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')

    delimiter = get_delimiter(plain_text)

    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [int(i[4:-4])  # Only checkpoint number
         for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)]
        for line in lines]

    # Remove checkpoints
    lines = [re.sub(html_quotations.CHECKPOINT_PATTERN, '', line)
             for line in lines]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(
        html_tree_copy, 0, quotation_checkpoints
    )

    return html.tostring(html_tree_copy).decode('utf-8')
Example #12
0
def test_get_delimiter():
    eq_('\r\n', u.get_delimiter('abc\r\n123'))
    eq_('\n', u.get_delimiter('abc\n123'))
    eq_('\n', u.get_delimiter('abc'))
Example #13
0
def test_get_delimiter():
    eq_('\r\n', utils.get_delimiter('abc\r\n123'))
    eq_('\n', utils.get_delimiter('abc\n123'))
    eq_('\n', utils.get_delimiter('abc'))
Example #14
0
def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """

    if msg_body.strip() == '':
        return msg_body

    html_tree = html.document_fromstring(
        msg_body, parser=html.HTMLParser(encoding="utf-8"))

    cut_quotations = (html_quotations.cut_gmail_quote(html_tree)
                      or html_quotations.cut_blockquote(html_tree)
                      or html_quotations.cut_microsoft_quote(html_tree)
                      or html_quotations.cut_by_id(html_tree)
                      or html_quotations.cut_from_block(html_tree))

    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
    msg_with_checkpoints = html.tostring(html_tree)

    h = html2text.HTML2Text()
    h.body_width = 0  # generate plain text without wrap

    # html2text adds unnecessary star symbols. Remove them.
    # Mask star symbols
    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
    plain_text = h.handle(msg_with_checkpoints)
    # Remove created star symbols
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')

    delimiter = get_delimiter(plain_text)

    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [
            int(i[4:-4])  # Only checkpoint number
            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)
        ] for line in lines
    ]

    # Remove checkpoints
    lines = [
        re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines
    ]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(html_tree_copy, 0,
                                          quotation_checkpoints)

    return html.tostring(html_tree_copy)
Example #15
0
def test_get_delimiter():
    eq_("\r\n", u.get_delimiter("abc\r\n123"))
    eq_("\n", u.get_delimiter("abc\n123"))
    eq_("\n", u.get_delimiter("abc"))
Example #16
0
def preprocess(emails, folder, csv_results):
    """
    v1:
    used to preprocess the marked file(xx_body), to generate the original one(xx_origin),
    the signature part(_sig), and the details infromation(xx_detail).

    v2:
    used to preprocess the marked emails(xx_body), to generate a csv file contains all the ACTUAL information of the emails.
    """
    with open(csv_results, 'w') as csvfile:
        fields = [
            'filename', 'sender', 'origin', 'marked', 'has_sig', 'sig', 'name',
            'title', 'company', 'address', 'number', 'work_number', 'fax',
            'email', 'url', 'slogan', 'quote'
        ]
        # predict_fields = ['p_has_sig','p_sig','p_name','p_title','p_company','p_address','p_number','p_work_number','p_fax','p_email','p_url','p_slogan','p_quote']
        # fields.extend(predict_fields)
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        for email in emails:
            filename = folder + email
            sender, msg = parse_msg_sender(filename, sender_known=True)
            if not sender or not msg:
                print 'Empty: ' + filename
                continue
            delim = get_delimiter(msg)
            lines = msg.split(delim)

            sig = []
            dict = {}
            label = -1
            for i in xrange(1, min(SIGNATURE_MAX_LINES, len(lines)) + 1):
                line = lines[-i]
                if line[:len(SIGNATURE_ANNOTATION)] == SIGNATURE_ANNOTATION:
                    label = 1
                    line = line[len(SIGNATURE_ANNOTATION):]
                    dict, line = find_details(dict, line)
                    sig.append(line)
                    lines[-i] = line
            origin = build_filename(filename, ORIGIN_SUFFIX)
            details = build_filename(filename, DETAILS_SUFFIX)
            signature = build_filename(filename, SIG_SUFFIX)

            writer.writerow({
                'filename': email,
                'sender': sender,
                'origin': delim.join(lines),
                'marked': msg,
                'has_sig': label,
                'sig': delim.join(sig[::-1]),
                'name': dict.get('name'),
                'title': dict.get('title'),
                'company': dict.get('company'),
                'address': dict.get('address'),
                'number': dict.get('num'),
                'work_number': dict.get('work_num'),
                'fax': dict.get('fax'),
                'email': dict.get('email'),
                'url': dict.get('url'),
                'slogan': dict.get('slogan'),
                'quote': dict.get('quote')
            })