Beispiel #1
0
def test_signature_words():
    msg_body = '''Hey!

Thanks!
Roman'''
    eq_(('Hey!', 'Thanks!\nRoman'), bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!
--
Best regards,

Roman'''
    eq_(('Hey!', '--\nBest regards,\n\nRoman'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!
--
--
Regards,
Roman'''
    eq_(('Hey!', '--\n--\nRegards,\nRoman'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!

Sincerely,
Roman'''
    eq_(('Hey!', 'Sincerely,\nRoman'), bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!

Take care,
Roman'''
    eq_(('Hey!', 'Take care,\nRoman'), bruteforce.extract_signature(msg_body))
Beispiel #2
0
def test_blackberry_signature():
    msg_body = """Heeyyoooo.
Sent wirelessly from my BlackBerry device on the Bell network.
Envoyé sans fil par mon terminal mobile BlackBerry sur le réseau de Bell."""
    eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]),
        bruteforce.extract_signature(msg_body))

    msg_body = """Blah
Enviado desde mi oficina móvil BlackBerry® de Telcel"""

    eq_(('Blah', 'Enviado desde mi oficina móvil BlackBerry® de Telcel'),
        bruteforce.extract_signature(msg_body))
Beispiel #3
0
def test_signature_separated_by_long_dashes():
    msg_body = '''Wow. Awesome!
—
Bob Smith'''
    eq_(('Wow. Awesome!', '—\nBob Smith'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Wow. Awesome!
——
Bob Smith'''
    eq_(('Wow. Awesome!', '——\nBob Smith'),
        bruteforce.extract_signature(msg_body))
Beispiel #4
0
def test_blackberry_signature():
    msg_body = """Heeyyoooo.
Sent wirelessly from my BlackBerry device on the Bell network.
Envoyé sans fil par mon terminal mobile BlackBerry sur le réseau de Bell."""
    eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]),
        bruteforce.extract_signature(msg_body))

    msg_body = u"""Blah
Enviado desde mi oficina móvil BlackBerry® de Telcel"""

    eq_(('Blah', u'Enviado desde mi oficina móvil BlackBerry® de Telcel'),
        bruteforce.extract_signature(msg_body))
Beispiel #5
0
def test_blank_lines_inside_signature():
    msg_body = '''Blah.

-Lev.

Sent from my HTC smartphone!'''
    eq_(('Blah.', '-Lev.\n\nSent from my HTC smartphone!'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Blah
--

John Doe'''
    eq_(('Blah', '--\n\nJohn Doe'), bruteforce.extract_signature(msg_body))
Beispiel #6
0
def test_blank_lines_inside_signature():
    msg_body = '''Blah.

-Lev.

Sent from my HTC smartphone!'''
    eq_(('Blah.', '-Lev.\n\nSent from my HTC smartphone!'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Blah
--

John Doe'''
    eq_(('Blah', '--\n\nJohn Doe'), bruteforce.extract_signature(msg_body))
Beispiel #7
0
def extract_salutation(mail: mailparser.MailParser) -> str:
    """Extract the salutation from the email as a human would read it."""
    result = extract_signature(extract_message(mail))[1]
    if result is None:
        result = talon.signature.extract(extract_message(mail),
                                         mail.from_[0][1])[1]
    return result
def email_pre_process(text, nlp, filters):
    text, _ = extract_signature(text)
    text = remove_regex(text, filters['regex'])
    text = remove_text(text, filters['text'])
    text = spacy_pipeline(text, nlp)

    return text.strip()
Beispiel #9
0
def test_blank_lines_inside_signature():
    msg_body = """Blah.

-Lev.

Sent from my HTC smartphone!"""
    eq_(
        ("Blah.", "-Lev.\n\nSent from my HTC smartphone!"),
        bruteforce.extract_signature(msg_body),
    )

    msg_body = """Blah
--

John Doe"""
    eq_(("Blah", "--\n\nJohn Doe"), bruteforce.extract_signature(msg_body))
Beispiel #10
0
def extract_signatures_rb(emails):
    from talon.signature.bruteforce import extract_signature

    items = [extract_signature(email) for email in emails]
    bodies = [body for body, _ in items]
    signatures = [str(signature) for _, signature in items]
    return bodies, signatures
Beispiel #11
0
def strip_sig_footer(bodytext):
    no_footers = strip_footers(bodytext, False)

    #talon bruteforce technique to extract signature
    content, sig = extract_signature(no_footers)

    return content
Beispiel #12
0
def test_line_starts_with_signature_word():
    msg_body = '''Hey man!
Thanks for your attention.
--
Thanks!
Roman'''
    eq_(('Hey man!\nThanks for your attention.', '--\nThanks!\nRoman'),
        bruteforce.extract_signature(msg_body))
Beispiel #13
0
def test_line_starts_with_signature_word():
    msg_body = '''Hey man!
Thanks for your attention.
--
Thanks!
Roman'''
    eq_(('Hey man!\nThanks for your attention.', '--\nThanks!\nRoman'),
        bruteforce.extract_signature(msg_body))
Beispiel #14
0
def test_line_starts_with_signature_word():
    msg_body = """Hey man!
Thanks for your attention.
--
Thanks!
Roman"""
    eq_(
        ("Hey man!\nThanks for your attention.", "--\nThanks!\nRoman"),
        bruteforce.extract_signature(msg_body),
    )
Beispiel #15
0
def test_signature_separated_by_dashes():
    msg_body = """Hey man! How r u?
---
Roman"""
    eq_(("Hey man! How r u?", "---\nRoman"), bruteforce.extract_signature(msg_body))

    msg_body = """Hey!
-roman"""
    eq_(("Hey!", "-roman"), bruteforce.extract_signature(msg_body))

    msg_body = """Hey!

- roman"""
    eq_(("Hey!", "- roman"), bruteforce.extract_signature(msg_body))

    msg_body = """Wow. Awesome!
--
Bob Smith"""
    eq_(("Wow. Awesome!", "--\nBob Smith"), bruteforce.extract_signature(msg_body))
Beispiel #16
0
def test_signature_max_lines_ignores_empty_lines():
    msg_body = """Thanks,
Blah

regards


John Doe"""
    eq_(('Thanks,\nBlah', 'regards\n\n\nJohn Doe'),
        bruteforce.extract_signature(msg_body))
Beispiel #17
0
def test_signature_cant_start_from_first_line():
    msg_body = """Thanks,

Blah

regards

John Doe"""
    eq_(('Thanks,\n\nBlah', 'regards\n\nJohn Doe'),
        bruteforce.extract_signature(msg_body))
Beispiel #18
0
def test_line_starts_with_dashes():
    msg_body = '''Hey man!
Look at this:

--> one
--> two
--
Roman'''
    eq_(('Hey man!\nLook at this:\n\n--> one\n--> two', '--\nRoman'),
        bruteforce.extract_signature(msg_body))
Beispiel #19
0
def test_signature_max_lines_ignores_empty_lines():
    msg_body = """Thanks,
Blah

regards


John Doe"""
    eq_(('Thanks,\nBlah', 'regards\n\n\nJohn Doe'),
        bruteforce.extract_signature(msg_body))
Beispiel #20
0
def test_signature_cant_start_from_first_line():
    msg_body = """Thanks,

Blah

regards

John Doe"""
    eq_(('Thanks,\n\nBlah', 'regards\n\nJohn Doe'),
        bruteforce.extract_signature(msg_body))
Beispiel #21
0
def test_line_starts_with_dashes():
    msg_body = '''Hey man!
Look at this:

--> one
--> two
--
Roman'''
    eq_(('Hey man!\nLook at this:\n\n--> one\n--> two', '--\nRoman'),
        bruteforce.extract_signature(msg_body))
Beispiel #22
0
def test_signature_words():
    msg_body = """Hey!

Thanks!
Roman"""
    eq_(("Hey!", "Thanks!\nRoman"), bruteforce.extract_signature(msg_body))

    msg_body = """Hey!
--
Best regards,

Roman"""
    eq_(("Hey!", "--\nBest regards,\n\nRoman"), bruteforce.extract_signature(msg_body))

    msg_body = """Hey!
--
--
Regards,
Roman"""
    eq_(("Hey!", "--\n--\nRegards,\nRoman"), bruteforce.extract_signature(msg_body))
Beispiel #23
0
def test_signature_separated_by_dashes():
    msg_body = '''Hey man! How r u?
---
Roman'''
    eq_(('Hey man! How r u?', '---\nRoman'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!
-roman'''
    eq_(('Hey!', '-roman'), bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!

- roman'''
    eq_(('Hey!', '- roman'), bruteforce.extract_signature(msg_body))

    msg_body = '''Wow. Awesome!
--
Bob Smith'''
    eq_(('Wow. Awesome!', '--\nBob Smith'),
        bruteforce.extract_signature(msg_body))
Beispiel #24
0
def test_line_starts_with_dashes():
    msg_body = """Hey man!
Look at this:

--> one
--> two
--
Roman"""
    eq_(
        ("Hey man!\nLook at this:\n\n--> one\n--> two", "--\nRoman"),
        bruteforce.extract_signature(msg_body),
    )
def extract_mail(input):
    input = input.replace("=", "-")
    input = input.replace("*", "-")
    input = input.replace("_", "-")
    text, signature = extract_signature(input)
    if type(signature) is "string":
        print("Detected sig")
        print(signature)
    if type(signature) is "string" and detect_verb(signature) == True:
        print("Signature has V")
        return input
    return text
Beispiel #26
0
def test_signature_separated_by_dashes():
    msg_body = '''Hey man! How r u?
---
Roman'''
    eq_(('Hey man! How r u?', '---\nRoman'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!
-roman'''
    eq_(('Hey!', '-roman'), bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!

- roman'''
    eq_(('Hey!', '- roman'), bruteforce.extract_signature(msg_body))

    msg_body = '''Wow. Awesome!
--
Bob Smith'''
    eq_(('Wow. Awesome!', '--\nBob Smith'),
        bruteforce.extract_signature(msg_body))
Beispiel #27
0
def test_signature_words():
    msg_body = '''Hey!

Thanks!
Roman'''
    eq_(('Hey!', 'Thanks!\nRoman'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!
--
Best regards,

Roman'''
    eq_(('Hey!', '--\nBest regards,\n\nRoman'),
        bruteforce.extract_signature(msg_body))

    msg_body = '''Hey!
--
--
Regards,
Roman'''
    eq_(('Hey!', '--\n--\nRegards,\nRoman'),
        bruteforce.extract_signature(msg_body))
Beispiel #28
0
def remove_signature(message):
    msge = message.split('\n')
    # msg = list(filter(lambda a: a != '', msg))
    # msg = list(filter(lambda a: a != ' ', msg))
    # print msg
    try:
        msg = [x.rstrip() for x in msge]
    except:
        pass
    message = '\n'.join(msg)
    if '--' in msg:
        text, signature = extract_signature(message)
    else:
        text, signature = sig.extract(message, sender='*****@*****.**')

    return text
Beispiel #29
0
def preprocess(emails):
    """
    Performs preprocessing operations such as:
        1. Removing signature lines (only English emails are supported)
        2. Removing new line characters.
    """
    n_emails = len(emails)
    for i in range(n_emails):
        email = emails[i]
        email, _ = extract_signature(email)
        lines = email.split('\n')
        for j in reversed(range(len(lines))):
            lines[j] = lines[j].strip()
            if lines[j] == '':
                lines.pop(j)
        emails[i] = ' '.join(lines)
Beispiel #30
0
def test_signature_line_too_long_ignores_urls():
    msg_body = """Thanks,

this is a test

--
Testy McTesterson
CEO, Test, Inc.
100 Test St, Ste 100 | Austin, TX 78701
<https://maps.google.com/?q=100+Test+St,+Ste+100+%7C+Austin,+TX+78701&entry=gmail&source=g>
"""
    eq_(('Thanks,\n\nthis is a test',"""\
--
Testy McTesterson
CEO, Test, Inc.
100 Test St, Ste 100 | Austin, TX 78701
<https://maps.google.com/?q=100+Test+St,+Ste+100+%7C+Austin,+TX+78701&entry=gmail&source=g>"""),
     bruteforce.extract_signature(msg_body))
    def clean_body(self, mail_body):
        """Contains several email cleaning procedures."""
        delimiters = ["-----Original Message-----", "To:", "From"]
        # Split body by earliest appearing delimiter, with delimiters being indicators of the start of an email being forwarded.
        old_len = sys.maxsize
        for delimiter in delimiters:
            split_body = mail_body.split(delimiter, 1)
            new_len = len(split_body[0])
            if new_len <= old_len:
                old_len = new_len
                final_split = split_body

        if (len(final_split) == 1):
            mail_chain = None
        else:
            mail_chain = final_split[1]
        # The following uses Talon library to try to extract a clean body from signatures of the remaining email body.
        clean_body, sig = extract_signature(final_split[0])
        return {'body': clean_body, 'chain': mail_chain, 'signature': sig}
Beispiel #32
0
def parse_reply(filename):
    """
    Extract body contents from reply, stripping away html tags.
    
    Args:
        filename: str, full path of .raw.html file
    """

    with open(filename, 'r') as f:
        raw = f.read()

    title = parse_reply_title(raw)
    bodyhtml, bodytext = parse_reply_body(raw)
    #talon bruteforce technique to extract signature
    content, sig = extract_signature(bodytext)

    body_filename = filename.replace('.raw.html', '.reply.body.txt')
    with open(body_filename, 'w') as w:
        w.write(bodytext)

    title_body_filename = filename.replace('.raw.html',
                                           '.reply.title_body.txt')
    with open(title_body_filename, 'w') as w:
        w.write(title)
        w.write(bodytext)

    body_no_sig_filename = filename.replace('.raw.html',
                                            '.reply.body_no_signature.txt')
    with open(body_no_sig_filename, 'w') as w:
        w.write(content)

    title_body_no_sig_filename = filename.replace(
        '.raw.html', '.reply.title_body_no_signature.txt')
    with open(title_body_no_sig_filename, 'w') as w:
        w.write(title)
        w.write(content)

    #parse tags
    tag_data = parse_reply_tags(bodyhtml)

    body_tags_filename = filename.replace('.raw.html', '.reply.body_tags.txt')
    with open(body_tags_filename, 'w') as w:
        w.write(json.dumps(tag_data))
Beispiel #33
0
    def __init__(self, email_string):
        """
        Takes a raw email string and processes it into something useful
        """
        self.str = email_string
        self.raw = mime.from_string(self.str)

        to = self.raw.headers['To']
        if to is None:
            self.recipients = []
        else:
            to = to.lower()
            self.recipients = address.parse_list(to) if ',' in to else [address.parse(to)]

        # It's possible a recipient is None if it is something like
        # 'Undisclosed recipients:;'
        self.recipients = [r for r in self.recipients if r is not None]
        self.sender = address.parse(self.raw.headers['From'].lower())

        self.subject = self.raw.subject
        self.id = self.raw.message_id
        self.date = parse(self.raw.headers['Date'])
        self.content_encoding = self.raw.content_encoding[0]

        # Extract plaintext body
        if self.raw.content_type.is_singlepart():
            self.full_body = self.raw.body
        elif self.raw.content_type.is_multipart():
            for p in self.raw.parts:
                if p.content_type == 'text/plain':
                    self.full_body = p.body
                    break

        # Try to get signature
        self.body, self.signature = extract_signature(self.full_body)

        # Try ML approach if necessary
        if self.signature is None:
            self.body, self.signature = signature.extract(self.full_body, sender=self.sender)

        # Get replies only, not the quotes
        self.body = quotations.extract_from(self.body, 'text/plain')
Beispiel #34
0
def remove_signature(message):
    '''takes the message as string 
	and returns a string removing signature 
	'''
    msge = message.split('\n')  # break the message in line
    try:
        msg = [x.rstrip() for x in msge]  # remove the white spaces
    except:
        pass  # otherwise proceed
    message = '\n'.join(msg)  # form a string from the msg list

    if '--' in msg:  # method 1 by checking the occurence of '--' in msg list
        text, signature = extract_signature(
            message)  # extract the filtered text and signature
    else:  #METHOD 2 by using ML way
        text, signature = sig.extract(
            message, sender='*****@*****.**'
        )  # extract the text and signature, the sender argument is necessary but any fake email id can be used

    return text
def parse_email_quotes():
    """
    Run through each file in archive and add 'clean_body' and 'signature' to 
    each email's information.
    """
    talon.init()
    archive_dir = "archive/"
    for filenum, filename in enumerate(os.listdir(archive_dir)):
        if filenum % 1000 == 0:
            print filenum
        if filename.endswith(".email.json"):
            full_filename = os.path.join(archive_dir, filename) 
            fh = open(full_filename, "r")
            email_data = load(fh)
            fh.close()
            if not "clean_body" in email_data or not 'signature' in email_data:
                reply_body = naive_quote_removal(email_data['body'])
                email_data['clean_body'], email_data['signature'] = extract_signature(reply_body)
                fh = open(full_filename, "w")
                fh.write(dumps(email_data))
                fh.close()
Beispiel #36
0
def get_cleaned_email(parsed_email):
    text_part = parsed_email.text_plain[0] if parsed_email.text_plain else None
    html_part = parsed_email.text_html[0] if parsed_email.text_html else None
    if not text_part and not html_part:
        return None, None

    if text_part:
        title_and_body = clean_email_text(text_part)
        title, body = title_and_body
        body, signature = extract_signature(body)
        # extract_signature seems to not support html code as input
        title_and_body = (title, body)
        print(f"striped out signature in the email: {signature}")
        # TODO optionally: if signature == None which may be
        # because it's not been recognized, apply additionally:

        # from talon import signature
        # body3, signature = signature.extract(body2, sender='*****@*****.**')
    else:
        title_and_body = clean_email_html(html_part)
    return title_and_body
def parse_email_quotes():
    """
    Run through each file in archive and add 'clean_body' and 'signature' to 
    each email's information.
    """
    talon.init()
    archive_dir = "archive/"
    for filenum, filename in enumerate(os.listdir(archive_dir)):
        if filenum % 1000 == 0:
            print filenum
        if filename.endswith(".email.json"):
            full_filename = os.path.join(archive_dir, filename)
            fh = open(full_filename, "r")
            email_data = load(fh)
            fh.close()
            if not "clean_body" in email_data or not 'signature' in email_data:
                reply_body = naive_quote_removal(email_data['body'])
                email_data['clean_body'], email_data[
                    'signature'] = extract_signature(reply_body)
                fh = open(full_filename, "w")
                fh.write(dumps(email_data))
                fh.close()
Beispiel #38
0
def process_sign(file):
    with open(file, 'r') as fp:
        lines = fp.readlines()
        stripped = []
        sender = lines[2]
        for line in lines:
            remove = False
            for t in exclude_headers:
                if t in line:
                    remove = True
                    break
            if remove:
                continue
            stripped.append(line)

        email_stripped = ''.join(stripped)
        sender = sender.split(' ')[-1]
        msg, signature = extract_signature_ml(email_stripped, sender)
        if signature == None:
            msg, signature = extract_signature(email_stripped)
        tokens = [i.lower() for i in nltk.wordpunct_tokenize(msg + '[SIGN]')]

    return tokens
Beispiel #39
0
ac_lines_talon = 0.0
ac_lines_correct = 0.0

for message_id in dataset:

    csv_signature, csv_authored_content = csv_munge(message_id)

    # print message_id
    # print csv_signature
    # print csv_authored_content
    # print "-------"

    text = dataset[message_id]

    # find talon signatures
    results = extract_signature(text)
    if results[1]:
        talon_signature = results[1].split('\n')
    else:
        talon_signature = []

    #find talon authored content
    talon_authored_content = quotations.extract_from_plain(text).split('\n')

    # do a comparative scoring of results found
    if len(talon_signature) > 0 or len(csv_signature) > 0:
        required = set(csv_signature)
        signature_lines_total += len(csv_signature)
        for line in talon_signature:
            if len(line) > 0:
                signature_lines_talon += 1.0
Beispiel #40
0
def extract_signatures_rb(emails):

    items = [extract_signature(email) for email in emails]
    bodies = [body for body, _ in items]
    signatures = [str(signature) for _, signature in items]
    return signatures
Beispiel #41
0
ac_lines_talon = 0.0
ac_lines_correct = 0.0

for message_id in dataset:
	
	csv_signature, csv_authored_content = csv_munge(message_id)

	# print message_id
	# print csv_signature
	# print csv_authored_content
	# print "-------"

	text = dataset[message_id]

	# find talon signatures
	results = extract_signature(text)
	if results[1]:
		talon_signature = results[1].split('\n')
	else:
		talon_signature = []

	#find talon authored content
	talon_authored_content = quotations.extract_from_plain(text).split('\n')


	# do a comparative scoring of results found
	if len(talon_signature) > 0 or len(csv_signature) > 0:
		required = set(csv_signature)
		signature_lines_total += len(csv_signature)
		for line in talon_signature:
			if len(line) > 0:
Beispiel #42
0
def test_crash_in_extract_signature():
    msg_body = '''Hey!
-roman'''
    eq_((msg_body, None), bruteforce.extract_signature(msg_body))
Beispiel #43
0
def test_no_signature():
    msg_body = 'Hey man!'
    eq_((msg_body, None), bruteforce.extract_signature(msg_body))
Beispiel #44
0
def test_signature_only():
    msg_body = '--\nRoman'
    eq_((msg_body, None), bruteforce.extract_signature(msg_body))
Beispiel #45
0
def test_iphone_signature():
    msg_body = '''Hey!

Sent from my iPhone!'''
    eq_(('Hey!', 'Sent from my iPhone!'),
        bruteforce.extract_signature(msg_body))
Beispiel #46
0
def test_empty_body():
    eq_(('', None), bruteforce.extract_signature(''))
Beispiel #47
0
def test_mailbox_for_iphone_signature():
    msg_body = """Blah
Sent from Mailbox for iPhone"""
    eq_(("Blah", "Sent from Mailbox for iPhone"),
        bruteforce.extract_signature(msg_body))
Beispiel #48
0
__author__ = 'a_medelyan'
import talon
from talon import quotations
from talon.signature.bruteforce import extract_signature

talon.init()

text = "The price is still 91.87.\n\nKeoni Almeida\nCalifornia Independent System Operator\nphone: 916/608-7053\npager:  916/814-7352\nalpha page:  [email protected]\ne-mail:  <mailto:[email protected]>\n\n\n\n> -----Original Message-----\n> From:\tCRCommunications\n> Sent:\tFriday, June 22, 2001 11:34 AM\n> To:\tISO Market Participants\n> Subject:\tCAISO Notice: Update to June 20 Market Notice\n>\n>  <<MARKET NOTICE 010622_.doc>>\n>\n> Market Participants:\n> Please read the attached explanation of Footnote 14 in the California ISO\n> June 20, 2001, Market Notice.\n>\n> CR Communications\n> Client Relations Communications\n\n - MARKET NOTICE 010622_.doc"

reply = quotations.extract_from_plain(text)
signature = extract_signature(text)[1]

print "Reply: ", reply

print "Signature: ", signature