"""parse email and return email_data named tuple
    subject parameter only applies to _ref_num parsing"""

    return EmailData(_ref_num(email, subject), _phone(email),
                     _first_name(email), _last_name(email), _address(email),
                     _email(email))


# +
# extract emails paths and create generator
emails = Path(r"emails_test/").glob("*.msg")

# add one EmailData container with features per email
emails_data = []
for email_path in emails:
    with extract_msg.Message(email_path) as msg:
        msg_body = msg.body
        msg_subject = msg.subject
        emails_data.append(email_regex(msg_body, msg_subject))

# -


def strip_string(feature: str) -> str:
    """Cleans whitespaces and return characters from email features"""
    return feature.strip("\r").strip("\n").strip()


# +
import csv
import extract_msg, sys, re

inputmail = input("Name of the msg file: ")

mail = inputmail + r'.msg'
msg = extract_msg.Message(mail)
msg_sender = msg.sender
msg_date = msg.date
msg_subj = msg.subject
msg_message = msg.body

result = r'Results_' + mail + r'.txt'

regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

f = open(result, 'w')

f.write('Results from .msg uploaded: \n \n')
f.write('Sender: {}\n'.format(msg_sender))
f.write('Sent On: {}\n'.format(msg_date))
f.write('Subject: {}\n'.format(msg_subj))

f.write('\nLinks detected >>>>>>>>>>>>>>>>>>>>>>>>> \n')
match = re.findall(regex, msg_message)

for m in match:
    #print(m)
    f.write('<{}\n'.format(m))

print('Done, check your directory for the results')
print msg.SentOn
print msg.To
print msg.CC
print msg.BCC
print msg.Subject
print msg.Body

count_attachments = msg.Attachments.Count
if count_attachments > 0:
    for item in range(count_attachments):
        print msg.Attachments.Item(item + 1).Filename

del outlook, msg

#  need to run
#  pip install extract-msg
#  pip install imapclient

import extract_msg

f = r'MS_Outlook_file.msg'  # Replace with yours
msg = extract_msg.Message(f)
msg_sender = msg.sender
msg_date = msg.date
msg_subj = msg.subject
msg_message = msg.body

print('Sender: {}'.format(msg_sender))
print('Sent On: {}'.format(msg_date))
print('Subject: {}'.format(msg_subj))
print('Body: {}'.format(msg_message))
Example #4
0
 def parse_message(self, file_path: str) -> OutlookMessage:
     """
     Returns a MSG object from an outlook msg file.
     """
     return extract_msg.Message(file_path)
Example #5
0
                if locked_dict[x]['file_out'] != '':
                    if dt_local.diff(locked_dict[x]['ts']).in_seconds() > 900:
                        locked_dict[x] = {'file_out': ''}
                        logging(locked_dict[x]['old_file'] + '\t(NOT CONVERTED TIMED OUT ' + locked_dict[x]['pronom'] + ')')
                        results['timedout'].append(pronom)
                        results['stats']['timedout'] += 1
time.sleep(60)
for cmd in cmds_list:
    if os.path.exists(cmd[1]) is False:
        print(cmd)
for container in containers:
    containers[container].stop()
###
for email in email_list:
    try:
        msg = extract_msg.Message(email[0])
        output_text = '==============================================================================\n'
        output_text += f'Sendt:\t\t{msg.date} ({pendulum.parse(msg.date, strict=False).isoformat()})\n'
        output_text += f'Avsender:\t{msg.sender}\n'
        output_text += f'Mottaker(e):\t' + ', '.join([f'{x.name} <{x.email}>' for x in msg.recipients]) + '\n'
        output_text += f'Vedlegg:\t' + ', '.join([f'<{x.longFilename}>' for x in msg.attachments]) + '\n'
        output_text += f'Emne:\t{msg.subject}\n'
        output_text += f'==============================================================================\n{msg.body}'
        with open(email[1], 'w') as outlook_out:
            outlook_out.write(output_text)
        conversion_stats(email[2], 'converted', email[0], email[1])
        logging(email[1] + '\t(converted)')
        pbar.set_postfix(file=pathlib.Path(email[0]).name + ' (converted)', refresh=False)
        pbar.update(1)
    except:
        copy(email[0], os.path.dirname(email[3]))
Example #6
0
## Konverterer med libreoffice
if pronom_type[args['input']['pronom']]['convert'] == 'libreoffice':
    subprocess.run([
        f"libreoffice --headless --convert-to pdf --outdir {output_dir}/{os.path.dirname(args['input']['filename'])} {input_dir}/{args['input']['filename']}"
    ],
                   shell=True,
                   stdout=subprocess.DEVNULL,
                   timeout=180)
    args['output'][
        'filename'] = f"{os.path.dirname(args['input']['filename'])}/{pathlib.Path(args['input']['filename']).stem}.pdf"
if pronom_type[args['input']['pronom']]['convert'] == 'email':
    #        try:
    pathlib.Path(
        f"{output_dir}/{os.path.dirname(args['input']['filename'])}").mkdir(
            parents=True, exist_ok=True)
    msg = extract_msg.Message(f"{input_dir}/{args['input']['filename']}")
    args['output'][
        'filename'] = f"{os.path.dirname(args['input']['filename'])}/{pathlib.Path(args['input']['filename']).stem}.txt"
    output_text = '==============================================================================\n'
    output_text += f'Sendt:\t\t{msg.date} ({pendulum.parse(msg.date, strict=False).isoformat()})\n'
    output_text += f'Avsender:\t{msg.sender}\n'
    output_text += f'Mottaker(e):\t' + ', '.join(
        [f'{x.name} <{x.email}>' for x in msg.recipients]) + '\n'
    output_text += f'Vedlegg:\t' + ', '.join(
        [f'<{x.longFilename}>' for x in msg.attachments]) + '\n'
    output_text += f'Emne:\t{msg.subject}\n'
    output_text += f'==============================================================================\n{msg.body}'
    with open(f"{output_dir}/{args['output']['filename']}",
              'w') as outlook_out:
        outlook_out.write(str(output_text.encode('utf8')))
#        except:
Example #7
0
 def _open(self, file_object):
     self._archive = extract_msg.Message(file_object)
def open_message(filepath):
    return extract_msg.Message(filepath)
Example #9
0
            conversion_stats(pronom_name, 'unconverted', args_output['gammel']['filename'])
#Libre-office konvertering
        if pronom_type[pronom_name]['convert'] == 'libreoffice':
            try:
                subprocess.run(['libreoffice --headless --convert-to pdf --outdir ' + shlex.quote(os.path.dirname(args_output['gammel']['filename'])) + ' ' + shlex.quote(file_name)], shell=True, stdout=subprocess.DEVNULL, timeout=360)
                args_output['ny']['filename'] = os.path.dirname(args_output['gammel']['filename']) + '/' + pathlib.Path(args_output['gammel']['filename']).stem + '.pdf'
                conversion_stats(pronom_name, 'converted', args_output['ny']['filename'])
            except subprocess.TimeoutExpired:
                results['timedout'].append(os.path.relpath(args_output['gammel']['filename'], document_dir))
                conversion_stats(pronom_name, 'unconverted', args_output['gammel']['filename'])
                logging(f"Timed out {args_output['gammel']['filename']}\t ({args_output['gammel']['pronom']})")
                args_output['ny'] = {}
# Epost-konvertator
        if pronom_type[pronom_name]['convert'] == 'email':
            try:
                msg = extract_msg.Message(file_name)
                output_text = '==============================================================================\n'
                output_text += f'Sendt:\t\t{msg.date} ({pendulum.parse(msg.date, strict=False).isoformat()})\n'
                output_text += f'Avsender:\t{msg.sender}\n'
                output_text += f'Mottaker(e):\t' + ', '.join([f'{x.name} <{x.email}>' for x in msg.recipients]) + '\n'
                output_text += f'Vedlegg:\t' + ', '.join([f'<{x.longFilename}>' for x in msg.attachments]) + '\n'
                output_text += f'Emne:\t{msg.subject}\n'
                output_text += f'==============================================================================\n{msg.body}'
                args_output['ny']['filename'] = os.path.dirname(args_output['gammel']['filename']) + '/' + pathlib.Path(args_output['gammel']['filename']).stem + '.txt'
                with open(args_output['ny']['filename'], 'w') as outlook_out:
                    outlook_out.write(output_text)
                conversion_stats(pronom_name, 'converted', args_output['ny']['filename'])
            except:
                conversion_stats(pronom_name, 'unconverted', args_output['gammel']['filename'])
    if args_output['ny'] != {}:
        if os.path.isfile(args_output['ny']['filename']) is True:
Example #10
0
def cleanForTrain():
    i = 1
    os.system(mkdir)
    os.system(unzip)
    os.system(rmzip)
    with open(
            os.path.join(os.getcwd(), 'test-uploads', 'model-input',
                         'training.csv'), 'wt') as file:
        fieldnames = [
            'Filename', 'uid', 'Subject', 'Date', 'Sender', 'Body',
            'Body_Unformatted', 'Label'
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for folder in os.listdir(
                os.path.join(os.getcwd(), 'train-uploads', 'extracted-data')):
            for f in os.listdir(
                    os.path.join(os.getcwd(), 'train-uploads',
                                 'extracted-data', folder)):
                if not f.endswith('.msg'):
                    continue
                msg = extract_msg.Message(
                    os.path.join(os.getcwd(), 'train-uploads',
                                 'extracted-data', folder, f))
                msg_sender = msg.body
                msg_date = msg.date
                msg_subj = msg.subject
                msg_message = msg.body
                msg_sender = re.findall('From *: (.+)\n', msg_sender)
                msg_message = re.sub('From *: (.*)\n', '', msg_message)
                msg_message = re.sub('To *: (.*)\n', '', msg_message)
                msg_message = re.sub('Cc *: (.*)\n', '', msg_message)
                msg_message = re.sub('Sent *: (.*)\n', '', msg_message)
                msg_message = re.sub('Subject *:', '', msg_message)
                msg_uformatted = msg_message
                msg_message = re.sub('[^a-zA-z,\.]', " ", msg_message)

                msg_message = ' '.join(re.split("\n", msg_message))
                msg_message = ' '.join(re.split(" +", msg_message))
                msg_message = ''.join(re.split("\r", msg_message))

                writer.writerow({
                    'Filename':
                    f,
                    'uid':
                    str(i),
                    'Subject':
                    msg_subj,
                    'Date':
                    msg_date,
                    'Sender':
                    msg_sender[0],
                    'Body':
                    msg_message.encode('utf-8'),
                    'Body_Unformatted':
                    msg_uformatted.encode('utf-8'),
                    'Label':
                    folder
                })
                i += 1
        os.system(rmmsg)
Example #11
0
# End of code from proofpoint


if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('input_file', help='Message file to extract links from.')
    args = parser.parse_args()

    msg_path = Path(args.input_file)
    output_file = Path(Path.cwd(), 'Active_Links.txt')

    if not Path.is_file(msg_path):
        print(f'{msg_path} is not a file.')
        quit()
    
    msg = extract_msg.Message(msg_path)
    body = msg.body

    urldefence_decoder = URLDefenseDecoder()
    links = list()
    regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    match = re.findall(regex, body)

    with open(output_file, 'w') as f:
        for m in match:
            m = m.strip('>')
            if m.find('safelinks') != -1:
                m = urllib.parse.parse_qs(urllib.parse.urlparse(m).query)['url'][0]
            if m.find('proofpoint') != -1:
                m = urldefence_decoder.decode(m)
            if m not in links:
Example #12
0
def msg_html_body_to_string(location, file_name):
    '''
    .MSG file -> Python String
    Dump the HTML body of the email into a string
    '''
    return extract_msg.Message(location + file_name).htmlBody