"""parse email and return email_data named tuple subject parameter only applies to _ref_num parsing""" return EmailData(_ref_num(email, subject), _phone(email), _first_name(email), _last_name(email), _address(email), _email(email)) # + # extract emails paths and create generator emails = Path(r"emails_test/").glob("*.msg") # add one EmailData container with features per email emails_data = [] for email_path in emails: with extract_msg.Message(email_path) as msg: msg_body = msg.body msg_subject = msg.subject emails_data.append(email_regex(msg_body, msg_subject)) # - def strip_string(feature: str) -> str: """Cleans whitespaces and return characters from email features""" return feature.strip("\r").strip("\n").strip() # + import csv
import extract_msg, sys, re inputmail = input("Name of the msg file: ") mail = inputmail + r'.msg' msg = extract_msg.Message(mail) msg_sender = msg.sender msg_date = msg.date msg_subj = msg.subject msg_message = msg.body result = r'Results_' + mail + r'.txt' regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' f = open(result, 'w') f.write('Results from .msg uploaded: \n \n') f.write('Sender: {}\n'.format(msg_sender)) f.write('Sent On: {}\n'.format(msg_date)) f.write('Subject: {}\n'.format(msg_subj)) f.write('\nLinks detected >>>>>>>>>>>>>>>>>>>>>>>>> \n') match = re.findall(regex, msg_message) for m in match: #print(m) f.write('<{}\n'.format(m)) print('Done, check your directory for the results')
print msg.SentOn print msg.To print msg.CC print msg.BCC print msg.Subject print msg.Body count_attachments = msg.Attachments.Count if count_attachments > 0: for item in range(count_attachments): print msg.Attachments.Item(item + 1).Filename del outlook, msg # need to run # pip install extract-msg # pip install imapclient import extract_msg f = r'MS_Outlook_file.msg' # Replace with yours msg = extract_msg.Message(f) msg_sender = msg.sender msg_date = msg.date msg_subj = msg.subject msg_message = msg.body print('Sender: {}'.format(msg_sender)) print('Sent On: {}'.format(msg_date)) print('Subject: {}'.format(msg_subj)) print('Body: {}'.format(msg_message))
def parse_message(self, file_path: str) -> OutlookMessage: """ Returns a MSG object from an outlook msg file. """ return extract_msg.Message(file_path)
if locked_dict[x]['file_out'] != '': if dt_local.diff(locked_dict[x]['ts']).in_seconds() > 900: locked_dict[x] = {'file_out': ''} logging(locked_dict[x]['old_file'] + '\t(NOT CONVERTED TIMED OUT ' + locked_dict[x]['pronom'] + ')') results['timedout'].append(pronom) results['stats']['timedout'] += 1 time.sleep(60) for cmd in cmds_list: if os.path.exists(cmd[1]) is False: print(cmd) for container in containers: containers[container].stop() ### for email in email_list: try: msg = extract_msg.Message(email[0]) output_text = '==============================================================================\n' output_text += f'Sendt:\t\t{msg.date} ({pendulum.parse(msg.date, strict=False).isoformat()})\n' output_text += f'Avsender:\t{msg.sender}\n' output_text += f'Mottaker(e):\t' + ', '.join([f'{x.name} <{x.email}>' for x in msg.recipients]) + '\n' output_text += f'Vedlegg:\t' + ', '.join([f'<{x.longFilename}>' for x in msg.attachments]) + '\n' output_text += f'Emne:\t{msg.subject}\n' output_text += f'==============================================================================\n{msg.body}' with open(email[1], 'w') as outlook_out: outlook_out.write(output_text) conversion_stats(email[2], 'converted', email[0], email[1]) logging(email[1] + '\t(converted)') pbar.set_postfix(file=pathlib.Path(email[0]).name + ' (converted)', refresh=False) pbar.update(1) except: copy(email[0], os.path.dirname(email[3]))
## Konverterer med libreoffice if pronom_type[args['input']['pronom']]['convert'] == 'libreoffice': subprocess.run([ f"libreoffice --headless --convert-to pdf --outdir {output_dir}/{os.path.dirname(args['input']['filename'])} {input_dir}/{args['input']['filename']}" ], shell=True, stdout=subprocess.DEVNULL, timeout=180) args['output'][ 'filename'] = f"{os.path.dirname(args['input']['filename'])}/{pathlib.Path(args['input']['filename']).stem}.pdf" if pronom_type[args['input']['pronom']]['convert'] == 'email': # try: pathlib.Path( f"{output_dir}/{os.path.dirname(args['input']['filename'])}").mkdir( parents=True, exist_ok=True) msg = extract_msg.Message(f"{input_dir}/{args['input']['filename']}") args['output'][ 'filename'] = f"{os.path.dirname(args['input']['filename'])}/{pathlib.Path(args['input']['filename']).stem}.txt" output_text = '==============================================================================\n' output_text += f'Sendt:\t\t{msg.date} ({pendulum.parse(msg.date, strict=False).isoformat()})\n' output_text += f'Avsender:\t{msg.sender}\n' output_text += f'Mottaker(e):\t' + ', '.join( [f'{x.name} <{x.email}>' for x in msg.recipients]) + '\n' output_text += f'Vedlegg:\t' + ', '.join( [f'<{x.longFilename}>' for x in msg.attachments]) + '\n' output_text += f'Emne:\t{msg.subject}\n' output_text += f'==============================================================================\n{msg.body}' with open(f"{output_dir}/{args['output']['filename']}", 'w') as outlook_out: outlook_out.write(str(output_text.encode('utf8'))) # except:
def _open(self, file_object): self._archive = extract_msg.Message(file_object)
def open_message(filepath): return extract_msg.Message(filepath)
conversion_stats(pronom_name, 'unconverted', args_output['gammel']['filename']) #Libre-office konvertering if pronom_type[pronom_name]['convert'] == 'libreoffice': try: subprocess.run(['libreoffice --headless --convert-to pdf --outdir ' + shlex.quote(os.path.dirname(args_output['gammel']['filename'])) + ' ' + shlex.quote(file_name)], shell=True, stdout=subprocess.DEVNULL, timeout=360) args_output['ny']['filename'] = os.path.dirname(args_output['gammel']['filename']) + '/' + pathlib.Path(args_output['gammel']['filename']).stem + '.pdf' conversion_stats(pronom_name, 'converted', args_output['ny']['filename']) except subprocess.TimeoutExpired: results['timedout'].append(os.path.relpath(args_output['gammel']['filename'], document_dir)) conversion_stats(pronom_name, 'unconverted', args_output['gammel']['filename']) logging(f"Timed out {args_output['gammel']['filename']}\t ({args_output['gammel']['pronom']})") args_output['ny'] = {} # Epost-konvertator if pronom_type[pronom_name]['convert'] == 'email': try: msg = extract_msg.Message(file_name) output_text = '==============================================================================\n' output_text += f'Sendt:\t\t{msg.date} ({pendulum.parse(msg.date, strict=False).isoformat()})\n' output_text += f'Avsender:\t{msg.sender}\n' output_text += f'Mottaker(e):\t' + ', '.join([f'{x.name} <{x.email}>' for x in msg.recipients]) + '\n' output_text += f'Vedlegg:\t' + ', '.join([f'<{x.longFilename}>' for x in msg.attachments]) + '\n' output_text += f'Emne:\t{msg.subject}\n' output_text += f'==============================================================================\n{msg.body}' args_output['ny']['filename'] = os.path.dirname(args_output['gammel']['filename']) + '/' + pathlib.Path(args_output['gammel']['filename']).stem + '.txt' with open(args_output['ny']['filename'], 'w') as outlook_out: outlook_out.write(output_text) conversion_stats(pronom_name, 'converted', args_output['ny']['filename']) except: conversion_stats(pronom_name, 'unconverted', args_output['gammel']['filename']) if args_output['ny'] != {}: if os.path.isfile(args_output['ny']['filename']) is True:
def cleanForTrain(): i = 1 os.system(mkdir) os.system(unzip) os.system(rmzip) with open( os.path.join(os.getcwd(), 'test-uploads', 'model-input', 'training.csv'), 'wt') as file: fieldnames = [ 'Filename', 'uid', 'Subject', 'Date', 'Sender', 'Body', 'Body_Unformatted', 'Label' ] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() for folder in os.listdir( os.path.join(os.getcwd(), 'train-uploads', 'extracted-data')): for f in os.listdir( os.path.join(os.getcwd(), 'train-uploads', 'extracted-data', folder)): if not f.endswith('.msg'): continue msg = extract_msg.Message( os.path.join(os.getcwd(), 'train-uploads', 'extracted-data', folder, f)) msg_sender = msg.body msg_date = msg.date msg_subj = msg.subject msg_message = msg.body msg_sender = re.findall('From *: (.+)\n', msg_sender) msg_message = re.sub('From *: (.*)\n', '', msg_message) msg_message = re.sub('To *: (.*)\n', '', msg_message) msg_message = re.sub('Cc *: (.*)\n', '', msg_message) msg_message = re.sub('Sent *: (.*)\n', '', msg_message) msg_message = re.sub('Subject *:', '', msg_message) msg_uformatted = msg_message msg_message = re.sub('[^a-zA-z,\.]', " ", msg_message) msg_message = ' '.join(re.split("\n", msg_message)) msg_message = ' '.join(re.split(" +", msg_message)) msg_message = ''.join(re.split("\r", msg_message)) writer.writerow({ 'Filename': f, 'uid': str(i), 'Subject': msg_subj, 'Date': msg_date, 'Sender': msg_sender[0], 'Body': msg_message.encode('utf-8'), 'Body_Unformatted': msg_uformatted.encode('utf-8'), 'Label': folder }) i += 1 os.system(rmmsg)
# End of code from proofpoint if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('input_file', help='Message file to extract links from.') args = parser.parse_args() msg_path = Path(args.input_file) output_file = Path(Path.cwd(), 'Active_Links.txt') if not Path.is_file(msg_path): print(f'{msg_path} is not a file.') quit() msg = extract_msg.Message(msg_path) body = msg.body urldefence_decoder = URLDefenseDecoder() links = list() regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' match = re.findall(regex, body) with open(output_file, 'w') as f: for m in match: m = m.strip('>') if m.find('safelinks') != -1: m = urllib.parse.parse_qs(urllib.parse.urlparse(m).query)['url'][0] if m.find('proofpoint') != -1: m = urldefence_decoder.decode(m) if m not in links:
def msg_html_body_to_string(location, file_name): ''' .MSG file -> Python String Dump the HTML body of the email into a string ''' return extract_msg.Message(location + file_name).htmlBody