def validate(msg): validation_dict = { 'input': { 'class': get_full_class_name(msg), # Get the full name of the class 'has_len': has_len(msg), # Does the input have a __len__ attribute? 'len': len(msg) if has_len(msg) else None, # If input has __len__, put the value here }, 'olefile': { 'valid': olefile.isOleFile(msg), }, } if validation_dict['olefile']['valid']: validation_dict['message'] = { 'initializes': False, } try: msg_instance = Message(msg) except NotImplementedError: # Should we have a special procedure for handling it if we get "not implemented"? pass except: pass else: validation_dict['message']['initializes'] = True validation_dict['message']['msg'] = validate_msg(msg_instance) return validation_dict
def _extract_msg_objects(self, msg_obj: MsgObj): """Extracts email objects needed to construct an eml from a msg.""" original_eml_header = msg_obj._getStringStream('__substg1.0_007D') message = email.message_from_string(original_eml_header, policy=policy.default) body = {} if msg_obj.body is not None: body['text'] = { "obj": msg_obj.body, "subtype": 'plain', "charset": "utf-8", "cte": "base64" } if msg_obj.htmlBody is not None: try: _html_encoding_raw = msg_obj.mainProperties['3FDE0003'].value _html_encoding = codepage2codec(_html_encoding_raw) except KeyError: _html_encoding = msg_obj.stringEncoding body['html'] = { 'obj': msg_obj.htmlBody.decode(), "subtype": 'html', "charset": _html_encoding, "cte": "base64" } if msg_obj.rtfBody is not None: body['rtf'] = { "obj": msg_obj.rtfBody.decode(), "subtype": 'rtf', "charset": 'ascii', "cte": "base64" } try: rtf_obj = DeEncapsulator(msg_obj.rtfBody) rtf_obj.deencapsulate() if (rtf_obj.content_type == "html") and (msg_obj.htmlBody is None): self.encapsulated_body = 'text/html' body['html'] = { "obj": rtf_obj.html, "subtype": 'html', "charset": rtf_obj.text_codec, "cte": "base64" } elif (rtf_obj.content_type == "text") and (msg_obj.body is None): self.encapsulated_body = 'text/plain' body['text'] = { "obj": rtf_obj.plain_text, "subtype": 'plain', "charset": rtf_obj.text_codec } except NotEncapsulatedRtf: logger.debug("RTF body in Msg object is not encapsualted.") except MalformedEncapsulatedRtf: logger.info( "RTF body in Msg object contains encapsulated content, but it is malformed and can't be converted." ) attachments = msg_obj.attachments return message, body, attachments
def _get_parts_outlook(self, data): def ensure_bytes(data): return data if isinstance(data, bytes) else data.encode(self.codec) def make_message(name, msg): if msg.body: yield UnpackResult(F'{name}.TXT', ensure_bytes(msg.body)) if msg.htmlBody: yield UnpackResult(F'{name}.HTM', ensure_bytes(msg.htmlBody)) msgcount = 0 with Message(bytes(data)) as msg: yield from self._get_headparts(msg.header.items()) yield from make_message('BODY', msg) for attachment in msg.attachments: if attachment.type == 'msg': msgcount += 1 yield from make_message(F'MSG{msgcount:d}', attachment.data) continue if not isbuffer(attachment.data): self.log_warn( F'unknown attachment of type {attachment.type}, please report this!' ) continue path = attachment.longFilename or attachment.shortFilename yield UnpackResult(path, attachment.data)
def main(): # Setup logging to stdout, indicate running from cli CLI_LOGGING = 'extract_msg_cli' args = utils.get_command_args(sys.argv[1:]) level = logging.INFO if args.verbose else logging.WARNING currentdir = os.getcwdu( ) # Store this just in case the paths that have been given are relative if args.out_path: if not os.path.exists(args.out_path): os.makedirs(args.out_path) out = args.out_path else: out = currentdir if args.dev: import extract_msg.dev extract_msg.dev.main(args, sys.argv[1:]) elif args.validate: import json import pprint import time from extract_msg import validation val_results = {x[0]: validation.validate(x[0]) for x in args.msgs} filename = 'validation {}.json'.format(int(time.time())) print('Validation Results:') pprint.pprint(val_results) print('These results have been saved to {}'.format(filename)) with open(filename, 'w') as fil: fil.write(json.dumps(val_results)) utils.get_input('Press enter to exit...') else: if not args.dump_stdout: utils.setup_logging(args.config_path, level, args.log, args.file_logging) for x in args.msgs: try: with Message(x[0]) as msg: # Right here we should still be in the path in currentdir if args.dump_stdout: print(msg.body) else: os.chdir(out) msg.save(toJson=args.json, useFileName=args.use_filename, ContentId=args.cid ) #, html = args.html, rtf = args.html) except Exception as e: print("Error with file '" + x[0] + "': " + traceback.format_exc()) os.chdir(currentdir)
def openMsg(path, prefix='', attachmentClass=None, filename=None, delayAttachments=False, strict=True): """ Function to automatically open an MSG file and detect what type it is. :param path: path to the msg file in the system or is the raw msg file. :param prefix: used for extracting embeded msg files inside the main one. Do not set manually unless you know what you are doing. :param attachmentClass: optional, the class the Message object will use for attachments. You probably should not change this value unless you know what you are doing. :param filename: optional, the filename to be used by default when saving. :param delayAttachments: optional, delays the initialization of attachments until the user attempts to retrieve them. Allows MSG files with bad attachments to be initialized so the other data can be retrieved. If :param strict: is set to `True`, this function will raise an exception when it cannot identify what MSGFile derivitive to use. Otherwise, it will log the error and return a basic MSGFile instance. """ from extract_msg.attachment import Attachment from extract_msg.contact import Contact from extract_msg.message import Message from extract_msg.msg import MSGFile attachmentClass = Attachment if attachmentClass is None else attachmentClass msg = MSGFile(path, prefix, attachmentClass, filename) if msg.classType.startswith('IPM.Contact'): return Contact(path, prefix, attachmentClass, filename) elif msg.classType.startswith('IPM.Note'): return Message(path, prefix, attachmentClass, filename, delayAttachments) elif strict: raise UnrecognizedMSGTypeError( 'Could not recognize msg class type "{}". It is recommended you report this to the developers.' .format(msg.classType)) else: logger.error( 'Could not recognize msg class type "{}". It is recommended you report this to the developers.' .format(msg.classType)) return msg
def _get_parts_outlook(self, data): def ensure_bytes(data): return data if isinstance(data, bytes) else data.encode(self.codec) with Message(bytes(data)) as msg: parts = [] if msg.body: parts.append(EmailPart(None, ensure_bytes(msg.body))) if msg.htmlBody: parts.append(EmailPart(None, ensure_bytes(msg.htmlBody))) for attachment in msg.attachments: parts.append( EmailPart( attachment.longFilename or attachment.shortFilename, attachment.data)) return parts
def main(args, argv): """ Please only run this from the command line. Attempting to use this otherwise is likely to fail. :param args: is the class instance returned by `extract_msg.utils.get_command_args`. :param argv: is the list of arguments that were the input to the aforementioned function. """ setup_dev_logger(args.config_path, args.log) currentdir = os.getcwdu( ) # Store this just in case the paths that have been given are relative if args.out_path: if not os.path.exists(args.out_path): os.makedirs(args.out_path) out = args.out_path else: out = currentdir logger.log(5, 'ARGV: {}'.format(argv)) for y, x in enumerate(args.msgs): logger.log(5, '---- RUNNING DEVELOPER MODE ON FILE {} ----'.format(x[0])) logger.log(5, 'EXCEPTION CHECK:') try: with Message(x[0]) as msg: # Right here we should still be in the path in currentdir os.chdir(out) msg.save(toJson=args.json, useFileName=args.use_filename, ContentId=args.cid) except Exception as e: logger.exception(e) else: logger.log(5, 'No exceptions raised.') logger.log(5, 'DEVELOPER CLASS OUTPUT:') os.chdir(currentdir) dev_classes.Message(x[0]) logger.log(5, '---- END OF DEVELOPER LOG ----') logpath = None for x in logging.root.handlers: try: logpath = x.baseFilename except AttributeError: pass print('Logging complete. Log has been saved to {}'.format(logpath))
import pprint import time from extract_msg import validation val_results = {x[0]: validation.validate(x[0]) for x in args.msgs} filename = 'validation {}.json'.format(int(time.time())) print('Validation Results:') pprint.pprint(val_results) print('These results have been saved to {}'.format(filename)) with open(filename, 'w') as fil: fil.write(json.dumps(val_results)) utils.get_input('Press enter to exit...') else: utils.setup_logging(args.config_path, level, args.log, args.file_logging) for x in args.msgs: try: with Message(x[0]) as msg: # Right here we should still be in the path in currentdir os.chdir(out) msg.save(toJson=args.json, useFileName=args.use_filename, ContentId=args.cid, html=args.html, rtf=args.html) except Exception as e: print("Error with file '" + x[0] + "': " + traceback.format_exc()) os.chdir(currentdir)