def message_process_msg( self, model, message, custom_values=None, save_original=False, strip_attachments=False, thread_id=None, ): """Convert message to RFC2822 and pass to message_process""" if not Message: raise exceptions.UserError( _('Install the msg-extractor library to handle .msg files') ) message_msg = Message(b64decode(message)) message_email = self.env['ir.mail_server'].build_email( message_msg.sender, message_msg.to.split(','), message_msg.subject, # prefer html bodies to text message_msg._getStream('__substg1.0_10130102') or message_msg.body, email_cc=message_msg.cc, headers={'date': message_msg.date}, message_id=message_msg.message_id, attachments=[ (attachment.longFilename, attachment.data) for attachment in message_msg.attachments ], ) return self.message_drop( model, message_email.as_string(), custom_values=custom_values, save_original=save_original, strip_attachments=strip_attachments, thread_id=thread_id, )
def analyze_file(self, path): m = Message(path) def xstr(s): return '' if s is None else str(s) attachments = m.attachments a = [] for attachment in attachments: sha256 = hashlib.sha256() if type(attachment.data) is not Message: sha256.update(attachment.data) minfo = magic.Magic(uncompress=True).from_buffer( attachment.data) a.append({ 'name': attachment.longFilename, 'sha256': sha256.hexdigest(), 'mimeinfo': minfo }) email = { 'header': xstr(m.header), 'from': xstr(m.sender), 'to': xstr(m.to), 'cc': xstr(m.cc), 'subject': xstr(m.subject), 'date': xstr(m.date), 'body': decode_utf7(m.body), 'attachments': a } self.add_result_subsection('Email details', email) return self.results
def msgtext(self): """ Extract texts and some information from .msg format email files. """ try: mail = Message(self.path) except (AttributeError, Exception): # wrong decode or cannot parse... text = '' else: if mail.sender is None: send = 'From: ' else: send = 'From: ' + mail.sender if mail.to is None: to = 'To: ' else: to = 'To: ' + mail.to if mail.date is None: dt = 'DataTime: ' else: dt = 'DateTime: ' + mail.date if mail.subject is None: sub = 'Subject: ' else: sub = 'Subject: ' + mail.subject if mail.body is None: msgs = 'Message: ' else: msgs = 'Message: ' + mail.body text = send + ', ' + to + ', ' + dt + ', ' + sub + ', ' + msgs text = text.replace("'", "‘") return text
def test_message_import(self): """Test Outlook MSG file import & date parsing """ path = Path(settings.BASE_DIR, 'prs2', 'referral', 'fixtures', 'test_email.msg') msg = Message(path) self.assertTrue(msg.date) record = Record.objects.all()[0] # Record order_date is empty. self.assertFalse(record.order_date) tmp_f = open(settings.MEDIA_ROOT + '/test.msg', 'wb') tmp_f.write(open(path, 'rb').read()) tmp_f.close() record.uploaded_file = tmp_f.name record.save() # Record order_date is no longer empty. self.assertTrue(record.order_date)
def analyze_file(self, path): m = Message(path) def xstr(s): return "" if s is None else str(s) attachments = m.attachments a = [] observables = [] outdir = tempfile.mkdtemp() for attachment in attachments: sha256 = hashlib.sha256() if type(attachment.data) is not Message: sha256.update(attachment.data) minfo = magic.Magic(uncompress=True).from_buffer( attachment.data) a.append({ "name": attachment.longFilename, "sha256": sha256.hexdigest(), "mimeinfo": minfo, }) with open(os.path.join(outdir, attachment.longFilename), "wb") as f: f.write(attachment.data) observables.append( os.path.join(outdir, attachment.longFilename)) email = { "header": xstr(m.header), "from": xstr(m.sender), "to": xstr(m.to), "cc": xstr(m.cc), "subject": xstr(m.subject), "date": xstr(m.date), "body": decode_utf7(m.body), "attachments": a, } self.add_result_subsection("Email details", email) return self.results, observables
def typesense_index_record(rec, client=None): """Index a single record in Typesense. """ if not client: client = typesense_client() rec_document = { 'id': str(rec.pk), 'created': rec.created.timestamp(), 'referral_id': rec.referral.pk, 'name': rec.name, 'description': rec.description if rec.description else '', 'file_name': rec.filename, 'file_type': rec.extension, } # PDF document content. if rec.extension == 'PDF': try: # PDF text extraction can be a little error-prone. # In the event of an exception here, we'll just accept it and pass. content = high_level.extract_text( open(rec.uploaded_file.path, 'rb')) rec_document['file_content'] = content.replace('\n', ' ').strip() except: pass # MSG document content. if rec.extension == 'MSG': message = Message(rec.uploaded_file.path) content = '{} {}'.format(message.subject, message.body.replace('\r\n', ' ')) rec_document['file_content'] = content.strip() # DOCX document content. if rec.extension == 'DOCX': content = docx2txt.process(rec.uploaded_file.path) rec_document['file_content'] = content.replace('\n', ' ').strip() client.collections['records'].documents.upsert(rec_document)
def analyze(self, data, parsed): ''' start analyzing exe logic, add descriptions and get words and wordsstripped from array (need to implement from extract_msg.dev_classes import Message) ''' streams = [] parts = [] mixed = [] headers = [] data["MSG"] = deepcopy(self.datastruct) message = Message(data["Location"]["File"]) headers = self.get_headers(data["MSG"]["General"], message) self.get_content(data["MSG"], message) if self.check_attachment_and_make_dir(data, message): streams = self.get_attachment(data, message) else: pass mixed = streams + parts + headers if len(mixed) > 0: get_words_multi_filesarray( data, mixed) # have to be bytes < will check this later on else: get_words(data, data["Location"]["File"]) parsed.type = "msg"
def analyze(self, data, parsed): ''' start analyzing exe logic, add descriptions and get words and wordsstripped from array ''' Streams = [] Parts = [] Mixed = [] Headers = [] data["MSG"] = deepcopy(self.datastruct) message = Message(data["Location"]["File"]) Headers = self.get_headers(data["MSG"]["General"], message) self.get_content(data["MSG"], message) if self.check_attachment_and_make_dir(data, message): Streams = self.get_attachment(data, message) else: pass Mixed = Streams + Parts + Headers if len(Mixed) > 0: get_words_multi_filesarray( data, Mixed) #have to be bytes < will check this later on else: get_words(data, data["Location"]["File"]) parsed.type = "msg"
useFileName = False useContentId = False for rawFilename in sys.argv[1:]: if rawFilename == '--raw': writeRaw = True if rawFilename == '--json': toJson = True if rawFilename == '--use-file-name': useFileName = True if rawFilename == '--use-content-id': useContentId = True if rawFilename == '--debug': debug = True for filename in glob.glob(rawFilename): msg = Message(filename) try: if writeRaw: msg.saveRaw() else: msg.save(toJson, useFileName, False, useContentId) except Exception as e: # msg.debug() print("Error with file '" + filename + "': " + traceback.format_exc())