def parse_email(email_as_string, include_headers, maintain_rfc, attach_message_primary): """ This function parses an email and returns an array with different parts of the message. :param email_as_string: This represents the email in a bytearray to be processed :type email_as_string: basestring :param include_headers: This parameter specifies if all headers should be included. :type include_headers: bool :param maintain_rfc: This parameter specifies if RFC format for email stays intact :type maintain_rfc: bool :param attach_message_primary: This parameter specifies if first attached email should be used as the message for indexing instead of the carrier email :type attach_message_primary: bool :return: Returns a list with the [date, Message-id, mail_message] :rtype: list """ message = email.message_from_string(email_as_string) if attach_message_primary: message = change_primary_message(message) if maintain_rfc: index_mail = maintain_rfc_parse(message) else: mailheaders = Parser().parsestr(message.as_string(), True) headers = [ "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k in MAIN_HEADERS ] if include_headers: other_headers = [ "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k not in MAIN_HEADERS ] headers.extend(other_headers) body = [] if message.is_multipart(): part_number = 1 for part in message.walk(): content_type = part.get_content_type() content_disposition = part.get('Content-Disposition') if content_type in [ 'multipart/alternative', 'multipart/mixed' ]: # The multipart/alternative part is usually empty. body.append("Multipart envelope header: %s" % str(part.get_payload(decode=True))) continue body.append("#START_OF_MULTIPART_%d" % part_number) extension = str( os.path.splitext(part.get_filename() or '')[1]).lower() if extension in TEXT_FILE_EXTENSIONS or content_type in SUPPORTED_CONTENT_TYPES or \ part.get_content_maintype() == 'text' or extension in ZIP_EXTENSIONS: if part.get_filename(): body.append("#BEGIN_ATTACHMENT: %s" % str(part.get_filename())) if extension in ZIP_EXTENSIONS: body.append("\n".join( zip.parse_zip(part, EMAIL_PART))) else: body.append(recode_mail(part)) body.append("#END_ATTACHMENT: %s" % str(part.get_filename())) else: body.append(recode_mail(part)) else: body.append( "#UNSUPPORTED_ATTACHMENT: file_name = %s - type = %s ; disposition=%s" % (part.get_filename(), content_type, content_disposition)) body.append("#END_OF_MULTIPART_%d" % part_number) part_number += 1 else: body.append(recode_mail(message)) """mail_for_index = [MESSAGE_PREAMBLE]""" mail_for_index = [] mail_for_index.extend(headers + body) index_mail = "\n".join(mail_for_index) message_time = float(mktime_tz(parsedate_tz(message['Date']))) return [message_time, message['Message-ID'], index_mail]
def process_raw_email(raw, include_headers): """ This fundtion takes an email in plain text form and preformats it with limited headers. :param raw: This represents the email in a bytearray to be processed :type raw: basestring :param include_headers: This parameter specifies if all headers should be included. :type include_headers: bool :return: Returns a list with the [[date, Message-id, mail_message],...] :rtype: list """ message = email.message_from_string(raw) mailheaders = Parser().parsestr(raw, True) headers = [ "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k in MAIN_HEADERS ] other_headers = [] if include_headers: other_headers = [ "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k not in MAIN_HEADERS ] headers.extend(other_headers) body = [] if message.is_multipart(): part_number = 1 for part in message.walk(): content_type = part.get_content_type() content_disposition = part.get('Content-Disposition') if content_type in ['multipart/alternative', 'multipart/mixed']: # The multipart/alternative part is usually empty. body.append("Multipart envelope header: %s" % str(part.get_payload(decode=True))) continue body.append("#START_OF_MULTIPART_%d" % part_number) extension = str(os.path.splitext(part.get_filename() or '')[1]).lower() if (extension in SUPPORTED_FILE_EXTENSIONS or content_type in SUPPORTED_CONTENT_TYPES or part.get_content_maintype() == 'text'): if part.get_filename(): body.append("#BEGIN_ATTACHMENT: %s" % str(part.get_filename())) if extension == '.docx': body.append(read_docx(part.get_payload(decode=True))) else: body.append(recode_mail(part)) body.append("#END_ATTACHMENT: %s" % str(part.get_filename())) else: body.append(recode_mail(part)) else: body.append( "#UNSUPPORTED_ATTACHMENT: file_name = %s - type = %s ; disposition=%s" % (part.get_filename(), content_type, content_disposition)) body.append("#END_OF_MULTIPART_%d" % part_number) part_number += 1 else: body.append(recode_mail(message)) mail_for_index = [MESSAGE_PREAMBLE] mail_for_index.extend(headers + body) message_time = float(mktime_tz(parsedate_tz(message['Date']))) return [message_time, message['Message-ID'], "\n".join(mail_for_index)]
def process_raw_email(raw, include_headers): """ This fundtion takes an email in plain text form and preformats it with limited headers. :param raw: This represents the email in a bytearray to be processed :type raw: basestring :param include_headers: This parameter specifies if all headers should be included. :type include_headers: bool :return: Returns a list with the [[date, Message-id, mail_message],...] :rtype: list """ message = email.message_from_string(raw) mailheaders = Parser().parsestr(raw, True) body = '' other_headers = '\n'.join([ "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k not in ('Date', 'Message-ID', 'From', 'To', 'Subject') ]) if include_headers: body += other_headers if message.is_multipart(): for part in message.walk(): content_type = part.get_content_type() content_disposition = part.get('Content-Disposition') """ body += "Content Disposition: %s\nContent Type: %s \n" % (repr(content_disposition) ,content_type) Microsoft sometimes sends the wrong content type. : sending csv as application/octect-stream """ index_attachments_flag = INDEX_ATTACHMENT_DEFAULT extension = str(os.path.splitext(part.get_filename() or '')[1]).lower() if extension in SUPPORTED_FILE_EXTENSIONS: file_is_supported_attachment = True else: file_is_supported_attachment = False if content_type in SUPPORTED_CONTENT_TYPES or part.get_content_maintype( ) == 'text': content_type_supported = True else: content_type_supported = False if content_type_supported or file_is_supported_attachment: if content_disposition is not None and content_disposition != '': if "attachment" in content_disposition and index_attachments_flag: """Easier to change to a flag in inputs.conf""" body += "\n#BEGIN_ATTACHMENT: %s\n" % part.get_filename( ) if extension == '.docx': body += read_docx(part.get_payload(decode=True)) else: body += "\n%s" % part.get_payload(decode=True) unicode(part.get_payload(decode=True), str(charset), "ignore").encode('utf8', 'replace') body += "\n#END_ATTACHMENT: %s\n" % part.get_filename() else: body += "\n%s" % recode_mail(part) else: body += "\n%s" % recode_mail(part) else: body += "\n#UNSUPPORTED_ATTACHMENT: %s, %s\n" % ( part.get_filename(), content_type) """ else: body += "Found unsupported message part: %s, Filename: %s" % (content_type,part.get_filename()) # what if we want to index images for steganalysis? - maybe add hexdump of image Give the user the responsibility - add an option for user to specify supported file extensions in input? """ else: body = recode_mail(message) mail_for_index = "VGhpcyBpcyBhIG1haWwgc2VwYXJhdG9yIGluIGJhc2U2NCBmb3Igb3VyIFNwbHVuayBpbmRleGluZwo=\n" \ "Date: %s\n" \ "Message-ID: %s\n" \ "From: %s\n" \ "Subject: %s\n" \ "To: %s\n" \ "Body: %s\n" % (message['Date'], message['Message-ID'], message['From'], getheader(message['Subject']), message['To'], body) return [message['Date'], message['Message-ID'], mail_for_index]
def parse_email(email_as_string, include_headers, maintain_rfc, attach_message_primary): """ This function parses an email and returns an array with different parts of the message. :param email_as_string: This represents the email in a bytearray to be processed :type email_as_string: basestring :param include_headers: This parameter specifies if all headers should be included. :type include_headers: bool :param maintain_rfc: This parameter specifies if RFC format for email stays intact :type maintain_rfc: bool :param attach_message_primary: This parameter specifies if first attached email should be used as the message for indexing instead of the carrier email :type attach_message_primary: bool :return: Returns a list with the [date, Message-id, mail_message] :rtype: list """ message = email.message_from_string(email_as_string) if attach_message_primary: message = change_primary_message(message) if maintain_rfc: index_mail = maintain_rfc_parse(message) else: mailheaders = Parser().parsestr(message.as_string(), True) headers = ["%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k in MAIN_HEADERS] if include_headers: other_headers = ["%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k not in MAIN_HEADERS] headers.extend(other_headers) body = [] if message.is_multipart(): part_number = 1 for part in message.walk(): content_type = part.get_content_type() content_disposition = part.get('Content-Disposition') if content_type in ['multipart/alternative', 'multipart/mixed']: # The multipart/alternative part is usually empty. body.append("Multipart envelope header: %s" % str(part.get_payload(decode=True))) continue body.append("#START_OF_MULTIPART_%d" % part_number) extension = str(os.path.splitext(part.get_filename() or '')[1]).lower() if extension in TEXT_FILE_EXTENSIONS or content_type in SUPPORTED_CONTENT_TYPES or \ part.get_content_maintype() == 'text' or extension in ZIP_EXTENSIONS: if part.get_filename(): body.append("#BEGIN_ATTACHMENT: %s" % str(part.get_filename())) if extension in ZIP_EXTENSIONS: body.append("\n".join(zip.parse_zip(part, EMAIL_PART))) else: body.append(recode_mail(part)) body.append("#END_ATTACHMENT: %s" % str(part.get_filename())) else: body.append(recode_mail(part)) else: body.append("#UNSUPPORTED_ATTACHMENT: file_name = %s - type = %s ; disposition=%s" % ( part.get_filename(), content_type, content_disposition)) body.append("#END_OF_MULTIPART_%d" % part_number) part_number += 1 else: body.append(recode_mail(message)) """mail_for_index = [MESSAGE_PREAMBLE]""" mail_for_index = [] mail_for_index.extend(headers + body) index_mail = "\n".join(mail_for_index) message_time = float(mktime_tz(parsedate_tz(message['Date']))) return [message_time, message['Message-ID'], index_mail]