Beispiel #1
0
def parse_email(email_as_string, include_headers, maintain_rfc,
                attach_message_primary):
    """
    This function parses an email and returns an array with different parts of the message.
    :param email_as_string: This represents the email in a bytearray to be processed
    :type email_as_string: basestring
    :param include_headers: This parameter specifies if all headers should be included.
    :type include_headers: bool
    :param maintain_rfc: This parameter specifies if RFC format for email stays intact
    :type maintain_rfc: bool
    :param attach_message_primary: This parameter specifies if first attached email should
      be used as the message for indexing instead of the carrier email
    :type attach_message_primary: bool
    :return: Returns a list with the [date, Message-id, mail_message]
      :rtype: list
    """
    message = email.message_from_string(email_as_string)
    if attach_message_primary:
        message = change_primary_message(message)
    if maintain_rfc:
        index_mail = maintain_rfc_parse(message)
    else:
        mailheaders = Parser().parsestr(message.as_string(), True)
        headers = [
            "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items()
            if k in MAIN_HEADERS
        ]
        if include_headers:
            other_headers = [
                "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items()
                if k not in MAIN_HEADERS
            ]
            headers.extend(other_headers)
        body = []
        if message.is_multipart():
            part_number = 1
            for part in message.walk():
                content_type = part.get_content_type()
                content_disposition = part.get('Content-Disposition')
                if content_type in [
                        'multipart/alternative', 'multipart/mixed'
                ]:
                    # The multipart/alternative part is usually empty.
                    body.append("Multipart envelope header: %s" %
                                str(part.get_payload(decode=True)))
                    continue
                body.append("#START_OF_MULTIPART_%d" % part_number)
                extension = str(
                    os.path.splitext(part.get_filename() or '')[1]).lower()
                if extension in TEXT_FILE_EXTENSIONS or content_type in SUPPORTED_CONTENT_TYPES or \
                   part.get_content_maintype() == 'text' or extension in ZIP_EXTENSIONS:
                    if part.get_filename():
                        body.append("#BEGIN_ATTACHMENT: %s" %
                                    str(part.get_filename()))
                        if extension in ZIP_EXTENSIONS:
                            body.append("\n".join(
                                zip.parse_zip(part, EMAIL_PART)))
                        else:
                            body.append(recode_mail(part))
                        body.append("#END_ATTACHMENT: %s" %
                                    str(part.get_filename()))
                    else:
                        body.append(recode_mail(part))
                else:
                    body.append(
                        "#UNSUPPORTED_ATTACHMENT: file_name = %s - type = %s ; disposition=%s"
                        % (part.get_filename(), content_type,
                           content_disposition))
                body.append("#END_OF_MULTIPART_%d" % part_number)
                part_number += 1
        else:
            body.append(recode_mail(message))
        """mail_for_index = [MESSAGE_PREAMBLE]"""
        mail_for_index = []
        mail_for_index.extend(headers + body)
        index_mail = "\n".join(mail_for_index)
    message_time = float(mktime_tz(parsedate_tz(message['Date'])))
    return [message_time, message['Message-ID'], index_mail]
Beispiel #2
0
def process_raw_email(raw, include_headers):
    """
    This fundtion takes an email in plain text form and preformats it with limited headers.
    :param raw: This represents the email in a bytearray to be processed
    :type raw: basestring
    :param include_headers: This parameter specifies if all headers should be included.
    :type include_headers: bool
    :return: Returns a list with the [[date, Message-id, mail_message],...]
      :rtype: list
    """
    message = email.message_from_string(raw)
    mailheaders = Parser().parsestr(raw, True)
    headers = [
        "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items()
        if k in MAIN_HEADERS
    ]
    other_headers = []
    if include_headers:
        other_headers = [
            "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items()
            if k not in MAIN_HEADERS
        ]
        headers.extend(other_headers)
    body = []
    if message.is_multipart():
        part_number = 1
        for part in message.walk():
            content_type = part.get_content_type()
            content_disposition = part.get('Content-Disposition')
            if content_type in ['multipart/alternative', 'multipart/mixed']:
                # The multipart/alternative part is usually empty.
                body.append("Multipart envelope header: %s" %
                            str(part.get_payload(decode=True)))
                continue
            body.append("#START_OF_MULTIPART_%d" % part_number)
            extension = str(os.path.splitext(part.get_filename()
                                             or '')[1]).lower()
            if (extension in SUPPORTED_FILE_EXTENSIONS
                    or content_type in SUPPORTED_CONTENT_TYPES
                    or part.get_content_maintype() == 'text'):
                if part.get_filename():
                    body.append("#BEGIN_ATTACHMENT: %s" %
                                str(part.get_filename()))
                    if extension == '.docx':
                        body.append(read_docx(part.get_payload(decode=True)))
                    else:
                        body.append(recode_mail(part))
                    body.append("#END_ATTACHMENT: %s" %
                                str(part.get_filename()))
                else:
                    body.append(recode_mail(part))
            else:
                body.append(
                    "#UNSUPPORTED_ATTACHMENT: file_name = %s - type = %s ; disposition=%s"
                    % (part.get_filename(), content_type, content_disposition))

            body.append("#END_OF_MULTIPART_%d" % part_number)
            part_number += 1
    else:
        body.append(recode_mail(message))
    mail_for_index = [MESSAGE_PREAMBLE]
    mail_for_index.extend(headers + body)
    message_time = float(mktime_tz(parsedate_tz(message['Date'])))
    return [message_time, message['Message-ID'], "\n".join(mail_for_index)]
Beispiel #3
0
def process_raw_email(raw, include_headers):
    """
    This fundtion takes an email in plain text form and preformats it with limited headers.
    :param raw: This represents the email in a bytearray to be processed
    :type raw: basestring
    :param include_headers: This parameter specifies if all headers should be included.
    :type include_headers: bool
    :return: Returns a list with the [[date, Message-id, mail_message],...]
      :rtype: list
    """
    message = email.message_from_string(raw)
    mailheaders = Parser().parsestr(raw, True)
    body = ''
    other_headers = '\n'.join([
        "%s: %s" % (k, getheader(v)) for k, v in mailheaders.items()
        if k not in ('Date', 'Message-ID', 'From', 'To', 'Subject')
    ])
    if include_headers:
        body += other_headers
    if message.is_multipart():
        for part in message.walk():
            content_type = part.get_content_type()
            content_disposition = part.get('Content-Disposition')
            """
            body += "Content Disposition: %s\nContent Type: %s \n" % (repr(content_disposition) ,content_type)
            Microsoft sometimes sends the wrong content type. : sending csv as application/octect-stream

            """
            index_attachments_flag = INDEX_ATTACHMENT_DEFAULT
            extension = str(os.path.splitext(part.get_filename()
                                             or '')[1]).lower()
            if extension in SUPPORTED_FILE_EXTENSIONS:
                file_is_supported_attachment = True
            else:
                file_is_supported_attachment = False
            if content_type in SUPPORTED_CONTENT_TYPES or part.get_content_maintype(
            ) == 'text':
                content_type_supported = True
            else:
                content_type_supported = False
            if content_type_supported or file_is_supported_attachment:
                if content_disposition is not None and content_disposition != '':
                    if "attachment" in content_disposition and index_attachments_flag:
                        """Easier to change to a flag in inputs.conf"""
                        body += "\n#BEGIN_ATTACHMENT: %s\n" % part.get_filename(
                        )
                        if extension == '.docx':
                            body += read_docx(part.get_payload(decode=True))
                        else:
                            body += "\n%s" % part.get_payload(decode=True)
                            unicode(part.get_payload(decode=True),
                                    str(charset),
                                    "ignore").encode('utf8', 'replace')

                        body += "\n#END_ATTACHMENT: %s\n" % part.get_filename()
                    else:
                        body += "\n%s" % recode_mail(part)
                else:
                    body += "\n%s" % recode_mail(part)
            else:
                body += "\n#UNSUPPORTED_ATTACHMENT: %s, %s\n" % (
                    part.get_filename(), content_type)
            """
            else:
                body += "Found unsupported message part: %s, Filename: %s" % (content_type,part.get_filename())
            # what if we want to index images for steganalysis? - maybe add hexdump of image
            Give the user the responsibility - add an option for user to specify supported file extensions in input?
            """
    else:
        body = recode_mail(message)
    mail_for_index = "VGhpcyBpcyBhIG1haWwgc2VwYXJhdG9yIGluIGJhc2U2NCBmb3Igb3VyIFNwbHVuayBpbmRleGluZwo=\n" \
                     "Date: %s\n" \
                     "Message-ID: %s\n" \
                     "From: %s\n" \
                     "Subject: %s\n" \
                     "To: %s\n" \
                     "Body: %s\n" % (message['Date'], message['Message-ID'],
                                     message['From'], getheader(message['Subject']), message['To'], body)
    return [message['Date'], message['Message-ID'], mail_for_index]
def parse_email(email_as_string, include_headers, maintain_rfc, attach_message_primary):
    """
    This function parses an email and returns an array with different parts of the message.
    :param email_as_string: This represents the email in a bytearray to be processed
    :type email_as_string: basestring
    :param include_headers: This parameter specifies if all headers should be included.
    :type include_headers: bool
    :param maintain_rfc: This parameter specifies if RFC format for email stays intact
    :type maintain_rfc: bool
    :param attach_message_primary: This parameter specifies if first attached email should
      be used as the message for indexing instead of the carrier email
    :type attach_message_primary: bool
    :return: Returns a list with the [date, Message-id, mail_message]
      :rtype: list
    """
    message = email.message_from_string(email_as_string)
    if attach_message_primary:
        message = change_primary_message(message)   
    if maintain_rfc:
        index_mail = maintain_rfc_parse(message)
    else:
        mailheaders = Parser().parsestr(message.as_string(), True)
        headers = ["%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k in MAIN_HEADERS]
        if include_headers:
            other_headers = ["%s: %s" % (k, getheader(v)) for k, v in mailheaders.items() if k not in MAIN_HEADERS]
            headers.extend(other_headers)
        body = []
        if message.is_multipart():
            part_number = 1
            for part in message.walk():
                content_type = part.get_content_type()
                content_disposition = part.get('Content-Disposition')
                if content_type in ['multipart/alternative', 'multipart/mixed']:
                    # The multipart/alternative part is usually empty.
                    body.append("Multipart envelope header: %s" % str(part.get_payload(decode=True)))
                    continue
                body.append("#START_OF_MULTIPART_%d" % part_number)
                extension = str(os.path.splitext(part.get_filename() or '')[1]).lower()
                if extension in TEXT_FILE_EXTENSIONS or content_type in SUPPORTED_CONTENT_TYPES or \
                   part.get_content_maintype() == 'text' or extension in ZIP_EXTENSIONS:
                    if part.get_filename():
                        body.append("#BEGIN_ATTACHMENT: %s" % str(part.get_filename()))
                        if extension in ZIP_EXTENSIONS:
                            body.append("\n".join(zip.parse_zip(part, EMAIL_PART)))
                        else:
                            body.append(recode_mail(part))
                        body.append("#END_ATTACHMENT: %s" % str(part.get_filename()))
                    else:
                        body.append(recode_mail(part))
                else:
                    body.append("#UNSUPPORTED_ATTACHMENT: file_name = %s - type = %s ; disposition=%s" % (
                        part.get_filename(), content_type, content_disposition))
                body.append("#END_OF_MULTIPART_%d" % part_number)
                part_number += 1
        else:
            body.append(recode_mail(message))
        """mail_for_index = [MESSAGE_PREAMBLE]"""
        mail_for_index = []
        mail_for_index.extend(headers + body)
        index_mail = "\n".join(mail_for_index)
    message_time = float(mktime_tz(parsedate_tz(message['Date'])))
    return [message_time, message['Message-ID'], index_mail]