Ejemplo n.º 1
0
def get_raw_body_text(
    msg: email.message.Message
) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = [
    ]  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.'
            )
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \
            or (filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(
                    raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(
                        charset, 'ignore')
                except Exception:
                    logger.debug(
                        'An exception occured while decoding the payload!',
                        exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode(
                        'ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
Ejemplo n.º 2
0
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]:
    """This method recursively retrieves all e-mail body parts and returns them as a list.

    Args:
        msg (email.message.Message): The actual e-mail message or sub-message.

    Returns:
        list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)"
    """
    raw_body = []  # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]]

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            raw_body.extend(get_raw_body_text(part))  # type: ignore
    else:
        # Treat text document attachments as belonging to the body of the mail.
        # Attachments with a file-extension of .htm/.html are implicitely treated
        # as text as well in order not to escape later checks (e.g. URL scan).

        try:
            filename = msg.get_filename('').lower()
        except (binascii.Error, AssertionError):
            logger.exception(
                'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.')
            filename = ''

        if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or (
                filename.endswith('.html') or filename.endswith('.htm')):
            encoding = msg.get('content-transfer-encoding', '').lower()

            charset = msg.get_content_charset()
            if charset is None:
                raw_body_str = msg.get_payload(decode=True)
                raw_body_str = eml_parser.decode.decode_string(raw_body_str, None)
            else:
                try:
                    raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore')
                except Exception:
                    logger.debug('An exception occured while decoding the payload!', exc_info=True)
                    raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore')

            # In case we hit bug 27257, try to downgrade the used policy
            try:
                raw_body.append((encoding, raw_body_str, msg.items()))
            except AttributeError:
                former_policy = msg.policy
                msg.policy = email.policy.compat32
                raw_body.append((encoding, raw_body_str, msg.items()))
                msg.policy = former_policy

    return raw_body
Ejemplo n.º 3
0
def prepare_multipart_part_attachment(
        msg: email.message.Message,
        counter: int = 0,
        include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Extract meta-information from a multipart-part.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachment = {}

    # In case we hit bug 27257, try to downgrade the used policy
    try:
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
    except AttributeError:
        former_policy = msg.policy
        msg.policy = email.policy.compat32
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
        msg.policy = former_policy

    if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \
            or msg.get_content_maintype() != 'text':
        # if it's an attachment-type, pull out the filename
        # and calculate the size in bytes
        if msg.get_content_type() == 'message/rfc822':
            payload = msg.get_payload()
            if len(payload) > 1:
                logger.warning(
                    'More than one payload for "message/rfc822" part detected. This is not supported, please report!'
                )

            data = bytes(payload[0])
            file_size = len(data)
        else:
            data = msg.get_payload(
                decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

        filename = msg.get_filename('')
        if filename == '':
            filename = 'part-{0:03d}'.format(counter)
        else:
            filename = eml_parser.decode.decode_field(filename)

        file_id = str(uuid.uuid1())
        attachment[file_id] = {}
        attachment[file_id]['filename'] = filename
        attachment[file_id]['size'] = file_size

        # os.path always returns the extension as second element
        # in case there is no extension it returns an empty string
        extension = os.path.splitext(filename)[1].lower()
        if extension:
            # strip leading dot
            attachment[file_id]['extension'] = extension[1:]

        attachment[file_id]['hash'] = get_file_hash(data)

        if not (magic_mime is None or magic_none is None):
            mime_type = magic_none.buffer(data)
            mime_type_short = magic_mime.buffer(data)

            if not (mime_type is None or mime_type_short is None):
                attachment[file_id]['mime_type'] = mime_type
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachment[file_id]['mime_type_short'] = mime_type_short
            else:
                logger.warning(
                    'Error determining attachment mime-type - "{}"'.format(
                        file_id))

        if include_attachment_data:
            attachment[file_id]['raw'] = base64.b64encode(data)

        ch = {}  # type: typing.Dict[str, typing.List[str]]
        for k, v in msg.items():
            k = k.lower()
            v = str(v)

            if k in ch:
                ch[k].append(v)
            else:
                ch[k] = [v]

        attachment[file_id]['content_header'] = ch

        counter += 1

    return attachment
Ejemplo n.º 4
0
def traverse_multipart(
        msg: email.message.Message,
        counter: int = 0,
        include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with all original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachments = {}

    if msg.is_multipart():
        for part in msg.get_payload():  # type: ignore
            attachments.update(
                traverse_multipart(part, counter,
                                   include_attachment_data))  # type: ignore
    else:
        # In case we hit bug 27257, try to downgrade the used policy
        try:
            lower_keys = dict((k.lower(), v) for k, v in msg.items())
        except AttributeError:
            former_policy = msg.policy
            msg.policy = email.policy.compat32
            lower_keys = dict((k.lower(), v) for k, v in msg.items())
            msg.policy = former_policy

        if 'content-disposition' in lower_keys or not msg.get_content_maintype(
        ) == 'text':
            # if it's an attachment-type, pull out the filename
            # and calculate the size in bytes
            data = msg.get_payload(
                decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

            filename = msg.get_filename('')
            if filename == '':
                filename = 'part-{0:03d}'.format(counter)
            else:
                filename = eml_parser.decode.decode_field(filename)

            file_id = str(uuid.uuid1())
            attachments[file_id] = {}
            attachments[file_id]['filename'] = filename
            attachments[file_id]['size'] = file_size

            # os.path always returns the extension as second element
            # in case there is no extension it returns an empty string
            extension = os.path.splitext(filename)[1].lower()
            if extension:
                # strip leading dot
                attachments[file_id]['extension'] = extension[1:]

            attachments[file_id]['hash'] = get_file_hash(data)

            if not (magic_mime is None or magic_none is None):
                attachments[file_id]['mime_type'] = magic_none.buffer(data)
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachments[file_id]['mime_type_short'] = magic_mime.buffer(
                    data)

            if include_attachment_data:
                attachments[file_id]['raw'] = base64.b64encode(data)

            ch = {}  # type: typing.Dict[str, typing.List[str]]
            for k, v in msg.items():
                k = k.lower()
                v = str(v)

                if k in ch:
                    # print "%s<<<>>>%s" % (k, v)
                    ch[k].append(v)
                else:
                    ch[k] = [v]

            attachments[file_id]['content_header'] = ch

            counter += 1

    return attachments
Ejemplo n.º 5
0
def prepare_multipart_part_attachment(msg: email.message.Message, counter: int = 0,
                                      include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]:
    """Extract meta-information from a multipart-part.

    Args:
        msg (email.message.Message): An e-mail message object.
        counter (int, optional): A counter which is used for generating attachments
            file-names in case there are none found in the header. Default = 0.
        include_attachment_data (bool, optional): If true, method includes the raw attachment data when
            returning. Default = False.

    Returns:
        dict: Returns a dict with original multi-part headers as well as generated hash check-sums,
            date size, file extension, real mime-type.
    """
    attachment = {}

    # In case we hit bug 27257, try to downgrade the used policy
    try:
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
    except AttributeError:
        former_policy = msg.policy
        msg.policy = email.policy.compat32
        lower_keys = dict((k.lower(), v) for k, v in msg.items())
        msg.policy = former_policy

    if 'content-disposition' in lower_keys or not msg.get_content_maintype() == 'text':
        # if it's an attachment-type, pull out the filename
        # and calculate the size in bytes
        if msg.get_content_type() == 'message/rfc822':
            payload = msg.get_payload()
            if len(payload) > 1:
                logger.warning(
                    'More than one payload for "message/rfc822" part detected. This is not supported, please report!')

            data = bytes(payload[0])
            file_size = len(data)
        else:
            data = msg.get_payload(decode=True)  # type: bytes  # type is always bytes here
            file_size = len(data)

        filename = msg.get_filename('')
        if filename == '':
            filename = 'part-{0:03d}'.format(counter)
        else:
            filename = eml_parser.decode.decode_field(filename)

        file_id = str(uuid.uuid1())
        attachment[file_id] = {}
        attachment[file_id]['filename'] = filename
        attachment[file_id]['size'] = file_size

        # os.path always returns the extension as second element
        # in case there is no extension it returns an empty string
        extension = os.path.splitext(filename)[1].lower()
        if extension:
            # strip leading dot
            attachment[file_id]['extension'] = extension[1:]

        attachment[file_id]['hash'] = get_file_hash(data)

        if not (magic_mime is None or magic_none is None):
            mime_type = magic_none.buffer(data)
            mime_type_short = magic_mime.buffer(data)

            if not (mime_type is None or mime_type_short is None):
                attachment[file_id]['mime_type'] = mime_type
                # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0]
                attachment[file_id]['mime_type_short'] = mime_type_short
            else:
                logger.warning('Error determining attachment mime-type - "{}"'.format(file_id))

        if include_attachment_data:
            attachment[file_id]['raw'] = base64.b64encode(data)

        ch = {}  # type: typing.Dict[str, typing.List[str]]
        for k, v in msg.items():
            k = k.lower()
            v = str(v)

            if k in ch:
                ch[k].append(v)
            else:
                ch[k] = [v]

        attachment[file_id]['content_header'] = ch

        counter += 1

    return attachment