def get_raw_body_text( msg: email.message.Message ) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [ ] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.' ) filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \ or (filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string( raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode( charset, 'ignore') except Exception: logger.debug( 'An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode( 'ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.') filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or ( filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string(raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore') except Exception: logger.debug('An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def prepare_multipart_part_attachment( msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Extract meta-information from a multipart-part. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachment = {} # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \ or msg.get_content_maintype() != 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes if msg.get_content_type() == 'message/rfc822': payload = msg.get_payload() if len(payload) > 1: logger.warning( 'More than one payload for "message/rfc822" part detected. This is not supported, please report!' ) data = bytes(payload[0]) file_size = len(data) else: data = msg.get_payload( decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachment[file_id] = {} attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachment[file_id]['extension'] = extension[1:] attachment[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): mime_type = magic_none.buffer(data) mime_type_short = magic_mime.buffer(data) if not (mime_type is None or mime_type_short is None): attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short else: logger.warning( 'Error determining attachment mime-type - "{}"'.format( file_id)) if include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: ch[k].append(v) else: ch[k] = [v] attachment[file_id]['content_header'] = ch counter += 1 return attachment
def traverse_multipart( msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): for part in msg.get_payload(): # type: ignore attachments.update( traverse_multipart(part, counter, include_attachment_data)) # type: ignore else: # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if 'content-disposition' in lower_keys or not msg.get_content_maintype( ) == 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes data = msg.get_payload( decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachments[file_id] = {} attachments[file_id]['filename'] = filename attachments[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachments[file_id]['extension'] = extension[1:] attachments[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): attachments[file_id]['mime_type'] = magic_none.buffer(data) # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachments[file_id]['mime_type_short'] = magic_mime.buffer( data) if include_attachment_data: attachments[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: # print "%s<<<>>>%s" % (k, v) ch[k].append(v) else: ch[k] = [v] attachments[file_id]['content_header'] = ch counter += 1 return attachments
def prepare_multipart_part_attachment(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Extract meta-information from a multipart-part. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachment = {} # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if 'content-disposition' in lower_keys or not msg.get_content_maintype() == 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes if msg.get_content_type() == 'message/rfc822': payload = msg.get_payload() if len(payload) > 1: logger.warning( 'More than one payload for "message/rfc822" part detected. This is not supported, please report!') data = bytes(payload[0]) file_size = len(data) else: data = msg.get_payload(decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachment[file_id] = {} attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachment[file_id]['extension'] = extension[1:] attachment[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): mime_type = magic_none.buffer(data) mime_type_short = magic_mime.buffer(data) if not (mime_type is None or mime_type_short is None): attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short else: logger.warning('Error determining attachment mime-type - "{}"'.format(file_id)) if include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: ch[k].append(v) else: ch[k] = [v] attachment[file_id]['content_header'] = ch counter += 1 return attachment