def get_raw_body_text( msg: email.message.Message ) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [ ] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.' ) filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \ or (filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string( raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode( charset, 'ignore') except Exception: logger.debug( 'An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode( 'ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def get_first_text_block( email_message_instance: email.message.Message ) -> 'list': # http://python-3.ru/page/imap-email-python """Читаем внутренности письма, извлекаем содержимое""" maintype = email_message_instance.get_content_maintype() if maintype == 'multipart': for part in email_message_instance.get_payload(): if part.get_content_maintype() == 'text': return part.get_payload().split(', ') elif maintype == 'text': return email_message_instance.get_payload().split(', ')
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.') filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or ( filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string(raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore') except Exception: logger.debug('An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \ typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): if 'content-type' in msg: if msg.get_content_type() == 'message/rfc822': # This is an e-mail message attachment, add it to the attachment list apart from parsing it attachments.update( prepare_multipart_part_attachment( msg, counter, include_attachment_data)) # type: ignore for part in msg.get_payload(): # type: ignore attachments.update( traverse_multipart(part, counter, include_attachment_data)) # type: ignore else: return prepare_multipart_part_attachment(msg, counter, include_attachment_data) return attachments
def traverse_multipart(self, msg: email.message.Message, counter: int = 0) -> typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): if 'content-type' in msg: if msg.get_content_type() == 'message/rfc822': # This is an e-mail message attachment, add it to the attachment list apart from parsing it attachments.update( self.prepare_multipart_part_attachment(msg, counter)) for part in msg.get_payload(): attachments.update(self.traverse_multipart(part, counter)) else: return self.prepare_multipart_part_attachment(msg, counter) return attachments
def get_email_body_as_html(message: email.message.Message) -> str: """ Loads a message and extracts the email body as HTML """ # not necessairly the safest assumptions payload = message.get_payload(1).get_payload() return payload
def traverse_multipart(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> \ typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): if 'content-type' in msg: if msg.get_content_type() == 'message/rfc822': # This is an e-mail message attachment, add it to the attachment list apart from parsing it attachments.update( prepare_multipart_part_attachment(msg, counter, include_attachment_data)) # type: ignore for part in msg.get_payload(): # type: ignore attachments.update(traverse_multipart(part, counter, include_attachment_data)) # type: ignore else: return prepare_multipart_part_attachment(msg, counter, include_attachment_data) return attachments
def decode_body(message: email.message.Message): # If the message comes in multiple portions (i.e. there are attachments) if message.is_multipart(): for part in message.get_payload(): # Throw away attachments if part.get_filename(): continue # Return an html object or plaintext charset = part.get_content_charset() if part.get_content_type() == 'text/plain': return part.get_payload( decode=True).decode(charset).strip() if part.get_content_type() == 'text/html': return html(part.get_payload(decode=True).decode(charset)) # Note that we can just return because even though it's multipart, the only usual suspects for the multiple parts are extra attachments, which we're avoiding. else: return message.get_payload(decode=True).decode( message.get_content_charset()).strip()
def analysisArchive(msg: email.message.Message) -> ProjectArchiveInfo: """return ((filename, file-content), (ProjectName, ProjectVersion))""" for payload in msg.get_payload(): # type:email.message.Message if payload.get_content_type() == Const.ARCHIVE_TYPE: return ProjectArchiveInfo( fileName=payload.get_filename(), data=payload.get_payload(decode=True), projectName=get_by_msg(msg, Const.PROJECT_NAME_HEADER), version=get_by_msg(msg, Const.PROJECT_VERSION_HEADER), )
def prepare_multipart_part_attachment( msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Extract meta-information from a multipart-part. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachment = {} # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if ('content-disposition' in lower_keys and msg.get_content_disposition() != 'inline') \ or msg.get_content_maintype() != 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes if msg.get_content_type() == 'message/rfc822': payload = msg.get_payload() if len(payload) > 1: logger.warning( 'More than one payload for "message/rfc822" part detected. This is not supported, please report!' ) data = bytes(payload[0]) file_size = len(data) else: data = msg.get_payload( decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachment[file_id] = {} attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachment[file_id]['extension'] = extension[1:] attachment[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): mime_type = magic_none.buffer(data) mime_type_short = magic_mime.buffer(data) if not (mime_type is None or mime_type_short is None): attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short else: logger.warning( 'Error determining attachment mime-type - "{}"'.format( file_id)) if include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: ch[k].append(v) else: ch[k] = [v] attachment[file_id]['content_header'] = ch counter += 1 return attachment
def traverse_multipart( msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Recursively traverses all e-mail message multi-part elements and returns in a parsed form as a dict. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with all original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachments = {} if msg.is_multipart(): for part in msg.get_payload(): # type: ignore attachments.update( traverse_multipart(part, counter, include_attachment_data)) # type: ignore else: # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if 'content-disposition' in lower_keys or not msg.get_content_maintype( ) == 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes data = msg.get_payload( decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachments[file_id] = {} attachments[file_id]['filename'] = filename attachments[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachments[file_id]['extension'] = extension[1:] attachments[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): attachments[file_id]['mime_type'] = magic_none.buffer(data) # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachments[file_id]['mime_type_short'] = magic_mime.buffer( data) if include_attachment_data: attachments[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: # print "%s<<<>>>%s" % (k, v) ch[k].append(v) else: ch[k] = [v] attachments[file_id]['content_header'] = ch counter += 1 return attachments
def prepare_multipart_part_attachment(msg: email.message.Message, counter: int = 0, include_attachment_data: bool = False) -> typing.Dict[str, typing.Any]: """Extract meta-information from a multipart-part. Args: msg (email.message.Message): An e-mail message object. counter (int, optional): A counter which is used for generating attachments file-names in case there are none found in the header. Default = 0. include_attachment_data (bool, optional): If true, method includes the raw attachment data when returning. Default = False. Returns: dict: Returns a dict with original multi-part headers as well as generated hash check-sums, date size, file extension, real mime-type. """ attachment = {} # In case we hit bug 27257, try to downgrade the used policy try: lower_keys = dict((k.lower(), v) for k, v in msg.items()) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 lower_keys = dict((k.lower(), v) for k, v in msg.items()) msg.policy = former_policy if 'content-disposition' in lower_keys or not msg.get_content_maintype() == 'text': # if it's an attachment-type, pull out the filename # and calculate the size in bytes if msg.get_content_type() == 'message/rfc822': payload = msg.get_payload() if len(payload) > 1: logger.warning( 'More than one payload for "message/rfc822" part detected. This is not supported, please report!') data = bytes(payload[0]) file_size = len(data) else: data = msg.get_payload(decode=True) # type: bytes # type is always bytes here file_size = len(data) filename = msg.get_filename('') if filename == '': filename = 'part-{0:03d}'.format(counter) else: filename = eml_parser.decode.decode_field(filename) file_id = str(uuid.uuid1()) attachment[file_id] = {} attachment[file_id]['filename'] = filename attachment[file_id]['size'] = file_size # os.path always returns the extension as second element # in case there is no extension it returns an empty string extension = os.path.splitext(filename)[1].lower() if extension: # strip leading dot attachment[file_id]['extension'] = extension[1:] attachment[file_id]['hash'] = get_file_hash(data) if not (magic_mime is None or magic_none is None): mime_type = magic_none.buffer(data) mime_type_short = magic_mime.buffer(data) if not (mime_type is None or mime_type_short is None): attachment[file_id]['mime_type'] = mime_type # attachments[file_id]['mime_type_short'] = attachments[file_id]['mime_type'].split(",")[0] attachment[file_id]['mime_type_short'] = mime_type_short else: logger.warning('Error determining attachment mime-type - "{}"'.format(file_id)) if include_attachment_data: attachment[file_id]['raw'] = base64.b64encode(data) ch = {} # type: typing.Dict[str, typing.List[str]] for k, v in msg.items(): k = k.lower() v = str(v) if k in ch: ch[k].append(v) else: ch[k] = [v] attachment[file_id]['content_header'] = ch counter += 1 return attachment
def extract_message_content(mime_msg: email.message.Message) -> str: for part in mime_msg.get_payload(): if part.get_content_maintype() == 'text': return part.get_payload(decode=True).decode() return ''