def get_raw_body_text( msg: email.message.Message ) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [ ] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.' ) filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \ or (filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string( raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode( charset, 'ignore') except Exception: logger.debug( 'An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode( 'ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.') filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or ( filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string(raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore') except Exception: logger.debug('An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def decode_body(message: email.message.Message): # If the message comes in multiple portions (i.e. there are attachments) if message.is_multipart(): for part in message.get_payload(): # Throw away attachments if part.get_filename(): continue # Return an html object or plaintext charset = part.get_content_charset() if part.get_content_type() == 'text/plain': return part.get_payload( decode=True).decode(charset).strip() if part.get_content_type() == 'text/html': return html(part.get_payload(decode=True).decode(charset)) # Note that we can just return because even though it's multipart, the only usual suspects for the multiple parts are extra attachments, which we're avoiding. else: return message.get_payload(decode=True).decode( message.get_content_charset()).strip()