def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False, pconf: typing.Optional[dict] = None) -> dict: """Parse an e-mail and return a dictionary containing the various parts of the e-mail broken down into key-value pairs. Args: msg (str): Raw EML e-mail string. include_raw_body (bool, optional): If True, includes the raw body in the resulting dictionary. Defaults to False. include_attachment_data (bool, optional): If True, includes the full attachment data in the resulting dictionary. Defaults to False. pconf (dict, optional): A dict with various optinal configuration parameters, e.g. whitelist IPs, whitelist e-mail addresses, etc. Returns: dict: A dictionary with the content of the EML parsed and broken down into key-value pairs. """ header = {} # type: typing.Dict[str, typing.Any] report_struc = {} # type: typing.Dict[str, typing.Any] # Final structure headers_struc = {} # type: typing.Dict[str, typing.Any] # header_structure bodys_struc = {} # type: typing.Dict[str, typing.Any] # body structure # If no pconf was specified, default to empty dict pconf = pconf or {} # If no whitelisting of if is required initiate the empty variable arry if 'whiteip' not in pconf: pconf['whiteip'] = [] # If no whitelisting of if is required initiate the empty variable arry if 'whitefor' not in pconf: pconf['whitefor'] = [] # parse and decode subject subject = msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) # If parsing had problem... report it... if msg.defects: headers_struc['defect'] = [] for exception in msg.defects: headers_struc['defect'].append(str(exception)) # parse and decode from # @TODO verify if this hack is necessary for other e-mail fields as well try: msg_header_field = str(msg.get('from', '')).lower() except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. # logger.exception('We hit bug 27257!') _from = eml_parser.decode.workaround_bug_27257(msg, 'from') msg.__delitem__('from') if _from: msg.add_header('from', _from[0]) __from = _from[0].lower() else: msg.add_header('from', '') __from = '' msg_header_field = __from if msg_header_field != '': m = eml_parser.regex.email_regex.search(msg_header_field) if m: headers_struc['from'] = m.group(1) else: from_ = email.utils.parseaddr(msg.get('from', '').lower()) headers_struc['from'] = from_[1] # parse and decode to headers_struc['to'] = headeremail2list(msg, 'to') # parse and decode Cc headers_struc['cc'] = headeremail2list(msg, 'cc') if not headers_struc['cc']: headers_struc.pop('cc') # parse and decode delivered-to headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to') if not headers_struc['delivered_to']: headers_struc.pop('delivered_to') # parse and decode Date # If date field is present if 'date' in msg: try: headers_struc['date'] = eml_parser.decode.robust_string2date(msg.get('date')) except (TypeError, Exception): logger.warning('Error parsing date.') headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000') msg.replace_header('date', headers_struc['date']) else: # If date field is absent... headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000') # mail receiver path / parse any domain, e-mail # @TODO parse case where domain is specified but in parentheses only an IP headers_struc['received'] = [] headers_struc['received_email'] = [] headers_struc['received_domain'] = [] headers_struc['received_ip'] = [] try: found_smtpin = collections.Counter() # type: collections.Counter # Array for storing potential duplicate "HOP" for received_line in msg.get_all('received', []): line = str(received_line).lower() received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE) # Parse and split routing headers. # Return dict of list # date string # from list # for list # by list # with string # warning list parsed_routing = eml_parser.routing.parserouting(received_line_flat) # If required collect the IP of the gateway that have injected the mail. # Iterate all parsed item and find IP # It is parsed from the MOST recent to the OLDEST (from IN > Out) # We match external IP from the most "OUT" Found. # Warning .. It may be spoofed !! # It add a warning if multiple identical items are found. if pconf.get('byhostentry'): for by_item in parsed_routing.get('by', []): # type: ignore for byhostentry_ in pconf['byhostentry']: byhostentry = byhostentry_.lower() # print ("%s %s" % (byhostentry, by_item)) if byhostentry in by_item: # Save the last Found.. ( most external ) headers_struc['received_src'] = parsed_routing.get('from') # Increment watched by detection counter, and warn if needed found_smtpin[byhostentry] += 1 if found_smtpin[byhostentry] > 1: # Twice found the header... if parsed_routing.get('warning'): parsed_routing['warning'].append(['Duplicate SMTP by entrypoint']) else: parsed_routing['warning'] = ['Duplicate SMTP by entrypoint'] headers_struc['received'].append(parsed_routing) # Parse IPs in "received headers" ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \ eml_parser.regex.ipv4_regex.findall(received_line_flat) for ip in ips_in_received_line: try: ip_obj = ipaddress.ip_address( ip) # type: ignore # type of findall is list[str], so this is correct except ValueError: logger.debug('Invalid IP in received line - "{}"'.format(ip)) else: if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']): headers_struc['received_ip'].append(str(ip_obj)) # search for domain for m in eml_parser.regex.recv_dom_regex.findall(received_line_flat): try: ip_obj = ipaddress.ip_address(m) # type: ignore # type of findall is list[str], so this is correct except ValueError: # we find IPs using the previous IP crawler, hence we ignore them # here. # iff the regex fails, we add the entry headers_struc['received_domain'].append(m) # search for e-mail addresses for mail_candidate in eml_parser.regex.email_regex.findall(received_line_flat): if mail_candidate not in parsed_routing.get('for', []): headers_struc['received_email'] += [mail_candidate] except TypeError: # Ready to parse email without received headers. logger.exception('Exception occured while parsing received lines.') # Concatenate for emails into one array | uniq # for rapid "find" headers_struc['received_foremail'] = [] if 'received' in headers_struc: for _parsed_routing in headers_struc['received']: for itemfor in _parsed_routing.get('for', []): if itemfor not in pconf['whitefor']: headers_struc['received_foremail'].append(itemfor) # Uniq data found headers_struc['received_email'] = list(set(headers_struc['received_email'])) headers_struc['received_domain'] = list(set(headers_struc['received_domain'])) headers_struc['received_ip'] = list(set(headers_struc['received_ip'])) # Clean up if empty if not headers_struc['received_email']: del headers_struc['received_email'] if 'received_foremail' in headers_struc: if not headers_struc['received_foremail']: del headers_struc['received_foremail'] else: headers_struc['received_foremail'] = list(set(headers_struc['received_foremail'])) if not headers_struc['received_domain']: del headers_struc['received_domain'] if not headers_struc['received_ip']: del headers_struc['received_ip'] #################### # Parse text body raw_body = get_raw_body_text(msg) if include_raw_body: bodys_struc['raw_body'] = raw_body bodys = {} multipart = True # Is it a multipart email ? if len(raw_body) == 1: multipart = False # No only "one" Part for body_tup in raw_body: bodie = {} # type: typing.Dict[str, typing.Any] _, body, body_multhead = body_tup # Parse any URLs and mail found in the body list_observed_urls = [] # type: typing.List[str] list_observed_email = [] # type: typing.List[str] list_observed_dom = [] # type: typing.List[str] list_observed_ip = [] # type: typing.List[str] # If we start directly a findall on 500K+ body we got time and memory issues... # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ." # in order to reduce regex complexity. if len(body) < 4096: list_observed_urls = get_uri_ondata(body) for match in eml_parser.regex.email_regex.findall(body): list_observed_email.append(match.lower()) for match in eml_parser.regex.dom_regex.findall(body): list_observed_dom.append(match.lower()) for match in eml_parser.regex.ipv4_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for match in eml_parser.regex.ipv6_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) else: for scn_pt in findall('://', body): list_observed_urls = get_uri_ondata(body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls for scn_pt in findall('@', body): # RFC 3696, 5322, 5321 for email size limitations for match in eml_parser.regex.email_regex.findall(body[scn_pt - 64:scn_pt + 255]): list_observed_email.append(match.lower()) for scn_pt in findall('.', body): # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035 # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035. for match in eml_parser.regex.dom_regex.findall(body[scn_pt - 253:scn_pt + 1004]): list_observed_dom.append(match.lower()) # Find IPv4 addresses for match in eml_parser.regex.ipv4_regex.findall(body[scn_pt - 11:scn_pt + 3]): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for scn_pt in findall(':', body): # The maximum length of IPv6 is 32 Char + 7 ":" for match in eml_parser.regex.ipv6_regex.findall(body[scn_pt - 4:scn_pt + 35]): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) # Report uri,email and observed domain or hash if no raw body if include_raw_body: if list_observed_urls: bodie['uri'] = list(set(list_observed_urls)) if list_observed_email: bodie['email'] = list(set(list_observed_email)) if list_observed_dom: bodie['domain'] = list(set(list_observed_dom)) if list_observed_ip: bodie['ip'] = list(set(list_observed_ip)) else: if list_observed_urls: bodie['uri_hash'] = [] for uri in list(set(list_observed_urls)): bodie['uri_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_email: bodie['email_hash'] = [] for emel in list(set(list_observed_email)): # Email already lowered bodie['email_hash'].append(wrap_hash_sha256(emel)) if list_observed_dom: bodie['domain_hash'] = [] for uri in list(set(list_observed_dom)): bodie['domain_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_ip: bodie['ip_hash'] = [] for fip in list(set(list_observed_ip)): # IP (v6) already lowered bodie['ip_hash'].append(wrap_hash_sha256(fip)) # For mail without multipart we will only get the "content....something" headers # all other headers are in "header" # but we need to convert header tuples in dict.. # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" ch = {} # type: typing.Dict[str, typing.List] for k, v in body_multhead: # make sure we are working with strings only v = str(v) # We are using replace . to : for avoiding issue in mongo k = k.lower().replace('.', ':') # Lot of lowers, precompute :) . # print v if multipart: if k in ch: ch[k].append(v) else: ch[k] = [v] else: # if not multipart, store only content-xx related header with part if k.startswith('content'): # otherwise, we got all header headers k = k.lower().replace('.', ':') if k in ch: ch[k].append(v) else: ch[k] = [v] bodie['content_header'] = ch # Store content headers dict if include_raw_body: bodie['content'] = body # Sometimes bad people play with multiple header instances. # We "display" the "LAST" one .. as does thunderbird val = ch.get('content-type') if val: header_val = val[-1] bodie['content_type'] = header_val.split(';', 1)[0].strip() # Hash the body bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest() uid = str(uuid.uuid1()) bodys[uid] = bodie bodys_struc = bodys # Get all other bulk raw headers # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" # for k in set(msg.keys()): # We are using replace . to : for avoiding issue in mongo k = k.lower() # Lot of lower, precompute... decoded_values = [] try: for value in msg.get_all(k, []): if value: decoded_values.append(value) except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. logger.exception('We hit bug 27257!') decoded_values = eml_parser.decode.workaround_bug_27257_field_value(msg, k) if k in header: header[k] += decoded_values else: header[k] = decoded_values if decoded_values: if k in header: header[k] += decoded_values else: header[k] = decoded_values headers_struc['header'] = header # parse attachments try: report_struc['attachment'] = traverse_multipart(msg, 0, include_attachment_data) except (binascii.Error, AssertionError): # we hit this exception if the payload contains invalid data logger.exception('Exception occured while parsing attachment data. Collected data will not be complete!') report_struc['attachment'] = None # Dirty hack... transform hash into list.. need to be done in the function. # Mandatory to search efficiently in mongodb # See Bug 11 of eml_parser if not report_struc['attachment']: del report_struc['attachment'] else: newattach = [] for attachment in report_struc['attachment']: newattach.append(report_struc['attachment'][attachment]) report_struc['attachment'] = newattach newbody = [] for body in bodys_struc: newbody.append(bodys_struc[body]) report_struc['body'] = newbody # End of dirty hack # Get all other bulk headers report_struc['header'] = headers_struc return report_struc
def checkContainsArchive(msg: email.message.Message): """检查对应的信封是否是Azazo的传输文件""" try: if (msg.get(Const.PROJECT_NAME_HEADER) and msg.get(Const.PROJECT_VERSION_HEADER)): return True except Exception: traceback.print_exc()
def extract_files_from_email( message: email.message.Message) -> ParseMessageResult: """ Parses an email Message and returns a ParseMessageResult: 1. Is there a subject and from in the email? If no, return. 2. Does the subject contain the word "unsubscribe"? If yes, return. 3. Is the subject 8 letters long? If yes, return. 4. Otherwise: - Walk through the message and grab all parts that match "application/pdf;" or "application/epub". - Return a ParseMessageResult with those files. """ subject: str = message.get("Subject") sent_from: str = message.get("From") assert subject and sent_from if "unsubscribe" in subject.lower(): return ParseMessageResult( sent_from=sent_from, subject=subject, status=MessageStatus.UNSUBSCRIBE, extracted_files=[], ) # FIXME: need a more robust check here if subject and len(subject) == 8: return ParseMessageResult( sent_from=sent_from, subject=subject, status=MessageStatus.REGISTER, extracted_files=[], ) # Now we're done parsing the subject, we should check if there are any attachments files: List[FileTuple] = [] for part in message.walk(): if "application/pdf;" in part["Content-Type"]: filename = part.get_filename() or "Remailable_Attachment.pdf" filebytes = base64.b64decode(part.get_payload()) assert type(filename) == str files.append((filename, filebytes)) elif "application/epub" in part["Content-Type"]: filename = part.get_filename() or "Remailable_Attachment.epub" filebytes = base64.b64decode(part.get_payload()) assert type(filename) == str files.append((filename, filebytes)) if files: return ParseMessageResult( sent_from=sent_from, subject=subject, status=MessageStatus.SUCCESS, extracted_files=files, ) else: # Couldn't parse any files, empty return ParseMessageResult( sent_from=sent_from, subject=subject, status=MessageStatus.FAILURE, extracted_files=files, )
def get_raw_body_text( msg: email.message.Message ) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [ ] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.' ) filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') \ or (filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string( raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode( charset, 'ignore') except Exception: logger.debug( 'An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode( 'ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def hash_headers(mail: email.message.Message, header_to_hash: str, bh: str) -> SHA256.SHA256Hash: # mail: email.message.Message object # header_to_hash: list of email headers to hash seperated by a colon # bh: body hash of the email body base64 encoded # # in: 'from:from:reply-to:subject:subject:date:date:message-id:message-id:to:to:cc:content-type:content-type:content-transfer-encoding:content-transfer-encoding' # build "from:Christian Schneider <*****@*****.**>\r\n..." # header_to_hash_list = header_to_hash.split(":") headers = "" for header in header_to_hash_list: if mail[header] and header in header_to_hash_list: headers += header.lower() + ":" + mail[header].strip() + "\r\n" header_to_hash_list.remove(header) # strip duplicate header like the from dkim_header = mail.get("DKIM-Signature") dkim_header = re.sub(r'(\n|\r)', "", dkim_header) dkim_header = re.sub(r'\s+', " ", dkim_header) headers += "dkim-signature:{}\r\n".format(dkim_header) headers = re.sub(r'b=[\w0-9\s/+=]+', "b=", headers) #replace b=... with be= hheader = SHA256.new(headers.encode()) assert hheader.hexdigest() == "5188ff42a5ab71ae70236cf66822ab963b0977a3e7d932237fbfc35005195720" return hheader
def get_by_msg(msg: email.message.Message, attr: str, decode=False) -> Union[bytes, str]: """get attribute from msg""" get = msg.get(attr) if decode: get, charset = email.header.decode_header(get) if charset: get = get.decode(charset) return get
def get_raw_body_text(msg: email.message.Message) -> typing.List[typing.Tuple[typing.Any, typing.Any, typing.Any]]: """This method recursively retrieves all e-mail body parts and returns them as a list. Args: msg (email.message.Message): The actual e-mail message or sub-message. Returns: list: Returns a list of sets which are in the form of "set(encoding, raw_body_string, message field headers)" """ raw_body = [] # type: typing.List[typing.Tuple[typing.Any, typing.Any,typing.Any]] if msg.is_multipart(): for part in msg.get_payload(): # type: ignore raw_body.extend(get_raw_body_text(part)) # type: ignore else: # Treat text document attachments as belonging to the body of the mail. # Attachments with a file-extension of .htm/.html are implicitely treated # as text as well in order not to escape later checks (e.g. URL scan). try: filename = msg.get_filename('').lower() except (binascii.Error, AssertionError): logger.exception( 'Exception occured while trying to parse the content-disposition header. Collected data will not be complete.') filename = '' if ('content-disposition' not in msg and msg.get_content_maintype() == 'text') or ( filename.endswith('.html') or filename.endswith('.htm')): encoding = msg.get('content-transfer-encoding', '').lower() charset = msg.get_content_charset() if charset is None: raw_body_str = msg.get_payload(decode=True) raw_body_str = eml_parser.decode.decode_string(raw_body_str, None) else: try: raw_body_str = msg.get_payload(decode=True).decode(charset, 'ignore') except Exception: logger.debug('An exception occured while decoding the payload!', exc_info=True) raw_body_str = msg.get_payload(decode=True).decode('ascii', 'ignore') # In case we hit bug 27257, try to downgrade the used policy try: raw_body.append((encoding, raw_body_str, msg.items())) except AttributeError: former_policy = msg.policy msg.policy = email.policy.compat32 raw_body.append((encoding, raw_body_str, msg.items())) msg.policy = former_policy return raw_body
def extract_pdf(message: email.message.Message) -> Tuple[str, bytes]: """ Get a PDF from the email. TODO: This is the thing to change to accommodate more than one PDF per msg. """ filename = None filebytes = None for part in message.walk(): if "application/pdf;" in part["Content-Type"]: filename = part.get_filename() or "Remailable_Attachment.pdf" filebytes = base64.b64decode(part.get_payload()) break else: # Let's try getting the subjectline and body and see if there's a code # for us to gobble up in there :) code = message.get("Subject") if code and len(code) == 8: register_user(message.get("From"), code) return True else: raise ValueError("No PDF in this message.") return (filename, filebytes)
def from_email(cls, m: email.message.Message): return Message( message_id=m.get("message-id", ""), in_reply_to=m.get("in-reply-to", ""), date=_parsedate(m), email_from=email.utils.parseaddr(m.get("from", "")), email_to=email.utils.parseaddr(m.get("to", "")), carbon_copy=email.utils.getaddresses(m.get_all("cc", [])), subject=m.get("subject", ""), reply_to=m.get("reply-to", ""), )
def extract_pdf(message: email.message.Message) -> Tuple[str, bytes]: """ Get a PDF from the email. TODO: This is the thing to change to accommodate more than one PDF per msg. """ # Handle unsubscribes: subject = message.get("Subject") if "unsubscribe" in subject.lower(): plog(f"Permanently removing user {message.get('From')}.") delete_user(message.get("From")) return (False, False) filename = None filebytes = None for part in message.walk(): if "application/pdf;" in part["Content-Type"]: filename = part.get_filename() or "Remailable_Attachment.pdf" filebytes = base64.b64decode(part.get_payload()) break else: # Let's try getting the subjectline and body and see if there's a code # for us to gobble up in there :) code = message.get("Subject") if code and len(code) == 8: register_user(message.get("From"), code) plog(f"Registered a new user {message.get('From')}.") send_email_if_enabled( message.get("From"), subject="Your email address is now verified!", message= "Your verification succeeded, and you can now email documents to your reMarkable tablet. Try responding to this email with a PDF attachment!", ) return (False, False) else: send_email_if_enabled( message.get("From"), subject="A problem with your document :(", message= "Unfortunately, a problem occurred while processing your email. Remailable only supports PDF attachments for now. If you're still encountering issues, please get in touch with Jordan at [email protected] or on Twitter at @j6m8.", ) plog( f"ERROR: Encountered no PDF in message from {message.get('From')}" ) return (False, False) return (filename, filebytes)
def __init__(self, message: email.message.Message, uid=None): self.msgraw = message message = email.message_from_bytes(message) # print(email.header.decode_header(message['subject'])) self.uid = uid self.recipients = message.get('to', '').replace('\n', '').replace( '\r', '').replace('\t', ' ').split(', ') self.sender = message.get('from', '') self.subject = self.soft_decode(message.get('subject', '')) self.date = message.get('date', '') self.cc = message.get('cc', '').replace('\n', '').replace('\r', '').replace( '\t', ' ').split(', ') self.bcc = message.get('bcc', '').replace('\n', '').replace('\r', '').replace( '\t', ' ').split(', ') self.server_uid = message.get('message-id', '').strip() self.attachments = self.resolve_attachments(message) self.body = self.decode_body(message) or '' self.type = 'html' if isinstance(self.body, BeautifulSoup) else 'text'
def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False, pconf: typing.Optional[dict] = None, parse_attachments: bool = True) -> dict: """Parse an e-mail and return a dictionary containing the various parts of the e-mail broken down into key-value pairs. Args: msg (str): Raw EML e-mail string. include_raw_body (bool, optional): If True, includes the raw body in the resulting dictionary. Defaults to False. include_attachment_data (bool, optional): If True, includes the full attachment data in the resulting dictionary. Defaults to False. pconf (dict, optional): A dict with various optional configuration parameters, e.g. whitelist IPs, whitelist e-mail addresses, etc. parse_attachments (bool, optional): Set this to false if you want to disable the parsing of attachments. Please note that HTML attachments as well as other text data marked to be in-lined, will always be parsed. Returns: dict: A dictionary with the content of the EML parsed and broken down into key-value pairs. """ header = {} # type: typing.Dict[str, typing.Any] report_struc = {} # type: typing.Dict[str, typing.Any] # Final structure headers_struc = { } # type: typing.Dict[str, typing.Any] # header_structure bodys_struc = {} # type: typing.Dict[str, typing.Any] # body structure # If no pconf was specified, default to empty dict pconf = pconf or {} # If no whitelisting is required, set to emtpy list if 'whiteip' not in pconf: pconf['whiteip'] = [] # If no whitelisting is required, set to emtpy list if 'whitefor' not in pconf: pconf['whitefor'] = [] # parse and decode subject subject = msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) # If parsing had problems, report it if msg.defects: headers_struc['defect'] = [] for exception in msg.defects: headers_struc['defect'].append(str(exception)) # parse and decode "from" # @TODO verify if this hack is necessary for other e-mail fields as well try: msg_header_field = str(msg.get('from', '')).lower() except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. # logger.exception('We hit bug 27257!') _from = eml_parser.decode.workaround_bug_27257(msg, 'from') msg.__delitem__('from') if _from: msg.add_header('from', _from[0]) __from = _from[0].lower() else: msg.add_header('from', '') __from = '' msg_header_field = __from if msg_header_field != '': m = eml_parser.regex.email_regex.search(msg_header_field) if m: headers_struc['from'] = m.group(1) else: from_ = email.utils.parseaddr(msg.get('from', '').lower()) headers_struc['from'] = from_[1] # parse and decode "to" headers_struc['to'] = headeremail2list(msg, 'to') # parse and decode "cc" headers_struc['cc'] = headeremail2list(msg, 'cc') if not headers_struc['cc']: headers_struc.pop('cc') # parse and decode delivered-to headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to') if not headers_struc['delivered_to']: headers_struc.pop('delivered_to') # parse and decode Date # If date field is present if 'date' in msg: try: headers_struc['date'] = eml_parser.decode.robust_string2date( msg.get('date')) except (TypeError, Exception): logger.warning('Error parsing date.') headers_struc['date'] = dateutil.parser.parse( '1970-01-01T00:00:00+0000') msg.replace_header('date', headers_struc['date']) else: # If date field is absent... headers_struc['date'] = dateutil.parser.parse( '1970-01-01T00:00:00+0000') # mail receiver path / parse any domain, e-mail # @TODO parse case where domain is specified but in parentheses only an IP headers_struc['received'] = [] headers_struc['received_email'] = [] headers_struc['received_domain'] = [] headers_struc['received_ip'] = [] try: found_smtpin = collections.Counter( ) # type: collections.Counter # Array for storing potential duplicate "HOP" for received_line in msg.get_all('received', []): line = str(received_line).lower() received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE) # Parse and split routing headers. # Return dict of list # date string # from list # for list # by list # with string # warning list parsed_routing = eml_parser.routing.parserouting( received_line_flat) # If required collect the IP of the gateway that have injected the mail. # Iterate all parsed item and find IP # It is parsed from the MOST recent to the OLDEST (from IN > Out) # We match external IP from the most "OUT" Found. # Warning .. It may be spoofed !! # It add a warning if multiple identical items are found. if pconf.get('byhostentry'): for by_item in parsed_routing.get('by', []): # type: ignore for byhostentry_ in pconf['byhostentry']: byhostentry = byhostentry_.lower() # print ("%s %s" % (byhostentry, by_item)) if byhostentry in by_item: # Save the last Found.. ( most external ) headers_struc['received_src'] = parsed_routing.get( 'from') # Increment watched by detection counter, and warn if needed found_smtpin[byhostentry] += 1 if found_smtpin[ byhostentry] > 1: # Twice found the header... if parsed_routing.get('warning'): parsed_routing['warning'].append( ['Duplicate SMTP by entrypoint']) else: parsed_routing['warning'] = [ 'Duplicate SMTP by entrypoint' ] headers_struc['received'].append(parsed_routing) # Parse IPs in "received headers" ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \ eml_parser.regex.ipv4_regex.findall(received_line_flat) for ip in ips_in_received_line: try: ip_obj = ipaddress.ip_address( ip ) # type: ignore # type of findall is list[str], so this is correct except ValueError: logger.debug( 'Invalid IP in received line - "{}"'.format(ip)) else: if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']): headers_struc['received_ip'].append(str(ip_obj)) # search for domain for m in eml_parser.regex.recv_dom_regex.findall( received_line_flat): try: ip_obj = ipaddress.ip_address( m ) # type: ignore # type of findall is list[str], so this is correct except ValueError: # we find IPs using the previous IP crawler, hence we ignore them # here. # iff the regex fails, we add the entry headers_struc['received_domain'].append(m) # search for e-mail addresses for mail_candidate in eml_parser.regex.email_regex.findall( received_line_flat): if mail_candidate not in parsed_routing.get('for', []): headers_struc['received_email'] += [mail_candidate] except TypeError: # Ready to parse email without received headers. logger.exception('Exception occured while parsing received lines.') # Concatenate for emails into one array | uniq # for rapid "find" headers_struc['received_foremail'] = [] if 'received' in headers_struc: for _parsed_routing in headers_struc['received']: for itemfor in _parsed_routing.get('for', []): if itemfor not in pconf['whitefor']: headers_struc['received_foremail'].append(itemfor) # Uniq data found headers_struc['received_email'] = list(set( headers_struc['received_email'])) headers_struc['received_domain'] = list( set(headers_struc['received_domain'])) headers_struc['received_ip'] = list(set(headers_struc['received_ip'])) # Clean up if empty if not headers_struc['received_email']: del headers_struc['received_email'] if 'received_foremail' in headers_struc: if not headers_struc['received_foremail']: del headers_struc['received_foremail'] else: headers_struc['received_foremail'] = list( set(headers_struc['received_foremail'])) if not headers_struc['received_domain']: del headers_struc['received_domain'] if not headers_struc['received_ip']: del headers_struc['received_ip'] #################### # Parse text body raw_body = get_raw_body_text(msg) if include_raw_body: bodys_struc['raw_body'] = raw_body bodys = {} # Is it a multipart email ? if len(raw_body) == 1: multipart = False else: multipart = True for body_tup in raw_body: bodie = {} # type: typing.Dict[str, typing.Any] _, body, body_multhead = body_tup # Parse any URLs and mail found in the body list_observed_urls = [] # type: typing.List[str] list_observed_email = [] # type: typing.List[str] list_observed_dom = [] # type: typing.List[str] list_observed_ip = [] # type: typing.List[str] # If we start directly a findall on 500K+ body we got time and memory issues... # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ." # in order to reduce regex complexity. if len(body) < 4096: list_observed_urls = get_uri_ondata(body) for match in eml_parser.regex.email_regex.findall(body): list_observed_email.append(match.lower()) for match in eml_parser.regex.dom_regex.findall(body): list_observed_dom.append(match.lower()) for match in eml_parser.regex.ipv4_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for match in eml_parser.regex.ipv6_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) else: for scn_pt in findall('://', body): list_observed_urls = get_uri_ondata( body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls for scn_pt in findall('@', body): # RFC 3696, 5322, 5321 for email size limitations for match in eml_parser.regex.email_regex.findall( body[scn_pt - 64:scn_pt + 255]): list_observed_email.append(match.lower()) for scn_pt in findall('.', body): # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035 # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035. for match in eml_parser.regex.dom_regex.findall( body[scn_pt - 253:scn_pt + 1004]): list_observed_dom.append(match.lower()) # Find IPv4 addresses for match in eml_parser.regex.ipv4_regex.findall( body[scn_pt - 11:scn_pt + 3]): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for scn_pt in findall(':', body): # The maximum length of IPv6 is 32 Char + 7 ":" for match in eml_parser.regex.ipv6_regex.findall( body[scn_pt - 4:scn_pt + 35]): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) # Report uri,email and observed domain or hash if no raw body if include_raw_body: if list_observed_urls: bodie['uri'] = list(set(list_observed_urls)) if list_observed_email: bodie['email'] = list(set(list_observed_email)) if list_observed_dom: bodie['domain'] = list(set(list_observed_dom)) if list_observed_ip: bodie['ip'] = list(set(list_observed_ip)) else: if list_observed_urls: bodie['uri_hash'] = [] for uri in list(set(list_observed_urls)): bodie['uri_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_email: bodie['email_hash'] = [] for emel in list(set(list_observed_email)): # Email already lowered bodie['email_hash'].append(wrap_hash_sha256(emel)) if list_observed_dom: bodie['domain_hash'] = [] for uri in list(set(list_observed_dom)): bodie['domain_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_ip: bodie['ip_hash'] = [] for fip in list(set(list_observed_ip)): # IP (v6) already lowered bodie['ip_hash'].append(wrap_hash_sha256(fip)) # For mail without multipart we will only get the "content....something" headers # all other headers are in "header" # but we need to convert header tuples in dict.. # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" ch = {} # type: typing.Dict[str, typing.List] for k, v in body_multhead: # make sure we are working with strings only v = str(v) # We are using replace . to : for avoiding issue in mongo k = k.lower().replace('.', ':') # Lot of lowers, pre-compute :) . # print v if multipart: if k in ch: ch[k].append(v) else: ch[k] = [v] else: # if not multipart, store only content-xx related header with part if k.startswith( 'content'): # otherwise, we got all header headers if k in ch: ch[k].append(v) else: ch[k] = [v] bodie['content_header'] = ch # Store content headers dict if include_raw_body: bodie['content'] = body # Sometimes bad people play with multiple header instances. # We "display" the "LAST" one .. as does thunderbird val = ch.get('content-type') if val: header_val = val[-1] bodie['content_type'] = header_val.split(';', 1)[0].strip() # Hash the body bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest() uid = str(uuid.uuid1()) bodys[uid] = bodie bodys_struc = bodys # Get all other bulk raw headers # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" # for k in set(msg.keys()): # We are using replace . to : for avoiding issue in mongo k = k.lower() # Lot of lower, precompute... decoded_values = [] try: for value in msg.get_all(k, []): if value: decoded_values.append(value) except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. logger.exception('We hit bug 27257!') decoded_values = eml_parser.decode.workaround_bug_27257_field_value( msg, k) if k in header: header[k] += decoded_values else: header[k] = decoded_values if decoded_values: if k in header: header[k] += decoded_values else: header[k] = decoded_values headers_struc['header'] = header # parse attachments if parse_attachments: try: report_struc['attachment'] = traverse_multipart( msg, 0, include_attachment_data) except (binascii.Error, AssertionError): # we hit this exception if the payload contains invalid data logger.exception( 'Exception occured while parsing attachment data. Collected data will not be complete!' ) report_struc['attachment'] = None # Dirty hack... transform hash into list.. need to be done in the function. # Mandatory to search efficiently in mongodb # See Bug 11 of eml_parser if not report_struc['attachment']: del report_struc['attachment'] else: newattach = [] for attachment in report_struc['attachment']: newattach.append(report_struc['attachment'][attachment]) report_struc['attachment'] = newattach newbody = [] for body in bodys_struc: newbody.append(bodys_struc[body]) report_struc['body'] = newbody # End of dirty hack # Get all other bulk headers report_struc['header'] = headers_struc return report_struc
def _parsedate(m: email.message.Message) -> datetime.datetime: parsed = email.utils.parsedate_to_datetime(m.get("date", "")) return parsed.astimezone(datetime.timezone.utc)