def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False, pconf: typing.Optional[dict] = None) -> dict: """Parse an e-mail and return a dictionary containing the various parts of the e-mail broken down into key-value pairs. Args: msg (str): Raw EML e-mail string. include_raw_body (bool, optional): If True, includes the raw body in the resulting dictionary. Defaults to False. include_attachment_data (bool, optional): If True, includes the full attachment data in the resulting dictionary. Defaults to False. pconf (dict, optional): A dict with various optinal configuration parameters, e.g. whitelist IPs, whitelist e-mail addresses, etc. Returns: dict: A dictionary with the content of the EML parsed and broken down into key-value pairs. """ header = {} # type: typing.Dict[str, typing.Any] report_struc = {} # type: typing.Dict[str, typing.Any] # Final structure headers_struc = {} # type: typing.Dict[str, typing.Any] # header_structure bodys_struc = {} # type: typing.Dict[str, typing.Any] # body structure # If no pconf was specified, default to empty dict pconf = pconf or {} # If no whitelisting of if is required initiate the empty variable arry if 'whiteip' not in pconf: pconf['whiteip'] = [] # If no whitelisting of if is required initiate the empty variable arry if 'whitefor' not in pconf: pconf['whitefor'] = [] # parse and decode subject subject = msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) # If parsing had problem... report it... if msg.defects: headers_struc['defect'] = [] for exception in msg.defects: headers_struc['defect'].append(str(exception)) # parse and decode from # @TODO verify if this hack is necessary for other e-mail fields as well try: msg_header_field = str(msg.get('from', '')).lower() except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. # logger.exception('We hit bug 27257!') _from = eml_parser.decode.workaround_bug_27257(msg, 'from') msg.__delitem__('from') if _from: msg.add_header('from', _from[0]) __from = _from[0].lower() else: msg.add_header('from', '') __from = '' msg_header_field = __from if msg_header_field != '': m = eml_parser.regex.email_regex.search(msg_header_field) if m: headers_struc['from'] = m.group(1) else: from_ = email.utils.parseaddr(msg.get('from', '').lower()) headers_struc['from'] = from_[1] # parse and decode to headers_struc['to'] = headeremail2list(msg, 'to') # parse and decode Cc headers_struc['cc'] = headeremail2list(msg, 'cc') if not headers_struc['cc']: headers_struc.pop('cc') # parse and decode delivered-to headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to') if not headers_struc['delivered_to']: headers_struc.pop('delivered_to') # parse and decode Date # If date field is present if 'date' in msg: try: headers_struc['date'] = eml_parser.decode.robust_string2date(msg.get('date')) except (TypeError, Exception): logger.warning('Error parsing date.') headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000') msg.replace_header('date', headers_struc['date']) else: # If date field is absent... headers_struc['date'] = dateutil.parser.parse('1970-01-01T00:00:00+0000') # mail receiver path / parse any domain, e-mail # @TODO parse case where domain is specified but in parentheses only an IP headers_struc['received'] = [] headers_struc['received_email'] = [] headers_struc['received_domain'] = [] headers_struc['received_ip'] = [] try: found_smtpin = collections.Counter() # type: collections.Counter # Array for storing potential duplicate "HOP" for received_line in msg.get_all('received', []): line = str(received_line).lower() received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE) # Parse and split routing headers. # Return dict of list # date string # from list # for list # by list # with string # warning list parsed_routing = eml_parser.routing.parserouting(received_line_flat) # If required collect the IP of the gateway that have injected the mail. # Iterate all parsed item and find IP # It is parsed from the MOST recent to the OLDEST (from IN > Out) # We match external IP from the most "OUT" Found. # Warning .. It may be spoofed !! # It add a warning if multiple identical items are found. if pconf.get('byhostentry'): for by_item in parsed_routing.get('by', []): # type: ignore for byhostentry_ in pconf['byhostentry']: byhostentry = byhostentry_.lower() # print ("%s %s" % (byhostentry, by_item)) if byhostentry in by_item: # Save the last Found.. ( most external ) headers_struc['received_src'] = parsed_routing.get('from') # Increment watched by detection counter, and warn if needed found_smtpin[byhostentry] += 1 if found_smtpin[byhostentry] > 1: # Twice found the header... if parsed_routing.get('warning'): parsed_routing['warning'].append(['Duplicate SMTP by entrypoint']) else: parsed_routing['warning'] = ['Duplicate SMTP by entrypoint'] headers_struc['received'].append(parsed_routing) # Parse IPs in "received headers" ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \ eml_parser.regex.ipv4_regex.findall(received_line_flat) for ip in ips_in_received_line: try: ip_obj = ipaddress.ip_address( ip) # type: ignore # type of findall is list[str], so this is correct except ValueError: logger.debug('Invalid IP in received line - "{}"'.format(ip)) else: if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']): headers_struc['received_ip'].append(str(ip_obj)) # search for domain for m in eml_parser.regex.recv_dom_regex.findall(received_line_flat): try: ip_obj = ipaddress.ip_address(m) # type: ignore # type of findall is list[str], so this is correct except ValueError: # we find IPs using the previous IP crawler, hence we ignore them # here. # iff the regex fails, we add the entry headers_struc['received_domain'].append(m) # search for e-mail addresses for mail_candidate in eml_parser.regex.email_regex.findall(received_line_flat): if mail_candidate not in parsed_routing.get('for', []): headers_struc['received_email'] += [mail_candidate] except TypeError: # Ready to parse email without received headers. logger.exception('Exception occured while parsing received lines.') # Concatenate for emails into one array | uniq # for rapid "find" headers_struc['received_foremail'] = [] if 'received' in headers_struc: for _parsed_routing in headers_struc['received']: for itemfor in _parsed_routing.get('for', []): if itemfor not in pconf['whitefor']: headers_struc['received_foremail'].append(itemfor) # Uniq data found headers_struc['received_email'] = list(set(headers_struc['received_email'])) headers_struc['received_domain'] = list(set(headers_struc['received_domain'])) headers_struc['received_ip'] = list(set(headers_struc['received_ip'])) # Clean up if empty if not headers_struc['received_email']: del headers_struc['received_email'] if 'received_foremail' in headers_struc: if not headers_struc['received_foremail']: del headers_struc['received_foremail'] else: headers_struc['received_foremail'] = list(set(headers_struc['received_foremail'])) if not headers_struc['received_domain']: del headers_struc['received_domain'] if not headers_struc['received_ip']: del headers_struc['received_ip'] #################### # Parse text body raw_body = get_raw_body_text(msg) if include_raw_body: bodys_struc['raw_body'] = raw_body bodys = {} multipart = True # Is it a multipart email ? if len(raw_body) == 1: multipart = False # No only "one" Part for body_tup in raw_body: bodie = {} # type: typing.Dict[str, typing.Any] _, body, body_multhead = body_tup # Parse any URLs and mail found in the body list_observed_urls = [] # type: typing.List[str] list_observed_email = [] # type: typing.List[str] list_observed_dom = [] # type: typing.List[str] list_observed_ip = [] # type: typing.List[str] # If we start directly a findall on 500K+ body we got time and memory issues... # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ." # in order to reduce regex complexity. if len(body) < 4096: list_observed_urls = get_uri_ondata(body) for match in eml_parser.regex.email_regex.findall(body): list_observed_email.append(match.lower()) for match in eml_parser.regex.dom_regex.findall(body): list_observed_dom.append(match.lower()) for match in eml_parser.regex.ipv4_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for match in eml_parser.regex.ipv6_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) else: for scn_pt in findall('://', body): list_observed_urls = get_uri_ondata(body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls for scn_pt in findall('@', body): # RFC 3696, 5322, 5321 for email size limitations for match in eml_parser.regex.email_regex.findall(body[scn_pt - 64:scn_pt + 255]): list_observed_email.append(match.lower()) for scn_pt in findall('.', body): # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035 # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035. for match in eml_parser.regex.dom_regex.findall(body[scn_pt - 253:scn_pt + 1004]): list_observed_dom.append(match.lower()) # Find IPv4 addresses for match in eml_parser.regex.ipv4_regex.findall(body[scn_pt - 11:scn_pt + 3]): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for scn_pt in findall(':', body): # The maximum length of IPv6 is 32 Char + 7 ":" for match in eml_parser.regex.ipv6_regex.findall(body[scn_pt - 4:scn_pt + 35]): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) # Report uri,email and observed domain or hash if no raw body if include_raw_body: if list_observed_urls: bodie['uri'] = list(set(list_observed_urls)) if list_observed_email: bodie['email'] = list(set(list_observed_email)) if list_observed_dom: bodie['domain'] = list(set(list_observed_dom)) if list_observed_ip: bodie['ip'] = list(set(list_observed_ip)) else: if list_observed_urls: bodie['uri_hash'] = [] for uri in list(set(list_observed_urls)): bodie['uri_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_email: bodie['email_hash'] = [] for emel in list(set(list_observed_email)): # Email already lowered bodie['email_hash'].append(wrap_hash_sha256(emel)) if list_observed_dom: bodie['domain_hash'] = [] for uri in list(set(list_observed_dom)): bodie['domain_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_ip: bodie['ip_hash'] = [] for fip in list(set(list_observed_ip)): # IP (v6) already lowered bodie['ip_hash'].append(wrap_hash_sha256(fip)) # For mail without multipart we will only get the "content....something" headers # all other headers are in "header" # but we need to convert header tuples in dict.. # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" ch = {} # type: typing.Dict[str, typing.List] for k, v in body_multhead: # make sure we are working with strings only v = str(v) # We are using replace . to : for avoiding issue in mongo k = k.lower().replace('.', ':') # Lot of lowers, precompute :) . # print v if multipart: if k in ch: ch[k].append(v) else: ch[k] = [v] else: # if not multipart, store only content-xx related header with part if k.startswith('content'): # otherwise, we got all header headers k = k.lower().replace('.', ':') if k in ch: ch[k].append(v) else: ch[k] = [v] bodie['content_header'] = ch # Store content headers dict if include_raw_body: bodie['content'] = body # Sometimes bad people play with multiple header instances. # We "display" the "LAST" one .. as does thunderbird val = ch.get('content-type') if val: header_val = val[-1] bodie['content_type'] = header_val.split(';', 1)[0].strip() # Hash the body bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest() uid = str(uuid.uuid1()) bodys[uid] = bodie bodys_struc = bodys # Get all other bulk raw headers # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" # for k in set(msg.keys()): # We are using replace . to : for avoiding issue in mongo k = k.lower() # Lot of lower, precompute... decoded_values = [] try: for value in msg.get_all(k, []): if value: decoded_values.append(value) except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. logger.exception('We hit bug 27257!') decoded_values = eml_parser.decode.workaround_bug_27257_field_value(msg, k) if k in header: header[k] += decoded_values else: header[k] = decoded_values if decoded_values: if k in header: header[k] += decoded_values else: header[k] = decoded_values headers_struc['header'] = header # parse attachments try: report_struc['attachment'] = traverse_multipart(msg, 0, include_attachment_data) except (binascii.Error, AssertionError): # we hit this exception if the payload contains invalid data logger.exception('Exception occured while parsing attachment data. Collected data will not be complete!') report_struc['attachment'] = None # Dirty hack... transform hash into list.. need to be done in the function. # Mandatory to search efficiently in mongodb # See Bug 11 of eml_parser if not report_struc['attachment']: del report_struc['attachment'] else: newattach = [] for attachment in report_struc['attachment']: newattach.append(report_struc['attachment'][attachment]) report_struc['attachment'] = newattach newbody = [] for body in bodys_struc: newbody.append(bodys_struc[body]) report_struc['body'] = newbody # End of dirty hack # Get all other bulk headers report_struc['header'] = headers_struc return report_struc
def headeremail2list(mail: email.message.Message, header: str) -> typing.List[str]: """Parses a given header field with e-mail addresses to a list of e-mail addresses. Args: mail (email.message.Message): An e-mail message object. header (str): The header field to decode. Returns: list: Returns a list of strings which represent e-mail addresses. """ try: field = email.utils.getaddresses(mail.get_all(header, [])) except (IndexError, AttributeError): field = email.utils.getaddresses( eml_parser.decode.workaround_bug_27257(mail, header)) return_field = [] for m in field: if not m[1] == '': if eml_parser.regex.parsing_email_force_tld: if eml_parser.regex.email_force_tld_regex.match(m[1]): return_field.append(m[1].lower()) else: return_field.append(m[1].lower()) return return_field
def workaround_bug_27257_field_value(msg: email.message.Message, header: str) -> typing.List[str]: """Function to work around bug 27257 and just tries its best using the compat32 policy to extract any meaningful information. Args: msg (email.message.Message): An e-mail message object. header (str): The header field to decode. Returns: list: Return an extracted list of strings. """ if msg.policy == email.policy.compat32: # type: ignore new_policy = None else: new_policy = msg.policy # type: ignore msg.policy = email.policy.compat32 # type: ignore return_value = [] for value in msg.get_all(header, []): if value != '': return_value.append(value) if new_policy is not None: msg.policy = new_policy # type: ignore return return_value
def headeremail2list(mail: email.message.Message, header: str) -> typing.List[str]: """Parses a given header field with e-mail addresses to a list of e-mail addresses. Args: mail (email.message.Message): An e-mail message object. header (str): The header field to decode. Returns: list: Returns a list of strings which represent e-mail addresses. """ try: field = email.utils.getaddresses(mail.get_all(header, [])) except (IndexError, AttributeError): field = email.utils.getaddresses(eml_parser.decode.workaround_bug_27257(mail, header)) return_field = [] for m in field: if not m[1] == '': if eml_parser.regex.parsing_email_force_tld: if eml_parser.regex.email_force_tld_regex.match(m[1]): return_field.append(m[1].lower()) else: return_field.append(m[1].lower()) return return_field
def from_email(cls, m: email.message.Message): return Message( message_id=m.get("message-id", ""), in_reply_to=m.get("in-reply-to", ""), date=_parsedate(m), email_from=email.utils.parseaddr(m.get("from", "")), email_to=email.utils.parseaddr(m.get("to", "")), carbon_copy=email.utils.getaddresses(m.get_all("cc", [])), subject=m.get("subject", ""), reply_to=m.get("reply-to", ""), )
def parse_email(msg: email.message.Message, include_raw_body: bool = False, include_attachment_data: bool = False, pconf: typing.Optional[dict] = None, parse_attachments: bool = True) -> dict: """Parse an e-mail and return a dictionary containing the various parts of the e-mail broken down into key-value pairs. Args: msg (str): Raw EML e-mail string. include_raw_body (bool, optional): If True, includes the raw body in the resulting dictionary. Defaults to False. include_attachment_data (bool, optional): If True, includes the full attachment data in the resulting dictionary. Defaults to False. pconf (dict, optional): A dict with various optional configuration parameters, e.g. whitelist IPs, whitelist e-mail addresses, etc. parse_attachments (bool, optional): Set this to false if you want to disable the parsing of attachments. Please note that HTML attachments as well as other text data marked to be in-lined, will always be parsed. Returns: dict: A dictionary with the content of the EML parsed and broken down into key-value pairs. """ header = {} # type: typing.Dict[str, typing.Any] report_struc = {} # type: typing.Dict[str, typing.Any] # Final structure headers_struc = { } # type: typing.Dict[str, typing.Any] # header_structure bodys_struc = {} # type: typing.Dict[str, typing.Any] # body structure # If no pconf was specified, default to empty dict pconf = pconf or {} # If no whitelisting is required, set to emtpy list if 'whiteip' not in pconf: pconf['whiteip'] = [] # If no whitelisting is required, set to emtpy list if 'whitefor' not in pconf: pconf['whitefor'] = [] # parse and decode subject subject = msg.get('subject', '') headers_struc['subject'] = eml_parser.decode.decode_field(subject) # If parsing had problems, report it if msg.defects: headers_struc['defect'] = [] for exception in msg.defects: headers_struc['defect'].append(str(exception)) # parse and decode "from" # @TODO verify if this hack is necessary for other e-mail fields as well try: msg_header_field = str(msg.get('from', '')).lower() except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. # logger.exception('We hit bug 27257!') _from = eml_parser.decode.workaround_bug_27257(msg, 'from') msg.__delitem__('from') if _from: msg.add_header('from', _from[0]) __from = _from[0].lower() else: msg.add_header('from', '') __from = '' msg_header_field = __from if msg_header_field != '': m = eml_parser.regex.email_regex.search(msg_header_field) if m: headers_struc['from'] = m.group(1) else: from_ = email.utils.parseaddr(msg.get('from', '').lower()) headers_struc['from'] = from_[1] # parse and decode "to" headers_struc['to'] = headeremail2list(msg, 'to') # parse and decode "cc" headers_struc['cc'] = headeremail2list(msg, 'cc') if not headers_struc['cc']: headers_struc.pop('cc') # parse and decode delivered-to headers_struc['delivered_to'] = headeremail2list(msg, 'delivered-to') if not headers_struc['delivered_to']: headers_struc.pop('delivered_to') # parse and decode Date # If date field is present if 'date' in msg: try: headers_struc['date'] = eml_parser.decode.robust_string2date( msg.get('date')) except (TypeError, Exception): logger.warning('Error parsing date.') headers_struc['date'] = dateutil.parser.parse( '1970-01-01T00:00:00+0000') msg.replace_header('date', headers_struc['date']) else: # If date field is absent... headers_struc['date'] = dateutil.parser.parse( '1970-01-01T00:00:00+0000') # mail receiver path / parse any domain, e-mail # @TODO parse case where domain is specified but in parentheses only an IP headers_struc['received'] = [] headers_struc['received_email'] = [] headers_struc['received_domain'] = [] headers_struc['received_ip'] = [] try: found_smtpin = collections.Counter( ) # type: collections.Counter # Array for storing potential duplicate "HOP" for received_line in msg.get_all('received', []): line = str(received_line).lower() received_line_flat = re.sub(r'(\r|\n|\s|\t)+', ' ', line, flags=re.UNICODE) # Parse and split routing headers. # Return dict of list # date string # from list # for list # by list # with string # warning list parsed_routing = eml_parser.routing.parserouting( received_line_flat) # If required collect the IP of the gateway that have injected the mail. # Iterate all parsed item and find IP # It is parsed from the MOST recent to the OLDEST (from IN > Out) # We match external IP from the most "OUT" Found. # Warning .. It may be spoofed !! # It add a warning if multiple identical items are found. if pconf.get('byhostentry'): for by_item in parsed_routing.get('by', []): # type: ignore for byhostentry_ in pconf['byhostentry']: byhostentry = byhostentry_.lower() # print ("%s %s" % (byhostentry, by_item)) if byhostentry in by_item: # Save the last Found.. ( most external ) headers_struc['received_src'] = parsed_routing.get( 'from') # Increment watched by detection counter, and warn if needed found_smtpin[byhostentry] += 1 if found_smtpin[ byhostentry] > 1: # Twice found the header... if parsed_routing.get('warning'): parsed_routing['warning'].append( ['Duplicate SMTP by entrypoint']) else: parsed_routing['warning'] = [ 'Duplicate SMTP by entrypoint' ] headers_struc['received'].append(parsed_routing) # Parse IPs in "received headers" ips_in_received_line = eml_parser.regex.ipv6_regex.findall(received_line_flat) + \ eml_parser.regex.ipv4_regex.findall(received_line_flat) for ip in ips_in_received_line: try: ip_obj = ipaddress.ip_address( ip ) # type: ignore # type of findall is list[str], so this is correct except ValueError: logger.debug( 'Invalid IP in received line - "{}"'.format(ip)) else: if not (ip_obj.is_private or str(ip_obj) in pconf['whiteip']): headers_struc['received_ip'].append(str(ip_obj)) # search for domain for m in eml_parser.regex.recv_dom_regex.findall( received_line_flat): try: ip_obj = ipaddress.ip_address( m ) # type: ignore # type of findall is list[str], so this is correct except ValueError: # we find IPs using the previous IP crawler, hence we ignore them # here. # iff the regex fails, we add the entry headers_struc['received_domain'].append(m) # search for e-mail addresses for mail_candidate in eml_parser.regex.email_regex.findall( received_line_flat): if mail_candidate not in parsed_routing.get('for', []): headers_struc['received_email'] += [mail_candidate] except TypeError: # Ready to parse email without received headers. logger.exception('Exception occured while parsing received lines.') # Concatenate for emails into one array | uniq # for rapid "find" headers_struc['received_foremail'] = [] if 'received' in headers_struc: for _parsed_routing in headers_struc['received']: for itemfor in _parsed_routing.get('for', []): if itemfor not in pconf['whitefor']: headers_struc['received_foremail'].append(itemfor) # Uniq data found headers_struc['received_email'] = list(set( headers_struc['received_email'])) headers_struc['received_domain'] = list( set(headers_struc['received_domain'])) headers_struc['received_ip'] = list(set(headers_struc['received_ip'])) # Clean up if empty if not headers_struc['received_email']: del headers_struc['received_email'] if 'received_foremail' in headers_struc: if not headers_struc['received_foremail']: del headers_struc['received_foremail'] else: headers_struc['received_foremail'] = list( set(headers_struc['received_foremail'])) if not headers_struc['received_domain']: del headers_struc['received_domain'] if not headers_struc['received_ip']: del headers_struc['received_ip'] #################### # Parse text body raw_body = get_raw_body_text(msg) if include_raw_body: bodys_struc['raw_body'] = raw_body bodys = {} # Is it a multipart email ? if len(raw_body) == 1: multipart = False else: multipart = True for body_tup in raw_body: bodie = {} # type: typing.Dict[str, typing.Any] _, body, body_multhead = body_tup # Parse any URLs and mail found in the body list_observed_urls = [] # type: typing.List[str] list_observed_email = [] # type: typing.List[str] list_observed_dom = [] # type: typing.List[str] list_observed_ip = [] # type: typing.List[str] # If we start directly a findall on 500K+ body we got time and memory issues... # if more than 4K.. lets cheat, we will cut around the thing we search "://, @, ." # in order to reduce regex complexity. if len(body) < 4096: list_observed_urls = get_uri_ondata(body) for match in eml_parser.regex.email_regex.findall(body): list_observed_email.append(match.lower()) for match in eml_parser.regex.dom_regex.findall(body): list_observed_dom.append(match.lower()) for match in eml_parser.regex.ipv4_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for match in eml_parser.regex.ipv6_regex.findall(body): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) else: for scn_pt in findall('://', body): list_observed_urls = get_uri_ondata( body[scn_pt - 16:scn_pt + 4096]) + list_observed_urls for scn_pt in findall('@', body): # RFC 3696, 5322, 5321 for email size limitations for match in eml_parser.regex.email_regex.findall( body[scn_pt - 64:scn_pt + 255]): list_observed_email.append(match.lower()) for scn_pt in findall('.', body): # The maximum length of a fqdn, not a hostname, is 1004 characters RFC1035 # The maximum length of a hostname is 253 characters. Imputed from RFC952, RFC1123 and RFC1035. for match in eml_parser.regex.dom_regex.findall( body[scn_pt - 253:scn_pt + 1004]): list_observed_dom.append(match.lower()) # Find IPv4 addresses for match in eml_parser.regex.ipv4_regex.findall( body[scn_pt - 11:scn_pt + 3]): if not eml_parser.regex.priv_ip_regex.match(match): if match not in pconf['whiteip']: list_observed_ip.append(match) for scn_pt in findall(':', body): # The maximum length of IPv6 is 32 Char + 7 ":" for match in eml_parser.regex.ipv6_regex.findall( body[scn_pt - 4:scn_pt + 35]): if not eml_parser.regex.priv_ip_regex.match(match): if match.lower() not in pconf['whiteip']: list_observed_ip.append(match.lower()) # Report uri,email and observed domain or hash if no raw body if include_raw_body: if list_observed_urls: bodie['uri'] = list(set(list_observed_urls)) if list_observed_email: bodie['email'] = list(set(list_observed_email)) if list_observed_dom: bodie['domain'] = list(set(list_observed_dom)) if list_observed_ip: bodie['ip'] = list(set(list_observed_ip)) else: if list_observed_urls: bodie['uri_hash'] = [] for uri in list(set(list_observed_urls)): bodie['uri_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_email: bodie['email_hash'] = [] for emel in list(set(list_observed_email)): # Email already lowered bodie['email_hash'].append(wrap_hash_sha256(emel)) if list_observed_dom: bodie['domain_hash'] = [] for uri in list(set(list_observed_dom)): bodie['domain_hash'].append(wrap_hash_sha256(uri.lower())) if list_observed_ip: bodie['ip_hash'] = [] for fip in list(set(list_observed_ip)): # IP (v6) already lowered bodie['ip_hash'].append(wrap_hash_sha256(fip)) # For mail without multipart we will only get the "content....something" headers # all other headers are in "header" # but we need to convert header tuples in dict.. # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" ch = {} # type: typing.Dict[str, typing.List] for k, v in body_multhead: # make sure we are working with strings only v = str(v) # We are using replace . to : for avoiding issue in mongo k = k.lower().replace('.', ':') # Lot of lowers, pre-compute :) . # print v if multipart: if k in ch: ch[k].append(v) else: ch[k] = [v] else: # if not multipart, store only content-xx related header with part if k.startswith( 'content'): # otherwise, we got all header headers if k in ch: ch[k].append(v) else: ch[k] = [v] bodie['content_header'] = ch # Store content headers dict if include_raw_body: bodie['content'] = body # Sometimes bad people play with multiple header instances. # We "display" the "LAST" one .. as does thunderbird val = ch.get('content-type') if val: header_val = val[-1] bodie['content_type'] = header_val.split(';', 1)[0].strip() # Hash the body bodie['hash'] = hashlib.sha256(body.encode('utf-8')).hexdigest() uid = str(uuid.uuid1()) bodys[uid] = bodie bodys_struc = bodys # Get all other bulk raw headers # "a","toto" a: [toto,titi] # "a","titi" ---> c: [truc] # "c","truc" # for k in set(msg.keys()): # We are using replace . to : for avoiding issue in mongo k = k.lower() # Lot of lower, precompute... decoded_values = [] try: for value in msg.get_all(k, []): if value: decoded_values.append(value) except (IndexError, AttributeError): # We have hit current open issue #27257 # https://bugs.python.org/issue27257 # The field will be set to emtpy as a workaround. logger.exception('We hit bug 27257!') decoded_values = eml_parser.decode.workaround_bug_27257_field_value( msg, k) if k in header: header[k] += decoded_values else: header[k] = decoded_values if decoded_values: if k in header: header[k] += decoded_values else: header[k] = decoded_values headers_struc['header'] = header # parse attachments if parse_attachments: try: report_struc['attachment'] = traverse_multipart( msg, 0, include_attachment_data) except (binascii.Error, AssertionError): # we hit this exception if the payload contains invalid data logger.exception( 'Exception occured while parsing attachment data. Collected data will not be complete!' ) report_struc['attachment'] = None # Dirty hack... transform hash into list.. need to be done in the function. # Mandatory to search efficiently in mongodb # See Bug 11 of eml_parser if not report_struc['attachment']: del report_struc['attachment'] else: newattach = [] for attachment in report_struc['attachment']: newattach.append(report_struc['attachment'][attachment]) report_struc['attachment'] = newattach newbody = [] for body in bodys_struc: newbody.append(bodys_struc[body]) report_struc['body'] = newbody # End of dirty hack # Get all other bulk headers report_struc['header'] = headers_struc return report_struc