def make_person_schema(mailFile, outputDir, person_db): msg = BytesParser().parse(mailFile) # Retrieve the from person. (realname, mailAddr) = get_info_from_mail_field(msg['from']) person = Person(realname, mailAddr) # Add it to the database. update_db(person_db, person) # Find ourself (my_name, my_email) = get_info_from_mail_field(msg['Delivered-To']) me = Person(my_name, my_email) def addToMyEmailAddr(field_name): (_, my_email_addr) = get_info_from_mail_field(msg[field_name]) if my_email_addr: me.addEmail(my_email_addr) addToMyEmailAddr('X-Original-To') addToMyEmailAddr('Resent-From') update_db(person_db, me) # Find cc and to relation (excluding ourself) link_people(person_db, me, msg.get_all('to', [])) link_people(person_db, me, msg.get_all('cc', []))
def test_prepend_headerfields_as_header_objs(): # we cope with email.header.Header instances as headerfields msg = BytesParser(policy=compat32).parsebytes( 'Subject: föö'.encode('utf-8')) assert not isinstance(msg.get_all("Subject")[0], str) result = pgp.prepend_header_fields(msg, [("To", "foo"), ("From", "bar")]) assert result.items() == [('To', 'foo'), ('From', 'bar'), ('Subject', '=?unknown-8bit?b?ZsO2w7Y=?=')]
def test_prepend_headerfields_encoded(): # we cope with non-ascii encodings in raw strings msg = BytesParser(policy=default_policy).parsebytes( 'Subject: föö'.encode('utf-8')) assert msg.get_all("Subject")[0] == "föö" result = pgp.prepend_header_fields(msg, [("To", "foo"), ("From", "bar")]) assert result.items() == [('To', 'foo'), ('From', 'bar'), ('Subject', 'föö')]
def __call__(self, content): '''Parse an email message in "content", which is a string or a text input object. /content/ Standard encoded email message content. Returns parsed message in a dict of (subject, date, body, html, from, to, attachments). ''' if isinstance(content, bytes): msgobj = BytesParser().parsebytes(content) else: msgobj = StrParser().parse(StringIO(content)) subject = parse_header('Subject', msgobj) date = parse_header('Date', msgobj) received = [] for part in (msgobj.get_all('Received') or []): lx = self.re_received.split(part) tmp = dict(zip(lx[1::2], [ x.strip() for x in lx[2::2] ])) tx = tmp.get(';') if tx: tmp['time'] = parse_time(tx) received.append(tmp) fromaddr = parse_addr(msgobj, 'From') if date: date = date.replace(',', '') logger.debug('Parsing message: Date={0}, Subject={1}'.format(date, subject)) #-------- Parsing attachments: attachments = [] body = None html = None for part in msgobj.walk(): attachment = parse_attachment(part) if attachment: attachments.append(attachment) else: # parse text content content_type = part.get_content_type() if content_type[0:5] == 'text/': payload = str(part.get_payload(decode=True), part.get_content_charset() or 'ascii', 'replace').encode('utf8','replace') if content_type == "text/plain": if body is None: body = '' body += str(payload) elif content_type == "text/html": if html is None: html = '' html += str(payload) else: logger.debug('Ignored: Content_type "{0}" in message "{1}" from {2}, Date={3}'.format(content_type, subject, fromaddr, date)) return { 'subject' : subject, 'date' : date, 'received': received, # 'received': sorted(received, key=lambda k: k['time']), 'body' : body, 'html' : html, 'from' : fromaddr, 'to' : parse_addr(msgobj, 'To'), 'cc' : parse_addr(msgobj, 'CC'), 'bcc' : parse_addr(msgobj, 'BCC'), 'attachments': attachments }
def email_analysis(filename, exclude_private_ip): urlList = [] domainList = [] hopList = [] hopListIP = [] attachList = [] data = {} data["data"] = [] with open(filename, "rb") as fp: msg = BytesParser(policy=policy.default).parse(fp) if msg: # Identify each url or attachment reported in the eMail body for part in msg.walk(): if part.get_content_type( ) == "text/plain" or part.get_content_type() == "text/html": extractor = URLExtract() urlList.extend(extractor.find_urls(part.get_content())) else: if part.get_filename(): attachList.append(part.get_filename()) # Identify each domain reported in the eMail body for url in urlList: analyzeddomain = tldcache(url).registered_domain if analyzeddomain: domainList.append(analyzeddomain) # Remove Duplicate urlList = list(set(urlList)) domainList = list(set(domainList)) # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example: # # Sender Name: Mario Rossi <*****@*****.**> # Sender Mail: [email protected] if msg["From"]: mail_from = re.findall( "[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["From"], re.IGNORECASE) mail_from = mail_from[-1] else: mail_from = "" if msg["Sender"]: mail_sender = msg["Sender"] else: mail_sender = "" if msg["Subject"]: mail_subject = msg["Subject"] else: mail_subject = "" if msg["X-Originating-IP"]: mail_xorigip = msg["X-Originating-IP"] else: mail_xorigip = "" data["data"].append({ "Filename": os.path.basename(filename), "From": mail_from, "Sender": mail_sender, "Subject": mail_subject, "X-Originating-IP": mail_xorigip, "attachments": [], "relay_full": [], "relay_ip": [], "urls": [], "domains": [] }) # Identify each relay received = msg.get_all("Received") if received: received.reverse() for line in received: hops = re.findall( "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)", line, re.DOTALL | re.X) for hop in hops: ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0], re.DOTALL | re.X) # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8 ipv6_address = re.findall( r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))", hop[0], re.DOTALL | re.X) if ipv4_address: if ipaddress.ip_address(ipv4_address[0]): if ipaddress.ip_address( ipv4_address[0]).is_private: if not exclude_private_ip: hopListIP.append(ipv4_address[0]) else: hopListIP.append(ipv4_address[0]) if ipv6_address: if ipaddress.ip_address(ipv6_address[0]): if ipaddress.ip_address( ipv6_address[0]).is_private: if not exclude_private_ip: hopListIP.append(ipv6_address[0]) else: hopListIP.append(ipv6_address[0]) if hop[0]: hopList.append(hop[0]) if attachList: data["data"][0]["attachments"].append( dict(zip(range(len(attachList)), attachList))) if hopList: data["data"][0]["relay_full"].append( dict(zip(range(len(hopList)), hopList))) if hopListIP: data["data"][0]["relay_ip"].append( dict(zip(range(len(hopListIP)), hopListIP))) if urlList: data["data"][0]["urls"].append( dict(zip(range(len(urlList)), urlList))) data["data"][0]["domains"].append( dict(zip(range(len(domainList)), domainList))) print(json.dumps(data, indent=4))
def email_analysis(filename, exclude_private_ip, check_spf): urlList = [] hopList = [] hopListIP = [] domainList = [] attachmentsList = [] hopListIPnoPrivate = [] resultmeioc = { "filename": os.path.basename(filename), "from": None, "sender": None, "x-sender": None, "to": None, "cc": None, "bcc": None, "envelope-to": None, "delivered-to": None, "subject": None, "x-originating-ip": None, "relay_full": None, "relay_ip": None, "spf": None, "urls": None, "domains": None, "attachments": None } with open(filename, "rb") as fp: msg = BytesParser(policy=policy.default).parse(fp) if msg: # # Header analysis # if msg["From"]: # A sender obfuscation technique involves entering two e-mails. Only the last one is the real one. Example: # # Sender Name: Mario Rossi <*****@*****.**> # Sender Mail: [email protected] mail_from = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["From"], re.IGNORECASE) if mail_from: resultmeioc["from"] = mail_from[-1] if msg["Sender"]: mail_sender = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["Sender"], re.IGNORECASE) if mail_sender: resultmeioc["sender"] = mail_sender[-1] if msg["X-Sender"]: mail_xsender = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["X-Sender"], re.IGNORECASE) if mail_xsender: resultmeioc["x-sender"] = mail_xsender[-1] if msg["To"]: mail_to = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["To"], re.IGNORECASE) if mail_to: # Remove possible duplicates and create a numbered dictionary mail_to = dict( zip(range(len(list(set(mail_to)))), list(set(mail_to)))) resultmeioc["to"] = mail_to if msg["Bcc"]: resultmeioc["bcc"] = msg["Bcc"] if msg["Cc"]: mail_cc = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["Cc"], re.IGNORECASE) if mail_cc: # Remove possible duplicates and create a numbered dictionary mail_cc = dict( zip(range(len(list(set(mail_cc)))), list(set(mail_cc)))) resultmeioc["cc"] = mail_cc if msg["Envelope-to"]: mail_envelopeto = re.findall( "[A-Za-z0-9.!#$%&'*+\/=?^_`{|}~\-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,6}", msg["Envelope-to"], re.IGNORECASE) if mail_envelopeto: # Remove possible duplicates and create a numbered dictionary mail_envelopeto = dict( zip(range(len(list(set(mail_envelopeto)))), list(set(mail_envelopeto)))) resultmeioc["envelope-to"] = mail_envelopeto if msg["Delivered-To"]: resultmeioc["delivered-to"] = msg["Delivered-To"] if msg["X-Originating-IP"]: # Usually the IP is in square brackets, I remove them if present. mail_xorigip = msg["X-Originating-IP"].replace("[", "").replace( "]", "") resultmeioc["x-originating-ip"] = mail_xorigip if msg["Subject"]: resultmeioc["subject"] = msg["Subject"] # Identify each relay received = msg.get_all("Received") if received: received.reverse() for line in received: hops = re.findall( "from\s+(.*?)\s+by(.*?)(?:(?:with|via)(.*?)(?:id|$)|id|$)", line, re.DOTALL | re.X) for hop in hops: ipv4_address = re.findall(r"[0-9]+(?:\.[0-9]+){3}", hop[0], re.DOTALL | re.X) # https://gist.github.com/dfee/6ed3a4b05cfe7a6faf40a2102408d5d8 ipv6_address = re.findall( r"(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}:[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:::(?:ffff(?::0{1,4}){0,1}:){0,1}[^\s:](?:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9]).){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))|(?:fe80:(?::(?:(?:[0-9a-fA-F]){1,4})){0,4}%[0-9a-zA-Z]{1,})|(?::(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,7}|:))|(?:(?:(?:[0-9a-fA-F]){1,4}):(?:(?::(?:(?:[0-9a-fA-F]){1,4})){1,6}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,2}(?::(?:(?:[0-9a-fA-F]){1,4})){1,5})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,3}(?::(?:(?:[0-9a-fA-F]){1,4})){1,4})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,4}(?::(?:(?:[0-9a-fA-F]){1,4})){1,3})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,5}(?::(?:(?:[0-9a-fA-F]){1,4})){1,2})|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,6}:(?:(?:[0-9a-fA-F]){1,4}))|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){1,7}:)|(?:(?:(?:(?:[0-9a-fA-F]){1,4}):){7,7}(?:(?:[0-9a-fA-F]){1,4}))", hop[0], re.DOTALL | re.X) if ipv4_address: for ipv4 in ipv4_address: if ipaddress.ip_address(ipv4): hopListIP.append(ipv4) if not ipaddress.ip_address(ipv4).is_private: hopListIPnoPrivate.append(ipv4) if ipv6_address: for ipv6 in ipv6_address: if ipaddress.ip_address(ipv6) and not "6::": hopListIP.append(ipv6) if not ipaddress.ip_address(ipv6).is_private: hopListIPnoPrivate.append(ipv6) if hop[0]: hopList.append(hop[0]) if hopList: resultmeioc["relay_full"] = dict(zip(range(len(hopList)), hopList)) if hopListIP: if exclude_private_ip: resultmeioc["relay_ip"] = dict( zip(range(len(hopListIPnoPrivate)), hopListIPnoPrivate)) else: resultmeioc["relay_ip"] = dict( zip(range(len(hopListIP)), hopListIP)) # # Body analysis # for part in msg.walk(): if part.get_content_type() == "text/plain": # https://gist.github.com/dperini/729294 urlList.extend( re.findall( "(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?", part.get_content(), re.UNICODE | re.IGNORECASE | re.MULTILINE)) if part.get_content_type() == "text/html": # The try/except is necessary, if the body of the eMail contains an incorrect or unencoded HTML code the script freeezes. try: soup = BeautifulSoup(part.get_content(), "html.parser") tags = soup.find_all("a", href=True) for url in tags: urlList.append(url.get("href")) except: pass if part.get_filename(): if part.get_payload(decode=True): filename = part.get_filename() filemd5 = hashlib.md5( part.get_payload(decode=True)).hexdigest() filesha1 = hashlib.sha1( part.get_payload(decode=True)).hexdigest() filesha256 = hashlib.sha256( part.get_payload(decode=True)).hexdigest() attachmentsList.append({ "filename": filename, "MD5": filemd5, "SHA1": filesha1, "SHA256": filesha256 }) # Identify each domain reported in the eMail body for url in urlList: analyzeddomain = tldcache(url).registered_domain if analyzeddomain: domainList.append(analyzeddomain) # Remove Duplicate urlList = list(set(urlList)) domainList = list(set(domainList)) if urlList: resultmeioc["urls"] = dict(zip(range(len(urlList)), urlList)) resultmeioc["domains"] = dict( zip(range(len(domainList)), domainList)) if attachmentsList: resultmeioc["attachments"] = attachmentsList # # Verify the SPF record if requested # if check_spf: testspf = False resultspf = "" for ip in hopListIPnoPrivate: if not testspf and "mail_from" in locals(): resultspf = spf.check2(ip, mail_from[-1], mail_from[-1].split("@")[1])[0] try: resultspf = spf.check2(ip, mail_from[-1], mail_from[-1].split("@")[1])[0] except: pass if resultspf == "pass": testspf = True else: testspf = False resultmeioc["spf"] = testspf print(json.dumps(resultmeioc, indent=4))
def get_eml_subject(eml_file): with open(eml_file, 'rb') as fp: msg = BytesParser(policy=policy.default).parse(fp) return msg.get_all('Subject')[0]