def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--file-path', action='store', dest='file_path', required=True, default=None, help="Path of file to use for URL extraction.") args = parser.parse_args() with open(args.file_path, "rb") as b: urls = RegexHelpers.find_urls(b.read()) for url in urls: print(url)
def _parse_attachment(self, message_part, charset): part_items = message_part.items() for tup in part_items: for value in tup: if "attachment" in value: file_data = message_part.get_payload() attachment_dict = {} if message_part.get("Content-Transfer-Encoding", None) == "base64": file_data_b64 = file_data.replace("\n", "") # For some reason, sometimes the attachments don't have the proper # padding. Add a couple "==" on the end for good measure. This doesn't # seem to harm correctly encoded attachments. file_data_decoded = base64.b64decode(file_data_b64 + "==") # Try and get strings out of the attachment. strings_list = RegexHelpers.find_strings( file_data_decoded) strings = " ".join(strings_list) # Look for any URLs that were in the strings. strings_urls = RegexHelpers.find_urls(strings) attachment_dict["strings_urls"] = strings_urls elif message_part.get_content_type() == "text/html": file_data_decoded = message_part.get_payload( decode=True).decode(charset).encode('utf-8') else: file_data_decoded = file_data try: md5_hasher = hashlib.md5() md5_hasher.update(file_data_decoded) md5_hash = md5_hasher.hexdigest() except TypeError: md5_hash = "" try: sha256_hasher = hashlib.sha256() sha256_hasher.update(file_data_decoded) sha256_hash = sha256_hasher.hexdigest() except TypeError: sha256_hash = "" attachment_dict["data"] = file_data_decoded attachment_dict[ "content_type"] = message_part.get_content_type() attachment_dict["size"] = len(file_data_decoded) attachment_dict["md5"] = md5_hash attachment_dict["sha256"] = sha256_hash attachment_dict["name"] = "" attachment_dict["create_date"] = None attachment_dict["mod_date"] = None attachment_dict["read_date"] = None # Find the attachment name. Normally this follows a specific format # and is called 'filename=' but recently I've seen some that are in # different locations are are just called 'name='... Hence removing # old code and replacing with a regex statement to account for either # name in any location in the message part. attachment_name_pattern = re.compile( r'(file)?name="(.*?)"') for tup in part_items: for item in tup: attachment_name = attachment_name_pattern.search( item) if attachment_name: attachment_dict[ "name"] = RegexHelpers.decode_utf_b64_string( attachment_name.groups()[1]) return attachment_dict return None
def __init__(self, smtp_path=None, smtp_text=None, attached_email=True, check_whitelist=True): # Check that we got at least an SMTP path or text: if not smtp_path and not smtp_text: raise ValueError( "You must specify either an SMTP path or the SMTP text.") # In case we received both, default to use the smtp_path over the smtp_text. if smtp_path: # Read the SMTP file. This works with the "smtp.stream" file or in theory # an "smtp.email" type file with the SMTP commands removed. if os.path.exists(smtp_path): self.path = smtp_path self.name = os.path.basename(smtp_path) with open(self.path) as s: smtp_stream = s.read().splitlines() else: smtp_stream = smtp_text.splitlines() # path and name are here just for completeness for anything # external that might rely on them. self.path = "" self.name = hashlib.md5(smtp_text.encode('utf-8')).hexdigest() # A place to store the IOCs. self.iocs = [] # Where did this alert come from? This could be anything, such as # a URL to an ACE alert or whatever other reference you want. self.reference = "" # Find the envelope from/to addresses. This will only work if given an # "smtp.stream" file, since otherwise the SMTP commands will not exist. self.envelope_from = "" self.envelope_to = "" envelope_address_pattern = re.compile(r'.*<(.*)>.*') for line in smtp_stream: if line.startswith("MAIL FROM:"): try: # Make an Indicator for the address. self.envelope_from = envelope_address_pattern.match( line).group(1) try: ind = Indicator.Indicator(self.envelope_from, "Email - Address") ind.add_tags(["phish", "envelope_from_address"]) self.iocs.append(ind) except ValueError: pass except AttributeError: self.envelope_from = "" if line.startswith("RCPT TO:"): try: # Make an Indicator for the address. self.envelope_to = envelope_address_pattern.match( line).group(1) try: ind = Indicator.Indicator(self.envelope_from, "Email - Address") ind.add_tags(["phish", "envelope_to_address"]) self.iocs.append(ind) except ValueError: pass except AttributeError: self.envelope_to = "" # Exchange journaling sends us the e-mail embedded as an attachment within # another e-mail. We need to strip away those outer headers so that we parse # the attached e-mail that we actually care about. #if attached_email: # if "Content-Type: message/rfc822" in smtp_stream: # index = smtp_stream.index("Content-Type: message/rfc822") # smtp_stream = smtp_stream[index:] # Just in case we are dealing with an "smtp.stream" file that still has # the SMTP commands above the actual e-mail, we need to strip those out. # This will remove all lines prior to the Received: headers so that the # email.parser can properly parse out the e-mail. If we were given an # "smtp.email" type of file with the SMTP commands already removed, this # should not affect anything. while not smtp_stream[0].startswith("Received:"): smtp_stream.pop(0) # Join the header lines into a single string. email_text = "\n".join(smtp_stream) # Create the e-mail object. self._email_obj = email.message_from_string(email_text) # If we want to try and parse an embedded/attached e-mail instead... if attached_email: # Walk the full e-mail's parts. for part in self._email_obj.walk(): # Continue if the part looks like a valid e-mail. if part.get_content_type() == "message/rfc822": # Split the part lines into a list. part_text = str(part).splitlines() # Make sure our part starts with the Received: headers. while not part_text[0].startswith("Received:"): part_text.pop(0) part_text = "\n".join(part_text) # Make the new e-mail object. self._email_obj = email.message_from_string(part_text) # Parse the e-mail object for its content. parsed_email = self._parse_content() # Now that we have the e-mail object, parse out some of the interesting parts. self.urls = set() self.headers = self._get_all_headers_string() # Make Indicators for the received headers (SMTP relays) self.received = self.get_header("received") for hop in self.received: for ip in RegexHelpers.find_ip_addresses(hop): try: ind = Indicator.Indicator(ip, "Address - ipv4-addr") ind.add_tags(["phish", "smtp_relay"]) # We consider SMTP relay indicators benign... Don't want to alert every time # we see the relay sending an e-mail, but it's nice to know for correlation. ind.make_benign() self.iocs.append(ind) except ValueError: pass for domain in RegexHelpers.find_domains(hop): if isinstance(domain, tuple): try: ind = Indicator.Indicator(domain[0], "URI - Domain Name") ind.add_tags(["phish", "smtp_relay"]) ind.make_benign() self.iocs.append(ind) except ValueError: pass # Get the e-mail's plaintext body, HTML body, and the visible text from the HTML. self.body = parsed_email["body"] self.html = parsed_email["html"] if self.html: soup = BeautifulSoup(self.html, "html.parser") self.visible_html = "".join(soup.findAll(text=True)) else: self.visible_html = "" # Get any e-mail attachments. self.attachments = parsed_email["attachments"] self.attachments_string = ", ".join( [attach["name"] for attach in self.attachments]) self.md5_string = ", ".join( [attach["md5"] for attach in self.attachments]) # Make an Indicator for the from address. try: self.from_address = self._get_address_list("from")[0][1] try: ind = Indicator.Indicator(self.from_address, "Email - Address") ind.add_tags(["phish", "from_address"]) self.iocs.append(ind) except ValueError: pass except IndexError: self.from_address = "" # Make an Indicator for the reply-to address. try: self.replyto = self._get_address_list("reply-to")[0][1] try: ind = Indicator.Indicator(self.replyto, "Email - Address") ind.add_tags(["phish", "replyto_address"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.replyto = "" # Make an Indicator for the subject. try: self.subject = "".join(self.get_header("subject")[0].splitlines()) try: ind = Indicator.Indicator(self.subject, "Email - Subject") ind.add_tags(["phish", "subject"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.subject = "" # Try and decode the subject and make an Indicator. try: self.decoded_subject = "".join( str(make_header(decode_header( self.get_header("subject")[0]))).splitlines()) try: ind = Indicator.Indicator(self.decoded_subject, "Email - Subject") ind.add_tags(["phish", "decoded_subject"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.decoded_subject = "" # Make an Indicator for each to address. self.to_list = [x[1] for x in self._get_address_list("to")] self.to_string = ", ".join(self.to_list).replace("\t", " ") for address in self.to_list: try: ind = Indicator.Indicator(address, "Email - Address") ind.add_tags(["phish", "to_address"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass # Make an Indicator for each CC address. self.cc_list = [x[1] for x in self._get_address_list("cc")] self.cc_string = ", ".join(self.cc_list).replace("\t", " ") for address in self.cc_list: try: ind = Indicator.Indicator(address, "Email - Address") ind.add_tags(["phish", "cc_address"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass # Make an Indicator for each BCC address. self.bcc_list = [x[1] for x in self._get_address_list("bcc")] self.bcc_string = ", ".join(self.bcc_list).replace("\t", " ") for address in self.bcc_list: try: ind = Indicator.Indicator(address, "Email - Address") ind.add_tags(["phish", "bcc_address"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass # Make an Indicator for the message ID. try: self.message_id = self.get_header("message-id")[0] try: ind = Indicator.Indicator(self.message_id, "Email Message ID") ind.add_tags(["phish", "message_id"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.message_id = "" # Make an Indicator for the x-mailer. try: self.x_mailer = self.get_header("x-mailer")[0] try: ind = Indicator.Indicator(self.x_mailer, "Email - Xmailer") ind.add_tags(["phish", "x-mailer"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.x_mailer = "" # Make an Indicator for the x-original-sender. try: self.x_original_sender = self.get_header("x-original-sender")[0] try: ind = Indicator.Indicator(address, "Email - Address") ind.add_tags(["phish", "x-original-sender"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.x_original_sender = "" # Make an Indicator for the x-originating-ip. try: x_originating_ip = self.get_header("x-originating-ip")[0] # Sometimes this field is in the form: [1.1.1.1] # Make sure we remove any non-IP characters. ip = RegexHelpers.find_ip_addresses(x_originating_ip) if ip: self.x_originating_ip = ip[0] try: ind = Indicator.Indicator(self.x_originating_ip, "Address - ipv4-addr") ind.add_tags(["phish", "x-originating-ip"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.x_originating_ip = "" # Make an Indicator for the x-sender-ip. try: x_sender_ip = self.get_header("x-sender-ip")[0] # Make sure like the X-Originating-IP that we only # get the IP address and no other characters. ip = RegexHelpers.find_ip_addresses(x_sender_ip) if ip: self.x_sender_ip = ip[0] try: ind = Indicator.Indicator(self.x_sender_ip, "Address - ipv4-addr") ind.add_tags(["phish", "x-sender-ip"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) except ValueError: pass except IndexError: self.x_sender_ip = "" # Make Indicators for any URLs in the plaintext body. # Indicator.generate_url_indicators() catches its own exceptions. text_urls = RegexHelpers.find_urls(self.body) text_urls_indicators = Indicator.generate_url_indicators(text_urls) for ind in text_urls_indicators: ind.add_tags(["phish", "plaintext_body"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) RegexHelpers.find_urls(self.html) # Make Indicators for any URLs in the HTML body. html_urls = RegexHelpers.find_urls(self.html) html_urls_indicators = Indicator.generate_url_indicators(html_urls) for ind in html_urls_indicators: ind.add_tags(["phish", "html_body"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) # Make Indicators for any URLs in the visible text HTML body. visible_html_urls = RegexHelpers.find_urls(self.visible_html) visible_html_urls_indicators = Indicator.generate_url_indicators( visible_html_urls) for ind in visible_html_urls_indicators: ind.add_tags(["phish", "visible_html_body"]) if self.from_address: ind.add_relationships(self.from_address) self.iocs.append(ind) # Make Indicators for different attachment attributes. all_urls = text_urls + html_urls + visible_html_urls for file in self.attachments: # Make Indicators for any strings URLs. if "strings_urls" in file: attachment_strings_urls_indicators = Indicator.generate_url_indicators( file["strings_urls"]) for ind in attachment_strings_urls_indicators: ind.add_tags(["phish", "strings_url", file["name"]]) if self.from_address: ind.add_relationships( [self.from_address, file["name"]]) self.iocs.append(ind) all_urls += file["strings_urls"] # Make an Indicator for the filename. if file["name"]: try: ind = Indicator.Indicator(file["name"], "Windows - FileName") ind.add_tags(["phish", "attachment"]) if self.from_address: ind.add_relationships(self.from_address) if file["md5"]: ind.add_relationships(file["md5"]) if file["sha256"]: ind.add_relationships(file["sha256"]) self.iocs.append(ind) except ValueError: pass # Make an Indicator for the MD5 hash. if file["md5"]: try: ind = Indicator.Indicator(file["md5"], "Hash - MD5") ind.add_tags(["phish", "attachment"]) if self.from_address: ind.add_relationships(self.from_address) if file["name"]: ind.add_tags(file["name"]) ind.add_relationships(file["name"]) if file["sha256"]: ind.add_relationships(file["sha256"]) self.iocs.append(ind) except ValueError: pass # Make an Indicator for the SHA256 hash. if file["sha256"]: try: ind = Indicator.Indicator(file["sha256"], "Hash - SHA256") ind.add_tags(["phish", "attachment"]) if self.from_address: ind.add_relationships(self.from_address) if file["name"]: ind.add_tags(file["name"]) ind.add_relationships(file["name"]) if file["md5"]: ind.add_relationships(file["md5"]) self.iocs.append(ind) except ValueError: pass # Parse the URLs and prevent "duplicate" URLs # like http://blah.com/ and http://blah.com for url in all_urls: # Strip off the ending slash if it's there. if url.endswith("/"): url = url[:-1] self.urls.add(url) self.received_time = self._get_received_time() # Run the IOCs through the whitelists if requested. if check_whitelist: self.iocs = Indicator.run_whitelist(self.iocs) # Finally merge the IOCs so we don't have any duplicates. self.iocs = Indicator.merge_duplicate_indicators(self.iocs)