def parse_process_tree_urls(self): self.logger.debug('Looking for URLs in process tree') urls = [] for process in self.processes: urls = find_urls(process['command']) urls += find_urls(process['decoded_command']) return urls
def test_find_urls_binary(): with open(f"{files_dir}/hello.bin", "rb") as f: blob = f.read() expected_urls = {"http://domain.com"} assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_csv_lookalike(): with open(f"{files_dir}/csv_lookalike.txt", "rb") as f: blob = f.read() expected_urls = {"https://example.com"} assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_html(): with open(f"{files_dir}/test.html", "rb") as f: blob = f.read() expected_urls = { "http://domain.com", "http://domain.com/action", "http://domain.com/background", "http://domain.com/css", "http://domain.com/href", "http://domain.com/meta", "http://domain.com/src", "http://domain.com/xmlns", "http://domain3.com", "http://faß.de/re.php", "http://domain2.com/image-small.png", "http://domain2.com/image-medium.png", "http://domain2.com/image-large.png", "http://domain4.com/index.php#thing=http://domain5.com", "http://domain5.com", "https://domain6.com", "https://domain.com/?a=1234", } assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_looks_like_html(): with open(f"{files_dir}/looks_like_html.xml", "rb") as f: blob = f.read() expected_urls = {"https://example.com"} assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_csv(): with open(f"{files_dir}/test.csv", "rb") as f: blob = f.read() expected_urls = { "http://domain.com", "http://domain2.com", "http://domain3.com" } assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_domain_as_url(): with open(f"{files_dir}/domain_as_url.txt", "rb") as f: blob = f.read() expected_urls = { "https://somefakesite.com", "https://somefakesite.com/index.html", "https://somefakesite2.com" } assert urlfinderlib.find_urls(blob, domain_as_url=True) == expected_urls
def test_find_urls_pdf(): with open(f"{files_dir}/test.pdfparser", "rb") as f: blob = f.read() expected_urls = { "http://en.wikipedia.org/wiki/MIT_License", "http://domain.com", "http://domain.com/(test/123" } assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_ical(): with open(f"{files_dir}/test.ical", "rb") as f: blob = f.read() expected_urls = { "https://thisisjustatest.com", "https://thisisjustatest2.com", "https://thisisjustatest3.com", "https://thisisjustatest4.com", } assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_base_url_malformed(): with open(f"{files_dir}/base_url_malformed.html", "rb") as f: blob = f.read() expected_urls = { "https://t.co", "https://t.co/asdf1234?amp=1", "https://t.co/images/stickman.gif", "https://www.w3schools.com/images/stickman.gif", } assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_xml(): with open(f"{files_dir}/sharedStrings.xml", "rb") as f: blob = f.read() expected_urls = { "http://schemas.openxmlformats.org/spreadsheetml/2006/main", "https://domain.com/test", "http://domain2.com", "http://www.w3.org/XML/1998/namespace", } assert urlfinderlib.find_urls(blob) == expected_urls
def test_find_urls_text_xml(): with open(f"{files_dir}/text.xml", "rb") as f: blob = f.read() expected_urls = { "http://schemas.microsoft.com/office/drawing/2010/main", "http://schemas.openxmlformats.org/drawingml/2006/chart", "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", "http://schemas.openxmlformats.org/drawingml/2006/main", "http://schemas.openxmlformats.org/markup-compatibility/2006", } assert urlfinderlib.find_urls(blob) == expected_urls
def setup(self, alert_uuids=[], manual_indicators=[], force=False): """ Parse everything in the event directory to build the event.json """ if alert_uuids: # Make sure the event directory exists. if not os.path.exists(self.path): #os.makedirs(self.path, mode=0o770) os.makedirs(self.path) self.logger.debug('Created event directory: {}'.format( self.path)) # Make sure the collect directory exists. collect_path = os.path.join(self.path, 'collect') if not os.path.exists(collect_path): #os.makedirs(collect_path, mode=0o770) os.makedirs(collect_path) self.logger.debug( 'Created collect directory: {}'.format(collect_path)) # Figure out which alerts are new to the event. existing_alert_paths = [ f['path'] for f in self.json['files'] if f['category'] == 'ace_alert' ] new_alert_uuids = [] for alert_uuid in alert_uuids: if not any(alert_uuid in existing_path for existing_path in existing_alert_paths): new_alert_uuids.append(alert_uuid) # Download any new alerts into the event directory. # NOTE: THIS IS A TEMPORARY HACK UNTIL ACE CAN HANDLE THE API REDIRECTS! for alert_uuid in new_alert_uuids: alert_path = os.path.join(self.path, alert_uuid) if not os.path.exists(alert_path): for ace_server in config['ace']['extra_ace_servers']: try: ace_api.set_default_remote_host(ace_server) ace_api.download(alert_uuid, alert_path) # Reset the API server to the default one. ace_api.set_default_remote_host( config['ace']['ace_api_server']) break except Exception as e: if 'BAD REQUEST for url' in str(e): try: shutil.rmtree(alert_path) except: self.logger.exception( 'There was an error trying to delete alert from event: {}' .format(alert_path)) else: self.logger.exception( 'Unable to download ACE alert!') self.changed = True # Figure out if any alerts were removed from the event. for a in existing_alert_paths: uuid = a.split('/')[-2] if not any(uuid in alert_uuid for alert_uuid in alert_uuids): self.logger.warning( 'Alert has been removed from the ACE event: {}'.format( uuid)) try: shutil.rmtree(os.path.dirname(a)) except: self.logger.exception( 'There was an error trying to delete alert from event: {}' .format(os.path.dirname(a))) # Identify the files in the event directory. self.logger.debug('Starting to walk event directory.') files = self.walk_event_directory() self.logger.debug('Finished walking event directory.') # Cross reference the critical files with what is stored in the JSON # to see if there have been any changes. Changes to critical files are # what require the event to be reprocessed. if force or self.has_critical_file_changed(files): self.logger.info( 'A critical file has changed or we are forcing an update. Reprocessing the event.' ) # Get rid of the old event campaign and tags. self.json['campaign'] = {} self.json['tags'] = [] # Flag the event as changed. self.changed = True # Save the current version of the files to the JSON. self.json['files'] = files # Fill in the missing non-critical file MD5 hashes. We don't hash every # file as we originally walk through the event directory to save time. # We only need to hash the critical files when walking the event directory # so that we can make sure that we do not have any duplicates and so we # can also tell if any of them have changed. This makes things much faster # if nothing has changed with the event. for f in self.json['files']: if not f['critical']: f['md5'] = self.calculate_md5(f['path']) # Get the latest whitelist from SIP. whitelist = EventWhitelist(sip=self.sip) # Parse the ACE alerts. ace_alerts = self.parse_ace_alerts() self.json['ace_alerts'] = [ ace_alert.json for ace_alert in ace_alerts ] # Gather up the unique screenshots found in the ACE alerts. Right now these are just from Cloudphish. screenshot_dict = {} for ace_alert in ace_alerts: for screenshot in ace_alert.screenshots: # Skip this screenshot if it is an HTML e-mail body. The HTML e-mail screenshots # are handled within the EmailParser class. if 'text_html' in screenshot: continue # Find the MD5 hash of this screenshot. for f in self.json['files']: if f['path'] == screenshot: screenshot_dict[f['md5']] = f['path'] # Symlink to the screenshots so we can ensure they have unique file names. unique_screenshots = [] for md5 in screenshot_dict: new_name = 'ace_screenshot_{}.png'.format(md5) new_path = os.path.join(self.path, new_name) unique_screenshots.append(new_path) try: os.symlink(screenshot_dict[md5], new_path) self.logger.debug('Symlinked to ACE screenshot: {}'.format( screenshot_dict[md5])) except FileExistsError: pass # Save the screenshots to the event JSON. self.json['ace_screenshots'] = sorted(unique_screenshots) # Parse the emails and make their indicators. emails = self.parse_emails(whitelist) self.json['emails'] = [ email.json for email in emails if email.received_time and email.message_id ] # Symlink to the sreenshots in the emails so we can ensure they have unique file names. for email in self.json['emails']: # Locate the MD5 of this email. for f in self.json['files']: if f['path'] == email['path']: # Symlink (rename) each screenshot in this email. new_screenshot_paths = [] for screenshot in email['screenshots']: new_name = 'email_screenshot_{}_{}'.format( f['md5'], os.path.basename(screenshot)) new_path = os.path.join( os.path.dirname(email['path']), new_name) new_screenshot_paths.append(new_path) try: os.symlink(screenshot, new_path) self.logger.debug( 'Symlinked to email screenshot: {}'.format( screenshot)) except FileExistsError: pass # Replace the screenshot paths in the JSON with the new paths. email['screenshots'] = sorted(new_screenshot_paths) # Parse the sandbox reports and make their indicators. sandbox_reports = self.parse_sandbox_reports(whitelist) self.json['sandbox'] = [ sandbox_report.json for sandbox_report in sandbox_reports ] # The EmailParser objects do not know where the attachments are located within the event directory. # This is a handy piece of information for various processes later. for email in self.json['emails']: for attachment in email['attachments']: attachment['event_path'] = '' # Try to locate the file with the same MD5. for f in self.json['files']: if f['md5'] == attachment['md5']: # Inject the event directory path into the e-mail JSON. attachment['event_path'] = f['path'] # The sandbox reports do not know where the sample is located within the event directory. That # is a handy piece of information for various processes later. # Also fix the filename in the sandbox reports. VxStream likes to name it after the SHA256 hash # and does not appear to included the actual filename anywhere in its JSON report. for report in self.json['sandbox']: report['event_path'] = '' # Try to locate the file with the same MD5. for f in self.json['files']: if f['md5'] == report['md5']: # Overwrite the filename in the sandbox report. report['filename'] = os.path.basename(f['path']) # Inject the event directory path into the sandbox report. report['event_path'] = f['path'] # Loop over any dropped files in this sandbox report to inject the event path. for dropped_file in report['dropped_files']: if f['md5'] == dropped_file['md5']: # Inject the event directory path into the dropped file JSON. dropped_file['event_path'] = f['path'] # Gather up the indicators. all_indicators = [] self.json['indicators'] = [] # Loop over all of the HTML files in the event and pull out the URLs. self.logger.debug('Gathering URLs from HTML files in the event.') for html_file in [ f['path'] for f in self.json['files'] if f['category'] == 'html' ]: # Store the unique URLs we find. unique_urls = set() # Open and read the contents of the file. with open(html_file, 'rb') as f: urls = find_urls(f.read()) # Add the unique URLs to the list. for url in urls: unique_urls.add(url) # Create indicators for the URLs. indicators = make_url_indicators(unique_urls) for indicator in indicators: indicator.path = html_file indicator.whitelisted = whitelist.is_indicator_whitelisted( indicator) all_indicators.append(indicator) # Gather up the indicators from the ACE alerts. self.logger.debug( 'Checking ACE alert indicators against whitelist.') for ace_alert in ace_alerts: for indicator in ace_alert.indicators: indicator.path = ace_alert.path indicator.whitelisted = whitelist.is_indicator_whitelisted( indicator) all_indicators.append(indicator) # Gather up the indicators from the emails. self.logger.debug('Checking email indicators against whitelist.') for email in emails: for indicator in email.indicators: indicator.path = email.path indicator.whitelisted = whitelist.is_indicator_whitelisted( indicator) all_indicators.append(indicator) # Gather up the indicators from the sandbox reports. self.logger.debug('Checking sandbox indicators against whitelist.') for sandbox_report in sandbox_reports: # Try to find the path to the actual sandboxed sample instead of the JSON report. matching_samples = [ f for f in self.json['files'] if f['md5'] == sandbox_report.md5 ] if matching_samples: for indicator in sandbox_report.indicators: indicator.path = matching_samples[0]['path'] indicator.whitelisted = whitelist.is_indicator_whitelisted( indicator) all_indicators.append(indicator) else: self.logger.warning( 'Could not find matching sample for indicators: "{}" "{}"' .format(sandbox_report.filename, sandbox_report.md5)) # Merge all of the indicators. merged_indicators = merge_indicators(all_indicators) # Add the merged indicators to the event JSON. for merged_ind in merged_indicators: self.json['indicators'].append(merged_ind.json) # Gather up any manual indicators we were given (from the refresh wiki function). # These are not Indicator objects, so we do not add the .json form to the list. for indicator in manual_indicators: self.logger.debug( 'Adding manual indicator to JSON: {} - {}'.format( indicator['type'], indicator['value'])) indicator['path'] = '' # We want to allow the Manual Indicators section to bypass the whitelist. indicator['whitelisted'] = False self.json['indicators'].append(indicator) """ # Loop over any CSS URLs we found to try and find even more URLs. for css_url in [i['value'] for i in self.json['indicators'] if i['type'] == 'URI - URL' and '.css' in i['value']]: # Download the CSS content and find any URLs inside it. try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'} css = requests.get(css_url, headers=headers).content css_urls = find_urls(css, base_url=css_url) except: self.logger.exception('Unable to download the CSS: {}'.format(css_url)) """ # Add some items to the event tags. if emails: self.json['tags'].append('phish') self.json['tags'] = sorted(list(set(self.json['tags']))) # Create the event package. if config['package']['enabled']: self.package_event()
def dedup_reports(report_list, whitelist): """ Merge a list of BaseSandboxParser subclass objects to make a single generic report. """ logger = logging.getLogger() logger.debug('Deduping sandbox report list') # Create the new generic report. dedup_report = BaseSandboxParser() for report in report_list: dedup_report.sandbox_urls += report.sandbox_urls if report.filename and not report.filename == 'sample': dedup_report.filename = report.filename if report.original_filename: dedup_report.original_filename = report.original_filename dedup_report.indicators.append( Indicator('Windows - FileName', dedup_report.original_filename, tags=['sandboxed_sample'])) if report.md5: dedup_report.md5 = report.md5 dedup_report.indicators.append( Indicator('Hash - MD5', dedup_report.md5, tags=['sandboxed_sample'])) if report.sha1: dedup_report.sha1 = report.sha1 dedup_report.indicators.append( Indicator('Hash - SHA1', dedup_report.sha1, tags=['sandboxed_sample'])) if report.sha256: dedup_report.sha256 = report.sha256 dedup_report.indicators.append( Indicator('Hash - SHA256', dedup_report.sha256, tags=['sandboxed_sample'])) if report.sha512: dedup_report.sha512 = report.sha512 if report.ssdeep: dedup_report.ssdeep = report.ssdeep dedup_report.indicators.append( Indicator('Hash - SSDEEP', dedup_report.ssdeep, tags=['sandboxed_sample'])) dedup_report.malware_family += report.malware_family # Dedup the contacted hosts. for host in report.contacted_hosts: if not host in dedup_report.contacted_hosts: dedup_report.contacted_hosts.append(host) tags = ['contacted_host'] if host['protocol'] and host['port']: tags.append('{} {}'.format(host['protocol'], host['port'])) elif host['protocol']: tags.append(host['protocol']) # For now we consider ALL contacted hosts to be benign, so no need to check the whitelist. <- XXX UPDATE WHY ?! #dedup_report.indicators.append(Indicator('Address - ipv4-addr', host['ipv4'], status='Informational', tags=tags)) dedup_report.indicators.append( Indicator('Address - ipv4-addr', host['ipv4'], tags=tags)) # Suricata for suricata_alert in report.suricata_alerts: if suricata_alert not in dedup_report.suricata_alerts: dedup_report.suricata_alerts.append(suricata_alert) # Dedup modified files for file in report.modified_files: if not file in dedup_report.modified_files: dedup_report.modified_files.append(file) # Dedup the dropped files. for file in report.dropped_files: # Dropped files are harder than the other items to properly whitelist, so we will # initially restrict them to certain file names or file types that we care about. if any(name in file['filename'].lower() for name in dedup_report.good_dropped_file_names) or any( t in file['type'] for t in dedup_report.good_dropped_file_types): if not file in dedup_report.dropped_files: dedup_report.dropped_files.append(file) # If any part of the dropped file is whitelisted, make sure we mark all parts as whitelisted. if whitelist.is_dropped_file_whitelisted(file): status = 'Whitelisted' file['status'] = 'Whitelisted' else: status = 'New' dedup_report.indicators.append( Indicator('Windows - FileName', file['filename'], status=status, tags=['dropped_file'])) dedup_report.indicators.append( Indicator('Hash - MD5', file['md5'], status=status, tags=['dropped_file'], relationships=[file['sha1'], file['sha256']])) dedup_report.indicators.append( Indicator('Hash - SHA1', file['sha1'], status=status, tags=['dropped_file'], relationships=[file['md5'], file['sha256']])) dedup_report.indicators.append( Indicator('Hash - SHA256', file['sha256'], status=status, tags=['dropped_file'], relationships=[file['md5'], file['sha1']])) # Dedup the HTTP requests. for request in report.http_requests: if not request in dedup_report.http_requests: dedup_report.http_requests.append(request) dedup_report.indicators += make_url_indicators( [request['url']], tags=['http_request', request['method']]) # Dedup the DNS requests. for request in report.dns_requests: if not request in dedup_report.dns_requests: dedup_report.dns_requests.append(request) # If any part of the DNS request is whitelisted, make sure we mark all parts as whitelisted. if whitelist.is_dns_request_whitelisted(request): status = 'Whitelisted' else: status = 'New' # For now we consider ALL request IP addresses to be benign, so no need to check the whitelist. dedup_report.indicators.append( Indicator('URI - Domain Name', request['request'], tags=['dns_request'])) try: ipaddress.ip_address(request['answer']) dedup_report.indicators.append( Indicator('Address - ipv4-addr', request['answer'], tags=['dns_response'], status='Informational', relationships=[request['request']])) except: pass # Dedup the memory strings. dedup_report.memory_strings += report.memory_strings dedup_report.memory_strings = sorted( list(set(dedup_report.memory_strings))) # Dedup the memory URLs. dedup_report.memory_urls += report.memory_urls dedup_report.memory_urls = list(set(dedup_report.memory_urls)) dedup_report.memory_urls = [ u for u in dedup_report.memory_urls if RegexHelpers.is_url(u) ] dedup_report.indicators += make_url_indicators( dedup_report.memory_urls, tags=['url_in_memory']) # Dedup the strings URLs. dedup_report.strings_urls += report.strings_urls dedup_report.strings_urls = list(set(dedup_report.strings_urls)) dedup_report.strings_urls = [ u for u in dedup_report.strings_urls if RegexHelpers.is_url(u) ] dedup_report.indicators += make_url_indicators( dedup_report.strings_urls, tags=['url_in_strings']) # Dedup the mutexes. dedup_report.mutexes += report.mutexes dedup_report.mutexes = list(set(dedup_report.mutexes)) # Dedup the resolved APIs. dedup_report.resolved_apis += report.resolved_apis dedup_report.resolved_apis = list(set(dedup_report.resolved_apis)) # Dedup the created services. dedup_report.created_services += report.created_services dedup_report.created_services = list(set( dedup_report.created_services)) # Dedup the started services. dedup_report.started_services += report.started_services dedup_report.started_services = list(set( dedup_report.started_services)) # Add the process tree as-is. dedup_report.process_trees.append(report.process_tree) # Try to decode base64 chunks in the process tree. process_tree_decoded = report.process_tree for chunk in report.process_tree.split(): try: decoded_chunk = base64.b64decode(chunk).decode('utf-8') if '\x00' in decoded_chunk: decoded_chunk = base64.b64decode(chunk).decode('utf-16') process_tree_decoded = process_tree_decoded.replace( chunk, decoded_chunk) except: pass dedup_report.process_trees_decoded.append(process_tree_decoded) # Remove ` backtick and other basic Powershell obfuscation. new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: if 'powershell' in decoded_process_tree.lower(): new_trees.append(decoded_process_tree.replace('`', '')) dedup_report.process_trees_decoded += new_trees # Remove Powershell string formatter obfuscation. new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: formatter_pattern = re.compile( r'(\([\'\"](({(\d+)})+)[\'\"]\s*\-f\s*(([\'\"][^\'\"]+[\'\"],*)+)\))', re.IGNORECASE) results = formatter_pattern.findall(decoded_process_tree) if results: for result in results: """ ('("{0}{1}"-f\'JDxA\',\'QDc\')', '{0}{1}', '{1}', '1', "'JDxA','QDc'", "'QDc'") """ full_match = result[0] order = result[1][1:-1] # 0}{1 items = result[4] # "'JDxA','QDc'" order_list = order.split('}{') order_ints = [int(x) for x in order_list] items_list = [ i.replace('\'', '').replace('"', '') for i in items.split(',') ] if len(order_ints) == len(items_list): deobfuscated_string = '' for i in order_ints: deobfuscated_string += items_list[i] decoded_process_tree = decoded_process_tree.replace( full_match, deobfuscated_string) new_trees.append(decoded_process_tree) dedup_report.process_trees_decoded += new_trees # Try to decode string .split() obfuscation (used by Emotet and others) new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: if 'split' in decoded_process_tree.lower(): try: split_char_pattern = re.compile( r'\.[\'\"]*split[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)', re.IGNORECASE) try: split_char = str( split_char_pattern.search( decoded_process_tree).group(1)) except AttributeError: split_char = None if split_char: new_process_tree_decoded = ' '.join( decoded_process_tree.split(split_char)) new_process_tree_decoded = new_process_tree_decoded.replace( "'+'", '') new_process_tree_decoded = new_process_tree_decoded.replace( '"+"', '') new_process_tree_decoded = new_process_tree_decoded.replace( '\'', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '\"', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '. ', ' ') new_trees.append(new_process_tree_decoded) except: logger.exception( 'Could not find process tree split() character.') dedup_report.process_trees_decoded += new_trees # Try to decode string .invoke() obfuscation (used by Emotet and others) new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: if 'invoke' in decoded_process_tree.lower(): try: split_char_pattern = re.compile( r'\.[\'\"]*invoke[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)', re.IGNORECASE) try: split_char = str( split_char_pattern.search( decoded_process_tree).group(1)) except AttributeError: split_char = None if split_char: new_process_tree_decoded = ' '.join( decoded_process_tree.split(split_char)) new_process_tree_decoded = new_process_tree_decoded.replace( "'+'", '') new_process_tree_decoded = new_process_tree_decoded.replace( '"+"', '') new_process_tree_decoded = new_process_tree_decoded.replace( '\'', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '\"', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '. ', ' ') new_trees.append(new_process_tree_decoded) except: logger.exception( 'Could not find process tree invoke() character.') dedup_report.process_trees_decoded += new_trees # Dedup the process tree URLs. Start by just adding the URLs from each report. dedup_report.process_tree_urls += report.process_tree_urls # Find the URLs in each decoded process tree. for decoded_tree in dedup_report.process_trees_decoded: urls = find_urls(decoded_tree) # Remove any URL that has these URLs as substrings, since it's probably a bogus # URL from the original, non-decoded process tree. for u in report.process_tree_urls: if any(decoded_url in u for decoded_url in urls): try: dedup_report.process_tree_urls.remove(u) logger.debug( 'Removing bad process tree URL: {}'.format(u)) except: pass dedup_report.process_tree_urls += urls dedup_report.process_tree_urls = list( set(dedup_report.process_tree_urls)) dedup_report.process_tree_urls = [ u for u in dedup_report.process_tree_urls if RegexHelpers.is_url(u) ] dedup_report.indicators += make_url_indicators( dedup_report.process_tree_urls, tags=['url_in_process_tree']) # Add the screenshot URLs as-is. if report.screenshot_path: dedup_report.screenshot_paths.append(report.screenshot_path) return dedup_report
def _parse_attachment(self, message_part, charset): part_items = message_part.items() for tup in part_items: for value in tup: if 'attachment' in value: file_data = message_part.get_payload() attachment_dict = {} if message_part.get('Content-Transfer-Encoding', None) == 'base64': file_data_b64 = file_data.replace('\n', '') # For some reason, sometimes the attachments don't have the proper # padding. Add a couple "==" on the end for good measure. This doesn't # seem to harm correctly encoded attachments. file_data_decoded = base64.b64decode(file_data_b64 + '==') # Try and get strings out of the attachment. strings_list = RegexHelpers.find_strings(file_data_decoded) strings = ' '.join(strings_list) # Look for any URLs that were in the strings. strings_urls = find_urls(strings) attachment_dict['strings_urls'] = strings_urls elif message_part.get_content_type() == 'text/html': file_data_decoded = message_part.get_payload(decode=True).decode(charset).encode('utf-8') else: file_data_decoded = file_data try: md5_hasher = hashlib.md5() md5_hasher.update(file_data_decoded) md5_hash = md5_hasher.hexdigest() except TypeError: md5_hash = '' try: sha256_hasher = hashlib.sha256() sha256_hasher.update(file_data_decoded) sha256_hash = sha256_hasher.hexdigest() except TypeError: sha256_hash = '' attachment_dict['content_type'] = message_part.get_content_type() attachment_dict['size'] = len(file_data_decoded) attachment_dict['md5'] = md5_hash attachment_dict['sha256'] = sha256_hash attachment_dict['name'] = '' attachment_dict['create_date'] = '' attachment_dict['mod_date'] = '' attachment_dict['read_date'] = '' # Find the attachment name. Normally this follows a specific format # and is called 'filename=' but recently I've seen some that are in # different locations are are just called 'name='... Hence removing # old code and replacing with a regex statement to account for either # name in any location in the message part. attachment_name_pattern = re.compile(r'(file)?name="?([^"]+)"?') for tup in part_items: for item in tup: item_lines = item.splitlines() for item_line in item_lines: attachment_name = attachment_name_pattern.search(item_line) if attachment_name: attachment_dict['name'] = RegexHelpers.decode_utf_b64_string(attachment_name.groups()[1]) if attachment_dict['name'].endswith(';'): attachment_dict['name'] = attachment_dict['name'][:-1] # Make the attachment indicators. self.indicators.append(Indicator('Windows - FileName', attachment_dict['name'], tags=['attachment'])) self.indicators.append(Indicator('Hash - MD5', attachment_dict['md5'], tags=['attachment'])) self.indicators.append(Indicator('Hash - SHA256', attachment_dict['sha256'], tags=['attachment'])) return attachment_dict return None
def parse_strings_urls(self): self.logger.debug('Looking for URLs in strings') return find_urls(self.parse_strings())
def test_find_urls_in_text_like_html(): blob = b"""<meta http-equiv="refresh" content="0; URL=https://blah.com/one/two">""" assert urlfinderlib.find_urls(blob) == {"https://blah.com/one/two"}
def test_find_urls_ooxml(): with open(f"{files_dir}/test.ooxml", "rb") as f: blob = f.read() assert urlfinderlib.find_urls(blob) == set()
def test_find_urls_rfc822(): with open(f"{files_dir}/email.rfc822", "rb") as f: blob = f.read() assert urlfinderlib.find_urls(blob) == set()
def test_find_urls_text(): assert urlfinderlib.find_urls("test") == set()
def __init__(self, smtp_path, whitelist): # Initiate logging. self.logger = logging.getLogger() # Save the whitelist. self.whitelist = whitelist # Items we parse out of the email. self.ace_url = '' self.attachments = [] self.body = '' self.cc_addresses = [] self.envelope_from = '' self.envelope_to = '' self.from_address = '' self.headers = '' self.html = '' self.indicators = [] self.message_id = '' self.original_recipient = '' self.path = smtp_path self.received = '' self.received_time = '' self.remediated = False self.reply_to = '' self.return_path = '' self.screenshots = [] self.subject = '' self.subject_decoded = '' self.to_addresses = [] self.urls = [] self.x_auth_id = '' self.x_mailer = '' self.x_original_sender = '' self.x_originating_ip = '' self.x_sender = '' self.x_sender_id = '' self.x_sender_ip = '' # Build the URL to the ACE alert. ace_uuid_pattern = re.compile(r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})') match = ace_uuid_pattern.search(self.path) if match: self.ace_url = '{}{}'.format(config['ace']['ace_alert_url'], match.group(1)) with open(self.path, encoding='utf-8', errors='ignore') as s: smtp_stream = s.read().splitlines() # Locate any screenshots for this email. email_dir = os.path.dirname(self.path) files = os.listdir(email_dir) for f in files: if 'text_html' in f and f.endswith('.png') and not f.startswith('email_screenshot'): self.logger.debug('Found email screenshot: {}'.format(os.path.join(email_dir, f))) self.screenshots.append(os.path.join(email_dir, f)) # Find the envelope from/to addresses. This will only work if given an # "smtp.stream" file, since otherwise the SMTP commands will not exist. envelope_address_pattern = re.compile(r'.*<(.*)>.*') for line in smtp_stream: if line.startswith('MAIL FROM:'): try: self.envelope_from = envelope_address_pattern.match(line).group(1) except: self.logger.exception('Unable to parse envelope from.') if line.startswith('RCPT TO:'): try: self.envelope_to = envelope_address_pattern.match(line).group(1) except: self.logger.exception('Unable to parse envelope to.') # Just in case we are dealing with an "smtp.stream" file that still has # the SMTP commands above the actual e-mail, we need to strip those out. # This will remove all lines prior to the Received: headers so that the # email.parser can properly parse out the e-mail. If we were given an # "smtp.email" type of file with the SMTP commands already removed, this # should not affect anything. This is legacy code at this point. try: while not smtp_stream[0].startswith('Received:'): smtp_stream.pop(0) except IndexError: smtp_stream = [] # Join the header lines into a single string. self.email_text = '\n'.join(smtp_stream) # Create the e-mail object. email_obj = email.message_from_string(self.email_text) # We want to try and parse an embedded/attached e-mail if there is one. # Walk the full e-mail's parts. for part in email_obj.walk(): # Continue if the part looks like a valid e-mail. if part.get_content_type() == 'message/rfc822': # Split the part lines into a list. part_text = str(part).splitlines() if any('Received:' in line for line in part_text): # Make sure our part starts with the Received: headers. while not part_text[0].startswith('Received:'): part_text.pop(0) part_text = '\n'.join(part_text) # Make the new e-mail object. email_obj = email.message_from_string(part_text) # Parse the e-mail object for its content. parsed_email = self._parse_content(email_obj) # Now that we have the e-mail object, parse out some of the interesting parts. self.headers = self._get_all_headers_string(email_obj) self.received = self.get_header(email_obj, 'received') # Get the e-mail's plaintext body, HTML body, and the visible text from the HTML. self.body = parsed_email['body'] self.html = parsed_email['html'] # Get any e-mail attachments. self.attachments = parsed_email['attachments'] # From address try: self.from_address = self._get_address_list(email_obj, 'from')[0][1] self.indicators.append(Indicator('Email - Address', self.from_address, tags=['from_address'])) except: pass # From domain try: self.indicators.append(Indicator('URI - Domain Name', self.from_address.split('@')[1], tags=['from_domain'])) except: pass # Reply-To address try: self.reply_to = self._get_address_list(email_obj, 'reply-to')[0][1] self.indicators.append(Indicator('Email - Address', self.reply_to, tags=['reply_to'])) except: pass # X-Sender address try: self.x_sender = self._get_address_list(email_obj, 'X-Sender')[0][1] self.indicators.append(Indicator('Email - Address', self.x_sender, tags=['x_sender'])) except: pass # X-Sender-Id address try: self.x_sender_id = self._get_address_list(email_obj, 'X-Sender-Id')[0][1] self.indicators.append(Indicator('Email - Address', self.x_sender_id, tags=['x_sender_id'])) except: pass # X-Auth-Id address try: self.x_auth_id = self._get_address_list(email_obj, 'X-Auth-ID')[0][1] self.indicators.append(Indicator('Email - Address', self.x_auth_id, tags=['x_auth_id'])) except: pass # Return-Path address try: self.return_path = self._get_address_list(email_obj, 'return_path')[0][1] self.indicators.append(Indicator('Email - Address', self.return_path, tags=['return_path'])) except: pass # X-MS-Exchange-Organization-OriginalEnvelopeRecipients address try: self.original_recipient = self._get_address_list(email_obj, 'X-MS-Exchange-Organization-OriginalEnvelopeRecipients')[0][1].lower() self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient'])) except: pass # If the original_recipient was not found, check if this is a POTENTIAL PHISH e-mail and use the from address. if not self.original_recipient and 'Subject: [POTENTIAL PHISH]' in self.email_text: try: temp_email_obj = email.message_from_string(self.email_text) self.original_recipient = self._get_address_list(temp_email_obj, 'from')[0][1] self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient'])) except: self.logger.exception('Error parsing original recipient from POTENTIAL PHISH e-mail.') # Subject try: self.subject = ''.join(self.get_header(email_obj, 'subject')[0].splitlines()) self.indicators.append(Indicator('Email - Subject', self.subject)) except: pass # Decoded subject try: self.subject_decoded = ''.join(str(make_header(decode_header(self.get_header(email_obj, 'subject')[0]))).splitlines()) self.indicators.append(Indicator('Email - Subject', self.subject_decoded)) except: pass # To addresses self.to_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'to')] # CC addresses self.cc_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'cc')] # Message-Id try: self.message_id = self.get_header(email_obj, 'message-id')[0] self.indicators.append(Indicator('Email Message ID', self.message_id, status='Informational')) except: pass # X-Mailer try: self.x_mailer = self.get_header(email_obj, 'x-mailer')[0] self.indicators.append(Indicator('Email - Xmailer', self.x_mailer, status='Informational')) except: pass # X-Original-Sender address try: self.x_original_sender = self.get_header(email_obj, 'x-original-sender')[0] self.indicators.append(Indicator('Email - Address', self.x_original_sender, tags=['x_original_sender'])) except: pass # X-Originating-Ip try: x_originating_ip = self.get_header(email_obj, 'x-originating-ip')[0] # Sometimes this field is in the form: [1.1.1.1] # Make sure we remove any non-IP characters. ip = RegexHelpers.find_ip_addresses(x_originating_ip) if ip: self.x_originating_ip = ip[0] self.indicators.append(Indicator('Address - ipv4-addr', self.x_originating_ip, tags=['x_originating_ip'])) except: pass # X-Sender-Ip try: x_sender_ip = self.get_header(email_obj, 'x-sender-ip')[0] # Make sure like the X-Originating-IP that we only # get the IP address and no other characters. ip = RegexHelpers.find_ip_addresses(x_sender_ip) if ip: self.x_sender_ip = ip[0] self.indicators.append(Indicator('Address - ipv4-addr', self.x_sender_ip, tags=['x_sender_ip'])) except: pass self.received_time = self._get_received_time(email_obj) if not self.received_time: self.received_time = self._get_date_time() # Find any URLs in the plaintext body. text_urls = find_urls(self.body) # Find any URLs in the HTML body. html_urls = find_urls(self.html) # Get any strings URLs. strings_urls = [] """ for file in self.attachments: try: strings_urls += file['strings_urls'] except: pass """ # Try and remove any URLs that look like partial versions of other URLs. all_urls = text_urls + html_urls + strings_urls unique_urls = set() for u in all_urls: if not any(other_url.startswith(u) and other_url != u for other_url in all_urls): unique_urls.add(u) # Get rid of any invalid URLs. self.urls = [u for u in unique_urls if is_valid(u)] # Make indicators for the URLs. self.indicators += make_url_indicators(self.urls) # Get rid of any invalid indicators. self.indicators = [i for i in self.indicators if i.value] # Add any extra tags to each indicator. for i in self.indicators: i.tags.append('phish')