def parse_memory_urls(self): self.logger.debug('Parsing memory URLs') memory_urls = set() memory_urls_json = self.parse(self.report, 'analysis', 'hybridanalysis', 'ipdomainstreams', 'stream') if memory_urls_json: if isinstance(memory_urls_json, dict): memory_urls_json = [memory_urls_json] for url in memory_urls_json: if isinstance(url, str): if is_valid(url): memory_urls.add(url) if isinstance(url, dict): if 'db' in url: if is_valid(url['db']): memory_urls.add(url['db']) return sorted(list(memory_urls))
def is_email_address_whitelisted(self, address, value_in_indicator=True, indicator_in_value=False): """ Returns True if the email address is whitelisted. """ # First check if the address was already cached. if self._is_cached_whitelisted(address): return True if self._is_cached_nonwhitelisted(address): return False # Check if the address is valid. email_pattern = re.compile( r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}') try: if not email_pattern.match(address): self._add_whitelisted_cache(address) self.logger.debug('Invalid e-mail address: {}'.format(address)) return True except: self._add_whitelisted_cache(address) return True # Check if the domain is valid. try: domain = address.split('@')[1] if not is_valid(domain): self._add_whitelisted_cache(address) self.logger.debug( 'Invalid e-mail address domain: {}'.format(address)) return True except: self._add_whitelisted_cache(address) return True return self._is_whitelisted(address, [ 'Email - Address', 'WHOIS Registrant Email Address', 'Email Address From', 'Email Address Sender' ], value_in_indicator=value_in_indicator, indicator_in_value=indicator_in_value)
def is_domain_whitelisted(self, domain, value_in_indicator=False, indicator_in_value=True): """ Returns True if the domain has an invalid TLD or is whitelisted. """ # First check if the domain was already cached. if self._is_cached_whitelisted(domain): return True if self._is_cached_nonwhitelisted(domain): return False # Check if the domain has a valid TLD. if not is_valid(domain): self._add_whitelisted_cache(domain) self.logger.debug('Invalid domain: {}'.format(domain)) return True return self._is_whitelisted(domain, ['URI - Domain Name'], value_in_indicator=value_in_indicator, indicator_in_value=indicator_in_value)
def dedup_reports(report_list, whitelist): """ Merge a list of BaseSandboxParser subclass objects to make a single generic report. """ logger = logging.getLogger() logger.debug('Deduping sandbox report list') # Create the new generic report. dedup_report = BaseSandboxParser() for report in report_list: dedup_report.sandbox_urls += report.sandbox_urls if report.filename and not report.filename == 'sample': dedup_report.filename = report.filename if report.md5: dedup_report.md5 = report.md5 dedup_report.indicators.append( Indicator('Hash - MD5', dedup_report.md5, tags=['sandboxed_sample'])) if report.sha1: dedup_report.sha1 = report.sha1 dedup_report.indicators.append( Indicator('Hash - SHA1', dedup_report.sha1, tags=['sandboxed_sample'])) if report.sha256: dedup_report.sha256 = report.sha256 dedup_report.indicators.append( Indicator('Hash - SHA256', dedup_report.sha256, tags=['sandboxed_sample'])) if report.sha512: dedup_report.sha512 = report.sha512 if report.ssdeep: dedup_report.ssdeep = report.ssdeep dedup_report.indicators.append( Indicator('Hash - SSDEEP', dedup_report.ssdeep, tags=['sandboxed_sample'])) dedup_report.malware_family += report.malware_family # Dedup the contacted hosts. for host in report.contacted_hosts: if not host in dedup_report.contacted_hosts: dedup_report.contacted_hosts.append(host) tags = ['contacted_host'] if host['protocol'] and host['port']: tags.append('{} {}'.format(host['protocol'], host['port'])) elif host['protocol']: tags.append(host['protocol']) # For now we consider ALL contacted hosts to be benign, so no need to check the whitelist. dedup_report.indicators.append( Indicator('Address - ipv4-addr', host['ipv4'], status='Informational', tags=tags)) # Dedup the dropped files. for file in report.dropped_files: # Dropped files are harder than the other items to properly whitelist, so we will # initially restrict them to certain file names or file types that we care about. if any(name in file['filename'] for name in dedup_report.good_dropped_file_names) or any( t in file['type'] for t in dedup_report.good_dropped_file_types): if not file in dedup_report.dropped_files: dedup_report.dropped_files.append(file) # If any part of the dropped file is whitelisted, make sure we mark all parts as whitelisted. if whitelist.is_dropped_file_whitelisted(file): status = 'Whitelisted' file['status'] = 'Whitelisted' else: status = 'New' dedup_report.indicators.append( Indicator('Windows - FileName', file['filename'], status=status, tags=['dropped_file'])) dedup_report.indicators.append( Indicator('Hash - MD5', file['md5'], status=status, tags=['dropped_file'], relationships=[file['sha1'], file['sha256']])) dedup_report.indicators.append( Indicator('Hash - SHA1', file['sha1'], status=status, tags=['dropped_file'], relationships=[file['md5'], file['sha256']])) dedup_report.indicators.append( Indicator('Hash - SHA256', file['sha256'], status=status, tags=['dropped_file'], relationships=[file['md5'], file['sha1']])) # Dedup the HTTP requests. for request in report.http_requests: if not request in dedup_report.http_requests: dedup_report.http_requests.append(request) dedup_report.indicators += make_url_indicators( [request['url']], tags=['http_request', request['method']]) # Dedup the DNS requests. for request in report.dns_requests: if not request in dedup_report.dns_requests: dedup_report.dns_requests.append(request) # If any part of the DNS request is whitelisted, make sure we mark all parts as whitelisted. if whitelist.is_dns_request_whitelisted(request): status = 'Whitelisted' else: status = 'New' # For now we consider ALL request IP addresses to be benign, so no need to check the whitelist. dedup_report.indicators.append( Indicator('URI - Domain Name', request['request'], tags=['dns_request'])) try: ipaddress.ip_address(request['answer']) dedup_report.indicators.append( Indicator('Address - ipv4-addr', request['answer'], tags=['dns_response'], status='Informational', relationships=[request['request']])) except: pass # Dedup the memory strings. dedup_report.memory_strings += report.memory_strings dedup_report.memory_strings = sorted( list(set(dedup_report.memory_strings))) # Dedup the memory URLs. dedup_report.memory_urls += report.memory_urls dedup_report.memory_urls = list(set(dedup_report.memory_urls)) dedup_report.memory_urls = [ u for u in dedup_report.memory_urls if is_valid(u) ] dedup_report.indicators += make_url_indicators( dedup_report.memory_urls, tags=['url_in_memory']) # Dedup the strings URLs. dedup_report.strings_urls += report.strings_urls dedup_report.strings_urls = list(set(dedup_report.strings_urls)) dedup_report.strings_urls = [ u for u in dedup_report.strings_urls if is_valid(u) ] dedup_report.indicators += make_url_indicators( dedup_report.strings_urls, tags=['url_in_strings']) # Dedup the mutexes. dedup_report.mutexes += report.mutexes dedup_report.mutexes = list(set(dedup_report.mutexes)) # Dedup the resolved APIs. dedup_report.resolved_apis += report.resolved_apis dedup_report.resolved_apis = list(set(dedup_report.resolved_apis)) # Dedup the created services. dedup_report.created_services += report.created_services dedup_report.created_services = list(set( dedup_report.created_services)) # Dedup the started services. dedup_report.started_services += report.started_services dedup_report.started_services = list(set( dedup_report.started_services)) # Add the process tree as-is. dedup_report.process_trees.append(report.process_tree) # Try to decode base64 chunks in the process tree. process_tree_decoded = report.process_tree for chunk in report.process_tree.split(): try: decoded_chunk = base64.b64decode(chunk).decode('utf-8') if '\x00' in decoded_chunk: decoded_chunk = base64.b64decode(chunk).decode('utf-16') process_tree_decoded = process_tree_decoded.replace( chunk, decoded_chunk) except: pass dedup_report.process_trees_decoded.append(process_tree_decoded) # Remove ` backtick and other basic Powershell obfuscation. new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: if 'powershell' in decoded_process_tree.lower(): new_trees.append(decoded_process_tree.replace('`', '')) dedup_report.process_trees_decoded += new_trees # Remove Powershell string formatter obfuscation. new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: formatter_pattern = re.compile( r'(\([\'\"](({(\d+)})+)[\'\"]\s*\-f\s*(([\'\"][^\'\"]+[\'\"],*)+)\))', re.IGNORECASE) results = formatter_pattern.findall(decoded_process_tree) if results: for result in results: """ ('("{0}{1}"-f\'JDxA\',\'QDc\')', '{0}{1}', '{1}', '1', "'JDxA','QDc'", "'QDc'") """ full_match = result[0] order = result[1][1:-1] # 0}{1 items = result[4] # "'JDxA','QDc'" order_list = order.split('}{') order_ints = [int(x) for x in order_list] items_list = [ i.replace('\'', '').replace('"', '') for i in items.split(',') ] if len(order_ints) == len(items_list): deobfuscated_string = '' for i in order_ints: deobfuscated_string += items_list[i] decoded_process_tree = decoded_process_tree.replace( full_match, deobfuscated_string) new_trees.append(decoded_process_tree) dedup_report.process_trees_decoded += new_trees # Try to decode string .split() obfuscation (used by Emotet and others) new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: if 'split' in decoded_process_tree.lower(): try: split_char_pattern = re.compile( r'\.[\'\"]*split[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)', re.IGNORECASE) try: split_char = str( split_char_pattern.search( decoded_process_tree).group(1)) except AttributeError: split_char = None if split_char: new_process_tree_decoded = ' '.join( decoded_process_tree.split(split_char)) new_process_tree_decoded = new_process_tree_decoded.replace( "'+'", '') new_process_tree_decoded = new_process_tree_decoded.replace( '"+"', '') new_process_tree_decoded = new_process_tree_decoded.replace( '\'', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '\"', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '. ', ' ') new_trees.append(new_process_tree_decoded) except: logger.exception( 'Could not find process tree split() character.') dedup_report.process_trees_decoded += new_trees # Try to decode string .invoke() obfuscation (used by Emotet and others) new_trees = [] for decoded_process_tree in dedup_report.process_trees_decoded: if 'invoke' in decoded_process_tree.lower(): try: split_char_pattern = re.compile( r'\.[\'\"]*invoke[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)', re.IGNORECASE) try: split_char = str( split_char_pattern.search( decoded_process_tree).group(1)) except AttributeError: split_char = None if split_char: new_process_tree_decoded = ' '.join( decoded_process_tree.split(split_char)) new_process_tree_decoded = new_process_tree_decoded.replace( "'+'", '') new_process_tree_decoded = new_process_tree_decoded.replace( '"+"', '') new_process_tree_decoded = new_process_tree_decoded.replace( '\'', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '\"', ' ') new_process_tree_decoded = new_process_tree_decoded.replace( '. ', ' ') new_trees.append(new_process_tree_decoded) except: logger.exception( 'Could not find process tree invoke() character.') dedup_report.process_trees_decoded += new_trees # Dedup the process tree URLs. Start by just adding the URLs from each report. dedup_report.process_tree_urls += report.process_tree_urls # Find the URLs in each decoded process tree. for decoded_tree in dedup_report.process_trees_decoded: urls = find_urls(decoded_tree) # Remove any URL that has these URLs as substrings, since it's probably a bogus # URL from the original, non-decoded process tree. for u in report.process_tree_urls: if any(decoded_url in u for decoded_url in urls): try: dedup_report.process_tree_urls.remove(u) logger.debug( 'Removing bad process tree URL: {}'.format(u)) except: pass dedup_report.process_tree_urls += urls dedup_report.process_tree_urls = list( set(dedup_report.process_tree_urls)) dedup_report.process_tree_urls = [ u for u in dedup_report.process_tree_urls if is_valid(u) ] dedup_report.indicators += make_url_indicators( dedup_report.process_tree_urls, tags=['url_in_process_tree']) # Add the screenshot URLs as-is. if report.screenshot_path: dedup_report.screenshot_paths.append(report.screenshot_path) return dedup_report
def is_url_whitelisted(self, u, value_in_indicator=False, indicator_in_value=False): """ Returns True if the URL is invalid or is whitelisted. """ # First check if the URL was already cached. if self._is_cached_whitelisted(u): return True if self._is_cached_nonwhitelisted(u): return False # Check if the URL is valid. if not is_valid(u): self._add_whitelisted_cache(u) self.logger.debug('Invalid URL: {}'.format(u)) return True # Split the URL and check each part against the whitelist. split_url = urlsplit(u) # First check if the netloc has a ':' in it, which indicates that # there is a port number specified. We need to remove that in order # to properly check it against the whitelists. if ':' in split_url.netloc: netloc = split_url.netloc.split(':')[0] else: netloc = split_url.netloc # Look for the edge case of the URL having a username:password notation. if ':' in split_url.netloc and '@' in split_url.netloc: user_pass = re.compile(r'(.*?:.*?@)').findall(split_url.netloc)[0] user_pass_url = u.replace(user_pass, '') split_url = urlsplit(user_pass_url) netloc = split_url.netloc # Check the netloc. Check if it is an IP address. try: ipaddress.ip_address(netloc) if self.is_ip_whitelisted(netloc): self._add_whitelisted_cache(u) self.logger.debug( 'URL whitelisted because of IP: {}'.format(u)) return True # If we got an exception, it must be a domain name. except: if self.is_domain_whitelisted(netloc): self._add_whitelisted_cache(u) self.logger.debug( 'URL whitelisted because of domain: {}'.format(u)) return True # Check the URI path if it exists. if split_url.path and split_url.path != '/': if self.is_uri_path_whitelisted(split_url.path): self._add_whitelisted_cache(u) self.logger.debug( 'URL whitelisted because of path: {}'.format(u)) return True # Check the URI query if it exists. if split_url.query: if self.is_uri_path_whitelisted(split_url.query): self._add_whitelisted_cache(u) self.logger.debug( 'URL whitelisted because of query: {}'.format(u)) return True # Finally check the entire URL. return self._is_whitelisted(u, ['URI - URL'], value_in_indicator=value_in_indicator, indicator_in_value=indicator_in_value)
def __init__(self, smtp_path, whitelist): # Initiate logging. self.logger = logging.getLogger() # Save the whitelist. self.whitelist = whitelist # Items we parse out of the email. self.ace_url = '' self.attachments = [] self.body = '' self.cc_addresses = [] self.envelope_from = '' self.envelope_to = '' self.from_address = '' self.headers = '' self.html = '' self.indicators = [] self.message_id = '' self.original_recipient = '' self.path = smtp_path self.received = '' self.received_time = '' self.remediated = False self.reply_to = '' self.return_path = '' self.screenshots = [] self.subject = '' self.subject_decoded = '' self.to_addresses = [] self.urls = [] self.x_auth_id = '' self.x_mailer = '' self.x_original_sender = '' self.x_originating_ip = '' self.x_sender = '' self.x_sender_id = '' self.x_sender_ip = '' # Build the URL to the ACE alert. ace_uuid_pattern = re.compile(r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})') match = ace_uuid_pattern.search(self.path) if match: self.ace_url = '{}{}'.format(config['ace']['ace_alert_url'], match.group(1)) with open(self.path, encoding='utf-8', errors='ignore') as s: smtp_stream = s.read().splitlines() # Locate any screenshots for this email. email_dir = os.path.dirname(self.path) files = os.listdir(email_dir) for f in files: if 'text_html' in f and f.endswith('.png') and not f.startswith('email_screenshot'): self.logger.debug('Found email screenshot: {}'.format(os.path.join(email_dir, f))) self.screenshots.append(os.path.join(email_dir, f)) # Find the envelope from/to addresses. This will only work if given an # "smtp.stream" file, since otherwise the SMTP commands will not exist. envelope_address_pattern = re.compile(r'.*<(.*)>.*') for line in smtp_stream: if line.startswith('MAIL FROM:'): try: self.envelope_from = envelope_address_pattern.match(line).group(1) except: self.logger.exception('Unable to parse envelope from.') if line.startswith('RCPT TO:'): try: self.envelope_to = envelope_address_pattern.match(line).group(1) except: self.logger.exception('Unable to parse envelope to.') # Just in case we are dealing with an "smtp.stream" file that still has # the SMTP commands above the actual e-mail, we need to strip those out. # This will remove all lines prior to the Received: headers so that the # email.parser can properly parse out the e-mail. If we were given an # "smtp.email" type of file with the SMTP commands already removed, this # should not affect anything. This is legacy code at this point. try: while not smtp_stream[0].startswith('Received:'): smtp_stream.pop(0) except IndexError: smtp_stream = [] # Join the header lines into a single string. self.email_text = '\n'.join(smtp_stream) # Create the e-mail object. email_obj = email.message_from_string(self.email_text) # We want to try and parse an embedded/attached e-mail if there is one. # Walk the full e-mail's parts. for part in email_obj.walk(): # Continue if the part looks like a valid e-mail. if part.get_content_type() == 'message/rfc822': # Split the part lines into a list. part_text = str(part).splitlines() if any('Received:' in line for line in part_text): # Make sure our part starts with the Received: headers. while not part_text[0].startswith('Received:'): part_text.pop(0) part_text = '\n'.join(part_text) # Make the new e-mail object. email_obj = email.message_from_string(part_text) # Parse the e-mail object for its content. parsed_email = self._parse_content(email_obj) # Now that we have the e-mail object, parse out some of the interesting parts. self.headers = self._get_all_headers_string(email_obj) self.received = self.get_header(email_obj, 'received') # Get the e-mail's plaintext body, HTML body, and the visible text from the HTML. self.body = parsed_email['body'] self.html = parsed_email['html'] # Get any e-mail attachments. self.attachments = parsed_email['attachments'] # From address try: self.from_address = self._get_address_list(email_obj, 'from')[0][1] self.indicators.append(Indicator('Email - Address', self.from_address, tags=['from_address'])) except: pass # From domain try: self.indicators.append(Indicator('URI - Domain Name', self.from_address.split('@')[1], tags=['from_domain'])) except: pass # Reply-To address try: self.reply_to = self._get_address_list(email_obj, 'reply-to')[0][1] self.indicators.append(Indicator('Email - Address', self.reply_to, tags=['reply_to'])) except: pass # X-Sender address try: self.x_sender = self._get_address_list(email_obj, 'X-Sender')[0][1] self.indicators.append(Indicator('Email - Address', self.x_sender, tags=['x_sender'])) except: pass # X-Sender-Id address try: self.x_sender_id = self._get_address_list(email_obj, 'X-Sender-Id')[0][1] self.indicators.append(Indicator('Email - Address', self.x_sender_id, tags=['x_sender_id'])) except: pass # X-Auth-Id address try: self.x_auth_id = self._get_address_list(email_obj, 'X-Auth-ID')[0][1] self.indicators.append(Indicator('Email - Address', self.x_auth_id, tags=['x_auth_id'])) except: pass # Return-Path address try: self.return_path = self._get_address_list(email_obj, 'return_path')[0][1] self.indicators.append(Indicator('Email - Address', self.return_path, tags=['return_path'])) except: pass # X-MS-Exchange-Organization-OriginalEnvelopeRecipients address try: self.original_recipient = self._get_address_list(email_obj, 'X-MS-Exchange-Organization-OriginalEnvelopeRecipients')[0][1].lower() self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient'])) except: pass # If the original_recipient was not found, check if this is a POTENTIAL PHISH e-mail and use the from address. if not self.original_recipient and 'Subject: [POTENTIAL PHISH]' in self.email_text: try: temp_email_obj = email.message_from_string(self.email_text) self.original_recipient = self._get_address_list(temp_email_obj, 'from')[0][1] self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient'])) except: self.logger.exception('Error parsing original recipient from POTENTIAL PHISH e-mail.') # Subject try: self.subject = ''.join(self.get_header(email_obj, 'subject')[0].splitlines()) self.indicators.append(Indicator('Email - Subject', self.subject)) except: pass # Decoded subject try: self.subject_decoded = ''.join(str(make_header(decode_header(self.get_header(email_obj, 'subject')[0]))).splitlines()) self.indicators.append(Indicator('Email - Subject', self.subject_decoded)) except: pass # To addresses self.to_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'to')] # CC addresses self.cc_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'cc')] # Message-Id try: self.message_id = self.get_header(email_obj, 'message-id')[0] self.indicators.append(Indicator('Email Message ID', self.message_id, status='Informational')) except: pass # X-Mailer try: self.x_mailer = self.get_header(email_obj, 'x-mailer')[0] self.indicators.append(Indicator('Email - Xmailer', self.x_mailer, status='Informational')) except: pass # X-Original-Sender address try: self.x_original_sender = self.get_header(email_obj, 'x-original-sender')[0] self.indicators.append(Indicator('Email - Address', self.x_original_sender, tags=['x_original_sender'])) except: pass # X-Originating-Ip try: x_originating_ip = self.get_header(email_obj, 'x-originating-ip')[0] # Sometimes this field is in the form: [1.1.1.1] # Make sure we remove any non-IP characters. ip = RegexHelpers.find_ip_addresses(x_originating_ip) if ip: self.x_originating_ip = ip[0] self.indicators.append(Indicator('Address - ipv4-addr', self.x_originating_ip, tags=['x_originating_ip'])) except: pass # X-Sender-Ip try: x_sender_ip = self.get_header(email_obj, 'x-sender-ip')[0] # Make sure like the X-Originating-IP that we only # get the IP address and no other characters. ip = RegexHelpers.find_ip_addresses(x_sender_ip) if ip: self.x_sender_ip = ip[0] self.indicators.append(Indicator('Address - ipv4-addr', self.x_sender_ip, tags=['x_sender_ip'])) except: pass self.received_time = self._get_received_time(email_obj) if not self.received_time: self.received_time = self._get_date_time() # Find any URLs in the plaintext body. text_urls = find_urls(self.body) # Find any URLs in the HTML body. html_urls = find_urls(self.html) # Get any strings URLs. strings_urls = [] """ for file in self.attachments: try: strings_urls += file['strings_urls'] except: pass """ # Try and remove any URLs that look like partial versions of other URLs. all_urls = text_urls + html_urls + strings_urls unique_urls = set() for u in all_urls: if not any(other_url.startswith(u) and other_url != u for other_url in all_urls): unique_urls.add(u) # Get rid of any invalid URLs. self.urls = [u for u in unique_urls if is_valid(u)] # Make indicators for the URLs. self.indicators += make_url_indicators(self.urls) # Get rid of any invalid indicators. self.indicators = [i for i in self.indicators if i.value] # Add any extra tags to each indicator. for i in self.indicators: i.tags.append('phish')
def make_url_indicators(urls, tags=[]): """ Make indicators from a list of URLs. """ logger = logging.getLogger(__name__) if isinstance(urls, str): urls = [urls] indicators = [] for u in set(urls): if is_valid(u): parsed_url = urlsplit(u) url_without_query = parsed_url.scheme + '://' + parsed_url.netloc + parsed_url.path url_variations = set() url_variations.add(u) url_variations.add(url_without_query) for u in url_variations: """ # If the URL is whitelisted, we want to make sure that we mark its component parts # (the netloc and the path/query) as Informational. We don't want to mark them as # Whitelisted since, for example, there can be cases where some URI paths from a # given domain are good and others are not. (See: dropbox.com) if whitelist.is_url_whitelisted(u): status = 'Informational' else: status = 'New' """ status = 'New' # Hacky way to deal with URLs that have a username:password notation. user_pass_url = '' parsed_url = urlsplit(u) # First check if the netloc has a ':' in it, which indicates that # there is a port number specified. We need to remove that in order # to properly create indicators for it. if ':' in parsed_url.netloc: netloc = parsed_url.netloc.split(':')[0] else: netloc = parsed_url.netloc # Look for the edge case of the URL having a username:password notation. try: if ':' in parsed_url.netloc and '@' in parsed_url.netloc: user_pass = re.compile(r'(.*?:.*?@)').findall( parsed_url.netloc)[0] user_pass_url = u.replace(user_pass, '') parsed_url = urlsplit(user_pass_url) netloc = parsed_url.netloc except: pass # Domain/IP try: ipaddress.ip_address(netloc) indicators.append( Indicator('Address - ipv4-addr', netloc, status=status, tags=tags + ['ip_in_url'], relationships=[u])) except ValueError: indicators.append( Indicator('URI - Domain Name', netloc, status=status, tags=tags + ['domain_in_url'], relationships=[u])) # TLD tld = get_fld('http://{}'.format(netloc), fail_silently=True) if tld: indicators.append( Indicator('URI - Domain Name', tld, status=status, tags=tags, relationships=[u])) # Full URL indicators.append(Indicator('URI - URL', u, tags=tags)) # Path indicators.append( Indicator('URI - Path', parsed_url.path, status=status, tags=tags, relationships=[u, parsed_url.netloc])) try: decoded_path = urllib.parse.unquote(parsed_url.path) if not decoded_path == parsed_url.path: indicators.append( Indicator('URI - Path', decoded_path, status=status, tags=tags, relationships=[u, parsed_url.netloc])) except: pass # Query indicators.append( Indicator('URI - Path', parsed_url.query, status=status, tags=tags, relationships=[u, parsed_url.netloc])) good_indicators = [i for i in set(indicators) if i.value] return good_indicators