def load_blacklist(self): logging.debug("loading blacklist from {}".format(self.blacklist_path)) blacklisted_fqdn = [] blacklisted_cidr = [] try: with open(self.blacklist_path, 'r') as fp: for line in fp: line = line.strip() # skip comments if line.startswith('#'): continue # skip blank lines if line == '': continue if is_ipv4(line): blacklisted_cidr.append(IPv4Network(add_netmask(line))) else: blacklisted_fqdn.append(line) self.blacklisted_cidr = blacklisted_cidr self.blacklisted_fqdn = blacklisted_fqdn logging.debug("loaded {} cidr {} fqdn blacklisted items".format( len(self.blacklisted_cidr), len(self.blacklisted_fqdn))) except Exception as e: logging.error("unable to load blacklist {}: {}".format(self.blacklist_path, e)) report_exception()
def filter(self, url): """Returns True if the given URL should be filtered (not crawled). Check the reason property the reason the url is filtered.""" result = FilterResult() result.filtered = False result.reason = REASON_UNKNOWN result.parsed_url = process_url(url) if not result.parsed_url: logging.debug("unable to process url {}".format(url)) result.reason = REASON_ERROR return result logging.debug("analyzing scheme {} netloc {} hostname {} path {} params {} query {} fragment {}".format( result.parsed_url.scheme, result.parsed_url.netloc, result.parsed_url.hostname, result.parsed_url.path, result.parsed_url.params, result.parsed_url.query, result.parsed_url.fragment)) if self.is_whitelisted(result.parsed_url.hostname): result.reason = REASON_WHITELISTED result.filtered = False return result if self.is_blacklisted(result.parsed_url.hostname): result.reason = REASON_BLACKLISTED result.filtered = True return result # if the URL is just to an IP address then we crawl that no matter what if is_ipv4(result.parsed_url.hostname): result.reason = REASON_DIRECT_IPV4 result.filtered = False return result if result.parsed_url.path: if self.matches_path_regex(result.parsed_url.path): result.reason = REASON_WHITELISTED result.filtered = False return result if self.is_in_intel_db(result.parsed_url): result.reason = REASON_CRITS result.filtered = False return result if not self.is_uncommon_network(result.parsed_url.hostname): result.reason = REASON_COMMON_NETWORK result.filtered = True return result result.filtered = False result.reason = REASON_OK return result
def is_whitelisted(self, value): if is_ipv4(value): for cidr in self.whitelisted_cidr: if IPv4Address(value) in cidr: logging.debug("{} matches whitelisted cidr {}".format(value, cidr)) return True return False for dst in self.whitelisted_fqdn: if is_subdomain(value, dst): logging.debug("{} matches whitelisted fqdn {}".format(value, dst)) return True return False
def is_blacklisted(self, value): if is_ipv4(value): for cidr in self.blacklisted_cidr: try: if IPv4Address(value) in cidr: logging.debug("{} matches blacklisted cidr {}".format(value, cidr)) return True except Exception as e: logging.error("failed to compare {} to {}: {}".format(value, cidr, e)) report_exception() return False for dst in self.blacklisted_fqdn: if is_subdomain(value, dst): logging.debug("{} matches blacklisted fqdn {}".format(value, dst)) return True return False
def is_in_cache_db(self, value, cache_path): """Is this URL in crits? value is the result of calling process_url on a URL.""" assert isinstance(value, ParseResult) with sqlite3.connect('file:{}?mode=ro'.format(cache_path), uri=True) as db: db_cursor = db.cursor() row = None # check ipv4 if is_ipv4(value.hostname): db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = ?", (CRITS_IPV4, value.hostname)) row = db_cursor.fetchone() if row: logging.debug("{} matched ipv4 indicator {}".format(value.hostname, row[0])) return True else: # check fqdn for partial_fqdn in iterate_fqdn_parts(value.hostname): #logging.debug("checking crits for {}".format(partial_fqdn)) db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = ?", (CRITS_FQDN, partial_fqdn.lower())) row = db_cursor.fetchone() if row: logging.debug("{} matched fqdn indicator {}".format(partial_fqdn, row[0])) return True # check full url db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = LOWER(?)", (CRITS_URL, value.geturl())) row = db_cursor.fetchone() if row: logging.debug("{} matched url indicator{}".format(value.geturl(), row[0])) return True # check url path path = urlunparse(('', '', value.path, value.params, value.query, value.fragment)) if path: db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = LOWER(?)", (CRITS_URL_PATH, path)) row = db_cursor.fetchone() if row: logging.debug("{} matched url_path indicator {}".format(value.path, row[0])) return True # check url file name if value.path: if not value.path.endswith('/'): file_name = value.path.split('/')[-1] db_cursor.execute("SELECT id FROM indicators WHERE type = ? AND value = LOWER(?)", (CRITS_FILE_NAME, file_name)) row = db_cursor.fetchone() if row: logging.debug("{} matched file_name indicator {}".format(file_name, row[0])) return True return False
def execute_analysis(self, url): if not self._initialized: # used to decide what URLs to actually crawl self.url_filter = CrawlphishURLFilter() # a whitelist of sites we'll always crawl self.watch_file(self.url_filter.whitelist_path, self.url_filter.load_whitelist) self.watch_file(self.url_filter.blacklist_path, self.url_filter.load_blacklist) self.watch_file(self.url_filter.regex_path, self.url_filter.load_path_regexes) self._initialized = True analysis = self.create_analysis(url) # are we able to download it? analysis.downloaded = False # if not, why? #analysis.error_reason = None # is this URL crawlable? filter_result = self.url_filter.filter(url.value) analysis.filtered_status = filter_result.filtered analysis.filtered_status_reason = filter_result.reason if analysis.filtered_status: logging.debug("{} is not crawlable: {}".format( url.value, analysis.filtered_status_reason)) return False parsed_url = filter_result.parsed_url if parsed_url is None: logging.debug("unable to parse url {}".format(url.value)) return False formatted_url = urlunparse(parsed_url) # update brocess if we're configured to do so if self.update_brocess and parsed_url.hostname and not is_ipv4( parsed_url.hostname): logging.debug( "updating brocess with crawlphish request for {}".format( parsed_url.hostname)) add_httplog(parsed_url.hostname) # what proxies are we going to use to attempt to download the url? # these are attempted in the order specified in the configuration setting proxy_configs = [] for name in self.proxies.split(','): if name == 'GLOBAL': proxy_configs.append((name, saq.PROXIES)) else: proxy_configs.append((name, saq.OTHER_PROXIES[name])) proxy_result = None for index, proxy_config in enumerate(proxy_configs): proxy_name, proxy_config = proxy_config proxy_result = CloudphishProxyResult() proxy_result.proxy_name = proxy_name analysis.proxies.append(proxy_name) analysis.proxy_results[proxy_name] = proxy_result session = requests.Session() session.proxies = proxy_config try: logging.info("requesting url {} via {}".format( formatted_url, proxy_name)) response = session.request('GET', formatted_url, headers=self.headers, timeout=self.timeout, allow_redirects=True, verify=False, stream=True) proxy_result.status_code = response.status_code proxy_result.status_code_reason = response.reason logging.info("url request result {} ({}) for {}".format( response.status_code, response.reason, formatted_url)) for header in response.headers.keys(): proxy_result.headers[header] = response.headers[header] for part in response.history: proxy_result.history.append(part.url) # did we get an error code? if math.floor(response.status_code / 100) in [4, 5]: proxy_result.error_reason = '({}) {}'.format( proxy_result.status_code, proxy_result.status_code_reason) continue # all is well -- break out and download the content break except requests.Timeout as e: proxy_result.error_reason = "request timed out" continue except Exception as e: proxy_result.error_reason = str(e) #report_exception() continue # we should never get here logging.error("executed invalid branch?") break # did we successfully start a download? if proxy_result.error_reason is not None: logging.info("unable to download {}: {}".format( formatted_url, proxy_result.error_reason)) return True path_components = [x for x in parsed_url.path.split('/') if x.strip()] # need to figure out what to call it file_name = None # content-disposition header is the official way if 'content-disposition' in response.headers: file_name = response.headers['content-disposition'] # we could potentially see there here: attachment; filename="blah..." content_file_match = re.search( 'attachment; filename*?="?(?P<real_filename>[^"]+)"?', response.headers['content-disposition']) if content_file_match: file_name = content_file_match.group('real_filename') # handle rfc5987 which allows utf-8 encoding and url-encoding if file_name.lower().startswith("utf-8''"): file_name = file_name[7:] file_name = urllib.unquote(file_name).decode('utf8') # otherwise we use the last element of the path if not file_name and parsed_url.path and not parsed_url.path.endswith( '/'): file_name = path_components[-1] # default if we can't figure it out if not file_name: file_name = 'unknown.crawlphish' # truncate if too long if len(file_name) > self.max_file_name_length: file_name = file_name[len(file_name) - self.max_file_name_length:] # replace invalid filesystem characters file_name = secure_filename(file_name) # make the crawlphish dir dest_dir = os.path.join(self.root.storage_dir, 'crawlphish') try: if not os.path.isdir(dest_dir): os.makedirs(dest_dir) except Exception as e: logging.error("unable to create directory {}: {}".format( dest_dir, e)) file_path = os.path.join(dest_dir, file_name) # prevent file path collision if os.path.isfile(file_path): duplicate_count = 1 file_path = os.path.join( dest_dir, "{}_{}".format(duplicate_count, file_name)) while os.path.isfile(file_path): duplicate_count = duplicate_count + 1 file_path = os.path.join( dest_dir, "{}_{}".format(duplicate_count, file_name)) # download the results up to the limit try: bytes_downloaded = 0 with open(file_path, 'wb') as fp: for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE): bytes_downloaded += len(chunk) fp.write(chunk) if bytes_downloaded >= self.max_download_size: logging.debug( "exceeded max download size for {}".format(url)) response.close() logging.debug("downloaded {} bytes for {}".format( bytes_downloaded, file_path)) except Exception as e: analysis.downloaded = False proxy_result.error_reason = "data transfer interrupted: {}".format( e) logging.debug("url {} transfer failed: {}".format(url, e)) return True # record all the details of the transaction analysis.downloaded = True analysis.file_name = file_name analysis.requested_url = formatted_url analysis.final_url = response.url # if the final url is different than the original url, record that url as an observable final_url = None if analysis.final_url and analysis.final_url != url.value: final_url = analysis.add_observable(F_URL, analysis.final_url, o_time=url.time) if final_url: final_url.add_tag('redirection_target') final_url.add_relationship(R_REDIRECTED_FROM, url) #if len(response.history) > 1: #url.add_tag('redirection') # and add the file for processing download = analysis.add_observable( F_FILE, os.path.relpath(file_path, start=self.root.storage_dir)) if download: download.add_relationship(R_DOWNLOADED_FROM, final_url if final_url else url) # only extract if non-error http response if response.status_code >= 200 and response.status_code <= 299: download.add_directive(DIRECTIVE_EXTRACT_URLS) return True
def execute_analysis(self, url): from saq.modules.cloudphish import CloudphishAnalysis from saq.cloudphish import SCAN_RESULT_ERROR, SCAN_RESULT_PASS cloudphish_analysis = self.wait_for_analysis(url, CloudphishAnalysis) if not cloudphish_analysis: return False # is this a URL to an IP address to a single file in the root directory? # example: http://220.218.70.160/sec.hta try: parsed_url = urlparse(url.value) except Exception as e: logging.debug("unable to parse url {}: {}".format(url.value, e)) return False # define what is considered suspicious to find in the root dir def _susp_file(path): for ext in [ 'doc', 'docx', 'docm', 'xls', 'xlsx', 'xlsm', 'ppt', 'pptx', 'pptm', 'pdf', 'js', 'vbs', 'jse', 'exe', 'swf', 'jar', 'lnk', 'ps1', 'rtf', 'chm', 'bat', 'scr', 'hta', 'cab', 'pif', 'au3', 'a3x', 'eps', 'xla', 'pptm', 'pps', 'dot', 'dotm', 'pub', 'wsf', 'cmd', 'ps', 'vbe', 'wsc' ]: if path.lower().endswith('.{}'.format(ext)): return True return False analysis = self.create_analysis(url) if parsed_url.hostname and parsed_url.path: if is_ipv4(parsed_url.hostname) and SINGLE_FILE_REGEX.match( parsed_url.path): # ignore a link to a URL in the local network (common for companies to do locally) if not any([ parsed_url.hostname in cidr for cidr in saq.MANAGED_NETWORKS ]): # and then the file extension must end in something suspicious if _susp_file(parsed_url.path): analysis.details = True url.add_detection_point( "URL to ipv4 to suspicious file in root directory") url.add_directive(DIRECTIVE_FORCE_DOWNLOAD) # is the URL to an actual internet host? if parsed_url.hostname and '.' in parsed_url.hostname: # did this URL come from a stream file from an office document? stream_file = search_down( url, lambda x: isinstance(x, Observable) and x.type == F_FILE and '.officeparser/stream' in x.value) if stream_file: # what did cloudphish think of this url? if cloudphish_analysis.analysis_result not in [ SCAN_RESULT_ERROR, SCAN_RESULT_PASS ]: #analysis.details = True #url.add_detection_point("uncommon URL in ole stream file") url.add_directive(DIRECTIVE_FORCE_DOWNLOAD) return True