def initialize_url_filter(): global url_filter # initialize the crawlphish url filter url_filter = CrawlphishURLFilter() # TODO schedule tasks to reload lists url_filter.load() logging.debug("url filter loaded")
def execute_analysis(self, url): if not self._initialized: # used to decide what URLs to actually crawl self.url_filter = CrawlphishURLFilter() # a whitelist of sites we'll always crawl self.watch_file(self.url_filter.whitelist_path, self.url_filter.load_whitelist) self.watch_file(self.url_filter.blacklist_path, self.url_filter.load_blacklist) self.watch_file(self.url_filter.regex_path, self.url_filter.load_path_regexes) self._initialized = True analysis = self.create_analysis(url) # are we able to download it? analysis.downloaded = False # if not, why? #analysis.error_reason = None # is this URL crawlable? filter_result = self.url_filter.filter(url.value) analysis.filtered_status = filter_result.filtered analysis.filtered_status_reason = filter_result.reason if analysis.filtered_status: logging.debug("{} is not crawlable: {}".format( url.value, analysis.filtered_status_reason)) return False parsed_url = filter_result.parsed_url if parsed_url is None: logging.debug("unable to parse url {}".format(url.value)) return False formatted_url = urlunparse(parsed_url) # update brocess if we're configured to do so if self.update_brocess and parsed_url.hostname and not is_ipv4( parsed_url.hostname): logging.debug( "updating brocess with crawlphish request for {}".format( parsed_url.hostname)) add_httplog(parsed_url.hostname) # what proxies are we going to use to attempt to download the url? # these are attempted in the order specified in the configuration setting proxy_configs = [] for name in self.proxies.split(','): if name == 'GLOBAL': proxy_configs.append((name, saq.PROXIES)) else: proxy_configs.append((name, saq.OTHER_PROXIES[name])) proxy_result = None for index, proxy_config in enumerate(proxy_configs): proxy_name, proxy_config = proxy_config proxy_result = CloudphishProxyResult() proxy_result.proxy_name = proxy_name analysis.proxies.append(proxy_name) analysis.proxy_results[proxy_name] = proxy_result session = requests.Session() session.proxies = proxy_config try: logging.info("requesting url {} via {}".format( formatted_url, proxy_name)) response = session.request('GET', formatted_url, headers=self.headers, timeout=self.timeout, allow_redirects=True, verify=False, stream=True) proxy_result.status_code = response.status_code proxy_result.status_code_reason = response.reason logging.info("url request result {} ({}) for {}".format( response.status_code, response.reason, formatted_url)) for header in response.headers.keys(): proxy_result.headers[header] = response.headers[header] for part in response.history: proxy_result.history.append(part.url) # did we get an error code? if math.floor(response.status_code / 100) in [4, 5]: proxy_result.error_reason = '({}) {}'.format( proxy_result.status_code, proxy_result.status_code_reason) continue # all is well -- break out and download the content break except requests.Timeout as e: proxy_result.error_reason = "request timed out" continue except Exception as e: proxy_result.error_reason = str(e) #report_exception() continue # we should never get here logging.error("executed invalid branch?") break # did we successfully start a download? if proxy_result.error_reason is not None: logging.info("unable to download {}: {}".format( formatted_url, proxy_result.error_reason)) return True path_components = [x for x in parsed_url.path.split('/') if x.strip()] # need to figure out what to call it file_name = None # content-disposition header is the official way if 'content-disposition' in response.headers: file_name = response.headers['content-disposition'] # we could potentially see there here: attachment; filename="blah..." content_file_match = re.search( 'attachment; filename*?="?(?P<real_filename>[^"]+)"?', response.headers['content-disposition']) if content_file_match: file_name = content_file_match.group('real_filename') # handle rfc5987 which allows utf-8 encoding and url-encoding if file_name.lower().startswith("utf-8''"): file_name = file_name[7:] file_name = urllib.unquote(file_name).decode('utf8') # otherwise we use the last element of the path if not file_name and parsed_url.path and not parsed_url.path.endswith( '/'): file_name = path_components[-1] # default if we can't figure it out if not file_name: file_name = 'unknown.crawlphish' # truncate if too long if len(file_name) > self.max_file_name_length: file_name = file_name[len(file_name) - self.max_file_name_length:] # replace invalid filesystem characters file_name = secure_filename(file_name) # make the crawlphish dir dest_dir = os.path.join(self.root.storage_dir, 'crawlphish') try: if not os.path.isdir(dest_dir): os.makedirs(dest_dir) except Exception as e: logging.error("unable to create directory {}: {}".format( dest_dir, e)) file_path = os.path.join(dest_dir, file_name) # prevent file path collision if os.path.isfile(file_path): duplicate_count = 1 file_path = os.path.join( dest_dir, "{}_{}".format(duplicate_count, file_name)) while os.path.isfile(file_path): duplicate_count = duplicate_count + 1 file_path = os.path.join( dest_dir, "{}_{}".format(duplicate_count, file_name)) # download the results up to the limit try: bytes_downloaded = 0 with open(file_path, 'wb') as fp: for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE): bytes_downloaded += len(chunk) fp.write(chunk) if bytes_downloaded >= self.max_download_size: logging.debug( "exceeded max download size for {}".format(url)) response.close() logging.debug("downloaded {} bytes for {}".format( bytes_downloaded, file_path)) except Exception as e: analysis.downloaded = False proxy_result.error_reason = "data transfer interrupted: {}".format( e) logging.debug("url {} transfer failed: {}".format(url, e)) return True # record all the details of the transaction analysis.downloaded = True analysis.file_name = file_name analysis.requested_url = formatted_url analysis.final_url = response.url # if the final url is different than the original url, record that url as an observable final_url = None if analysis.final_url and analysis.final_url != url.value: final_url = analysis.add_observable(F_URL, analysis.final_url, o_time=url.time) if final_url: final_url.add_tag('redirection_target') final_url.add_relationship(R_REDIRECTED_FROM, url) #if len(response.history) > 1: #url.add_tag('redirection') # and add the file for processing download = analysis.add_observable( F_FILE, os.path.relpath(file_path, start=self.root.storage_dir)) if download: download.add_relationship(R_DOWNLOADED_FROM, final_url if final_url else url) # only extract if non-error http response if response.status_code >= 200 and response.status_code <= 299: download.add_directive(DIRECTIVE_EXTRACT_URLS) return True
class CrawlphishAnalyzer(AnalysisModule): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.headers = {'User-Agent': self.config['user-agent']} self._initialized = False def verify_environment(self): self.verify_config_exists('whitelist_path') self.verify_path_exists(self.config['whitelist_path']) self.verify_config_exists('regex_path') self.verify_path_exists(self.config['regex_path']) self.verify_config_exists('blacklist_path') self.verify_path_exists(self.config['blacklist_path']) self.verify_config_exists('uncommon_network_threshold') self.verify_config_exists('user-agent') self.verify_config_exists('timeout') self.verify_config_exists('max_download_size') self.verify_config_exists('cooldown_period') self.verify_config_exists('update_brocess') self.verify_config_exists('proxies') for name in self.config['proxies'].split(','): if name == 'GLOBAL': continue if 'proxy_{}'.format(name) not in saq.CONFIG: logging.critical( "invalid proxy name {} in crawlphish config".format(name)) @property def whitelist_path(self): return self.url_filter.whitelist_path @property def regex_path(self): return self.url_filter.regex_path @property def blacklist_path(self): return self.url_filter.blacklist_path @property def uncommon_network_threshold(self): """How many connections decides that a given fqdn or ip address is an "uncommon network"? """ return self.config.getint('uncommon_network_threshold') @property def user_agent(self): return self.config['user-agent'] @property def timeout(self): """How long to wait for an HTTP request to time out (in seconds).""" return self.config.getint('timeout') @property def max_download_size(self): """Maximum download size (in MB).""" return self.config.getint('max_download_size') * 1024 * 1024 @property def update_brocess(self): """Are we updating brocess when we make a request?""" return self.config.getboolean('update_brocess') @property def proxies(self): """The list of proxies we'll use to download URLs, attepted in order.""" return self.config['proxies'] @property def generated_analysis_type(self): return CrawlphishAnalysisV2 @property def valid_observable_types(self): return F_URL @property def required_directives(self): return [DIRECTIVE_CRAWL] def execute_analysis(self, url): if not self._initialized: # used to decide what URLs to actually crawl self.url_filter = CrawlphishURLFilter() # a whitelist of sites we'll always crawl self.watch_file(self.url_filter.whitelist_path, self.url_filter.load_whitelist) self.watch_file(self.url_filter.blacklist_path, self.url_filter.load_blacklist) self.watch_file(self.url_filter.regex_path, self.url_filter.load_path_regexes) self._initialized = True analysis = self.create_analysis(url) # are we able to download it? analysis.downloaded = False # if not, why? #analysis.error_reason = None # is this URL crawlable? filter_result = self.url_filter.filter(url.value) analysis.filtered_status = filter_result.filtered analysis.filtered_status_reason = filter_result.reason if analysis.filtered_status: logging.debug("{} is not crawlable: {}".format( url.value, analysis.filtered_status_reason)) return False parsed_url = filter_result.parsed_url if parsed_url is None: logging.debug("unable to parse url {}".format(url.value)) return False formatted_url = urlunparse(parsed_url) # update brocess if we're configured to do so if self.update_brocess and parsed_url.hostname and not is_ipv4( parsed_url.hostname): logging.debug( "updating brocess with crawlphish request for {}".format( parsed_url.hostname)) add_httplog(parsed_url.hostname) # what proxies are we going to use to attempt to download the url? # these are attempted in the order specified in the configuration setting proxy_configs = [] for name in self.proxies.split(','): if name == 'GLOBAL': proxy_configs.append((name, saq.PROXIES)) else: proxy_configs.append((name, saq.OTHER_PROXIES[name])) proxy_result = None for index, proxy_config in enumerate(proxy_configs): proxy_name, proxy_config = proxy_config proxy_result = CloudphishProxyResult() proxy_result.proxy_name = proxy_name analysis.proxies.append(proxy_name) analysis.proxy_results[proxy_name] = proxy_result session = requests.Session() session.proxies = proxy_config try: logging.info("requesting url {} via {}".format( formatted_url, proxy_name)) response = session.request('GET', formatted_url, headers=self.headers, timeout=self.timeout, allow_redirects=True, verify=False, stream=True) proxy_result.status_code = response.status_code proxy_result.status_code_reason = response.reason logging.info("url request result {} ({}) for {}".format( response.status_code, response.reason, formatted_url)) for header in response.headers.keys(): proxy_result.headers[header] = response.headers[header] for part in response.history: proxy_result.history.append(part.url) # did we get an error code? if math.floor(response.status_code / 100) in [4, 5]: proxy_result.error_reason = '({}) {}'.format( proxy_result.status_code, proxy_result.status_code_reason) continue # all is well -- break out and download the content break except requests.Timeout as e: proxy_result.error_reason = "request timed out" continue except Exception as e: proxy_result.error_reason = str(e) #report_exception() continue # we should never get here logging.error("executed invalid branch?") break # did we successfully start a download? if proxy_result.error_reason is not None: logging.info("unable to download {}: {}".format( formatted_url, proxy_result.error_reason)) return True # for each url we download we use a file name inside a directory with the following format # parsed_url.hostname_N # where N is an integer starting at 0 and increments each time # we store the next number to use in the state for this module if not self.state: self.state = {} self.state[ 'next_unknown'] = 0 # next integer to use for unknown.crawlphish files self.state['host_counts'] = { } # key = parsed_url.hostname, value = next integer to use path_components = [x for x in parsed_url.path.split('/') if x.strip()] # need to figure out what to call it file_name = None # content-disposition header is the official way if 'content-disposition' in response.headers: file_name = response.headers['content-disposition'] # we could potentially see there here: attachment; filename="blah..." content_file_match = re.search( 'attachment; filename="?(?P<real_filename>[^"]+)"?', response.headers['content-disposition']) if content_file_match: file_name = content_file_match.group('real_filename') # replace any / or . with _ file_name = re.sub( r'_+', '_', re.sub(r'\.\.', '_', re.sub(r'/', '_', file_name))) # otherwise we use the last element of the path if not file_name and parsed_url.path and not parsed_url.path.endswith( '/'): file_name = path_components[-1] # default if we can't figure it out if not file_name: file_name = 'unknown_{}.crawlphish'.format( self.state['next_unknown']) self.state['next_unknown'] += 1 hostname = parsed_url.hostname if not hostname: hostname = 'unknown_host' if hostname not in self.state['host_counts']: self.state['host_counts'][hostname] = 0 dest_dir = os.path.join( self.root.storage_dir, 'crawlphish', '{}_{}'.format(hostname, self.state['host_counts'][hostname])) self.state['host_counts'][hostname] += 1 try: if not os.path.isdir(dest_dir): os.makedirs(dest_dir) except Exception as e: logging.error("unable to create directory {}: {}".format( dest_dir, e)) file_path = os.path.join(dest_dir, file_name) # download the results up to the limit try: bytes_downloaded = 0 with open(file_path, 'wb') as fp: for chunk in response.iter_content(io.DEFAULT_BUFFER_SIZE): bytes_downloaded += len(chunk) fp.write(chunk) if bytes_downloaded >= self.max_download_size: logging.debug( "exceeded max download size for {}".format(url)) response.close() logging.debug("downloaded {} bytes for {}".format( bytes_downloaded, file_path)) except Exception as e: analysis.downloaded = False proxy_result.error_reason = "data transfer interrupted: {}".format( e) logging.debug("url {} transfer failed: {}".format(url, e)) return True # record all the details of the transaction analysis.downloaded = True analysis.file_name = file_name analysis.requested_url = formatted_url analysis.final_url = response.url # if the final url is different than the original url, record that url as an observable final_url = None if analysis.final_url and analysis.final_url != url.value: final_url = analysis.add_observable(F_URL, analysis.final_url, o_time=url.time) if final_url: final_url.add_tag('redirection_target') final_url.add_relationship(R_REDIRECTED_FROM, url) #if len(response.history) > 1: #url.add_tag('redirection') # and add the file for processing download = analysis.add_observable( F_FILE, os.path.relpath(file_path, start=self.root.storage_dir)) if download: download.add_relationship(R_DOWNLOADED_FROM, final_url if final_url else url) # only extract if non-error http response if response.status_code >= 200 and response.status_code <= 299: download.add_directive(DIRECTIVE_EXTRACT_URLS) return True