def __init__(self,db_name,db_engine, flush_domain_owners): # set up global db connection if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # these gets reused frequently, minimize db calls by doing it up here self.total_pages = self.sql_driver.get_complex_page_count() self.total_crawls = self.sql_driver.get_crawl_count() # pass utilities the database info self.utilities = Utilities(db_name,db_engine) # initialize the domain owner dict self.domain_owners = self.utilities.get_domain_owner_dict() # update domain owners if flush_domain_owners: self.patch_domain_owners() # load to memory for faster processing, make sure you # have enough RAM! self.get_crawl_id_to_3p_domain_info()
def __init__(self): """ Set up our server configuration here. Note we store config details in server_config.json because __init__ is run each time a worker processes a request this means we can modify our config on the fly without having to restart the server """ # connect to server config db to get client_config self.server_sql_driver = PostgreSQLDriver('server_config') # important parts of config currently are to # generate our whitelist of allowed ips # and to map our clients to their respective # databases self.whitelisted_ips = [] self.client_id_to_db = {} for client in self.server_sql_driver.get_client_configs(): if client['live']: if self.server_sql_driver.check_db_exist(client['mapped_db']): self.whitelisted_ips.append(client['client_ip']) self.client_id_to_db[ client['client_id']] = client['mapped_db'] else: print( f"Database {client['mapped_db']} for client {client['client_id']} does not exist" )
def __init__(self, db_name, db_engine): self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() self.debug = False if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.config = self.sql_driver.get_config()
def __init__(self, db_name=None, db_engine=None, client_id=None): """ This class can be called to run store_results_from_queue which connects to the server_config database to fetch results, in which case a global db_name isn't needed, so we have db_name=None to account for that. However, if we *do* have a db_name we set up a global config. """ self.db_name = db_name self.db_engine = db_engine self.client_id = client_id self.debug = True self.utilities = Utilities() # get global config for this db if db_name: # set up database connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.config = sql_driver.get_config() self.browser_config = {} for item in self.config: if 'client' in item: self.browser_config[item] = self.config[item] sql_driver.close()
def __init__(self, db_name=None, db_engine=None): # if we have db params set up global db connection, otherwise we don't bother if db_name: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() elif db_engine: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver() elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver() else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.url_parser = ParseURL()
def build_policy_task_queue(self, flush_policy_task_queue=True, timeseries_interval=10080): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # set up new db connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # get rid of whatever is in there already if flush_policy_task_queue: sql_driver.flush_task_queue(task='get_policy') # get list of all policies we have scanned_policies = [] for policy_url, in sql_driver.get_scanned_policy_urls(): scanned_policies.append(policy_url) # run the query and add to list for policy_url, in sql_driver.get_policies_to_collect(): # if page has an anchor, we drop everything after if policy_url[-1] == '#': policy_url = policy_url[:-1] elif '#' in policy_url: policy_url = re.search('^(.+?)#.+$', policy_url).group(1) # skip invalid links if not self.utilities.is_url_valid(policy_url): continue # already did it, skip if policy_url in scanned_policies: continue sql_driver.add_task_to_queue(policy_url, 'get_policy') # fyi print('\t%s pages in task_queue for get_policy' % sql_driver.get_task_queue_length(task='get_policy')) # we no longer need this db connection sql_driver.close()
class Reporter: """ Manages the production of a number of CSV reports. """ def __init__(self, db_name, db_engine, num_tlds, num_results, tracker_threshold=None, flush_domain_owners=True, start_date=False, end_date=False): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_name = db_name self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold # pass utilities the database info self.utilities = Utilities(db_name, db_engine) # set up the analyzer we will be using throughout self.analyzer = Analyzer(db_name, db_engine) # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') # creates a new directory if it doesn't exist already self.report_path = self.utilities.setup_report_dir(self.db_name) # this is used in various places to get owner information self.domain_owners = self.utilities.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for tld in self.top_tlds: if tld: print('\t\t |- %s' % tld) else: self.top_tlds = [None] # __init__ ##################### # REPORT GENERATORS # ##################### def generate_db_summary_report(self, print_to_cli=True): """ outputs and stores report of basic data about how many records in db, etc. """ print('\t================') print('\t General Summary') print('\t================') # get the relevant db summary data db_summary = self.analyzer.get_db_summary() # print to cli if print_to_cli: print("\t\tTotal Crawls:\t\t\t%s" % db_summary['total_crawls_ok']) print("\t\tTotal Pages:\t\t\t%s" % db_summary['total_pages_ok']) print("\t\tTotal Tasks Fail:\t\t%s" % db_summary['total_tasks_fail']) print("\t\tTotal Tasks Attempted:\t\t%s" % db_summary['total_tasks_attempted']) print("\t\t%% Pages OK:\t\t\t%.2f%%" % db_summary['percent_tasks_ok']) print("\t\tTotal Errors:\t\t\t%s" % db_summary['total_errors']) print("\t\tTotal Cookies:\t\t\t%s" % db_summary['total_cookies']) print("\t\tTotal 3P Cookies:\t\t%s" % db_summary['total_3p_cookies']) print("\t\tTotal Dom Storage:\t\t%s" % db_summary['total_dom_storage']) print("\t\tTotal Websockets:\t\t%s" % db_summary['total_websockets']) print("\t\tTotal Websocket Events:\t\t%s" % db_summary['total_websocket_events']) print("\t\tTotal Requests:\t\t\t%s" % db_summary['total_requests']) print("\t\tTotal Responses:\t\t%s" % db_summary['total_responses']) print('\t\t%% Requests Received:\t\t%.2f%%' % db_summary['percent_requests_received']) print("\t\t3P Requests:\t\t\t%s" % db_summary['total_3p_requests']) print("\t\t3P Responses:\t\t\t%s" % db_summary['total_3p_responses']) print('\t\t%% 3P Requests Received:\t\t%.2f%%' % db_summary['percent_3p_requests_received']) print('\t\t' + '-' * 40) # write results to csv csv_rows = [] csv_rows.append(('total_crawls_ok', db_summary['total_crawls_ok'])) csv_rows.append(('total_pages_ok', db_summary['total_pages_ok'])) csv_rows.append(('total_tasks_fail', db_summary['total_tasks_fail'])) csv_rows.append( ('total_tasks_attempted', db_summary['total_tasks_attempted'])) csv_rows.append(('percent_pages_ok', db_summary['percent_tasks_ok'])) csv_rows.append(('total_errors', db_summary['total_errors'])) csv_rows.append(('total_cookies', db_summary['total_cookies'])) csv_rows.append(('total_3p_cookies', db_summary['total_3p_cookies'])) csv_rows.append(('total_dom_storage', db_summary['total_dom_storage'])) csv_rows.append(('total_websockets', db_summary['total_websockets'])) csv_rows.append( ('total_websocket_events', db_summary['total_websocket_events'])) csv_rows.append(('total_requests', db_summary['total_requests'])) csv_rows.append(('total_responses', db_summary['total_responses'])) csv_rows.append(('percent_requests_received', db_summary['percent_requests_received'])) csv_rows.append(('total_3p_requests', db_summary['total_3p_requests'])) csv_rows.append( ('total_3p_responses', db_summary['total_3p_responses'])) csv_rows.append(('percent_3p_requests_received', db_summary['percent_3p_requests_received'])) self.utilities.write_csv(self.report_path, 'db_summary.csv', csv_rows) # generate_db_summary_report def generate_stats_report(self): """ High level stats """ print('\t=============================') print('\t Processing High-Level Stats ') print('\t=============================') for tld_filter in self.top_tlds: csv_rows = [] if tld_filter: stats = self.analyzer.get_high_level_stats(tld_filter) else: stats = self.analyzer.get_high_level_stats() if self.tracker_threshold: filter_depth = self.tracker_threshold else: filter_depth = 'no_filter_used' csv_rows.append(('n_pages', stats['total_pages'])) csv_rows.append(('n_crawls', stats['total_crawls'])) csv_rows.append(('%_pages_ssl', stats['percent_pages_ssl'])) csv_rows.append( ('n_requests_received', stats['total_requests_received'])) csv_rows.append( ('%_requests_received_ssl', stats['percent_requests_ssl'])) csv_rows.append(('n_1p_requests_received', stats['total_requests_received_1p'])) csv_rows.append(('%_1p_requests_received_ssl', stats['percent_1p_requests_ssl'])) csv_rows.append(('n_3p_requests_received', stats['total_requests_received_3p'])) csv_rows.append(('%_3p_requests_received_ssl', stats['percent_3p_requests_ssl'])) csv_rows.append( ('average_page_load_time', stats['average_page_load_time'])) csv_rows.append(('%_w/3p_request', stats['percent_w_3p_request'])) csv_rows.append(('%_w/3p_cookie', stats['percent_w_3p_cookie'])) csv_rows.append(('%_w/3p_script', stats['percent_w_3p_script'])) csv_rows.append(('mean_3p_domains', stats['3p_domains_mean'])) csv_rows.append(('median_3p_domains', stats['3p_domains_median'])) csv_rows.append(('mode_3p_domains', stats['3p_domains_mode'])) csv_rows.append(('mean_3p_cookies', stats['3p_cookies_mean'])) csv_rows.append(('median_3p_cookies', stats['3p_cookies_median'])) csv_rows.append(('mode_3p_cookies', stats['3p_cookies_mode'])) if tld_filter: self.utilities.write_csv(self.report_path, tld_filter + '-stats.csv', csv_rows) else: self.utilities.write_csv(self.report_path, 'stats.csv', csv_rows) # generate_stats_report def generate_aggregated_tracking_attribution_report(self): """ generates ranked list of which entities collect data from the greatest number of crawls ('aggregated_tracking_attribution.csv') - entities which have subsidiaries are ranked according to the crawls their subsidiaries get data from as well - however, parent entities only get one hit on a crawl which has multiple subsidiaries present - for example, if a crawl has 'google analytics' and 'doubleclick' that is only one hit for 'google' """ print('\t======================================') print('\t Processing Aggregated Tracking Report ') print('\t======================================') for tld_filter in self.top_tlds: csv_rows = [] # write out data to csv for item in self.analyzer.get_aggregated_tracking_attribution( tld_filter): csv_rows.append( (item['percent_crawls'], item['owner_name'], item['owner_country'], self.utilities.get_domain_owner_lineage_combined_string( item['owner_id']))) # we want to first sort by owner name and then by percentage # to account for cases where two owners have the same percentage value csv_rows.sort(key=lambda x: x[1].lower()) csv_rows.sort(key=lambda x: x[0], reverse=True) # insert header row after sort csv_rows.insert(0, ('percentage_crawls_tracked', 'owner', 'owner_country', 'owner_lineage')) # write out csv with tld prefix if applicable if tld_filter: self.utilities.write_csv( self.report_path, tld_filter + '-aggregated_tracking_attribution.csv', csv_rows) else: self.utilities.write_csv( self.report_path, 'aggregated_tracking_attribution.csv', csv_rows) # generate_aggregated_tracking_attribution_report def generate_aggregated_3p_ssl_use_report(self): """ this report tells us the percentage of requests made to a given third-party are encrypted """ print('\t=========================================') print('\t Processing Aggregated 3P SSL Use Report ') print('\t=========================================') for tld_filter in self.top_tlds: csv_rows = [] for item in self.analyzer.get_aggregated_3p_ssl_use(tld_filter): csv_rows.append( (item['ssl_use'], item['owner_name'], item['owner_country'], self.utilities.get_domain_owner_lineage_combined_string( item['owner_id']))) # we want to first sort by owner name and then by percentage # to account for cases where two owners have the same percentage value csv_rows.sort(key=lambda x: x[1].lower()) csv_rows.sort(key=lambda x: x[0], reverse=True) # insert header row after sort csv_rows.insert(0, ('percent_requests_encrypted', 'owner', 'owner_country', 'owner_lineage')) # write out csv with tld prefix if applicable if tld_filter: self.utilities.write_csv(self.report_path, tld_filter + '-3p_ssl_use.csv', csv_rows) else: self.utilities.write_csv(self.report_path, '3p_ssl_use.csv', csv_rows) # generate_aggregated_3p_ssl_use_report def generate_3p_domain_report(self): """ This report tells us the most commonly occuring third-party domains. """ print('\t==============================') print('\t Processing 3P Domains Report ') print('\t==============================') for tld_filter in self.top_tlds: csv_rows = [] csv_rows.append(('percent_total', 'domain', 'owner', 'owner_country', 'owner_lineage')) # get_3p_domain_percentages returns a list, we slice it to get only desired num_results for item in self.analyzer.get_3p_domain_percentages( tld_filter)[:self.num_results]: # figure out the lineage string if we know who owns the domain if item['owner_id'] != None: lineage_string = self.utilities.get_domain_owner_lineage_combined_string( item['owner_id']) else: lineage_string = None csv_rows.append((item['percent_crawls'], item['domain'], item['owner_name'], item['owner_country'], lineage_string)) if tld_filter: self.utilities.write_csv(self.report_path, tld_filter + '-3p_domains.csv', csv_rows) else: self.utilities.write_csv(self.report_path, '3p_domains.csv', csv_rows) # generate_3p_domain_report def generate_3p_request_report(self, request_type=None): """ this queries the db to get all requests, domains, or domain owners next they are counted to find the most common and formatted to csv rows and returned """ if request_type == 'script': print('\t=============================') print('\t Processing 3P Script Report ') print('\t=============================') else: print('\t==============================') print('\t Processing 3P Request Report ') print('\t==============================') for tld_filter in self.top_tlds: csv_rows = [] csv_rows.append(('percent_total', 'request', 'type', 'domain', 'owner', 'owner_country', 'owner_lineage')) # get_3p_domain_percentages returns a list, we slice it to get only desired num_results for item in self.analyzer.get_3p_request_percentages( tld_filter, request_type)[:self.num_results]: # figure out the lineage string if we know who owns the domain if item['request_owner_id'] != None: lineage_string = self.utilities.get_domain_owner_lineage_combined_string( item['request_owner_id']) else: lineage_string = None csv_rows.append( (item['percent_crawls'], item['request_url'], item['request_type'], item['request_domain'], item['request_owner_name'], item['request_owner_country'], lineage_string)) if tld_filter: if request_type: self.utilities.write_csv( self.report_path, tld_filter + '-3p_' + request_type + '.csv', csv_rows) else: self.utilities.write_csv(self.report_path, tld_filter + '-3p_request.csv', csv_rows) else: if request_type: self.utilities.write_csv(self.report_path, '3p_' + request_type + '.csv', csv_rows) else: self.utilities.write_csv(self.report_path, '3p_request.csv', csv_rows) # generate_3p_request_report def generate_data_transfer_report(self): """ These reports tell us how much data was transferred across several dimensions """ print('\t==================================') print('\t Processing Data Transfer Reports ') print('\t==================================') for tld_filter in self.top_tlds: # set up filter and file names if tld_filter: summary_file_name = tld_filter + '-data_xfer_summary.csv' domain_file_name = tld_filter + '-data_xfer_by_domain.csv' aggregated_file_name = tld_filter + '-data_xfer_aggregated.csv' else: summary_file_name = 'data_xfer_summary.csv' domain_file_name = 'data_xfer_by_domain.csv' aggregated_file_name = 'data_xfer_aggregated.csv' # get the data from db, tuple of (response_domain, size, is_3p (boolean), domain_owner_id) response_sizes = self.sql_driver.get_response_sizes() # initialize vars first_party_data = 0 third_party_data = 0 total_data = 0 # need Counter object, allows sorting later domain_data = collections.Counter() owner_data = collections.Counter() # process each row for item in response_sizes: response_domain = item[0] response_size = item[1] response_is_3p = item[2] domain_owner_id = item[3] # this is the measure of all data downloaded total_data += response_size # measures for third and first party data if response_is_3p: third_party_data += response_size else: first_party_data += response_size # data by domain, increment if already in there, otherwise new entry if response_domain in domain_data: domain_data[response_domain] += response_size else: domain_data[response_domain] = response_size # only if we know the owner, increment if domain_owner_id: for lineage_id in self.utilities.get_domain_owner_lineage_ids( domain_owner_id): if lineage_id in owner_data: owner_data[lineage_id] += response_size else: owner_data[lineage_id] = response_size # avoid divide-by-zero if total_data == 0: print('\t\tTotal data is zero, no report') return # output data to csv summary_data_csv = [] summary_data_csv.append( ('party', 'percent_total', 'data_transfered_bytes')) summary_data_csv.append(('all', '100', total_data)) summary_data_csv.append( ('First', round((first_party_data / total_data) * 100, self.num_decimals), first_party_data)) summary_data_csv.append( ('Third', round((third_party_data / total_data) * 100, self.num_decimals), third_party_data)) self.utilities.write_csv(self.report_path, summary_file_name, summary_data_csv) # sort and output ranked data domain_data = domain_data.most_common() domain_data.sort() domain_data.sort(reverse=True, key=lambda item: item[1]) # for csv data domain_data_csv = [] domain_data_csv.append( ('percent_total', 'domain', 'data_transfered_bytes')) # if num_results is None we get everything, otherwise stops at limit for item in domain_data[:self.num_results]: domain_data_csv.append( (round((item[1] / total_data) * 100, self.num_decimals), item[0], item[1])) self.utilities.write_csv(self.report_path, domain_file_name, domain_data_csv) owner_data = self.utilities.get_most_common_sorted(owner_data) owner_data_csv = [] owner_data_csv.append(('percent_total', 'owner', 'owner_country', 'owner_lineage', 'data_transfered_bytes')) # get results for all known owners for item in owner_data: owner_data_csv.append( (round((item[1] / total_data) * 100, self.num_decimals), self.domain_owners[item[0]]['owner_name'], self.domain_owners[item[0]]['country'], self.utilities.get_domain_owner_lineage_combined_string( item[0]), item[1])) self.utilities.write_csv(self.report_path, aggregated_file_name, owner_data_csv) # generate_data_transfer_report def generate_use_report(self): """ This function handles the process of generating a csv report which details what percentage of pages use third-party content for specific uses, the number of requests made for a given type of use on a per-page basis, and the percentage of such requests which correspond to a third-party cookie. """ print('\t==========================') print('\t Processing 3P Use Report ') print('\t==========================') for tld_filter in self.top_tlds: use_data = self.analyzer.get_3p_use_data(tld_filter) all_uses = use_data['all_uses'] percentage_by_use = use_data['percentage_by_use'] average_use_occurance_per_page = use_data[ 'average_use_occurance_per_crawl'] percentage_use_w_cookie = use_data['percentage_use_w_cookie'] percentage_use_ssl = use_data['percentage_use_ssl'] csv_rows = [] csv_rows.append( ('use_category', 'percent_crawls_w_use', 'ave_occurances_per_page', 'percentage_of_use_w_cookie', 'percentage_of_use_ssl')) for use in sorted(all_uses): if percentage_by_use[use] != None: csv_rows.append((use, percentage_by_use[use], average_use_occurance_per_page[use], percentage_use_w_cookie[use], percentage_use_ssl[use])) else: csv_rows.append((use, None, None, None, None)) # write out csv with tld prefix if applicable if tld_filter: self.utilities.write_csv(self.report_path, tld_filter + '-3p_uses.csv', csv_rows) else: self.utilities.write_csv(self.report_path, '3p_uses.csv', csv_rows) # generate_use_report def generate_per_page_network_report(self): """ this report generates data necessary for graph/network analysis by outputting a list of page domains and the requests/owners they connect to on a per-page basis """ print('\t====================================') print('\t Processing Per-Page Network Report ') print('\t====================================') # put output here csv_rows = [] # header row for csv csv_rows.append(('page_start_url', 'page_final_url', 'page_accessed', '3p_request_domain', '3p_domain_owner', '3p_domain_owner_country')) # process all records for item in self.analyzer.get_page_to_3p_network(): csv_rows.append( (item['page_start_url'], item['page_final_url'], item['page_accessed'], item['request_domain'], item['request_owner_name'], item['request_owner_country'])) self.utilities.write_csv(self.report_path, 'per_page_network_report.csv', csv_rows) # generate_per_page_network_report def generate_per_site_network_report(self): """ this report generates data necessary for graph/network analysis by outputting a list of page domains and the requests/owners they connect to aggregated on a per-site basis (eg combining all pages) """ print('\t================================') print('\t Processing Site Network Report ') print('\t================================') # put output here csv_rows = [] # header row for csv csv_rows.append(('page_domain', '3p_request_domain', '3p_domain_owner', '3p_domain_owner_country')) for item in self.analyzer.get_site_to_3p_network(): csv_rows.append( (item['page_domain'], item['request_domain'], item['request_owner_name'], item['request_owner_country'])) self.utilities.write_csv(self.report_path, 'per_site_network_report.csv', csv_rows) # generate_per_site_network_report def generate_all_pages_request_dump(self): """ Full dump of all requests loaded by all pages across all load times. Default is 3p only, can be overridden. """ print('\t===================================') print('\t Processing All Pages request Dump ') print('\t===================================') # put output here csv_rows = [] # header row for csv csv_rows.append(('accessed', 'start_url', 'final_url', 'request_url', 'request_domain', 'domain_owner')) # process all records for item in self.analyzer.get_all_pages_requests(): csv_rows.append( (item['accessed'], item['start_url'], item['final_url'], item['request_url'], item['request_domain'], item['request_domain_owner'])) self.utilities.write_csv(self.report_path, 'all_pages_request_dump.csv', csv_rows) # generate_all_pages_request_dump def generate_all_pages_cookie_dump(self): """ Full dump of all cookies loaded by all pages across all load times. Default is 1p and 3p, can be overridden to 3p only. """ print('\t==================================') print('\t Processing All Pages Cookie Dump ') print('\t==================================') # put output here csv_rows = [] # header row for csv csv_rows.append(('accessed', 'start_url', 'final_url', 'cookie_domain', 'cookie_owner', 'cookie_name', 'cookie_value')) # process all records for item in self.analyzer.get_all_pages_cookies(): csv_rows.append( (item['accessed'], item['start_url'], item['final_url'], item['cookie_domain'], item['cookie_owner'], item['cookie_name'], item['cookie_value'])) self.utilities.write_csv(self.report_path, 'all_pages_cookie_dump.csv', csv_rows) # generate_all_pages_request_dump def generate_site_host_report(self): """ First, we update the domain table with the owners of the various ip addresses which gives us a mapping of pages to hosts. Second, we generate a network report for site domains to hosts. """ print('\t=====================') print('\t Updating Site Hosts ') print('\t=====================') self.analyzer.update_site_hosts() print('\t==============================') print('\t Generating Site Host Network ') print('\t==============================') site_host_data = self.analyzer.get_site_host_network() if len(site_host_data) == 0: print('\t\tNo site host data, skipping report.') return # put output here csv_rows = [] # header row for csv csv_rows.append(('page_domain', 'host_name')) for item in site_host_data: csv_rows.append((item['site_domain'], item['host_name'])) self.utilities.write_csv(self.report_path, 'site_hosts-network.csv', csv_rows) print('\t============================================') print('\t Generating Aggregate Host Ownership Report ') print('\t============================================') owner_occurances = [] for owner, in self.sql_driver.get_ip_owners(): owner_occurances.append(owner) csv_rows = [('owner', 'percent_sites_w_owner')] for item in self.utilities.get_most_common_sorted(owner_occurances): csv_rows.append((item[0], 100 * (item[1] / len(owner_occurances)))) self.utilities.write_csv(self.report_path, 'site_hosts-aggregated.csv', csv_rows) # generate_site_host_report ############## # POLICYXRAY # ############## def initialize_policy_reports(self): """ Run various pre-production steps. """ print('\t====================================') print('\t Updating 3p Domain Disclosure Data ') print('\t====================================') #self.analyzer.update_request_disclosure() self.analyzer.update_crawl_disclosure() print('\t\t...done!') print('\t======================================') print('\t Getting Policy Types List and Counts ') print('\t======================================') # pre-populate with 'None' which gives all policies self.policy_types = [{ 'type': None, 'count': self.analyzer.get_policy_count() }] for policy_type, in self.sql_driver.get_available_policy_types(): self.policy_types.append({ 'type': policy_type, 'count': self.analyzer.get_policy_count(policy_type=policy_type) }) print('\t\t...done!') # initialize_policy_reports def generate_policy_summary_report(self): """ Conducts prelminary analysis steps, determines what types of policies we have, and then initiates the pertinent reports. """ print('\t==================================') print('\t Generating Policy Summary Report ') print('\t==================================') # header row csv_rows = [('Type', 'N', 'Word Count', 'FK Grade', 'FRE', '% 3P Disclosed')] # get results for each policy_type for policy_type in self.policy_types: # makes reports clearer than 'None' if policy_type['type'] == None: this_policy_type = 'all' else: this_policy_type = policy_type['type'] print('\t\tProcessing %s...' % this_policy_type, end='', flush=True) # fetch results readability_scores = self.analyzer.get_readability_scores( policy_type=policy_type['type']) csv_rows.append( (this_policy_type, policy_type['count'], self.analyzer.get_average_policy_word_count( policy_type=policy_type['type']), readability_scores['ave_fkg'], readability_scores['ave_fre'], self.analyzer.get_percent_crawl_3p_domains_disclosed( policy_type=policy_type['type']))) print('done!') self.utilities.write_csv(self.report_path, 'policy-summary.csv', csv_rows) # generate_policy_summary_report def generate_policy_owner_disclosure_reports(self): """ Determines what types of policies we have, and then initiates the pertinent reports. """ print('\t======================================') print('\t Generating Company Disclosure Report ') print('\t======================================') # header row csv_rows = [('Type', 'N', '%% 3P Disclosed')] print('\t\tProcessing ...', end='', flush=True) company_results = self.analyzer.get_disclosure_by_request_owner() csv_rows = [('Domain Owner', 'Total Occurances', 'Total Disclosures', 'Percent Disclosed')] for item in company_results: csv_rows.append( (item, company_results[item][0], company_results[item][1], round(company_results[item][2], 2))) print('done!') self.utilities.write_csv(self.report_path, 'policy-owner_disclosure.csv', csv_rows) # generate_policy_owner_disclosure_reports def generate_policy_gdpr_report(self): """ Determine percentage of all policy types that contain gdpr article 9 terms. """ print('\t==============================') print('\t Generating GDPR Term Report ') print('\t==============================') term_list = [ 'racial or ethnic origin', 'political opinions', 'religious or philosophical beliefs', 'trade union membership', 'genetic data', 'biometric data', 'data concerning health', 'sex life', 'sexual orientation' ] self.generate_terms_report('policy-gdpr_terms.csv', term_list) # generate_policy_gdpr_report def generate_policy_pacification_report(self): """ Determine percentage of all policy types that contain pacification terms. """ print('\t=====================================') print('\t Generating Pacification Term Report ') print('\t=====================================') term_list = [ 'we value', 'we respect', 'important to us', 'help you', 'we care', 'committed to protecting', 'cares about', 'transparency' ] self.generate_terms_report('policy-pacification_terms.csv', term_list) # generate_policy_pacification_report def generate_policy_pii_report(self): """ Determine percentage of all policy types that contain pacification terms. """ print('\t============================') print('\t Generating PII Term Report ') print('\t============================') term_list = [ 'ip address', 'internet protocol address', 'browser type', 'operating system' ] self.generate_terms_report('policy-pii_terms.csv', term_list) # generate_policy_pacification_report def generate_terms_report(self, report_name, term_list): """ Generic function to generate reports on how often terms appear in policies. """ # set up header row csv_rows = [] header_row = ('Type', 'any term') for term in term_list: header_row = header_row + (term, ) csv_rows.append(header_row) # get results for each policy_type for policy_type in self.policy_types: # makes reports clearer than 'None' if policy_type['type'] == None: this_policy_type = 'all' else: this_policy_type = policy_type['type'] print('\t\tProcessing %s...' % this_policy_type, end='', flush=True) this_csv_row = (this_policy_type, ) this_csv_row = this_csv_row + (self.analyzer.get_terms_percentage( term_list, policy_type=policy_type['type'], policy_type_count=policy_type['count']), ) for term in term_list: this_csv_row = this_csv_row + ( self.analyzer.get_terms_percentage( [term], policy_type=policy_type['type'], policy_type_count=policy_type['count']), ) csv_rows.append(this_csv_row) print('done!') self.utilities.write_csv(self.report_path, report_name, csv_rows)
def build_crawl_task_queue(self, params): """ Enter crawl tasks to the database after performing checks to verify urls are valid. """ # these vars are specific to this function crawl_file_name = params['crawl_file_name'] flush_crawl_task_queue = params['flush_crawl_task_queue'] # only need this sql_driver to build the task list sql_driver = PostgreSQLDriver(self.db_name) # open list of pages try: crawl_list = json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/../crawl_lists/' + crawl_file_name, 'r', encoding='utf-8')) except: print( f'Could not open {crawl_file_name}, is it correctly formatted and present in the ./crawl_lists directory? Exiting.' ) sql_driver.close() exit() # get rid of whatever is in there already if flush_crawl_task_queue: sql_driver.flush_task_queue(task='get_crawl') for count, url_list in enumerate(crawl_list): # first make sure the urls are valid, if we # encounterd a non-valid url we trash the # entire list url_list_valid = True # we keep our fixed urls here idna_url_list = [] # look at each url for url in url_list: if self.utilities.is_url_valid(url) == False: print( f'{url} is not valid from {url_list}, not entering crawl to queue' ) url_list_valid = False break # perform idna fix idna_url_list.append(self.utilities.idna_encode_url(url)) # we need to put the continue here for the outer loop if url_list_valid == False: continue # if we are allowing time series we see if page has been scanned in the # specified interval, otherwise if we are *not* allowing a time series # we skip anything already in the db if self.config['timeseries_enabled']: if sql_driver.crawl_exists(json.dumps(idna_url_list), timeseries_interval=self. config['timeseries_interval']): print(f'\t{count} | {url[:30]}... Scanned too recently.') continue else: if sql_driver.crawl_exists(json.dumps(idna_url_list)): print(f'\t{count} | {url[:30]}... Exists in DB, skipping.') continue # we have a valid list, queue it up! if url_list_valid: sql_driver.add_task_to_queue(json.dumps(idna_url_list), 'get_crawl') print(f'\t{count} | {str(idna_url_list)[:30]}... Adding to queue.') # done sql_driver.close()
def build_scan_task_queue(self, params): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # these vars are specific to this function pages_file_name = params['pages_file_name'] flush_scan_task_queue = params['flush_scan_task_queue'] task = params['task'] # set up sql connection used to determine if items are already in the db if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # open list of pages try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + pages_file_name, 'r', encoding='utf-8') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % pages_file_name) sql_driver.close() exit() # get list of pages already scanned already_scanned = [] print('\tFetching list of pages already scanned...') if self.config['timeseries_enabled']: for url, in sql_driver.get_all_pages_exist( timeseries_interval=self.config['timeseries_interval']): already_scanned.append(url) else: for url, in sql_driver.get_all_pages_exist(): already_scanned.append(url) print(f'\t => {len(already_scanned)} pages already scanned') # get rid of whatever is in there already if flush_scan_task_queue: sql_driver.flush_task_queue(task=task) # simple counter used solely for updates to CLI count = 0 print('\t---------------------') print('\t Building Page Queue ') print('\t---------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # make sure url is valid if self.utilities.is_url_valid(url) == False: print(f'\t\t{count} | {url} is invalid') continue # perform idna fix url = self.utilities.idna_encode_url(url) # if we are allowing time series we see if page has been scanned in the # specified interval, otherwise if we are *not* allowing a time series # we skip anything already in the db if url in already_scanned and self.config['timeseries_enabled']: print(f'\t\t{count} | {url[:30]}... Scanned too recently.') continue elif url in already_scanned: print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.') continue # add to the queue, duplicates will be # ignored sql_driver.add_task_to_queue(url, task) print(f'\t\t{count} | {url[:30]}... Adding to queue.') # close the db connection sql_driver.close()
def store_result(self, params): """ Handles storing task_result and removing jobs from the task_queue. """ # unpack params target = params['target'] task = params['task'] task_result = params['task_result'] client_id = params['client_id'] # client_ip is optional if 'client_ip' in params: client_ip = params['client_ip'] else: client_ip = None # if db_name is specified we are running in server mode and we # connect to the db which corresponds to the result being # processed. otherwise, we use the global db_name as we are # running in non-server mode. if 'db_name' in params: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(params['db_name']) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(params['db_name']) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(params['db_name'], self.db_engine) else: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(self.db_name, self.db_engine) if task == 'get_policy': store_result = output_store.store_policy(task_result, client_id, client_ip=client_ip) # we never retry policies sql_driver.remove_task_from_queue(target, task) if store_result['success']: result = {'success': True} else: # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail on ' + store_result['result'] }) result = {'success': False, 'result': store_result['result']} # elif task == 'get_crawl' or task == 'get_random_crawl': else: all_crawls_ok = True # We want to be able to re-run random crawls, and to do so we make sure # the crawl_id will match if task == 'get_crawl' or task == 'get_scan': crawl_id = target elif task == 'get_random_crawl': crawl_id = [] for result in task_result: crawl_id.append(result['start_url']) crawl_id = json.dumps(crawl_id) # tweak to account for differences between scans/crawls if task == 'get_scan': task_result = [task_result] # keep track of domains all_3p_cookie_domains = set() all_3p_dom_storage_domains = set() all_3p_request_domains = set() all_3p_response_domains = set() all_3p_websocket_domains = set() # When we store a crawl we add optional fields in the page table # that allow us to connect the page loads into a single crawl. # the crawl_id is a hash of the target (which is a json string # derived from the url_list), and the crawl_timestamp which is the # first accessed time from the crawl. for crawl_sequence, result in enumerate(task_result): store_result = output_store.store_scan({ 'browser_output': result, 'client_id': client_id, 'crawl_id': crawl_id, 'crawl_timestamp': task_result[0]['accessed'], 'crawl_sequence': crawl_sequence, 'client_ip': client_ip }) if store_result['success'] != True: all_crawls_ok = False else: # we are successful, create entries in page_lookup table page_lookup_table = self.build_lookup_table( 'page', store_result['page_id'], { 'requests': store_result['page_3p_request_domains'], 'responses': store_result['page_3p_response_domains'], 'websockets': store_result['page_3p_websocket_domains'], 'dom_storage': store_result['page_3p_dom_storage_domains'], 'cookies': store_result['page_3p_dom_storage_domains'] }) for lookup_item in page_lookup_table: sql_driver.add_page_id_domain_lookup_item( page_lookup_table[lookup_item]) # we are also making a lookup table for the crawl, keep joing the # sets as we go along all_3p_request_domains.update( store_result['page_3p_request_domains']) all_3p_response_domains.update( store_result['page_3p_response_domains']) all_3p_websocket_domains.update( store_result['page_3p_websocket_domains']) all_3p_dom_storage_domains.update( store_result['page_3p_dom_storage_domains']) all_3p_cookie_domains.update( store_result['page_3p_dom_storage_domains']) if all_crawls_ok: sql_driver.remove_task_from_queue(target, task) result = {'success': True} # build crawl lookup table crawl_lookup_table = self.build_lookup_table( 'crawl', crawl_id, { 'requests': all_3p_request_domains, 'responses': all_3p_response_domains, 'websockets': all_3p_websocket_domains, 'dom_storage': all_3p_dom_storage_domains, 'cookies': all_3p_cookie_domains }) # patch lookup table for lookup_item in crawl_lookup_table: sql_driver.add_crawl_id_domain_lookup_item( crawl_lookup_table[lookup_item]) else: sql_driver.unlock_task_in_queue(target, task) # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail to store all scans for crawl_id_target ' + target }) result = { 'success': False, 'result': 'unable to store all crawl loads' } # tidy up output_store.close() sql_driver.close() # done return result
def get_client_task(self, client_ip, client_id): """ We determine what the client should be doing when it sends us a 'READY' message. If we find a task in our queue we sent it back, otherwise we send 'WAIT' and the client will contact us again. """ # connect to appropriate db for this client, if none found # return wait command if client_id in self.client_id_to_db: sql_driver = PostgreSQLDriver(self.client_id_to_db[client_id]) else: print( 'client_id not in client_id_to_db list, returning wait command' ) return {'task': 'wait'} # get config for this db config = sql_driver.get_config() # get client config client_config = {} for item in config: if 'client' in item: client_config[item] = config[item] # if we have items in task_queue we send them back, otherwise # we sent a wait command if sql_driver.get_task_queue_length( max_attempts=config['max_attempts'], unlocked_only=True) != 0: # if this fails we wait try: target, task = sql_driver.get_task_from_queue( max_attempts=config['max_attempts'], client_id=client_id) except: print('β Returning command to wait.') return {'task': 'wait'} if task == 'get_scan': print(f'π Returning command to scan {target}') return { 'task': 'get_scan', 'target': target, 'client_config': client_config } elif task == 'get_crawl': print(f'π Returning command to crawl {target[:30]}...') return { 'task': 'get_crawl', 'target': json.loads(target), 'client_config': client_config } elif task == 'get_policy': print(f'π Returning command to get_policy {target}') return { 'task': 'get_policy', 'target': target, 'client_config': client_config } elif task == 'get_random_crawl': print(f'π Returning command to get_random_crawl {target}') return { 'task': 'get_random_crawl', 'target': target, 'client_config': client_config } else: print('β Returning command to wait.') return {'task': 'wait'} sql_driver.close() del sql_driver
class Analyzer: """ webXray stores data in a relational db, but that isn't human-readable so what this class does is analyze the data and exports it to csv files that can be opened in other programs (e.g. excel) Most of the reports may also be run on the top tlds (off by default), so you will be able to see if there are variations between tlds ('org' and 'com' usually differ quite a bit) See the readme for details on all of the available reports. """ def __init__(self, db_engine, db_name, num_tlds, num_results, tracker_threshold=None, flush_owner_db=True): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_engine = db_engine self.db_name = db_name self.num_tlds = num_tlds self.top_tlds = [] self.num_results = num_results self.tracker_threshold = tracker_threshold self.start_time = datetime.now() # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver self.sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # this is reused often, do it once to save time self.get_pages_ok_count = self.sql_driver.get_pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t============================') print('\t Patching Domain Owner Data ') print('\t============================') if flush_owner_db: # update the domains to their owners in the db, can be overridden # by changing flush_owner_db to false self.patch_domain_owners() else: print('\t\t\tSkipping') # this is used in various places to get owner information self.domain_owners = self.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: if tld: print('\t\t |- %s (%s)' % (tld, pages)) else: # othewise we push in a single empty entry self.top_tlds.append((None, self.get_pages_ok_count)) # SPECIAL FEATURE FOR EXPERTS: tracker domain filter # # you can set a threshold of the number of sites a given 3p domain # is connected to - domains connecting to many sites may correlate those visits # so we call these 'tracker domains' # # the 'tracker_threshold' variable set above controls the filtering level # # on large set of sites (e.g. >10k) this works well but on small samples # (e.g. <500) it doesn't work as well as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing # don't use this...but because you are reading the source code for an otherwise # undocumented feature you are probably competent to use it ;-) # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # # use at your own risk! if tracker_threshold: print('\t===================================================') print('\t Getting tracker domains with threshold level of %s' % self.tracker_threshold) print('\t===================================================') print('\t\tProcessing...', end='', flush=True) self.tracker_domains = self.get_tracker_domains( self.tracker_threshold) print('done!') else: # set to None so various downstream operations get skipped self.tracker_domains = None # __init__ ################# # UTILITIES # ################# def setup_report_dir(self): """ create directory for where the reports go if it does not exist """ if os.path.exists('./reports') == False: print('\t\tMaking global reports directory at ./reports.') os.makedirs('./reports') # set global report_path self.report_path = './reports/' + self.db_name # set up subdir for this analysis if os.path.exists(self.report_path) == False: print('\t\tMaking subdirectory for reports at %s' % self.report_path) os.makedirs(self.report_path) print('\t\tStoring output in %s' % self.report_path) # setup_report_dir def write_csv(self, file_name, csv_rows): """ basic utility function to write list of csv rows to a file """ full_file_path = self.report_path + '/' + file_name with open(full_file_path, 'w', newline='') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for row in csv_rows: csv_writer.writerow(row) print('\t\tOutput written to %s' % full_file_path) # write_csv def get_most_common_sorted(self, list_in): """ takes a list, finds the most common items and then resorts alpha (b/c python's Counter will arbitrarily order items with same count), then sorts again for most-common assumes list_in contains alphanumeric tuples """ most_common_sorted = collections.Counter(list_in).most_common() most_common_sorted.sort() most_common_sorted.sort(reverse=True, key=lambda item: item[1]) return most_common_sorted # get_most_common_sorted def print_runtime(self): """ just for CLI info """ print('~=' * 40) print('Finished!') print('Time to process: %s' % str(datetime.now() - self.start_time)) print('-' * 80) # print_runtime def patch_domain_owners(self): """ in order to analyze what entities receive user data, we need to update the database with domain ownership records we have stored previously """ # we first clear out what is the db in case the new data has changed, # on big dbs takes a while print('\t\tFlushing extant domain owner data...', end='', flush=True) self.sql_driver.reset_domain_owners() print('done!') # next we pull the owner/domain pairings from the json file in # the resources dir and add to the db print('\t\tPatching with new domain owner data...', end='', flush=True) domain_owner_data = json.load( open( os.path.dirname(os.path.abspath(__file__)) + '/resources/domain_owners/domain_owners.json', 'r')) for item in domain_owner_data: aliases = '' for alias in item['aliases']: aliases += '<<' + alias + '>>' self.sql_driver.add_domain_owner(item['id'], item['parent_id'], item['owner_name'], aliases, item['homepage_url'], item['privacy_policy_url'], item['notes'], item['country']) for domain in item['domains']: self.sql_driver.update_domain_owner(item['id'], domain) print('done!') # patch_domain_owners def get_domain_owner_dict(self): """ read out everything in the domain_owner table into a dictionary so we can easily use it as a global lookup table this is purposefully independent of self.patch_domain_owners and does not assume the above has been run, however will return and empty dictionary if the db has not been patched yet reasons for above is that if user does not wish to update with the current json file historical data will remain consistent """ domain_owners = {} domain_owner_raw_data = self.sql_driver.get_all_domain_owner_data() if domain_owner_raw_data: for item in domain_owner_raw_data: # aliases are stored in the db as a string that needs to be turned into a list aliases = [] for alias in re.split('<<(.+?)>>', item[3]): if alias != '': aliases.append(alias) # add everything to the dict domain_owners[item[0]] = { 'parent_id': item[1], 'owner_name': item[2], 'aliases': aliases, 'homepage_url': item[4], 'privacy_policy_url': item[5], 'notes': item[6], 'country': item[7], } return domain_owners # get_domain_owner_dict def get_domain_owner_lineage_ids(self, id): """ for a given domain owner id, return the list which corresponds to its ownership lineage """ if self.domain_owners[id]['parent_id'] == None: return [id] else: return [id] + self.get_domain_owner_lineage_ids( self.domain_owners[id]['parent_id']) # get_domain_owner_lineage_ids def get_domain_owner_lineage_strings(self, owner_id, get_aliases=False): """ given an owner_id this function returns a list which is the full lineage of ownership optionally will also return aliases (e.g. 'Doubleclick' and 'Double Click') """ lineage_strings = [] for owner_id in self.get_domain_owner_lineage_ids(owner_id): lineage_strings.append( (owner_id, self.domain_owners[owner_id]['owner_name'])) if get_aliases: for alias in self.domain_owners[owner_id]['aliases']: lineage_strings.append((owner_id, alias)) return lineage_strings # get_domain_owner_lineage_strings def get_domain_owner_lineage_combined_string(self, owner_id): """ given an owner_id this function returns a single string which is the full lineage of ownership """ lineage_string = '' for item in self.get_domain_owner_lineage_strings(owner_id): lineage_string += item[1] + ' > ' return lineage_string[:-2] # get_domain_owner_lineage_combined_string def get_domain_owner_child_ids(self, id): """ for a given owner id, get all of its children/subsidiaries """ # first get all the children ids if they exist child_ids = [] for item in self.domain_owners: if self.domain_owners[item]['parent_id'] == id: child_ids.append(item) # if we have children, call recursively if len(child_ids) > 0: for child_id in child_ids: child_ids.extend(self.get_domain_owner_child_ids(child_id)) # return an empty list if no children return child_ids # get_domain_owner_child_ids def get_top_tlds(self, limit): """ finds the most common tlds from all the pages type is default to tld, but pubsuffix also works returns list of tlds """ tlds = [] for row in self.sql_driver.get_all_tlds(): tlds.append(row[0]) top_tlds = collections.Counter(tlds).most_common() # cut the list to the limit top_tlds = top_tlds[0:limit] # push in entry for all tlds top_tlds.insert(0, (None, self.get_pages_ok_count)) return top_tlds # get_top_tlds def get_tracker_domains(self, threshold): """ NOTE: first determines all pairings of page domains and element domains note this is then unique on SITES, not on PAGES e.g. if you have several pages from the same site these links only count once returns a list of domains which link at least the threshold number of sites """ all_domains = [] for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs( ): all_domains.append(page_domain_element_domain[1]) # count up all the pairs, convert to items() so can process as tuples domain_counts = collections.Counter(all_domains).items() # put the return values here tracker_domains = [] # check against threshold for domain_count in domain_counts: if domain_count[1] >= threshold: tracker_domains.append(domain_count[0]) # EDGE CASE # likely due to a large threshold we have no tracker domains, # so we throw warning and log error if len(tracker_domains) == 0: self.sql_driver.log_error( 'Analaysis Warning', 'Tracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------WARNING-----------') print( '\t\tTracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------------------------') return tracker_domains # get_tracker_domains ##################### # REPORT HELPERS # ##################### def get_3p_domain_stats(self, num_pages, tld_filter=None): """ determines basic stats for the number of 3p domains contacted per-page note this is distinct domain+pubsuffix, not fqdns (e.g. 'sub.example.com' and sub2.example.com' only count as 'example.com') if tracker_domains have been set the stats will reflect only third-parties which have crossed the threshold (see get_tracker_domains()) """ # each page id corresponds to a list of domains belonging to page elements page_id_to_domains_dict = {} # run query to get all page id, page domain, and element domain entries # there is no third-party filter so each page will have at least one entry for first-party domain for row in self.sql_driver.get_page_id_page_domain_element_domain( tld_filter): page_id = row[0] page_domain = row[1] element_domain = row[2] # if the page id is not yet seen enter the current element as a fresh list # otherwise, we add to the existing list # in both cases, if there is a tracker_domain list we only add # domains that are in the list if page_id not in page_id_to_domains_dict: if self.tracker_domains: if element_domain in self.tracker_domains: page_id_to_domains_dict[page_id] = [element_domain] else: page_id_to_domains_dict[page_id] = [element_domain] else: if self.tracker_domains: if element_domain in self.tracker_domains: page_id_to_domains_dict[ page_id] = page_id_to_domains_dict[page_id] + [ element_domain ] else: page_id_to_domains_dict[page_id] = page_id_to_domains_dict[ page_id] + [element_domain] # now we determine the number of domains each page is connected to by looking at len of list of 3p domains per_page_3p_element_counts = [] for page_id in page_id_to_domains_dict: per_page_3p_element_counts.append( len(page_id_to_domains_dict[page_id])) # pages that have no 3p elements are not yet in our counts # so for all uncounted pages we add in zeros uncounted_pages = num_pages - len(per_page_3p_element_counts) while uncounted_pages > 0: uncounted_pages -= 1 per_page_3p_element_counts.append(0) # mean and median should always be ok mean = statistics.mean(per_page_3p_element_counts) median = statistics.median(per_page_3p_element_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(per_page_3p_element_counts) except: mode = None return (mean, median, mode) # get_3p_domain_stats def get_3p_cookie_stats(self, num_pages, tld_filter=None): """ determines basic stats for the number of 3p cookies contacted per-page note that a single 3p many set more than one cookie if tracker_domains have been set the stats will reflect only third-parties which have crossed the threshold (see get_tracker_domains()) """ # each page id corresponds to a list of cookie ids page_id_to_cookie_id_dict = {} # run query to get all page id, 3p cookie id, 3p cookie domain entries for row in self.sql_driver.get_page_id_3p_cookie_id_3p_cookie_domain( tld_filter): page_id = row[0] cookie_id = row[1] cookie_domain = row[2] # if the page id is not yet seen enter the current cookie id as a fresh list # otherwise, we add to the existing list # in both cases, if there is a tracker_domain list we do not count cookies # set by domains which are not trackers if page_id not in page_id_to_cookie_id_dict: if self.tracker_domains: if cookie_domain in self.tracker_domains: page_id_to_cookie_id_dict[page_id] = [cookie_id] else: page_id_to_cookie_id_dict[page_id] = [cookie_id] else: if self.tracker_domains: if cookie_domain in self.tracker_domains: page_id_to_cookie_id_dict[ page_id] = page_id_to_cookie_id_dict[page_id] + [ cookie_id ] else: page_id_to_cookie_id_dict[ page_id] = page_id_to_cookie_id_dict[page_id] + [ cookie_id ] # determine the number of 3p cookies each page has by looking at len of list of cookie ids per_page_3p_cookie_counts = [] for page_id in page_id_to_cookie_id_dict: per_page_3p_cookie_counts.append( len(page_id_to_cookie_id_dict[page_id])) # pages that have no 3p cookies are not yet in our counts # so for all uncounted pages we add in zeros uncounted_pages = num_pages - len(per_page_3p_cookie_counts) while uncounted_pages > 0: uncounted_pages -= 1 per_page_3p_cookie_counts.append(0) # mean and median should always be ok mean = statistics.mean(per_page_3p_cookie_counts) median = statistics.median(per_page_3p_cookie_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(per_page_3p_cookie_counts) except: mode = None return (mean, median, mode) # get_3p_cookie_stats ##################### # REPORT GENERATORS # ##################### def generate_db_summary_report(self): """ outputs and stores report of basic data about how many records in db, etc. """ print('\t================') print('\t General Summary') print('\t================') csv_rows = [] total_pages_ok = self.sql_driver.get_pages_ok_count() print("\t\tTotal Pages OK:\t\t\t%s" % total_pages_ok) csv_rows.append(('Total Pages OK', total_pages_ok)) total_pages_noload = self.sql_driver.get_pages_noload_count() total_pages_attempted = total_pages_ok + total_pages_noload print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload) csv_rows.append(('Total Pages FAIL', total_pages_noload)) print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted) csv_rows.append(('Total Pages Attempted', total_pages_attempted)) percent_pages_OK = (total_pages_ok / total_pages_attempted) * 100 print("\t\t%% Pages OK:\t\t\t%.2f%%" % round(percent_pages_OK, self.num_decimals)) csv_rows.append( ('% Pages OK', round(percent_pages_OK, self.num_decimals))) print('\t\t---') total_errors = self.sql_driver.get_total_errors_count() print("\t\tTotal Errors:\t\t\t%s" % total_errors) csv_rows.append(('Total Errors', total_errors)) print('\t\t---') total_3p_cookies = self.sql_driver.get_total_cookie_count(is_3p=True) print("\t\tTotal 3P Cookies:\t\t%s" % total_3p_cookies) csv_rows.append(('Total Cookies', total_3p_cookies)) print('\t\t---') # see if we have both 1p/3p requests, if so show stats for all total_1p_elements = self.sql_driver.get_total_request_count( party='first') if total_1p_elements > 0: total_elements = self.sql_driver.get_total_request_count() print("\t\tTotal Elements Requested:\t%s" % total_elements) csv_rows.append(('Total Elements Requested', total_elements)) total_elements_received = self.sql_driver.get_total_request_count( received=True) print("\t\tTotal Elements Received:\t%s" % total_elements_received) csv_rows.append( ('Total Elements Received', total_elements_received)) percent_element_received = (total_elements_received / total_elements) * 100 print('\t\tTotal %% Elements Received:\t%.2f%%' % percent_element_received) csv_rows.append(('Total % Elements Received', round(percent_element_received, self.num_decimals))) print('\t\t---') # only 3p request/receive info - we always do this total_3p_elements = self.sql_driver.get_total_request_count( party='third') print("\t\t3P Elements Requested:\t\t%s" % total_3p_elements) csv_rows.append(('3P Elements Requested', total_3p_elements)) # avoid divide-by-zero if no 3p elements if total_3p_elements > 0: total_3p_elements_received = self.sql_driver.get_total_request_count( received=True, party='third') print("\t\t3P Elements Received:\t\t%s" % total_3p_elements_received) csv_rows.append( ('3P Elements Received', total_3p_elements_received)) percent_3p_element_received = (total_3p_elements_received / total_3p_elements) * 100 print('\t\t3P %% Elements Received:\t\t%.2f%%' % percent_3p_element_received) csv_rows.append(('3P % Elements Received', round(percent_3p_element_received, self.num_decimals))) print('\t\t' + '-' * 40) self.write_csv('db_summary.csv', csv_rows) # generate_db_summary_report def generate_stats_report(self): """ high level stats """ print('\t=============================') print('\t Processing High-Level Stats ') print('\t=============================') for tld in self.top_tlds: csv_rows = [] if tld[0]: tld_filter = tld[0] file_name = tld[0] + '-stats.csv' else: tld_filter = None file_name = 'stats.csv' # page info total_pages = self.sql_driver.get_complex_page_count(tld_filter) total_pages_percent = (total_pages / self.get_pages_ok_count) * 100 total_pages_elements = self.sql_driver.get_complex_page_count( tld_filter, 'elements', self.tracker_domains) percent_with_elements = (total_pages_elements / total_pages) * 100 total_pages_cookies = self.sql_driver.get_complex_page_count( tld_filter, 'cookies', self.tracker_domains) percent_with_cookies = (total_pages_cookies / total_pages) * 100 total_pages_js = self.sql_driver.get_complex_page_count( tld_filter, 'javascript', self.tracker_domains) percent_with_js = (total_pages_js / total_pages) * 100 total_pages_ssl = self.sql_driver.get_pages_ok_count(is_ssl=True) percent_pages_ssl = (total_pages_ssl / total_pages) * 100 # elements info total_elements_received = self.sql_driver.get_total_request_count( received=True) total_elements_received_ssl = self.sql_driver.get_total_request_count( received=True, is_ssl=True) total_elements_received_1p = self.sql_driver.get_total_request_count( received=True, party='first') total_elements_received_1p_ssl = self.sql_driver.get_total_request_count( received=True, party='first', is_ssl=True) total_elements_received_3p = self.sql_driver.get_total_request_count( received=True, party='third') total_elements_received_3p_ssl = self.sql_driver.get_total_request_count( received=True, party='third', is_ssl=True) all_load_times = self.sql_driver.get_pages_load_times() all_load_times_sum = 0 for load_time in all_load_times: all_load_times_sum += load_time average_page_load_time = all_load_times_sum / len(all_load_times) if self.tracker_threshold: filter_depth = self.tracker_threshold else: filter_depth = 'No Filter Used' domain_stats = self.get_3p_domain_stats(total_pages, tld_filter) domain_mean = domain_stats[0] domain_median = domain_stats[1] domain_mode = domain_stats[2] cookie_stats = self.get_3p_cookie_stats(total_pages, tld_filter) cookie_mean = cookie_stats[0] cookie_median = cookie_stats[1] cookie_mode = cookie_stats[2] csv_rows.append(('N Pages Loaded', total_pages)) csv_rows.append(('% of all Pages', total_pages_percent)) csv_rows.append( ('% Pages SSL', round(percent_pages_ssl, self.num_decimals))) csv_rows.append(('N Elements Received', total_elements_received)) csv_rows.append( ('% Elements Received SSL', round( (total_elements_received_ssl / total_elements_received) * 100, self.num_decimals))) csv_rows.append( ('N 1P Elements Received', total_elements_received_1p)) csv_rows.append(('% 1P Elements Received SSL', round((total_elements_received_1p_ssl / total_elements_received_1p) * 100, self.num_decimals))) csv_rows.append( ('N 3P Elements Received', total_elements_received_3p)) csv_rows.append(('% 3P Elements Received SSL', round((total_elements_received_3p_ssl / total_elements_received_3p) * 100, self.num_decimals))) csv_rows.append(('Average Page Load Time (ms)', round(average_page_load_time, self.num_decimals))) csv_rows.append(('% w/3p Element', round(percent_with_elements, self.num_decimals))) csv_rows.append( ('% w/3p Cookie', round(percent_with_cookies, self.num_decimals))) csv_rows.append( ('% w/3p Javascript', round(percent_with_js, self.num_decimals))) csv_rows.append( ('Mean 3p Domains', round(domain_mean, self.num_decimals))) csv_rows.append(('Median 3p Domains', domain_median)) csv_rows.append(('Mode 3p Domains', domain_mode)) csv_rows.append( ('Mean 3p Cookies', round(cookie_mean, self.num_decimals))) csv_rows.append(('Median 3p Cookies', cookie_median)) csv_rows.append(('Mode 3p Cookies', cookie_mode)) csv_rows.append(('Filter Depth Used', filter_depth)) self.write_csv(file_name, csv_rows) # generate_stats_report def generate_aggregated_tracking_attribution_report(self): """ generates ranked list of which entities collect data from the greatest number of pages ('data_flow_ownership.csv') - entities which have subsidiaries are ranked according to the pages their subsidiaries get data from as well - however, parent entities only get one hit on a page which has multiple subsidiaries present - for example, if a page has 'google analytics' and 'doubleclick' that is only one hit for 'google' also able to filter by tld """ print('\t======================================') print('\t Processing Aggregated Tracking Report ') print('\t======================================') for tld in self.top_tlds: csv_rows = [] csv_rows.append(('Percentage Pages Tracked', 'Owner', 'Owner Country', 'Owner Lineage')) # will need this value to determine percentages later on total_pages = self.sql_driver.get_complex_page_count( tld_filter=tld[0]) # list will have entry for each hit on a given entity all_owner_occurances = [] # each page id is a key which corresponds to a list of # ids for entities which own the 3p element domains page_to_element_owners = {} # this query may produce a large volume of results! results = self.sql_driver.get_all_page_id_3p_domain_owner_ids( tld_filter=tld[0]) # for each result we either create a new list, or extend the existing one # with the ids of the owners of the 3p elements for item in results: page_id = item[0] element_owner_id = item[1] if page_id not in page_to_element_owners: page_to_element_owners[page_id] = [element_owner_id] else: page_to_element_owners[page_id] = page_to_element_owners[ page_id] + [element_owner_id] # now that we have ids for each page, we can look up the lineage # to create the aggregate measure of how often entities appear for item in page_to_element_owners: # this is a set so items which appear more than once only get counted once # reset this for each page page_domain_owners = set() # we are operating on a list of ids which correspond to the owners of domains which get the data for page_3p_owner_id in page_to_element_owners[item]: # for each domain owner we also count all of its parents by getting the lineage for lineage_id in self.get_domain_owner_lineage_ids( page_3p_owner_id): page_domain_owners.add( (lineage_id, self.domain_owners[lineage_id]['owner_name'])) # we have finished processing for this page so we add the owner ids to the full list for owner_id in page_domain_owners: all_owner_occurances.append(owner_id) # write out data to csv for item in self.get_most_common_sorted(all_owner_occurances): # we want to specify the parent name for each item, or if there is no parent, identify as such parent_id = self.domain_owners[item[0][0]]['parent_id'] if parent_id: parent_name = self.domain_owners[parent_id]['owner_name'] else: parent_name = '' csv_rows.append((round( (item[1] / total_pages) * 100, 2), item[0][1], self.domain_owners[item[0][0]]['country'], self.get_domain_owner_lineage_combined_string( item[0][0]))) # set file name prefix when doing tld-bounded report if tld[0]: file_name = tld[0] + '-aggregated_tracking_attribution.csv' else: file_name = 'aggregated_tracking_attribution.csv' # done! self.write_csv(file_name, csv_rows) # generate_aggregated_tracking_attribution_report def generate_aggregated_3p_ssl_use_report(self): """ this report tells us the percentage of requests made to a given third-party are encrypted """ print('\t=========================================') print('\t Processing Aggregated 3P SSL Use Report ') print('\t=========================================') csv_rows = [] domain_owners_ssl_use_dict = {} for item in self.sql_driver.get_3p_element_domain_owner_id_ssl_use(): child_domain_owner_id = item[0] is_ssl = item[1] for domain_owner_id in self.get_domain_owner_lineage_ids( child_domain_owner_id): if domain_owner_id not in domain_owners_ssl_use_dict: domain_owners_ssl_use_dict[domain_owner_id] = [is_ssl] else: domain_owners_ssl_use_dict[ domain_owner_id] = domain_owners_ssl_use_dict[ domain_owner_id] + [is_ssl] for domain_owner_id in domain_owners_ssl_use_dict: csv_rows.append( (round( 100 * (sum(domain_owners_ssl_use_dict[domain_owner_id]) / len(domain_owners_ssl_use_dict[domain_owner_id])), self.num_decimals), self.domain_owners[domain_owner_id]['owner_name'], self.domain_owners[domain_owner_id]['country'], self.get_domain_owner_lineage_combined_string(domain_owner_id) )) # sort results by owner, note is upper then lower case # would cause code bloat to do otherwise, but worth considering csv_rows.sort(key=itemgetter(1)) # now sort by percentage of encrypted requests descending csv_rows.sort(key=itemgetter(0), reverse=True) # insert header row after sort csv_rows[0] = ('Percent Requests Encrypted', 'Owner', 'Owner Country', 'Owner Lineage') # done! self.write_csv('3p_ssl_use.csv', csv_rows) # generate_aggregated_3p_ssl_use_report def generate_per_page_data_flow_report(self): """ generates a csv which has information on data flows for each page note this file may be very large and is disabled by default """ print('\t======================================') print('\t Processing Per-Page Data Flow Report ') print('\t======================================') file_name = 'per_page_data_flow.csv' csv_rows = [] csv_rows.append(('Final URL', '3P Domain', 'Owner', 'Owner Country', 'Owner Lineage')) for item in self.sql_driver.get_all_pages_3p_domains_and_owners(): # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[3] != None: csv_rows.append( (item[1], item[2], self.domain_owners[item[3]]['owner_name'], self.domain_owners[item[3]]['country'], self.get_domain_owner_lineage_combined_string(item[3]))) else: csv_rows.append((item[1], item[2], 'Unknown', '', '')) self.write_csv(file_name, csv_rows) # generate_per_page_data_flow_report def generate_3p_domain_report(self): """ this queries the db to get all elements, domains, and domain owners next they are counted to find the most common and formatted to csv rows and returned """ print('\t==============================') print('\t Processing 3P Domains Report ') print('\t==============================') for tld in self.top_tlds: csv_rows = [] csv_rows.append(('Percent Total', 'Domain', 'Owner', 'Owner Country', 'Owner Lineage')) if tld[0]: tld_filter = tld[0] file_name = tld[0] + '-3p_domains.csv' else: tld_filter = None file_name = '3p_domains.csv' total_pages = tld[1] all_3p_domains = [] for item in self.sql_driver.get_3p_domain_owners(tld_filter): all_3p_domains.append((item[1], item[2])) # if num_results is None we get everything, otherwise stops at limit for item in self.get_most_common_sorted( all_3p_domains)[:self.num_results]: # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[0][1] != None: owner_name = self.domain_owners[item[0][1]]['owner_name'] owner_country = self.domain_owners[item[0][1]]['country'] owner_lineage = self.get_domain_owner_lineage_combined_string( item[0][1]) else: owner_name = 'Unknown' owner_country = '' owner_lineage = '' csv_rows.append((round((item[1] / total_pages) * 100, self.num_decimals), item[0][0], owner_name, owner_country, owner_lineage)) self.write_csv(file_name, csv_rows) # generate_3p_domain_report def generate_3p_element_report(self, element_type=None): """ this queries the db to get all elements, domains, or domain owners next they are counted to find the most common and formatted to csv rows and returned """ if element_type == 'javascript': print('\t=================================') print('\t Processing 3P Javascript Report ') print('\t=================================') elif element_type == 'image': print('\t=============================') print('\t Processing 3P Images Report ') print('\t=============================') else: print('\t==============================') print('\t Processing 3P Element Report ') print('\t==============================') for tld in self.top_tlds: total_pages = tld[1] csv_rows = [] csv_rows.append( ('Percent Total', 'Element', 'Extension', 'Type', 'Domain', 'Owner', 'Owner Country', 'Owner Lineage')) if tld[0]: tld_filter = tld[0] if element_type: file_name = tld[0] + '-3p_' + element_type + '.csv' else: file_name = tld[0] + '-3p_element.csv' else: tld_filter = None if element_type: file_name = '3p_' + element_type + '.csv' else: file_name = '3p_element.csv' all_3p_elements = [] for item in self.sql_driver.get_3p_elements( tld_filter, element_type): # we need to drop off the first element returned here # perhaps tho it should happen insql? all_3p_elements.append( (item[1], item[2], item[3], item[4], item[5])) # if num_results is None we get everything, otherwise stops at limit for item in self.get_most_common_sorted( all_3p_elements)[:self.num_results]: # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[0][4] != None: owner_name = self.domain_owners[item[0][4]]['owner_name'] owner_country = self.domain_owners[item[0][4]]['country'] owner_lineage = self.get_domain_owner_lineage_combined_string( item[0][4]) else: owner_name = 'Unknown' owner_country = '' owner_lineage = '' csv_rows.append( (round( (item[1] / total_pages) * 100, self.num_decimals), item[0][0], item[0][1], item[0][2], item[0][3], owner_name, owner_country, owner_lineage)) self.write_csv(file_name, csv_rows) # generate_3p_element_report def generate_data_transfer_report(self): """ this report tells us how much data was transferred across several dimensions """ print('\t==================================') print('\t Processing Data Transfer Reports ') print('\t==================================') for tld in self.top_tlds: # set up filter and file names if tld[0]: tld_filter = tld[0] summary_file_name = tld[0] + '-data_xfer_summary.csv' domain_file_name = tld[0] + '-data_xfer_by_domain.csv' aggregated_file_name = tld[0] + '-data_xfer_aggregated.csv' else: tld_filter = None summary_file_name = 'data_xfer_summary.csv' domain_file_name = 'data_xfer_by_domain.csv' aggregated_file_name = 'data_xfer_aggregated.csv' # get the data from db, tuple of (element_domain, size, is_3p (boolean), domain_owner_id) element_sizes = self.sql_driver.get_element_sizes( tld_filter=tld_filter) # initialize vars first_party_data = 0 third_party_data = 0 total_data = 0 # need Counter object, allows sorting later domain_data = collections.Counter() owner_data = collections.Counter() # process each row for item in element_sizes: element_domain = item[0] element_size = item[1] element_is_3p = item[2] domain_owner_id = item[3] # this is the measure of all data downloaded total_data += element_size # measures for third and first party data if element_is_3p: third_party_data += element_size else: first_party_data += element_size # data by domain, increment if already in there, otherwise new entry if element_domain in domain_data: domain_data[element_domain] += element_size else: domain_data[element_domain] = element_size # only if we know the owner, increment if domain_owner_id: for lineage_id in self.get_domain_owner_lineage_ids( domain_owner_id): if lineage_id in owner_data: owner_data[lineage_id] += element_size else: owner_data[lineage_id] = element_size # output data to csv summary_data_csv = [] summary_data_csv.append( ('Party', 'Percent Total', 'Data Transfered (bytes)')) summary_data_csv.append(('All', '100', total_data)) summary_data_csv.append( ('First', round((first_party_data / total_data) * 100, self.num_decimals), first_party_data)) summary_data_csv.append( ('Third', round((third_party_data / total_data) * 100, self.num_decimals), third_party_data)) self.write_csv(summary_file_name, summary_data_csv) # sort and output ranked data domain_data = domain_data.most_common() domain_data.sort() domain_data.sort(reverse=True, key=lambda item: item[1]) # for csv data domain_data_csv = [] domain_data_csv.append( ('Percent Total', 'Domain', 'Data Transfered (bytes)')) # if num_results is None we get everything, otherwise stops at limit for item in domain_data[:self.num_results]: domain_data_csv.append( (round((item[1] / total_data) * 100, self.num_decimals), item[0], item[1])) self.write_csv(domain_file_name, domain_data_csv) owner_data = self.get_most_common_sorted(owner_data) owner_data_csv = [] owner_data_csv.append(('Percent Total', 'Owner', 'Owner Country', 'Owner Lineage', 'Data Transfered (bytes)')) # get results for all known owners for item in owner_data: owner_data_csv.append( (round((item[1] / total_data) * 100, self.num_decimals), self.domain_owners[item[0]]['owner_name'], self.domain_owners[item[0]]['country'], self.get_domain_owner_lineage_combined_string(item[0]), item[1])) self.write_csv(aggregated_file_name, owner_data_csv) # generate_data_transfer_report def generate_network_report(self): """ this report generates data necessary for graph/network analysis by outputting a list of page domains and the elements/owners they connect to """ print('\t=========================') print('\t Processing Network Ties ') print('\t=========================') # put output here csv_rows = [] # header row for csv csv_rows.append(('Page Domain', '3P Element Domain', '3P Domain Owner', '3P Domain Owner Country')) # sql_driver.get_network_ties returns a set of tuples in the format # (page domain, element domain, element domain owner id) # we just go through this data to produce the report for item in self.sql_driver.get_3p_network_ties(): # if a page has no elements, edge[1] will be 'None' so we skip it # an alternate approach would be to include as orphan nodes if item[1]: # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[2] != None: csv_rows.append((item[0], item[1], self.domain_owners[item[2]]['owner_name'], self.domain_owners[item[2]]['country'])) else: csv_rows.append((item[0], item[1], 'Unknown', '')) self.write_csv('network.csv', csv_rows)
def __init__(self, db_engine, db_name, num_tlds, num_results, tracker_threshold=None, flush_owner_db=True): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_engine = db_engine self.db_name = db_name self.num_tlds = num_tlds self.top_tlds = [] self.num_results = num_results self.tracker_threshold = tracker_threshold self.start_time = datetime.now() # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver self.sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # this is reused often, do it once to save time self.get_pages_ok_count = self.sql_driver.get_pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t============================') print('\t Patching Domain Owner Data ') print('\t============================') if flush_owner_db: # update the domains to their owners in the db, can be overridden # by changing flush_owner_db to false self.patch_domain_owners() else: print('\t\t\tSkipping') # this is used in various places to get owner information self.domain_owners = self.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: if tld: print('\t\t |- %s (%s)' % (tld, pages)) else: # othewise we push in a single empty entry self.top_tlds.append((None, self.get_pages_ok_count)) # SPECIAL FEATURE FOR EXPERTS: tracker domain filter # # you can set a threshold of the number of sites a given 3p domain # is connected to - domains connecting to many sites may correlate those visits # so we call these 'tracker domains' # # the 'tracker_threshold' variable set above controls the filtering level # # on large set of sites (e.g. >10k) this works well but on small samples # (e.g. <500) it doesn't work as well as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing # don't use this...but because you are reading the source code for an otherwise # undocumented feature you are probably competent to use it ;-) # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # # use at your own risk! if tracker_threshold: print('\t===================================================') print('\t Getting tracker domains with threshold level of %s' % self.tracker_threshold) print('\t===================================================') print('\t\tProcessing...', end='', flush=True) self.tracker_domains = self.get_tracker_domains( self.tracker_threshold) print('done!') else: # set to None so various downstream operations get skipped self.tracker_domains = None
class Utilities: def __init__(self, db_name=None, db_engine=None): # if we have db params set up global db connection, otherwise we don't bother if db_name: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() elif db_engine: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver() elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver() else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.url_parser = ParseURL() # __init__ def check_dependencies(self): import sys if sys.version_info[0] < 3 or sys.version_info[1] < 4: print( '******************************************************************************' ) print( ' Python 3.4 or above is required for webXray; please check your installation. ' ) print( '******************************************************************************' ) quit() try: from websocket import create_connection except: print('*******************************************************') print(' The websocket-client library is needed for webXray. ') print(' Please try running "pip3 install -r requirements.txt" ') print('*******************************************************') quit() try: from textstat.textstat import textstat except: print('*******************************************************') print(' The textstat library is needed for webXray. ') print(' Please try running "pip3 install -r requirements.txt" ') print('*******************************************************') quit() try: import lxml.html except: print('*******************************************************') print(' The lxml library is needed for webXray. ') print(' Please try running "pip3 install -r requirements.txt" ') print('*******************************************************') quit() # check_dependencies def get_default_config(self, config_type): # the following are two pre-configured options for # haystack and forensic scans, can be tweaked as desired if config_type == 'haystack': return { 'client_browser_type': 'chrome', 'client_prewait': 10, 'client_no_event_wait': 20, 'client_max_wait': 60, 'client_get_bodies': False, 'client_get_bodies_b64': False, 'client_get_screen_shot': False, 'client_get_text': False, 'client_crawl_depth': 3, 'client_crawl_retries': 5, 'client_page_load_strategy': 'none', 'client_reject_redirects': False, 'client_min_internal_links': 5, 'max_attempts': 5, 'store_1p': True, 'store_base64': False, 'store_files': True, 'store_screen_shot': False, 'store_source': False, 'store_page_text': False, 'store_links': True, 'store_dom_storage': True, 'store_responses': True, 'store_request_xtra_headers': True, 'store_response_xtra_headers': True, 'store_requests': True, 'store_websockets': True, 'store_websocket_events': True, 'store_event_source_msgs': True, 'store_cookies': True, 'store_security_details': True, 'timeseries_enabled': True, 'timeseries_interval': 0 } elif config_type == 'forensic': return { 'client_browser_type': 'chrome', 'client_prewait': 10, 'client_no_event_wait': 20, 'client_max_wait': 60, 'client_get_bodies': True, 'client_get_bodies_b64': True, 'client_get_screen_shot': True, 'client_get_text': True, 'client_crawl_depth': 3, 'client_crawl_retries': 5, 'client_page_load_strategy': 'none', 'client_reject_redirects': True, 'client_min_internal_links': 5, 'max_attempts': 5, 'store_1p': True, 'store_base64': True, 'store_files': True, 'store_screen_shot': True, 'store_source': True, 'store_page_text': True, 'store_links': True, 'store_dom_storage': True, 'store_responses': True, 'store_request_xtra_headers': True, 'store_response_xtra_headers': True, 'store_requests': True, 'store_websockets': True, 'store_websocket_events': True, 'store_event_source_msgs': True, 'store_cookies': True, 'store_security_details': True, 'timeseries_enabled': True, 'timeseries_interval': 0 } elif config_type == 'custom': print('Create a custom config in Utilities.py') quit() else: print('Invalid config option, see Utilities.py') quit() # get_default_config def select_wbxr_db(self): """ databases are stored with a prefix (default 'wbxr_'), this function helps select a database in interactive mode """ # you can optionally specify a different prefix here by setting "db_prefix = '[PREFIX]'" wbxr_dbs = self.sql_driver.get_wbxr_dbs_list() wbxr_dbs.sort() if len(wbxr_dbs) == 0: print( '''\t\tThere are no databases to analyze, please try [C]ollecting data or import an existing wbxr-formatted database manually.''') interaction() return for index, db_name in enumerate(wbxr_dbs): print('\t\t[%s] %s' % (index, db_name)) max_index = len(wbxr_dbs) - 1 # interaction step: loop until we get acceptable input while True: selected_db_index = input("\n\tPlease select database by number: ") if selected_db_index.isdigit(): selected_db_index = int(selected_db_index) if selected_db_index >= 0 and selected_db_index <= max_index: break else: print( '\t\t You entered an invalid string, please select a number in the range 0-%s.' % max_index) continue else: print( '\t\t You entered an invalid string, please select a number in the range 0-%s.' % max_index) continue db_name = wbxr_dbs[selected_db_index] return db_name # select_wbxr_db def stream_rate(self, type='scan', return_json=False, client_id=None): """ This function is a generator which determines the rate at which pages are being add to the db allowing us to evaluate our rate of progress. """ # initialize dictionary to store rate data client_rate_data = {} # this diction will hold all the rates for each client so we can # easily figure out the average rate all_rates = {} # None store the aggregate data for all clients client_rate_data[None] = {} all_rates[None] = [] # add entries for each client for client_id, in self.sql_driver.get_client_list(): client_rate_data[client_id] = {} all_rates[client_id] = [] # for client_id in ['wbxr0','wbxr1','wbxr2','wbxr3','wbxr4','wbxr5']: # client_rate_data[client_id] = {} # all_rates[client_id] = [] crawl_depth = self.sql_driver.get_config()['client_crawl_depth'] # set time window we want to look at to see how many # pages have been recently added # set the time gap between updates, leaving it too short # means lots of db calls if type == 'scan' or type == 'policy': wait_seconds = 10 interval_seconds = 600 elif type == 'task': wait_seconds = 30 interval_seconds = 30 # keep track of how long we've been doing this elapsed_seconds = 0 # for tasks if type == 'task': old_task_count = self.sql_driver.get_pending_task_count() # this runs forever, no terminating condition while True: # simple increment, note we we /60 before we return # for minutes conversion elapsed_seconds += wait_seconds remaining_tasks = self.sql_driver.get_task_queue_length() total_count = 0 for client_id, count in self.sql_driver.get_recent_page_count_by_client_id( interval_seconds): total_count += count # to get rate/hour we take the number of pages we've added per # second *3600 current_rate = (count / interval_seconds) * 3600 # this list is all the rates we've seen all_rates[client_id] = all_rates[client_id] + [current_rate] # nice built-in to get the average rate average_rate = statistics.mean(all_rates[client_id]) # figure out how much longer to go, gracefully handle # a rate of zero if average_rate != 0: remaining_hours = remaining_tasks / average_rate else: remaining_hours = 0 # dictionary of the data to return client_rate_data[client_id] = { 'elapsed_minutes': round(elapsed_seconds / 60, 2), 'current_rate': round(current_rate, 2), 'average_rate': round(average_rate, 2), 'remaining_tasks': remaining_tasks, 'remaining_hours': round(remaining_hours, 2) * crawl_depth } # for overall measure total_current_rate = (total_count / interval_seconds) * 3600 all_rates[None] += [total_current_rate] total_average_rate = statistics.mean(all_rates[None]) # figure out how much longer to go, gracefully handle # a rate of zero if total_average_rate != 0: remaining_hours = round( (remaining_tasks / total_average_rate) * crawl_depth, 2) else: remaining_hours = 0 # round down for days if remaining_hours > 24: remaining_time = f'{round(remaining_hours/24,2)} days' else: remaining_time = f'{remaining_hours} hours' client_rate_data[None] = { 'elapsed_minutes': round(elapsed_seconds / 60, 2), 'current_rate': round(total_current_rate, 2), 'average_rate': round(total_average_rate, 2), 'remaining_tasks': remaining_tasks, 'remaining_hours': remaining_time } # if we are called by the flask admin_console it is # easiest to do json formatting here, otherwise # we don't. if return_json: yield f"data:{json.dumps(client_rate_data)}\n\n" else: yield client_rate_data # wait until we send a new update time.sleep(wait_seconds) # stream_rate def setup_report_dir(self, db_name): """ Create directory for where the reports go if it does not exist, returns the path. """ if os.path.exists('./reports') == False: print('\t\tMaking global reports directory at ./reports.') os.makedirs('./reports') # set global report_path report_path = './reports/' + db_name # set up subdir for this analysis if os.path.exists(report_path) == False: print('\t\tMaking subdirectory for reports at %s' % report_path) os.makedirs(report_path) print('\t\tStoring output in %s' % report_path) return report_path # setup_report_dir def write_csv(self, report_path, file_name, csv_rows, num_decimals=2): """ basic utility function to write list of csv rows to a file """ full_file_path = report_path + '/' + file_name with open(full_file_path, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for row in csv_rows: rounded_row = [] for item in row: # round floats and decimals if isinstance(item, float) or isinstance( item, decimal.Decimal): rounded_row.append(round(item, num_decimals)) else: rounded_row.append(item) csv_writer.writerow(rounded_row) print('\t\tOutput written to %s' % full_file_path) # write_csv def print_runtime(self, action_name, start_time): """ Just for CLI info """ print('-' * 40) print('\t%s finished in %s' % (action_name, str(datetime.now() - start_time))) print('-' * 40) # print_runtime def get_absolute_url_from_page_link(self, page_url, link_url): """ Given a page_url and a link_url from that page we determine the absolute url of the link from the page_url. """ # ex nihilo nihil fit if link_url == None: return None if len(link_url) == 0: return None # we use the info from the original url for converting # relative links to absolute parsed_page_url = urlparse(page_url) # this is an absolute url already, nothing further to do to if re.match('^https?://', link_url): return (link_url) # link with no scheme, paste it in elif re.match('^//', link_url): return (parsed_page_url.scheme + ':' + link_url) # relative link, fix it up else: if link_url[0] != '/': return (parsed_page_url.scheme + '://' + parsed_page_url.netloc + '/' + link_url) else: return (parsed_page_url.scheme + '://' + parsed_page_url.netloc + link_url) # this only happens if something breaks return None # get_absolute_url_from_link def get_most_common_sorted(self, list_in): """ takes a list, finds the most common items and then resorts alpha (b/c python's Counter will arbitrarily order items with same count), then sorts again for most-common assumes list_in contains alphanumeric tuples """ most_common_sorted = collections.Counter(list_in).most_common() most_common_sorted.sort() most_common_sorted.sort(reverse=True, key=lambda item: item[1]) return most_common_sorted # get_most_common_sorted ######################### # POLICY EXTRACTION # ######################### def get_policy_link_terms(self): """ Returns a list of terms used to indicate a link may be a policy, note languages are all mixed together. """ policy_link_terms = [] # go through json file and merge terms together for lang_term_set in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/policyxray/policy_terms.json', 'r', encoding='utf-8')): for term in lang_term_set['policy_link_terms']: policy_link_terms.append(term) return policy_link_terms # get_policy_link_terms def get_policy_verification_terms(self): """ Returns a dictionary of terms used to verify several types of policies, note languages are all mixed together. """ policy_verification_terms = {} policy_verification_terms['privacy_policy'] = [] policy_verification_terms['terms_of_service'] = [] policy_verification_terms['cookie_policy'] = [] policy_verification_terms['ad_choices'] = [] policy_verification_terms['gdpr_statement'] = [] policy_verification_terms['ccpa_statement'] = [] # go through json file and merge terms together for lang_term_set in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/policyxray/policy_terms.json', 'r', encoding='utf-8')): for term in lang_term_set['privacy_policy_verification_terms']: policy_verification_terms[ 'privacy_policy'] = policy_verification_terms[ 'privacy_policy'] + [term] for term in lang_term_set['terms_of_service_verification_terms']: policy_verification_terms[ 'terms_of_service'] = policy_verification_terms[ 'terms_of_service'] + [term] for term in lang_term_set['cookie_policy_verification_terms']: policy_verification_terms[ 'cookie_policy'] = policy_verification_terms[ 'cookie_policy'] + [term] for term in lang_term_set['ad_choices_verification_terms']: policy_verification_terms[ 'ad_choices'] = policy_verification_terms['ad_choices'] + [ term ] for term in lang_term_set['gdpr_statement_verification_terms']: policy_verification_terms[ 'gdpr_statement'] = policy_verification_terms[ 'gdpr_statement'] + [term] for term in lang_term_set['ccpa_statement_verification_terms']: policy_verification_terms[ 'ccpa_statement'] = policy_verification_terms[ 'ccpa_statement'] + [term] return policy_verification_terms # get_policy_verification_terms def get_lang_to_privacy_policy_term_dict(self): """ Returns a dict of privacy policy terms keyed by language code. """ lang_to_terms = {} for lang_term_set in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/policyxray/policy_terms.json', 'r', encoding='utf-8')): lang_to_terms[ lang_term_set['lang']] = lang_term_set['policy_terms'] return lang_to_terms # get_lang_to_priv_term_dict ######################### # DOMAIN OWNERSHIP # ######################### def get_domain_owner_dict(self): """ read out everything in the domain_owner table into a dictionary so we can easily use it as a global lookup table this is purposefully independent of self.patch_domain_owners and does not assume the above has been run, however will return and empty dictionary if the db has not been patched yet reasons for above is that if user does not wish to update with the current json file historical data will remain consistent """ # domain_owners is both returned as well as made available to other class functions self.domain_owners = {} domain_owner_raw_data = self.sql_driver.get_all_domain_owner_data() if domain_owner_raw_data: for item in domain_owner_raw_data: # add everything to the dict self.domain_owners[item[0]] = { 'parent_id': item[1], 'owner_name': item[2], 'aliases': json.loads(item[3]), 'homepage_url': item[4], 'site_privacy_policy_urls': json.loads(item[5]), 'service_privacy_policy_urls': json.loads(item[6]), 'gdpr_statement_urls': json.loads(item[7]), 'terms_of_use_urls': json.loads(item[8]), 'platforms': json.loads(item[9]), 'uses': json.loads(item[10]), 'notes': item[11], 'country': item[12] } return self.domain_owners # get_domain_owner_dict def get_domain_owner_lineage_ids(self, id): """ for a given domain owner id, return the list which corresponds to its ownership lineage """ if self.domain_owners[id]['parent_id'] == None: return [id] else: return [id] + self.get_domain_owner_lineage_ids( self.domain_owners[id]['parent_id']) # get_domain_owner_lineage_ids def get_domain_owner_lineage_strings(self, owner_id, get_aliases=False): """ given an owner_id this function returns a list which is the full lineage of ownership optionally will also return aliases (e.g. 'Doubleclick' and 'Double Click') """ lineage_strings = [] for owner_id in self.get_domain_owner_lineage_ids(owner_id): lineage_strings.append( (owner_id, self.domain_owners[owner_id]['owner_name'])) if get_aliases: for alias in self.domain_owners[owner_id]['aliases']: lineage_strings.append((owner_id, alias)) return lineage_strings # get_domain_owner_lineage_strings def get_domain_owner_lineage_combined_string(self, owner_id): """ given an owner_id this function returns a single string which is the full lineage of ownership """ lineage_string = '' for item in self.get_domain_owner_lineage_strings(owner_id): lineage_string += item[1] + ' > ' return lineage_string[:-3] # get_domain_owner_lineage_combined_string def get_domain_owner_child_ids(self, id): """ for a given owner id, get all of its children/subsidiaries """ # first get all the children ids if they exist child_ids = [] for item in self.domain_owners: if self.domain_owners[item]['parent_id'] == id: child_ids.append(item) # if we have children, call recursively if len(child_ids) > 0: for child_id in child_ids: child_ids.extend(self.get_domain_owner_child_ids(child_id)) # return an empty list if no children return child_ids # get_domain_owner_child_ids def is_url_valid(self, url): """ Performs checks to verify if the url can actually be scanned. """ # only do http links if not (re.match('^https?://.+', url)): return False # if we can't get the url_path it is invalid try: url_path = urlsplit(url.strip().lower()).path except: return False # if we can't do idna conversion it is invalid try: idna_fixed_netloc = urlsplit( url.strip()).netloc.encode('idna').decode('utf-8') except: return False # these are common file types we want to avoid illegal_extensions = [ 'apk', 'dmg', 'doc', 'docx', 'exe', 'ics', 'iso', 'pdf', 'ppt', 'pptx', 'rtf', 'txt', 'xls', 'xlsx' ] # if we can't parse the extension it doesn't exist and is # therefore ok by our standards try: url_extension = re.search('\.([0-9A-Za-z]+)$', url_path).group(1) if url_extension in illegal_extensions: return False except: return True # it's good return True # is_url_valid def idna_encode_url(self, url, no_fragment=False): """ Non-ascii domains will crash some browsers, so we need to convert them to idna/ascii/utf-8. This requires splitting apart the url, converting the domain to idna, and pasting it all back together """ split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') if no_fragment: return urlunsplit((split_url.scheme, idna_fixed_netloc, split_url.path, split_url.query, '')) else: return urlunsplit( (split_url.scheme, idna_fixed_netloc, split_url.path, split_url.query, split_url.fragment)) # idna_encode_url def is_url_internal(self, origin_url, target_url): """ Given two urls (origin, target) determines if the target is internal to the origin based on subsuffix+1 domain. """ origin_domain = self.url_parser.get_parsed_domain_info(origin_url) target_domain = self.url_parser.get_parsed_domain_info(target_url) # we return None to signify we couldn't parse the urls if not origin_domain['success'] or not target_domain['success']: return None else: origin_domain = origin_domain['result']['domain'] target_domain = target_domain['result']['domain'] if origin_domain != target_domain: return False else: return True
def store_result(self, data): """ We've gotten data from a client, attempt to store it. """ # unpack params client_id = data['client_id'] client_ip = data['client_ip'] success = data['success'] task = data['task'] task_result = data['task_result'] # we only load the json string if it is # not a crawl if task != 'get_crawl': target = json.loads(data['target']) else: target = data['target'] # get db connection from config mapped_db = self.client_id_to_db[client_id] # create db connection if client_id in self.client_id_to_db: sql_driver = PostgreSQLDriver(mapped_db) else: return 'FAIL: client_id not in client_id_to_db list' # get config for this db config = sql_driver.get_config() # if we're not expecting this result we ignore it if not sql_driver.is_task_in_queue({'task': task, 'target': target}): return 'FAIL: task not in queue, ignoring' # if browser failed we increment attempts and log the error if success == False: print(f'π Error for {target}: %s' % {task_result}) # for times we don't want to retry, such as a rejected # redirect or network resolution failure, this could be expanded fail_cases = [ 'reached fail limit', 'rejecting redirect', 'did not find enough internal links' ] if task_result in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result: sql_driver.set_task_as_failed(target, task) else: sql_driver.unlock_task_in_queue(target, task) sql_driver.log_error({ 'client_id': client_id, 'target': target, 'task': task, 'msg': task_result }) sql_driver.close() del sql_driver return 'FAIL' # we only need to put the result in the queue, allows # us to respond to clients faster and keep the results # compressed self.server_sql_driver.add_result_to_queue({ 'client_id': client_id, 'client_ip': client_ip, 'mapped_db': mapped_db, 'target': target, 'task': task, 'task_result': task_result }) # close out db connection and send back our response sql_driver.close() del sql_driver return 'OK'
def process_tasks_from_queue(self, process_num): """ Selects the next page from the task_queue and passes to process_url. If load is unsucessful places page back into queue and updates attempts. Returns once when there are no pages in the queue under max_attempts. """ print('\t[p.%s]\tπββοΈ Starting process' % process_num) # need a local connection for each queue manager if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # keep getting tasks from queue until none are left at max attempt level while sql_driver.get_task_queue_length( max_attempts=self.config['max_attempts'], unlocked_only=True) != 0: # it is possible for two processes to both pass the above conditional # and then try to get a task from the queue at the same time. # however, the second process that attempts to get a task will # get an empty result (and crash), so we have a try/except block here # to handle that condition gracefully try: target, task = sql_driver.get_task_from_queue( max_attempts=self.config['max_attempts'], client_id=self.client_id) except: break print('\t[p.%s]\tπ Initializing: %s for target %s' % (process_num, task, target[:50])) # import and set up specified browser driver # note we set up a new browser each time to # get a fresh profile if self.browser_config['client_browser_type'] == 'chrome': browser_driver = ChromeDriver(self.browser_config, port_offset=process_num) else: print( f"π₯΄ INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!" ) return # does the webxray scan or policy capture if task == 'get_scan': task_result = browser_driver.get_scan(target) elif task == 'get_crawl': task_result = browser_driver.get_crawl(json.loads(target)) elif task == 'get_policy': task_result = browser_driver.get_scan(target, get_text_only=True) elif task == 'get_random_crawl': task_result = browser_driver.get_random_crawl(target) # kill browser del browser_driver # browser has failed to get result, unlock and continue if task_result['success'] == False: print('\t[p.%s]\tπ Error: %s %s' % (process_num, target[:50], task_result['result'])) # for times we don't want to retry, such as a rejected # redirect or network resolution failure, this could be expanded fail_cases = [ 'reached fail limit', 'rejecting redirect', 'did not find enough internal links' ] if task_result[ 'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[ 'result']: sql_driver.set_task_as_failed(target, task) else: sql_driver.unlock_task_in_queue(target, task) # keep track of error regardless of fail/unlock sql_driver.log_error({ 'client_id': 'localhost', 'target': target, 'task': task, 'msg': task_result['result'] }) continue # debug if self.debug: print( '\t[p.%s]\tπ₯ Got browser result on task %s, going to store: %s' % (process_num, task, target[:50])) # store_result also handles task queue mangement store_result = self.store_result({ 'target': target, 'task': task, 'task_result': task_result['result'], 'client_id': self.client_id }) if store_result['success'] == True: print(f'\t[p.{process_num}]\tπ Success: {target[:50]}') else: print( f'\t[p.{process_num}]\tπ Error: {target[:50]} {store_result["result"]}' ) # tidy up sql_driver.close() del sql_driver print('\t[p.%s]\tβ Completed process' % process_num) return
def run(self, task='process_tasks_from_queue', pool_size=None): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there when running in slave mode the list is skipping and we got straight to scanning """ if task == 'process_tasks_from_queue': # set up sql connection to get queue_length if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() queue_length = sql_driver.get_task_queue_length() sql_driver.close() del sql_driver print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % queue_length) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method( allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) # map requires we pass an argument to the function # (even though we don't need to), so we create # a list equal to pool_size which will # spawn the desired number of processes process_num = [] if pool_size == None: pool_size = multiprocessing.cpu_count() for i in range(0, pool_size): process_num.append(i) if task == 'process_tasks_from_queue': myPool.map(self.process_tasks_from_queue, process_num) elif task == 'store_results_from_queue': myPool.map(self.store_results_from_queue, process_num)
def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # figure out the privacy policy url and text, starts null priv_policy_url = None priv_policy_url_text = None # read in our list of privacy link terms from the json file in webxray/resources/policyxray privacy_policy_term_list = self.utilities.get_privacy_policy_term_list( ) # we reverse links return from browser to check footer links first as that is where policy links tend to be all_links = browser_output['all_links'] all_links.reverse() # if we have links search for privacy policy if len(all_links) > 0: # links are tuple for link_text, link_url in all_links: # makes sure we have text, skip links without if link_text: # need lower for string matching link_text = link_text.lower().strip() # not a link we can use if 'javascript' in link_text: continue # see if the link_text is in our term list if link_text in privacy_policy_term_list: # if the link_url is relative this will convert to absolute priv_policy_url = self.utilities.get_absolute_url_from_page_link( url, link_url) priv_policy_url_text = link_text break # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: # handles issue where postgres will crash on inserting null character source = browser_output['source'].replace('\x00', ' ') else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], priv_policy_url, priv_policy_url_text, page_is_ssl, source, browser_output['load_time'], page_domain_id) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( 'http://' + cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error( url, 'Error parsing cookie with domain: ' + cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie(page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error( url, 'Error parsing element request: ' + request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request][ 'received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request][ 'referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[ 2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error( url, 'Error parsing referer header: ' + referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][ request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request][ 'load_time'] except: load_time = None try: status = browser_output['processed_requests'][request][ 'status'] except: status = None try: status_text = browser_output['processed_requests'][request][ 'status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request][ 'content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request][ 'body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'] [request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'] [request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = [ 'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf' ] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = [ 'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi' ] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None # file hashing has non-trivial overhead and off by default # # what this does is uses the same ua/referer as the actual request # so we are just replaying the last one to get similar response # note that we aren't sending the same cookies so that could be an issue # otherwise it is equivalent to a page refresh in theory # option to hash only 3p elements observed here if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False): replay_element_request = urllib.request.Request( request, headers={ 'User-Agent': browser_output['processed_requests'][request] ['user_agent'], 'Referer': referer, 'Accept': '*/*' }) try: file_md5 = hashlib.md5( urllib.request.urlopen(replay_element_request, timeout=10).read()).hexdigest() except: file_md5 = None else: file_md5 = None # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, file_md5, element_extension, element_type, element_args, element_domain_id) # close db connection sql_driver.close() return True
class OutputStore: """ This class receives data from the browser, processes it, and stores it in the db """ def __init__(self, db_name, db_engine): self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() self.debug = False if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.config = self.sql_driver.get_config() # __init__ def close(self): """ Just to make sure we close the db connection. """ self.sql_driver.close() # close def store_scan(self, params): """ This function pre-processes data from the browser, inserts it into database, and handles linking various entries across tables. """ # unpack params browser_output = params['browser_output'] client_id = params['client_id'] crawl_id = params['crawl_id'] crawl_timestamp = params['crawl_timestamp'] crawl_sequence = params['crawl_sequence'] # client_ip is optional if 'client_ip' in params: client_ip = params['client_ip'] else: client_ip = None if self.debug: print('going to store scan %s' % browser_output['start_url']) # keep track of domains page_3p_cookie_domains = set() page_3p_dom_storage_domains = set() page_3p_request_domains = set() page_3p_response_domains = set() page_3p_websocket_domains = set() # convert from timestamp to datetime object that will go to the db accessed = datetime.fromtimestamp(browser_output['accessed']) # first make sure we don't have it already if self.sql_driver.page_exists(browser_output['start_url'],accessed): return {'success': False, 'result': 'exists in db already'} # if we have no responses the page didn't load at all and we skip # unless we are using basic driver and then it's ok if len(browser_output['responses']) == 0 and browser_output['browser_type'] != 'basic': return {'success': False, 'result': 'no responses received'} # ignore any malformed unicode characters page_source = browser_output['page_source'].encode('utf-8', 'ignore').decode() # store source if self.config['store_source']: if self.debug: print('going to store source %s' % browser_output['start_url']) page_source_md5 = self.store_file(page_source, False, 'page_source') else: page_source_md5 = None # store readability_html if self.config['store_page_text'] and browser_output['page_text']: if self.debug: print('going to store readability_html') # ignore any malformed unicode characters readability_html = browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip() readability_source_md5 = self.store_file(readability_html, False, 'readability_html') # store_page_text handles some addition operations if self.debug: print('going to store page_text') page_text_id = self.store_page_text(readability_html,readability_source_md5) else: page_text_id = None # process info on the start_url domain if self.debug: print('going to parse start/final_url %s' % browser_output['start_url']) start_url = browser_output['start_url'] start_url_domain_info = self.url_parser.get_parsed_domain_info(start_url) if start_url_domain_info['success'] == False: err_msg = 'unable to parse start_url_domain_info info for %s with error %s' % (browser_output['start_url'], start_url_domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) return {'success': False, 'result': 'could not parse start_url'} else: # needed for comparisons later on start_url_domain = start_url_domain_info['result']['domain'] # add start_url domain and get id start_url_domain_id = self.sql_driver.add_domain(start_url_domain_info['result']) # process info on the final_url domain # note: we use the final_url domain as the benchmark for determine 1p/3p final_url = browser_output['final_url'] final_url_domain_info = self.url_parser.get_parsed_domain_info(final_url) if final_url_domain_info['success'] == False: err_msg = 'unable to parse final_url_domain_info info for %s with error %s' % (browser_output['final_url'], final_url_domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) return {'success': False, 'result': 'could not parse final_url'} else: final_url_domain = final_url_domain_info['result']['domain'] # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id final_url_domain_id = self.sql_driver.add_domain(final_url_domain_info['result']) # check if the page has redirected to a new domain if start_url_domain != final_url_domain: page_domain_redirect = True else: page_domain_redirect = False # this is semi-redundant but ensures that any config changes made while # a result is queued are followed if self.config['client_reject_redirects'] and page_domain_redirect: return {'success': False, 'result': 'rejecting redirect'} # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False # (optionally) process and store links, this allows us to go back later and do deeper scans # as well as do more with policies # links starts as empty list links = [] # keep track of link counts as helpful for filtering pages link_count_internal = 0 link_count_external = 0 if self.config['store_links']: if self.debug: print('going to process links %s' % browser_output['start_url']) # we use the list of policy_link_terms to flag that a link *might* # be for a policy, we check if it actually is policy in PolicyCollector.py policy_link_terms = self.utilities.get_policy_link_terms() # process links, duplicates get ignored by db for link in browser_output['all_links']: # skip if href not valid if not self.utilities.is_url_valid(link['href']): continue # unpack values and catch any unicode errors link_text = link['text'].encode('utf-8', 'ignore').decode() link_url = link['href'].encode('utf-8', 'ignore').decode() # get rid of trailing # and / if link_url.strip()[-1:] == '#': link_url = link_url.strip()[:-1] if link_url.strip()[-1:] == '/': link_url = link_url.strip()[:-1] # sometimes the text will be a dict (very rarely) # so we convert to string link_text = str(link_text).strip() # clean up white space and remove line breaks link_text = re.sub('\n|\r|\t|\s+',' ',link_text.strip()) link_url = re.sub('\n|\r|\t|\s+',' ',link_url.strip()) # catch nulls link_text = link_text.replace('\x00','NULL_REPLACED_FOR_PSQL') link_url = link_url.replace('\x00','NULL_REPLACED_FOR_PSQL') # update counts if link['internal']: link_count_internal += 1 else: link_count_external += 1 # flag links that could be policies, default False link_is_policy = False # determine if a policy term appears in the link for policy_term in policy_link_terms: if policy_term in link_text.lower(): link_is_policy = True break link_domain_info = self.url_parser.get_parsed_domain_info(link_url) if link_domain_info['success'] == False: # don't bother with storing errors link_domain_id = None else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id link_domain_id = self.sql_driver.add_domain(link_domain_info['result']) links.append({ 'url' : link_url, 'text' : link_text, 'is_internal' : link['internal'], 'is_policy' : link_is_policy, 'domain_id' : link_domain_id }) # if we got the screen shot we get the hash and store it to the file table screen_shot_md5 = None if browser_output['screen_shot'] and self.config['store_screen_shot']: if self.debug: print('going to store screen shot %s' % browser_output['start_url']) # store file to get md5 screen_shot_md5 = self.store_file(browser_output['screen_shot'],True,'screen_shot') # if we have timestamp it is also an 'accessed' field from # a page load so we convert that as well if crawl_timestamp: crawl_timestamp = datetime.fromtimestamp(crawl_timestamp) # ignore any malformed unicode characters if browser_output['title']: browser_output['title'] = browser_output['title'].encode('utf-8', 'ignore').decode() if browser_output['meta_desc']: browser_output['meta_desc'] = browser_output['meta_desc'].encode('utf-8', 'ignore').decode() if browser_output['lang']: browser_output['lang'] = browser_output['lang'].encode('utf-8', 'ignore').decode() # now we know link counts we can store the page if self.debug: print('going to store page %s' % browser_output['start_url']) page_id = self.sql_driver.add_page({ 'accessed' : accessed, 'browser_type' : browser_output['browser_type'], 'browser_version' : browser_output['browser_version'], 'browser_prewait' : browser_output['prewait'], 'browser_no_event_wait' : browser_output['no_event_wait'], 'browser_max_wait' : browser_output['max_wait'], 'page_load_strategy' : browser_output['page_load_strategy'], 'title' : browser_output['title'], 'meta_desc' : browser_output['meta_desc'], 'lang' : browser_output['lang'], 'start_url' : browser_output['start_url'], 'final_url' : browser_output['final_url'], 'is_ssl' : page_is_ssl, 'page_domain_redirect' : page_domain_redirect, 'link_count_internal' : link_count_internal, 'link_count_external' : link_count_external, 'load_time' : browser_output['load_time'], 'start_url_domain_id' : start_url_domain_id, 'final_url_domain_id' : final_url_domain_id, 'client_id' : client_id, 'client_timezone' : browser_output['client_timezone'], 'client_ip' : client_ip, 'page_text_id' : page_text_id, 'screen_shot_md5' : screen_shot_md5, 'page_source_md5' : page_source_md5, 'crawl_id' : crawl_id, 'crawl_timestamp' : crawl_timestamp, 'crawl_sequence' : crawl_sequence }) # STORE LINKS if self.config['store_links']: if self.debug: print('going to store links %s' % browser_output['start_url']) for link in links: link_id = self.sql_driver.add_link(link) if link_id: self.sql_driver.join_link_to_page(page_id,link_id) # PROCESS DOM_STORAGE if self.config['store_dom_storage']: if self.debug: print('going to process dom storage %s' % browser_output['start_url']) for dom_storage in browser_output['dom_storage']: # parse domain from the security_origin, which is equivalent to a url domain_info = self.url_parser.get_parsed_domain_info(dom_storage['security_origin']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (dom_storage['security_origin'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id dom_storage['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark if third-party storage if final_url_domain != domain_info['result']['domain']: dom_storage['is_3p'] = True else: dom_storage['is_3p'] = False # key to page dom_storage['page_id'] = page_id # replace null b/c postgres will die otherwise dom_storage['key'] = dom_storage['key'].replace('\x00','NULL_REPLACED_FOR_PSQL') dom_storage['value'] = dom_storage['value'].replace('\x00','NULL_REPLACED_FOR_PSQL') # there types of illegal utf-8 characters that psql doesn't like, eg trying to store # '\uded5' gives this error when storing in psql: # 'UnicodeEncodeError: 'utf-8' codec can't encode character '\uded5' in position 0: surrogates not allowed' # # to overcome the above, we use python's backslashreplace to keep the original data in # a way that won't cause our queries to die # see https://docs.python.org/3/library/codecs.html#error-handlers dom_storage['key'] = dom_storage['key'].encode('utf-8','backslashreplace') dom_storage['value'] = dom_storage['value'].encode('utf-8','backslashreplace') # now that we've encoded with backslashes we decode to get the semi-original data dom_storage['key'] = dom_storage['key'].decode('utf-8') dom_storage['value'] = dom_storage['value'].decode('utf-8') # all done with this item self.sql_driver.add_dom_storage(dom_storage) # update domains if dom_storage['is_3p']: page_3p_dom_storage_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) # PROCESS LOAD FINISH if self.debug: print('going to process load finish data %s' % browser_output['start_url']) load_finish_data = {} for load_finish_event in browser_output['load_finish_events']: load_finish_data[load_finish_event['request_id']] = load_finish_event['encoded_data_length'] # RESPONSE EXTRA HEADERS if self.debug: print('going to process response extra header data %s' % browser_output['start_url']) http_cookies = [] internal_id_to_resp_ex_headers = {} for response_extra_header in browser_output['response_extra_headers']: response_extra_header['page_id'] = page_id response_extra_header['cookies_set'] = None # to check for domain leakage in headers we make a big string keyed to the internal id if response_extra_header['request_id'] not in internal_id_to_resp_ex_headers: internal_id_to_resp_ex_headers[response_extra_header['request_id']] = str(response_extra_header['headers']) else: internal_id_to_resp_ex_headers[response_extra_header['request_id']] += str(response_extra_header['headers']) for item in response_extra_header['headers']: if item.lower() == 'set-cookie': response_extra_header['cookies_set'] = response_extra_header['headers'][item] # when we add cookies later on we mark those that came from response headers, # note we try/pass on this in case we can't parse for cookie in response_extra_header['cookies_set'].split('\n'): if 'domain' in cookie.lower(): try: name = re.match('^(.+?)=',cookie)[0][:-1] domain = re.match('^.+domain=(.+?)(;|$)',cookie.lower())[1] if domain[0] == '.': domain = domain[1:] http_cookies.append((domain,name)) except: pass if self.config['store_response_xtra_headers']: self.sql_driver.add_response_extra_header(response_extra_header) # PROCESS RESPONSES response_received_req_ids = [] if self.debug: print('going to process response data %s' % browser_output['start_url']) for response in browser_output['responses']: # defaut values that may get over-written response['file_md5'] = None response['is_data'] = False response['is_3p'] = None response['is_ssl'] = None response['page_domain_in_headers'] = False # first handle non-http urls and optionally store content if re.match('^(data|about|chrome|blob|javascript).+', response['url']): if 'base64' in response['url'].lower() or 'image' in response['type'].lower(): is_base64 = True else: is_base64 = False # store_file follows the config as far as actually storing the file goes # and will either return the md5 or None # make sure we're following our configuration if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False): response['file_md5'] = self.store_file(response['url'],is_base64,response['type']) else: response['file_md5'] = None response['url'] = None response['is_data'] = True response['domain_id'] = None else: # parse, store, and get id of domain; if fails skip domain_info = self.url_parser.get_parsed_domain_info(response['url']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (response['url'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: response_domain = domain_info['result']['domain'] response['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # now add ip if response['remote_ip_address']: self.sql_driver.add_domain_ip_addr(response['domain_id'],response['remote_ip_address']) # mark third-party responses based on final_url domain if response_domain != final_url_domain: response['is_3p'] = True else: response['is_3p'] = False # determine if encrypted if response['url'][:5] == 'https' or response['url'][:3] == 'wss': response['is_ssl'] = True else: response['is_ssl'] = False # keep track of the request ids of each reponse to mark as received response_received_req_ids.append(response['request_id']) # we do no more processing at this point if not self.config['store_responses']: continue # lower case the type, simplifies db queries response['type'] = response['type'].lower() # store the security details if they exist if response['security_details'] and self.config['store_security_details']: response['security_details_id'] = self.sql_driver.add_security_details(response['security_details']) else: response['security_details_id'] = None # store the size of the request if response['request_id'] in load_finish_data: response['final_data_length'] = load_finish_data[response['request_id']] else: response['final_data_length'] = None # parse off args/etc # consider anything before the "?" to be the element_url try: response['base_url'] = re.search('^(.+?)\?.+$', response['url']).group(1) except: response['base_url'] = response['url'] # attempt to parse off the extension try: response['extension'] = re.search('\.([0-9A-Za-z]+)$', response['base_url']).group(1).lower() except: response['extension'] = None # First see if this request_id is present in response_bodies, and if # the entry is not None, then we store it to the db if config says to. if response['request_id'] in browser_output['response_bodies']: if browser_output['response_bodies'][response['request_id']]: # make sure we're following our configuration is_base64 = browser_output['response_bodies'][response['request_id']]['is_base64'] if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False): response['file_md5'] = self.store_file( browser_output['response_bodies'][response['request_id']]['body'], is_base64, response['type'] ) else: response['file_md5'] = None # link to page response['page_id'] = page_id # parse data headers, accounts for upper/lower case variations (eg 'set-cookie', 'Set-Cookie') response['content_type'] = None response['cookies_set'] = None for item in response['response_headers']: if item.lower() == 'content-type': response['content_type'] = response['response_headers'][item] if item.lower() == 'set-cookie': response['cookies_set'] = response['response_headers'][item] # if we have request_headers look for cookies sent response['cookies_sent'] = None if response['request_headers']: for item in response['request_headers']: if item.lower() == 'cookie': response['cookies_sent'] = response['request_headers'][item] # parse referer header response['referer'] = None for item in response['response_headers']: if item.lower() == 'referer': response['referer'] = response['response_headers'][item] # check if domain leaked in referer if response['request_id'] in internal_id_to_resp_ex_headers: if final_url_domain in internal_id_to_resp_ex_headers[response['request_id']]: response['page_domain_in_headers'] = True # convert from timestamp to datetime object that will go to the db response['timestamp'] = datetime.fromtimestamp(response['timestamp']) # store self.sql_driver.add_response(response) # update domains if response['is_3p']: page_3p_response_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) # REQUEST EXTRA HEADERS if self.debug: print('going to process request extra headers data %s' % browser_output['start_url']) internal_id_to_req_ex_headers = {} for request_extra_header in browser_output['request_extra_headers']: request_extra_header['page_id'] = page_id request_extra_header['cookies_sent'] = None # to check for domain leakage in headers we make a big string keyed to the internal id if request_extra_header['request_id'] not in internal_id_to_req_ex_headers: internal_id_to_req_ex_headers[request_extra_header['request_id']] = str(request_extra_header['headers']) else: internal_id_to_req_ex_headers[request_extra_header['request_id']] += str(request_extra_header['headers']) for item in request_extra_header['headers']: if item.lower() == 'cookie': request_extra_header['cookies_sent'] = request_extra_header['headers'][item] if self.config['store_request_xtra_headers']: self.sql_driver.add_request_extra_header(request_extra_header) # PROCESS REQUESTS if self.config['store_requests']: if self.debug: print('going to process request data %s' % browser_output['start_url']) for request in browser_output['requests']: # defaut values that may get over-written request['file_md5'] = None request['is_data'] = False request['is_3p'] = None request['is_ssl'] = None request['page_domain_in_headers'] = False # first handle non-http urls and optionally store content if re.match('^(data|about|chrome|blob|javascript).+', request['url']): if 'base64' in request['url'].lower() or 'image' in request['url'].lower(): is_base64 = True else: is_base64 = False # store_file follows the config as far as actually storing the file goes # and will either return the md5 or None # make sure we're following our configuration if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False): request['file_md5'] = self.store_file(request['url'],is_base64,request['type']) else: request['file_md5'] = None request['url'] = None request['is_data'] = True request['domain_id'] = None else: # parse, store, and get id of domain; if fails skip domain_info = self.url_parser.get_parsed_domain_info(request['url']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (request['url'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: request_domain = domain_info['result']['domain'] request['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark third-party requests based on final_url domain if request_domain != final_url_domain: request['is_3p'] = True else: request['is_3p'] = False # determine if encrypted if request['url'][:5] == 'https' or request['url'][:3] == 'wss': request['is_ssl'] = True else: request['is_ssl'] = False # replace null b/c postgres will die otherwise if request['post_data']: request['post_data'] = request['post_data'].replace('\x00','NULL_REPLACED_FOR_PSQL') # consider anything after the "?" to be the GET data try: get_string = re.search('^.+\?(.+)$', request['url']).group(1) get_string = get_string.replace('\x00','NULL_REPLACED_FOR_PSQL') get_data = {} for key_val in get_string.split('&'): get_data[key_val.split('=')[0]] = key_val.split('=')[1] request['get_data'] = json.dumps(get_data) except: request['get_data'] = None # mark if response received if request['request_id'] in response_received_req_ids: request['response_received'] = True else: request['response_received'] = None # mark if the loading finished if request['request_id'] in load_finish_data: request['load_finished'] = True else: request['load_finished'] = None # lower case the type, simplifies db queries if request['type']: request['type'] = request['type'].lower() # parse off args/etc # consider anything before the "?" to be the element_url try: request['base_url'] = re.search('^(.+?)\?.+$', request['url']).group(1) except: request['base_url'] = request['url'] # attempt to parse off the extension try: request['extension'] = re.search('\.([0-9A-Za-z]+)$', request['base_url']).group(1).lower() except: request['extension'] = None # link to page request['page_id'] = page_id # parse referer header request['referer'] = None for item in request['headers']: if item.lower() == 'referer': request['referer'] = request['headers'][item] # check if domain leaked in headers if request['request_id'] in internal_id_to_req_ex_headers: if final_url_domain in internal_id_to_req_ex_headers[request['request_id']]: request['page_domain_in_headers'] = True # convert from timestamp to datetime object that will go to the db request['timestamp'] = datetime.fromtimestamp(request['timestamp']) # all done self.sql_driver.add_request(request) # update domains if request['is_3p']: page_3p_request_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) # PROCESS WEBSOCKETS if self.config['store_websockets']: if self.debug: print('going to process websocket data %s' % browser_output['start_url']) ws_id_map = {} for websocket in browser_output['websockets']: domain_info = self.url_parser.get_parsed_domain_info(websocket['url']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (websocket['url'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id websocket['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark if third-party connection if final_url_domain != domain_info['result']['domain']: websocket['is_3p'] = True else: websocket['is_3p'] = False websocket['page_id'] = page_id this_websocket_id = self.sql_driver.add_websocket(websocket) # update domains if websocket['is_3p']: page_3p_websocket_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) if websocket['request_id'] not in ws_id_map: ws_id_map[websocket['request_id']] = this_websocket_id else: print('ERROR WS_REQ_ID ALREADY IN MAP') # PROCESS WEBSOCKET EVENTS if self.config['store_websockets'] and self.config['store_websocket_events']: for websocket_event in browser_output['websocket_events']: websocket_event['page_id'] = page_id if websocket_event['request_id'] in ws_id_map: websocket_event['websocket_id'] = ws_id_map[websocket_event['request_id']] else: websocket_event['websocket_id'] = None # convert from timestamp to datetime object that will go to the db websocket_event['timestamp'] = datetime.fromtimestamp(websocket_event['timestamp']) self.sql_driver.add_websocket_event(websocket_event) # PROCESS EVENT SOURCE MSGS if self.config['store_event_source_msgs']: if self.debug: print('going to process event source data %s' % browser_output['start_url']) for event_source_msg in browser_output['event_source_msgs']: event_source_msg['page_id'] = page_id # convert from timestamp to datetime object that will go to the db event_source_msg['timestamp'] = datetime.fromtimestamp(event_source_msg['timestamp']) self.sql_driver.add_event_source_msg(event_source_msg) # PROCESS COOKIES if self.config['store_cookies']: if self.debug: print('going to process cookies %s' % browser_output['start_url']) for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// # parse domain from the security_origin, which is equivalent to a url domain_info = self.url_parser.get_parsed_domain_info('http://'+cookie['domain']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (cookie['domain'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id cookie['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark if third-party cookie if final_url_domain != domain_info['result']['domain']: cookie['is_3p'] = True else: cookie['is_3p'] = False # key to page cookie['page_id'] = page_id # fix var names cookie['http_only'] = cookie['httpOnly'] # attempt to convert cookie expiry from timestamp to datetime object, note we # need try/except as python datetime object cannot have year > 9999 and some # cookies do that cookie['expires_timestamp'] = None if cookie['expires']: try: cookie['expires_timestamp'] = datetime.fromtimestamp(cookie['expires']) except: pass # this is optional, do fall-back if 'sameSite' in cookie: cookie['same_site'] = cookie['sameSite'] else: cookie['same_site'] = None # see if this cookie was set via http response if cookie['domain'][0] == '.': cookie_tuple = (cookie['domain'][1:],cookie['name']) else: cookie_tuple = (cookie['domain'],cookie['name']) if cookie_tuple in http_cookies: cookie['is_set_by_response'] = True else: cookie['is_set_by_response'] = False # all done with this cookie self.sql_driver.add_cookie(cookie) # update domains if cookie['is_3p']: page_3p_cookie_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) if self.debug: print('done storing scan %s' % browser_output['start_url']) return { 'success' : True, 'page_id' : page_id, 'page_3p_request_domains' : page_3p_request_domains, 'page_3p_response_domains' : page_3p_response_domains, 'page_3p_websocket_domains' : page_3p_websocket_domains, 'page_3p_dom_storage_domains' : page_3p_dom_storage_domains, 'page_3p_cookie_domains' : page_3p_cookie_domains } # store_scan def store_file(self,body,is_base64,type): """ Hashes and stores file, returns file_md5. """ # in theory we shouldn't get here if it is base64, so this is a fail-safe check if not self.config['store_base64']: if is_base64 or type.lower()=='image': return None # note hash is on original data, which we modify to remove \x00 before we store file_md5 = hashlib.md5(body.encode()).hexdigest() # store to db, note query will be ignored on conflict # but since we calculate the md5 as above that is fine self.sql_driver.add_file({ 'md5' : file_md5, 'body' : body.replace('\x00','NULL_REPLACED_FOR_PSQL'), 'type' : type.lower(), 'is_base64' : is_base64 }) return file_md5 # store_file def store_policy(self, browser_output, client_id, client_ip=None): """ We attempt to figure out if the text provided is a policy, if so we store it to the database. """ # keep values in a dict here policy = {} # attempt to get_policy was a success, extract data from # dict, since postgres cannot handle '\x00' we convert to # string for several fields and use .replace('\x00',' ') to # clean the input policy['client_id'] = client_id policy['client_ip'] = client_ip policy['browser_type'] = browser_output['browser_type'] policy['browser_version'] = browser_output['browser_version'] policy['browser_prewait'] = browser_output['prewait'] policy['start_url'] = browser_output['start_url'] policy['final_url'] = browser_output['final_url'] policy['title'] = browser_output['title'] policy['meta_desc'] = browser_output['meta_desc'] policy['lang'] = browser_output['lang'] policy['fk_score'] = None policy['fre_score'] = None policy['word_count'] = None policy['type'] = None policy['match_term'] = None policy['match_text'] = None policy['match_text_type'] = None policy['confidence'] = None policy['page_text_id'] = None policy['page_source_md5'] = None # if readability failed we bail if not browser_output['readability_html'] or not browser_output['page_text']: self.sql_driver.close() return { 'success' : False, 'result' : 'No readability result' } # ignore any malformed unicode characters readability_html = browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip() page_text = browser_output['page_text'].encode('utf-8', 'ignore').decode().strip() page_source = browser_output['page_source'].encode('utf-8', 'ignore').decode() # bail on empty text if len(page_text) == 0: self.sql_driver.close() return { 'success' : False, 'result' : 'Empty page text' } # load the source into lxml so we can do additional processing, # if we fail we bail try: lxml_doc = lxml.html.fromstring(readability_html) except: return ({ 'success': False, 'result': 'Could not parse readability_html with lxml' }) # if the text is less than 500 words we ignore it if len(page_text.split(' ')) < 500: self.sql_driver.close() return { 'success' : False, 'result' : 'Page text < 500 words' } # once we have the text we figure out if it is # a policy, start false, override on match is_policy = False # first look for matches on page title # we give this confidence of 100 as it is # definitely a match if policy['title']: policy_type_result = self.determine_policy_type_from_text(policy['title']) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = 'title' policy['confidence'] = 100 # deep checks may generate false positives so # they have confidence of 0 until they can # be verified, note we may do this here # or later on deep_checks = True if deep_checks: policy['confidence'] = 0 # convert the url path to a sentence by replacing # common delimiters with spaces and attempt matches if self.debug: print('going to do checks on url path') if not is_policy: url_path_string = re.sub('[-|_|/|\.]',' ',urlsplit(policy['start_url']).path) if len(url_path_string) > 0: policy_type_result = self.determine_policy_type_from_text(url_path_string) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = 'url_path' if self.debug: print('going to do checks on meta desc') if not is_policy and policy['meta_desc']: policy_type_result = self.determine_policy_type_from_text(policy['meta_desc']) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = 'meta_desc' # iterate over all types of heading tags to extract text # and check for policy matches. note we go in order of # importance (eg h1->h7->span,etc) if self.debug: print('going to do checks on heading tags') if not is_policy: for tag_type in ['h1','h2','h3','h4','h5','h6','h7','span','strong','em']: if is_policy: break tags = lxml_doc.cssselect(tag_type) if len(tags) > 0: for tag in tags: tag_text = tag.text_content() # if it is > 15 words it is likely not a heading if len(tag_text.split(' ')) > 15: break policy_type_result = self.determine_policy_type_from_text(tag_text) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = tag_type # if it is a policy we do additional processing # before storing in db, otherwise we fail # gracefully if is_policy: if self.debug: print('going to store readability_html') readability_source_md5 = self.store_file(readability_html, False, 'readability_html') if self.debug: print('going to store page_text') # store_page_text handles some addition operations if self.debug: print('going to store page_text') policy['page_text_id'] = self.store_page_text(readability_html, readability_source_md5) if self.debug: print(f"page_text_id is {policy['page_text_id']}") if self.debug: print('going to store page_source') policy['page_source_md5'] = self.store_file(page_source, False, 'page_source') if self.debug: print('going to do reading ease scores') # get readability scores, scores below zero are # invalid so we null them policy['fre_score'] = textstat.flesch_reading_ease(page_text) if policy['fre_score'] <= 0: policy['fre_score'] = None policy['fk_score'] = textstat.flesch_kincaid_grade(page_text) if policy['fk_score'] <= 0: policy['fk_score'] = None if self.debug: print('going to store policy') # add to db and get id for this policy policy_id = self.sql_driver.add_policy(policy) if self.debug: print('going to link policy to pages') # attach policy to all links with this url, not we can filter # do only do internal links for page_id, crawl_id in self.sql_driver.get_page_ids_from_link_url(policy['start_url'],internal_links_only=True): self.sql_driver.attach_policy_to_page(policy_id,page_id) self.sql_driver.attach_policy_to_crawl(policy_id,crawl_id) if self.debug: print(f'\tΓ°ΕΈβοΏ½ Success: {policy["start_url"]}') self.sql_driver.close() return {'success': True} else: if self.debug: print(f'\tΓ°ΕΈβΕ½ Fail: {policy["start_url"]}') self.sql_driver.close() return { 'success': False, 'result': 'Not policy' } # store_policy def determine_policy_type_from_text(self, text): """ Determine if a given text fragment indicates a given type of policy. Returns dict. """ # clear whitespace text = re.sub('\s+',' ',text) # retrieve values from policy_terms.json policy_verification_terms = self.utilities.get_policy_verification_terms() policy_type_keys = [] for key in policy_verification_terms: policy_type_keys.append(key) # randomize the order we do our checks random.shuffle(policy_type_keys) # look for matches against verification terms for policy_type in policy_type_keys: for term in policy_verification_terms[policy_type]: if term in text.lower(): return({ 'success': True, 'result' :{ 'policy_type': policy_type, 'match_term': term, 'match_text': text } }) # no match return ({'success': False}) # determine_policy_type_from_text def store_page_text(self,readability_html,readability_source_md5): # the actual 'page_text' output from readability doesn't properly seperate words # that use markup as a space. eg '<h3>this</h3><p>that</p>' becomes 'thisthat' # whereas 'this that' is what a user would see in the browser # to overcome the above issue we have to manually strip out html and do some # cleaning of our own. page_text = re.sub('<!--.+-->',' ', readability_html) page_text = re.sub('<svg.+</svg>',' ', page_text) page_text = re.sub('<.+?>', ' ', page_text) page_text = re.sub('[\n|\r]', ' ', page_text) page_text = re.sub('\s+', ' ', page_text) page_text = unicodedata.normalize('NFKD',html.unescape(page_text.strip())) # postgres can't handle nulls page_text = page_text.replace('\x00','NULL_REPLACED_FOR_PSQL') # return the id return self.sql_driver.add_page_text({ 'text' : page_text.replace('\x00',' '), 'word_count' : len(page_text.split()), 'readability_source_md5' : readability_source_md5 })
class Analyzer: """ This class performs analysis of our data. """ def __init__(self,db_name,db_engine, flush_domain_owners): # set up global db connection if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # these gets reused frequently, minimize db calls by doing it up here self.total_pages = self.sql_driver.get_complex_page_count() self.total_crawls = self.sql_driver.get_crawl_count() # pass utilities the database info self.utilities = Utilities(db_name,db_engine) # initialize the domain owner dict self.domain_owners = self.utilities.get_domain_owner_dict() # update domain owners if flush_domain_owners: self.patch_domain_owners() # load to memory for faster processing, make sure you # have enough RAM! self.get_crawl_id_to_3p_domain_info() # __init__ def get_crawl_id_to_3p_domain_info(self): """ Many operations needed to access a mapping of crawl_ids to the domain name and domain_owner_ids of all types of data (requests, responses, cookies, and websockets). To save db calls we set up a massive dictionary once to be reused later. """ print('\tFetching crawl 3p domain lookup info...', end='', flush=True) # this is a class global self.crawl_id_to_3p_domain_info = {} for crawl_id,domain,domain_owner_id in self.sql_driver.get_crawl_id_3p_domain_info(): if crawl_id not in self.crawl_id_to_3p_domain_info: self.crawl_id_to_3p_domain_info[crawl_id] = [{'domain':domain,'owner_id':domain_owner_id}] else: self.crawl_id_to_3p_domain_info[crawl_id] = self.crawl_id_to_3p_domain_info[crawl_id] + [{'domain':domain,'owner_id':domain_owner_id}] print('done!') # get_crawl_id_to_3p_domain_info def patch_domain_owners(self): """ in order to analyze what entities receive user data, we need to update the database with domain ownership records we have stored previously """ # we first clear out what is the db in case the new data has changed, # on big dbs takes a while print('\tFlushing extant domain owner data...', end='', flush=True) self.sql_driver.reset_domain_owners() print('done!') # next we pull the owner/domain pairings from the json file in # the resources dir and add to the db print('\tPatching with new domain owner data...', end='', flush=True) domain_owner_data = json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')) for item in domain_owner_data: # skipping for now, but perhaps find a way to enter this in db? if 'revision_date' in item: continue # convert lists to strings for db storage item['aliases'] = json.dumps(item['aliases']) item['site_privacy_policy_urls'] = json.dumps(item['site_privacy_policy_urls']) item['service_privacy_policy_urls'] = json.dumps(item['service_privacy_policy_urls']) item['gdpr_statement_urls'] = json.dumps(item['gdpr_statement_urls']) item['terms_of_use_urls'] = json.dumps(item['terms_of_use_urls']) item['platforms'] = json.dumps(item['platforms']) item['uses'] = json.dumps(item['uses']) self.sql_driver.add_domain_owner(item) for domain in item['domains']: self.sql_driver.update_domain_owner(item['id'], domain) # update the domain owner dict self.domain_owners = self.utilities.get_domain_owner_dict() print('done!') # patch_domain_owners def get_top_tlds(self, limit): """ finds the most common tlds from all the pages type is default to tld, but pubsuffix also works returns list of tlds """ # first we put all the tlds for each page into a list tlds = [] for row in self.sql_driver.get_all_tlds(): tlds.append(row[0]) # use this to hold the top tlds # it starts with "None" as that means we process all the pages top_tlds = [None] # set up a global var which has the counts for each tld self.page_counts_by_tld = {} # cut the list to the limit to return only top tlds for tld,count in collections.Counter(tlds).most_common()[0:limit]: top_tlds.append(tld) self.page_counts_by_tld[tld] = count return top_tlds # get_top_tlds def get_per_crawl_3p_domain_counts(self, tld_filter = None): """ determines basic stats for the number of 3p domains contacted per-crawl note this is distinct domain+pubsuffix, not fqdns (e.g. 'sub.example.com' and sub2.example.com' only count as 'example.com') """ # now we determine the number of domains each page is connected to by looking at len of list of 3p domains per_crawl_3p_request_counts = [] for crawl_id,count in self.sql_driver.get_crawl_3p_domain_counts(): per_crawl_3p_request_counts.append(count) # crawls that have no 3p requests are not yet in our counts # so for all uncounted pages we add in zeros uncounted_crawls = self.total_crawls - len(per_crawl_3p_request_counts) for i in range(0,uncounted_crawls): per_crawl_3p_request_counts.append(0) return per_crawl_3p_request_counts # get_per_crawl_3p_domain_counts def get_3p_domain_distribution(self, tld_filter=None): """ Determines the number of pages which have a given number of 3p domains. """ per_crawl_3p_request_counts = self.get_per_crawl_3p_domain_counts() domain_count_to_page_count = collections.Counter(per_crawl_3p_request_counts) domain_count_to_page_distribution = {} max_value = 0 for domain_count in domain_count_to_page_count: domain_count_to_page_distribution[domain_count] = domain_count_to_page_count[domain_count] if domain_count > max_value: max_value = domain_count full_dist = [] for domain_count in range(max_value+1): if domain_count in domain_count_to_page_distribution: full_dist.append({ 'domain_count': domain_count, 'page_count': domain_count_to_page_distribution[domain_count] }) else: full_dist.append({ 'domain_count': domain_count, 'page_count': 0 }) return full_dist # get_3p_domain_distribution def get_3p_cookie_distribution(self, tld_filter=None): """ Determines the number of pages which have a given number of cookies. """ per_page_3p_cookie_counts = self.get_per_crawl_3p_cookie_counts(tld_filter) cookie_count_to_page_count = collections.Counter(per_page_3p_cookie_counts) cookie_count_to_page_distribution = {} max_value = 0 for cookie_count in cookie_count_to_page_count: cookie_count_to_page_distribution[cookie_count] = cookie_count_to_page_count[cookie_count] if cookie_count > max_value: max_value = cookie_count full_dist = [] for cookie_count in range(max_value+1): if cookie_count in cookie_count_to_page_distribution: full_dist.append({ 'cookie_count': cookie_count, 'page_count': cookie_count_to_page_distribution[cookie_count] }) else: full_dist.append({ 'cookie_count': cookie_count, 'page_count': 0 }) return full_dist # get_3p_cookie_distribution def get_3p_domain_stats(self, tld_filter=None): """ Returns high-level 3p domain stats. """ # this is the data we will be getting stats for per_crawl_3p_request_counts = self.get_per_crawl_3p_domain_counts(tld_filter) # mean and median should always be ok mean = statistics.mean(per_crawl_3p_request_counts) median = statistics.median(per_crawl_3p_request_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(per_crawl_3p_request_counts) except: mode = None return({ 'mean': mean, 'median': median, 'mode': mode }) # get_3p_domain_stats def get_per_crawl_3p_cookie_counts(self, tld_filter = None): """ determines basic stats for the number of 3p cookies contacted per-crawl note that a single 3p many set more than one cookie """ # each page id corresponds to a list of cookie ids crawl_id_to_unique_cookies = {} # run query to get all page id, 3p cookie id, 3p cookie domain entries for crawl_id,cookie_name,cookie_domain in self.sql_driver.get_crawl_id_3p_cookie_id_3p_cookie_domain(tld_filter): # if the page id is not yet seen enter the current cookie id as a fresh list # otherwise, we add to the existing list if crawl_id not in crawl_id_to_unique_cookies: crawl_id_to_unique_cookies[crawl_id] = [(cookie_name,cookie_domain)] else: if (cookie_name,cookie_domain) not in crawl_id_to_unique_cookies[crawl_id]: crawl_id_to_unique_cookies[crawl_id] = crawl_id_to_unique_cookies[crawl_id] + [(cookie_name,cookie_domain)] # determine the number of 3p cookies each crawl has by looking at len of list of cookies per_crawl_3p_cookie_counts = [] for crawl_id in crawl_id_to_unique_cookies: per_crawl_3p_cookie_counts.append(len(crawl_id_to_unique_cookies[crawl_id])) # crawls that have no 3p cookies are not yet in our counts # so for all uncounted crawls we add in zeros uncounted_crawls = self.total_crawls - len(per_crawl_3p_cookie_counts) for i in range(0,uncounted_crawls): per_crawl_3p_cookie_counts.append(0) return per_crawl_3p_cookie_counts # get_per_crawl_3p_cookie_counts def get_3p_cookie_stats(self,tld_filter=None): """ Returns high-level cookie stats. """ # this is the data we will be getting stats for per_page_3p_cookie_counts = self.get_per_crawl_3p_cookie_counts(tld_filter) # mean and median should always be ok mean = statistics.mean(per_page_3p_cookie_counts) median = statistics.median(per_page_3p_cookie_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(per_page_3p_cookie_counts) except: mode = None return({ 'mean': mean, 'median': median, 'mode': mode }) # get_3p_cookie_stats def get_db_summary(self): """ Get basic data about what is in our database. """ # some of these take longer than others total_tasks_fail = self.sql_driver.get_pending_task_count() total_tasks_attempted = self.total_crawls + total_tasks_fail percent_tasks_ok = (self.total_crawls/total_tasks_attempted)*100 total_errors = self.sql_driver.get_total_errors_count() total_cookies = self.sql_driver.get_total_cookie_count() total_3p_cookies = self.sql_driver.get_total_cookie_count(is_3p = True) total_dom_storage = self.sql_driver.get_dom_storage_count() total_websockets = self.sql_driver.get_websocket_count() total_websocket_events = self.sql_driver.get_websocket_event_count() total_requests = self.sql_driver.get_total_request_count() total_responses = self.sql_driver.get_total_response_count() total_requests_received = self.sql_driver.get_total_request_count(received = True) percent_requests_received = (total_requests_received/total_requests)*100 total_3p_requests = self.sql_driver.get_total_request_count(party='third') total_3p_responses = self.sql_driver.get_total_response_count(is_3p = True) # avoid divide-by-zero if total_3p_requests > 0: total_3p_requests_received = self.sql_driver.get_total_request_count(received = True, party='third') percent_3p_requests_received = (total_3p_requests_received/total_3p_requests)*100 else: percent_3p_requests_received = 0 # ship it back return({ 'total_crawls_ok' : self.total_crawls, 'total_pages_ok' : self.total_pages, 'total_tasks_fail' : total_tasks_fail, 'total_tasks_attempted' : total_tasks_attempted, 'percent_tasks_ok' : percent_tasks_ok, 'total_errors' : total_errors, 'total_cookies' : total_cookies, 'total_3p_cookies' : total_3p_cookies, 'total_dom_storage' : total_dom_storage, 'total_websockets' : total_websockets, 'total_websocket_events' : total_websocket_events, 'total_requests' : total_requests, 'total_responses' : total_responses, 'percent_requests_received' : percent_requests_received, 'total_3p_requests' : total_3p_requests, 'total_3p_responses' : total_3p_responses, 'percent_3p_requests_received' : percent_3p_requests_received, }) # get_db_summary def get_high_level_stats(self, tld_filter=None): """ Get high level stats about what we found. """ crawls_w_3p_req = self.sql_driver.get_crawl_w_3p_req_count() percent_w_3p_request = (crawls_w_3p_req/self.total_crawls)*100 total_crawls_cookies = self.sql_driver.get_crawl_w_3p_cookie_count() percent_w_3p_cookie = (total_crawls_cookies/self.total_crawls)*100 crawls_w_3p_script = self.sql_driver.get_crawl_w_3p_script_count() percent_w_3p_script = (crawls_w_3p_script/self.total_crawls)*100 total_pages_ssl = self.sql_driver.get_ssl_page_count() percent_pages_ssl = (total_pages_ssl/self.total_pages)*100 # request info total_requests_received = self.sql_driver.get_total_request_count(received = True) total_requests_received_ssl = self.sql_driver.get_total_request_count(received = True, is_ssl = True) total_requests_received_1p = self.sql_driver.get_total_request_count(received = True, party='first') total_requests_received_1p_ssl = self.sql_driver.get_total_request_count(received = True, party='first', is_ssl = True) total_requests_received_3p = self.sql_driver.get_total_request_count(received = True, party='third') total_requests_received_3p_ssl = self.sql_driver.get_total_request_count(received = True, party='third', is_ssl = True) # ssl if total_requests_received > 0: percent_requests_ssl = (total_requests_received_ssl/total_requests_received)*100 percent_1p_requests_ssl = (total_requests_received_1p_ssl/total_requests_received_1p)*100 else: percent_requests_ssl = 0 percent_1p_requests_ssl = 0 if total_requests_received_3p: percent_3p_requests_ssl = (total_requests_received_3p_ssl/total_requests_received_3p)*100 else: percent_3p_requests_ssl = 0 # load time is seconds average_page_load_time = self.sql_driver.get_page_ave_load_time() # domains and cookies domain_stats = self.get_3p_domain_stats(tld_filter) cookie_stats = self.get_3p_cookie_stats(tld_filter) return ({ 'total_crawls' : self.total_crawls, 'total_pages' : self.total_pages, 'percent_pages_ssl' : percent_pages_ssl, 'total_requests_received' : total_requests_received, 'percent_requests_ssl' : percent_requests_ssl, 'total_requests_received_1p' : total_requests_received_1p, 'percent_1p_requests_ssl' : percent_1p_requests_ssl, 'total_requests_received_3p' : total_requests_received_3p, 'percent_3p_requests_ssl' : percent_3p_requests_ssl, 'average_page_load_time' : average_page_load_time, 'percent_w_3p_request' : percent_w_3p_request, 'percent_w_3p_cookie' : percent_w_3p_cookie, 'percent_w_3p_script' : percent_w_3p_script, '3p_domains_mean' : domain_stats['mean'], '3p_domains_median' : domain_stats['median'], '3p_domains_mode' : domain_stats['mode'], '3p_cookies_mean' : cookie_stats['mean'], '3p_cookies_median' : cookie_stats['median'], '3p_cookies_mode' : cookie_stats['mode'], }) # get_high_level_stats def get_aggregated_tracking_attribution(self, tld_filter=None): """ generates ranked list of which entities collect data from the greatest number of crawls - entities which have subsidiaries are ranked according to the crawls their subsidiaries get data from as well - however, parent entities only get one hit on a crawl which has multiple subsidiaries present - for example, if a crawl has 'google analytics' and 'doubleclick' that is only one hit for 'google' """ # list will have entries for each hit on a given entity all_owner_occurances = [] # each crawl_id is a key which corresponds to a list of # ids for entities which own the 3p domains crawl_to_3p_owners = {} # iterate through the entire set of 3p domains for each # crawl for crawl_id in self.crawl_id_to_3p_domain_info: # this is a set so items which appear more than once only get counted once # reset this for each crawl crawl_domain_owners = set() for item in self.crawl_id_to_3p_domain_info[crawl_id]: if item['owner_id']: for lineage_id in self.utilities.get_domain_owner_lineage_ids(item['owner_id']): crawl_domain_owners.add(lineage_id) # we have finished processing for this crawl so we add the owner ids to the full list for owner_id in crawl_domain_owners: all_owner_occurances.append(owner_id) # return a list of dicts ranked_aggregated_tracking_attribution = [] for owner_id, total_crawl_occurances in collections.Counter(all_owner_occurances).most_common(): ranked_aggregated_tracking_attribution.append({ 'owner_id': owner_id, 'owner_name': self.domain_owners[owner_id]['owner_name'], 'owner_country': self.domain_owners[owner_id]['country'], 'percent_crawls': (total_crawl_occurances/self.total_crawls)*100, }) return ranked_aggregated_tracking_attribution # # get the crawl count for each domain + its children # domain_owner_to_crawl_count = {} # for domain_owner_id in self.domain_owners: # # this it the owner + children # domain_owner_id_list = [domain_owner_id]+self.utilities.get_domain_owner_child_ids(domain_owner_id) # domain_owner_to_crawl_count[domain_owner_id] = self.sql_driver.get_crawl_count_by_domain_owners(domain_owner_id_list) # # now figure out the ranking # domain_owners_ranked_high_low = [] # for domain_owner_id, count in sorted(domain_owner_to_crawl_count.items(), key=lambda item: item[1],reverse=True): # domain_owners_ranked_high_low.append(domain_owner_id) # # return a list of dicts # ranked_aggregated_tracking_attribution = [] # for domain_owner_id in domain_owners_ranked_high_low: # ranked_aggregated_tracking_attribution.append({ # 'owner_id': domain_owner_id, # 'owner_name': self.domain_owners[domain_owner_id]['owner_name'], # 'owner_country': self.domain_owners[domain_owner_id]['country'], # 'percent_crawls': (domain_owner_to_crawl_count[domain_owner_id]/self.total_crawls)*100, # }) # return ranked_aggregated_tracking_attribution # get_aggregated_tracking_attribution def get_aggregated_3p_ssl_use(self, tld_filter=None): """ For each request where we know the owner we determine if it is SSL, then we figure out the aggregated (owner+children) SSL usage percentage """ # do processing here owner_id_ssl_use = {} # we iterate over every received request # this is a potentially large query b/c we must look at each request on the page # since a single domain owner may have more than one requests and these may or may not be with ssl for domain,domain_owner_id,is_ssl in self.sql_driver.get_3p_request_domain_owner_id_ssl_use(tld_filter): for domain_owner_id in self.utilities.get_domain_owner_lineage_ids(domain_owner_id): if domain_owner_id not in owner_id_ssl_use: owner_id_ssl_use[domain_owner_id] = [is_ssl] else: owner_id_ssl_use[domain_owner_id] = owner_id_ssl_use[domain_owner_id] + [is_ssl] # output list of dicts aggregated_3p_ssl_use = [] for owner_id in owner_id_ssl_use: aggregated_3p_ssl_use.append({ 'owner_id' : owner_id, 'owner_name' : self.domain_owners[owner_id]['owner_name'], 'owner_country' : self.domain_owners[owner_id]['country'], 'ssl_use' : 100*(sum(owner_id_ssl_use[owner_id])/len(owner_id_ssl_use[owner_id])) }) return aggregated_3p_ssl_use # get_aggregated_3p_ssl_use def get_site_to_3p_network(self, domain_owner_is_known=False): """ sql_driver.get_network_ties returns a set of tuples in the format (page domain, request domain, request domain owner id) we just go through this data to produce the report """ network = [] for page_domain,request_domain,request_owner_id in self.sql_driver.get_3p_network_ties(): # if we know the owner get name and country, otherwise None if request_owner_id != None: request_owner_name = self.domain_owners[request_owner_id]['owner_name'] request_owner_country = self.domain_owners[request_owner_id]['country'] else: request_owner_name = None request_owner_country = None network.append({ 'page_domain' : page_domain, 'request_domain' : request_domain, 'request_owner_id' : request_owner_id, 'request_owner_name' : request_owner_name, 'request_owner_country' : request_owner_country }) return network # get_3p_network def get_page_to_3p_network(self): """ Returns the network of all pages between third-party domains. Additionally returns information on page redirects and owners. """ network = [] for page_start_url,page_final_url,page_accessed,request_domain,request_owner_id in self.sql_driver.get_all_pages_3p_domains_and_owners(): # if we know the owner get name and country, otherwise None if request_owner_id != None: request_owner_name = self.domain_owners[request_owner_id]['owner_name'] request_owner_country = self.domain_owners[request_owner_id]['country'] else: request_owner_name = None request_owner_country = None network.append({ 'page_start_url' : page_start_url, 'page_final_url' : page_final_url, 'page_accessed' : page_accessed, 'request_domain' : request_domain, 'request_owner_id' : request_owner_id, 'request_owner_name' : request_owner_name, 'request_owner_country' : request_owner_country }) return network # get_page_to_3p_network def get_3p_domain_percentages(self,tld_filter=None): """ Determines what percentage of crawls a given third-party domain is found on and owner information. """ # total crawls for this tld, used to calculate percentages if tld_filter: total_crawls = self.crawl_counts_by_tld[tld_filter] else: total_crawls = self.total_crawls all_3p_domains = [] for crawl_id in self.crawl_id_to_3p_domain_info: for item in self.crawl_id_to_3p_domain_info[crawl_id]: all_3p_domains.append((item['domain'],item['owner_id'])) domain_percentages = [] for item, domain_crawl_count in self.utilities.get_most_common_sorted(all_3p_domains): domain = item[0] owner_id = item[1] # if we know the owner get name and country, otherwise None if owner_id != None: owner_name = self.domain_owners[owner_id]['owner_name'] owner_country = self.domain_owners[owner_id]['country'] owner_uses = self.domain_owners[owner_id]['uses'] owner_platforms = self.domain_owners[owner_id]['platforms'] else: owner_name = None owner_country = None owner_uses = None owner_platforms = None domain_percentages.append({ 'percent_crawls': 100*(domain_crawl_count/total_crawls), 'domain' : domain, 'owner_id' : owner_id, 'owner_name' : owner_name, 'owner_country' : owner_country, 'owner_uses' : owner_uses, 'owner_platforms': owner_platforms }) return domain_percentages # get_3p_domain_percentages def get_3p_request_percentages(self,tld_filter=None,request_type=None): """ Determine what percentage of pages a given request is found on. This is based on the "request_url" which is the url for a given request stripped of arguments. ex: "https://example.com/track.js?abc=123" would become "https://example.com/track.js" Additionally returns relevant owner information. """ all_3p_requests = [] # total crawls for this tld, used to calculate percentages if tld_filter: total_crawls = self.crawl_counts_by_tld[tld_filter] else: total_crawls = self.total_crawls for page_id,request_url,request_type,request_domain,request_domain_owner in self.sql_driver.get_3p_requests(tld_filter, request_type): all_3p_requests.append((request_url,request_type,request_domain,request_domain_owner)) request_percentages =[] for item, request_crawl_count in self.utilities.get_most_common_sorted(all_3p_requests): # if we know the owner get name and country, otherwise None request_owner_id = item[3] if request_owner_id != None: request_owner_name = self.domain_owners[request_owner_id]['owner_name'] request_owner_country = self.domain_owners[request_owner_id]['country'] else: request_owner_name = None request_owner_country = None request_percentages.append({ 'percent_crawls' : 100*(request_crawl_count/total_crawls), 'request_url' : item[0], 'request_type' : item[1], 'request_domain' : item[2], 'request_owner_id' : request_owner_id, 'request_owner_name' : request_owner_name, 'request_owner_country' : request_owner_country }) return request_percentages # get_3p_domain_percentages def get_3p_use_data(self,tld_filter=None): """" For some domains we know what they are used for on a first-party basis (eg marketing). This function examines the data we have collected in order to determine what percentage of crawls include a request to a third-party domain with a given use, how many such requests are made on a per-use basis per-crawl, and finally, what percentage of requests per-crawl set a third-party cookie. Data is returned as a dict, the first field of which is a set of all the uses we know of. """ # we first need to create a dict whereby each domain # corresponds to a list of known uses # domains with no known uses are not in the list # # IMPORTANT NOTE: # some domains may have several uses! domain_to_use_map = {} # a list of all known uses all_uses = set() for domain,owner_id in self.sql_driver.get_domain_owner_ids(): if len(self.domain_owners[owner_id]['uses']) > 0: domain_to_use_map[domain] = self.domain_owners[owner_id]['uses'] for use in self.domain_owners[owner_id]['uses']: all_uses.add(use) # for each crawl, create a list of the set of domains # which set a cookie # # note that due to currently unresolved chrome issues we sometimes # can get cookies which don't have a corresponding 3p request # this approach handles that gracefully crawl_cookie_domains = {} for crawl_id, cookie_domain in self.sql_driver.get_crawl_id_3p_cookie_domain_pairs(): if crawl_id not in crawl_cookie_domains: crawl_cookie_domains[crawl_id] = [cookie_domain] else: crawl_cookie_domains[crawl_id] = crawl_cookie_domains[crawl_id] + [cookie_domain] # next, for each crawl we want a list of uses for domains and if # that domain corresponds to a cookie being set # NOTE: the same use may occur many times, this is desired # as it gives us our counts later on crawl_3p_uses = {} # for crawl_id, request_domain in self.sql_driver.get_crawl_id_3p_request_domain_pairs(tld_filter): for crawl_id in self.crawl_id_to_3p_domain_info: for item in self.crawl_id_to_3p_domain_info[crawl_id]: domain = item['domain'] # if this 3p domain has a known use we add it to a list of uses keyed to crawl id if domain in domain_to_use_map: # check if the domain of this request has a cookie for this crawl if crawl_id in crawl_cookie_domains and domain in crawl_cookie_domains[crawl_id]: sets_cookie = True else: sets_cookie = False # add in a tuple of (use,sets_cookie) to a list for this crawl_id for use in domain_to_use_map[domain]: if crawl_id not in crawl_3p_uses: crawl_3p_uses[crawl_id] = [(use,sets_cookie)] else: crawl_3p_uses[crawl_id] = crawl_3p_uses[crawl_id] + [(use,sets_cookie)] # determine how often requests for a give use are encrypted with ssl # - note that on the same crawl multiple requests for a single use may be made # and each request may or may not be ssl use_ssl = {} use_total = {} total_classified = 0 for domain,domain_owner_id,is_ssl in self.sql_driver.get_3p_request_domain_owner_id_ssl_use(tld_filter): # only analyze domains we know the use for if domain in domain_to_use_map: total_classified += 1 # each domain may have several uses, add for all for use in domain_to_use_map[domain]: # increment count of ssl usage if is_ssl: if use not in use_ssl: use_ssl[use] = 1 else: use_ssl[use] = use_ssl[use] + 1 # keep track of total occurances of this use if use not in use_total: use_total[use] = 1 else: use_total[use] = use_total[use] + 1 # for each use we will produce summary counts, we # initialize everyting to zero here total_crawls_w_use = {} total_use_occurances = {} total_use_occurances_w_cookie = {} for use in all_uses: total_crawls_w_use[use] = 0 total_use_occurances[use] = 0 total_use_occurances_w_cookie[use] = 0 # process each crawl and update the relevant counts for crawl_id in crawl_3p_uses: # we only want to count use once per-crawl, so # create a set and add to it as we go along this_crawl_use_set = set() # upate the use occurance counters for use, has_cookie in crawl_3p_uses[crawl_id]: this_crawl_use_set.add(use) total_use_occurances[use] = total_use_occurances[use] + 1 if has_cookie == True: total_use_occurances_w_cookie[use] = total_use_occurances_w_cookie[use] + 1 # each use in the set adds one to the total crawl count for use in this_crawl_use_set: total_crawls_w_use[use] = total_crawls_w_use[use] + 1 # the last step is to calculate the relevant percentages and averages # used to get percentage by use if tld_filter: total_crawls = self.crawl_counts_by_tld[tld_filter] else: total_crawls = self.total_crawls percentage_by_use = {} average_use_occurance_per_crawl = {} percentage_use_w_cookie = {} percentage_use_ssl = {} for use in all_uses: percentage_by_use[use] = 0 average_use_occurance_per_crawl[use] = 0 percentage_use_w_cookie[use] = 0 for use in total_crawls_w_use: if total_crawls_w_use[use] > 0: percentage_by_use[use] = 100*(total_crawls_w_use[use]/total_crawls) average_use_occurance_per_crawl[use] = total_use_occurances[use]/total_crawls_w_use[use] percentage_use_w_cookie[use] = 100*(total_use_occurances_w_cookie[use]/total_use_occurances[use]) else: percentage_by_use[use] = None average_use_occurance_per_crawl[use] = None percentage_use_w_cookie[use] = None # conditional to account for cases where no instance of a given use is ssl if use in use_ssl: percentage_use_ssl[use] = 100*(use_ssl[use]/use_total[use]) else: percentage_use_ssl[use] = 0 # send back everyting as a keyed dict return({ 'all_uses' : all_uses, 'percentage_by_use' : percentage_by_use, 'average_use_occurance_per_crawl' : average_use_occurance_per_crawl, 'percentage_use_w_cookie' : percentage_use_w_cookie, 'percentage_use_ssl' : percentage_use_ssl }) # get_3p_use_data def get_all_pages_requests(self): """ For all pages get all of the requests associated with each page load. Default is only_3p, but this can be overridden to get 1p as well. """ records = [] for result in self.sql_driver.get_all_pages_requests(): try: domain_owner = self.utilities.get_domain_owner_lineage_combined_string(result[4]) except: domain_owner = None records.append({ 'accessed' : result[0].isoformat(), 'start_url' : result[1], 'final_url' : result[2], 'request_domain' : result[3], 'request_domain_owner' : domain_owner, 'request_url' : result[5], }) return records # get_all_pages_requests def get_all_pages_cookies(self): """ For all pages get all of the cookies associated with each page load. Default is 1p and 3p, but this can be overridden to get 3p only. """ records = [] for result in self.sql_driver.get_all_pages_cookies(): try: cookie_owner = self.utilities.get_domain_owner_lineage_combined_string(result[4]) except: cookie_owner = None records.append({ 'accessed' : result[0].isoformat(), 'start_url' : result[1], 'final_url' : result[2], 'cookie_domain' : result[3], 'cookie_owner' : cookie_owner, 'cookie_name' : result[5], 'cookie_value' : result[6], }) return records # get_all_pages_cookies def get_single_page_request_dump(self,page_start_url): """ For a given page (defined as unique start_url) get all of the requests associated with every page load. Default is only_3p, but this can be overridden to get 1p as well. """ records = [] for result in self.sql_driver.get_single_page_requests(page_start_url): try: domain_owner = self.utilities.get_domain_owner_lineage_combined_string(result[6]) except: domain_owner = None records.append({ 'page_accessed' : result[0].isoformat(), 'start_url' : result[1], 'final_url' : result[2], 'request_url' : result[4], 'request_domain' : result[5], 'request_domain_owner' : domain_owner }) return records # get_single_page_request_dump def get_single_page_cookie_dump(self,page_start_url): """ For a given page (defined as unique start_url) get all of the cookies associated with every page load. Default is only_3p, but this can be overridden to get 1p as well. """ records = [] for result in self.sql_driver.get_single_page_cookies(page_start_url): try: domain_owner = self.utilities.get_domain_owner_lineage_combined_string(result[6]) except: domain_owner = None records.append({ #'page_accessed' : result[0].isoformat(), 'page_accessed' : 'blah', 'start_url' : result[1], 'final_url' : result[2], 'is_ssl' : result[3], 'cookie_domain' : result[4], 'cookie_name' : result[5], 'cookie_value' : result[6], 'cookie_domain_owner' : domain_owner }) return records # get_single_page_cookie_dump def update_site_hosts(self): """ For each FDQN corresponding to a page we find the owner of the associated ip_addr. """ # required, non-standard try: from ipwhois import IPWhois except: print('!!! UNABLE TO UPDATE SITE HOSTS, IPWHOIS NOT INSTALLED !!!') page_ips_w_no_owner = self.sql_driver.get_page_ips_w_no_owner() total_to_update = len(page_ips_w_no_owner) progress = 0 for ip, in page_ips_w_no_owner: progress += 1 print('\t\t %s of %s done' % (progress,total_to_update)) try: obj = IPWhois(ip) result = obj.lookup_whois() owner = result['nets'][0]['description'] except: print('fail on %s' % ip) pass # fall back if owner == None: owner = result['asn_description'] if owner: # combine amazon # if 'Amazon' in owner: # owner = 'amazon' # fix strings owner = owner.replace('.','') owner = owner.replace('"','') owner = owner.replace("'","") owner = owner.replace('\n', ' ') owner = owner.replace('\r', ' ') owner = owner.replace(' ','_') owner = owner.replace(',','_') owner = owner.lower() self.sql_driver.update_ip_owner(ip,owner) # update_site_hosts def get_site_host_network(self): """ Return all records where we known the owner of the ip_addr corresponding to a given page's fqdn. """ records = [] for site_domain,host_name in self.sql_driver.get_site_hosts(): records.append({ 'site_domain' : site_domain, 'host_name' : host_name }) return records #get_site_hosts ############## # POLICYXRAY # ############## def get_policy_count(self,policy_type=None): """ For a given type of policy tells us how many we have, if policy_type is None we get total count. """ return self.sql_driver.get_total_policy_count(policy_type) # get_policy_count def get_average_policy_word_count(self, policy_type=None): """ Returns average policy word count, filtered by policy_type. """ return self.sql_driver.get_average_policy_word_count(policy_type=policy_type) # get_average_policy_word_count def update_readability_scores(self): """ This function performs two English-language readability tests: Flesch-Kinkaid grade-level and Flesch Reading Ease for any policies we haven't already done. The python textstat module handle the actual calculations. Note these scores are meaningless for non-English language policies. """ # non-standard lib which must be installed from textstat.textstat import textstat for policy_id, text in self.sql_driver.get_id_and_policy_text(readability_null = True): fre_score = textstat.flesch_reading_ease(text) fk_score = textstat.flesch_kincaid_grade(text) self.sql_driver.update_readability_scores(policy_id, fre_score, fk_score) # update_readability_scores def get_readability_scores(self, policy_type=None): """ Returns average policy word count, filtered by policy_type. """ ave_fre = self.sql_driver.get_ave_fre(policy_type=policy_type) ave_fkg = self.sql_driver.get_ave_fkg(policy_type=policy_type) return({ 'ave_fre': ave_fre, 'ave_fkg': ave_fkg }) # get_readability_scores def update_crawl_disclosure(self): """ Leaving code here in case useful later, but it doesn't make sense in cases where crawls are from different sites so it's staying dormant for now. """ # set up dictionaries so we can pull in the policy_id and policy_text for each page crawl_id_to_policy_id_text = {} for crawl_id, policy_id, policy_text in self.sql_driver.get_crawl_id_policy_id_policy_text(): crawl_id_to_policy_id_text[crawl_id] = (policy_id, policy_text) # pull in all sets of page_id/request_owner_id we haven't analyzed yet for crawl_id, domain_owner_id in self.sql_driver.get_all_crawl_id_3p_request_owner_ids(): # only process in cases we have an associated policy if crawl_id in crawl_id_to_policy_id_text: policy_id = crawl_id_to_policy_id_text[crawl_id][0] policy_text = crawl_id_to_policy_id_text[crawl_id][1] # default values disclosed = False disclosed_owner_id = None # each owner may have several parent owners and aliases, we check for all of these in the policy for this_owner_id, this_owner_name in self.utilities.get_domain_owner_lineage_strings(domain_owner_id,get_aliases=True): if this_owner_name in policy_text: disclosed = True disclosed_owner_id = this_owner_id # done for this record, update disclosure table self.sql_driver.update_crawl_3p_domain_disclosure(crawl_id, domain_owner_id) return # update_crawl_disclosure def update_request_disclosure(self): """ For any page where we have a policy we extract all third-party request domains where we have determined the owner. Next, we check if the name of the owner, any of it's parent companies, is in a given policy. Note we also check based on "aliases" which are spelling variations on a given owner name (eg 'doubleclick' and 'double click'). Once we've done the checks we update the policy_request_disclosure table. """ # set up dictionaries so we can pull in the policy_id and policy_text for each page page_id_to_policy_id_text = {} for page_id, policy_id, policy_text in self.sql_driver.get_page_id_policy_id_policy_text(): page_id_to_policy_id_text[page_id] = (policy_id, policy_text) # pull in all sets of page_id/request_owner_id we haven't analyzed yet for page_id, request_owner_id in self.sql_driver.get_all_page_id_3p_request_owner_ids(not_in_disclosure_table=True): # only process in cases we have an associated policy if page_id in page_id_to_policy_id_text: policy_id = page_id_to_policy_id_text[page_id][0] policy_text = page_id_to_policy_id_text[page_id][1] # default values disclosed = False disclosed_owner_id = None # each owner may have several parent owners and aliases, we check for all of these in the policy for this_owner_id, this_owner_name in self.utilities.get_domain_owner_lineage_strings(request_owner_id,get_aliases=True): if this_owner_name in policy_text: disclosed = True disclosed_owner_id = this_owner_id # done for this record, update disclosure table self.sql_driver.update_request_disclosure( page_id, policy_id, request_owner_id, disclosed, disclosed_owner_id ) return # update_request_disclosure def get_percent_crawl_3p_domains_disclosed(self, policy_type=None): """ Determine the global percentage of 3p requests which are disclosed in policies. """ total_identified = self.sql_driver.get_total_crawl_3p_count() total_disclosed = self.sql_driver.get_total_crawl_3p_disclosure_count() if total_identified == 0: return 0 else: return(100*(total_disclosed/total_identified)) # get_percent_3p_requests_disclosed def get_percent_3p_requests_disclosed(self, policy_type=None): """ Determine the global percentage of 3p requests which are disclosed in privacy policies. NOTE A PAGE CAN HAVE SEVERAL POLICIES WITH DISCLOSURE OCCURING IN SOME BUT NOT ALL, WE SHOULD ACCOUNT FOR THIS! """ total_identified = self.sql_driver.get_total_request_disclosure_count(policy_type=policy_type) total_disclosed = self.sql_driver.get_total_request_disclosure_count(policy_type=policy_type,disclosed=True) if total_identified == 0: return 0 else: return(100*(total_disclosed/total_identified)) # get_percent_3p_requests_disclosed def get_disclosure_by_request_owner(self): """ For each domain owner we query the policy_disclosure_table to find out if it or its subsidiaries have been disclosed. This gives a very granular view on disclosure on a per-service basis in some cases. Note that this is distinct on the page id to avoid over-counting for subsidiaries. Returns a dict which is keyed to the owner name. """ results = {} for owner_id in self.domain_owners: child_owner_ids = self.utilities.get_domain_owner_child_ids(owner_id) if len(child_owner_ids) > 0: total = self.sql_driver.get_domain_owner_disclosure_count(owner_id, child_owner_ids=child_owner_ids) total_disclosed = self.sql_driver.get_domain_owner_disclosure_count(owner_id, child_owner_ids=child_owner_ids, disclosed=True) else: total = self.sql_driver.get_domain_owner_disclosure_count(owner_id) total_disclosed = self.sql_driver.get_domain_owner_disclosure_count(owner_id, disclosed=True) if total != 0: results[self.domain_owners[owner_id]['owner_name']] = (total,total_disclosed,(total_disclosed/total)*100) # return the dict which can be processed to a csv in the calling class return results # get_disclosure_by_request_owner def get_terms_percentage(self,substrings,policy_type=None,policy_type_count=None): total_count = self.sql_driver.get_total_policy_count(policy_type=None) if policy_type: matches_count = self.sql_driver.get_policy_substrings_count(substrings,policy_type=policy_type) else: matches_count = self.sql_driver.get_policy_substrings_count(substrings) return (matches_count/policy_type_count)*100 # get_terms_percentage def stream_rate(self): wait_time = 10 elapsed = 0 query = 'SELECT COUNT(*) FROM task_queue' old_count = sql_driver.fetch_query(query)[0][0] all_rates = [] while True: time.sleep(wait_time) elapsed += wait_time new_count = sql_driver.fetch_query(query)[0][0] all_rates.append((old_count-new_count)*60) old_count = new_count json_data = json.dumps({ 'time': elapsed/60, 'rate': statistics.mean(all_rates) # 'rate': new_count }) yield f"data:{json_data}\n\n"
def __init__(self, db_engine, db_name, num_tlds, num_results, tracker_threshold = None, flush_owner_db = True): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_engine = db_engine self.db_name = db_name self.num_tlds = num_tlds self.top_tlds = [] self.num_results = num_results self.tracker_threshold = tracker_threshold self.start_time = datetime.now() # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver self.sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # this is reused often, do it once to save time self.get_pages_ok_count = self.sql_driver.get_pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t============================') print('\t Patching Domain Owner Data ') print('\t============================') if flush_owner_db: # update the domains to their owners in the db, can be overridden # by changing flush_owner_db to false self.patch_domain_owners() else: print('\t\t\tSkipping') # this is used in various places to get owner information self.domain_owners = self.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: if tld: print('\t\t |- %s (%s)' % (tld,pages)) else: # othewise we push in a single empty entry self.top_tlds.append((None,self.get_pages_ok_count)) # SPECIAL FEATURE FOR EXPERTS: tracker domain filter # # you can set a threshold of the number of sites a given 3p domain # is connected to - domains connecting to many sites may correlate those visits # so we call these 'tracker domains' # # the 'tracker_threshold' variable set above controls the filtering level # # on large set of sites (e.g. >10k) this works well but on small samples # (e.g. <500) it doesn't work as well as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing # don't use this...but because you are reading the source code for an otherwise # undocumented feature you are probably competent to use it ;-) # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # # use at your own risk! if tracker_threshold: print('\t===================================================') print('\t Getting tracker domains with threshold level of %s' % self.tracker_threshold) print('\t===================================================') print('\t\tProcessing...', end='', flush=True) self.tracker_domains = self.get_tracker_domains(self.tracker_threshold) print('done!') else: # set to None so various downstream operations get skipped self.tracker_domains = None
class Analyzer: """ webXray stores data in a relational db, but that isn't human-readable so what this class does is analyze the data and exports it to csv files that can be opened in other programs (e.g. excel, r, gephi) Most of the reports may also be run on the top tlds (off by default), so you will be able to see if there are variations between tlds ('org' and 'com' usually differ quite a bit) See the readme for details on all of the available reports. """ def __init__(self, db_engine, db_name, num_tlds, num_results, tracker_threshold = None, flush_owner_db = True): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_engine = db_engine self.db_name = db_name self.num_tlds = num_tlds self.top_tlds = [] self.num_results = num_results self.tracker_threshold = tracker_threshold self.start_time = datetime.now() # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver self.sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # this is reused often, do it once to save time self.get_pages_ok_count = self.sql_driver.get_pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t============================') print('\t Patching Domain Owner Data ') print('\t============================') if flush_owner_db: # update the domains to their owners in the db, can be overridden # by changing flush_owner_db to false self.patch_domain_owners() else: print('\t\t\tSkipping') # this is used in various places to get owner information self.domain_owners = self.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: if tld: print('\t\t |- %s (%s)' % (tld,pages)) else: # othewise we push in a single empty entry self.top_tlds.append((None,self.get_pages_ok_count)) # SPECIAL FEATURE FOR EXPERTS: tracker domain filter # # you can set a threshold of the number of sites a given 3p domain # is connected to - domains connecting to many sites may correlate those visits # so we call these 'tracker domains' # # the 'tracker_threshold' variable set above controls the filtering level # # on large set of sites (e.g. >10k) this works well but on small samples # (e.g. <500) it doesn't work as well as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing # don't use this...but because you are reading the source code for an otherwise # undocumented feature you are probably competent to use it ;-) # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # # use at your own risk! if tracker_threshold: print('\t===================================================') print('\t Getting tracker domains with threshold level of %s' % self.tracker_threshold) print('\t===================================================') print('\t\tProcessing...', end='', flush=True) self.tracker_domains = self.get_tracker_domains(self.tracker_threshold) print('done!') else: # set to None so various downstream operations get skipped self.tracker_domains = None # __init__ ################# # UTILITIES # ################# def setup_report_dir(self): """ create directory for where the reports go if it does not exist """ if os.path.exists('./reports') == False: print('\t\tMaking global reports directory at ./reports.') os.makedirs('./reports') # set global report_path self.report_path = './reports/'+self.db_name # set up subdir for this analysis if os.path.exists(self.report_path) == False: print('\t\tMaking subdirectory for reports at %s' % self.report_path) os.makedirs(self.report_path) print('\t\tStoring output in %s' % self.report_path) # setup_report_dir def write_csv(self, file_name, csv_rows): """ basic utility function to write list of csv rows to a file """ full_file_path = self.report_path+'/'+file_name with open(full_file_path, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for row in csv_rows: csv_writer.writerow(row) print('\t\tOutput written to %s' % full_file_path) # write_csv def get_most_common_sorted(self,list_in): """ takes a list, finds the most common items and then resorts alpha (b/c python's Counter will arbitrarily order items with same count), then sorts again for most-common assumes list_in contains alphanumeric tuples """ most_common_sorted = collections.Counter(list_in).most_common() most_common_sorted.sort() most_common_sorted.sort(reverse=True, key=lambda item:item[1]) return most_common_sorted # get_most_common_sorted def print_runtime(self): """ just for CLI info """ print('~='*40) print('Finished!') print('Time to process: %s' % str(datetime.now()-self.start_time)) print('-'*80) # print_runtime def patch_domain_owners(self): """ in order to analyze what entities receive user data, we need to update the database with domain ownership records we have stored previously """ # we first clear out what is the db in case the new data has changed, # on big dbs takes a while print('\t\tFlushing extant domain owner data...', end='', flush=True) self.sql_driver.reset_domain_owners() print('done!') # next we pull the owner/domain pairings from the json file in # the resources dir and add to the db print('\t\tPatching with new domain owner data...', end='', flush=True) domain_owner_data = json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')) for item in domain_owner_data: aliases = '' for alias in item['aliases']: aliases += '<<' + alias + '>>' self.sql_driver.add_domain_owner( item['id'], item['parent_id'], item['owner_name'], aliases, item['homepage_url'], item['privacy_policy_url'], item['notes'], item['country'] ) for domain in item['domains']: self.sql_driver.update_domain_owner(item['id'], domain) print('done!') # patch_domain_owners def get_domain_owner_dict(self): """ read out everything in the domain_owner table into a dictionary so we can easily use it as a global lookup table this is purposefully independent of self.patch_domain_owners and does not assume the above has been run, however will return and empty dictionary if the db has not been patched yet reasons for above is that if user does not wish to update with the current json file historical data will remain consistent """ domain_owners = {} domain_owner_raw_data = self.sql_driver.get_all_domain_owner_data() if domain_owner_raw_data: for item in domain_owner_raw_data: # aliases are stored in the db as a string that needs to be turned into a list aliases = [] for alias in re.split('<<(.+?)>>',item[3]): if alias != '': aliases.append(alias) # add everything to the dict domain_owners[item[0]] = { 'parent_id' : item[1], 'owner_name' : item[2], 'aliases' : aliases, 'homepage_url' : item[4], 'privacy_policy_url' : item[5], 'notes' : item[6], 'country' : item[7], } return domain_owners # get_domain_owner_dict def get_domain_owner_lineage_ids(self, id): """ for a given domain owner id, return the list which corresponds to its ownership lineage """ if self.domain_owners[id]['parent_id'] == None: return [id] else: return [id] + self.get_domain_owner_lineage_ids(self.domain_owners[id]['parent_id']) # get_domain_owner_lineage_ids def get_domain_owner_lineage_strings(self,owner_id,get_aliases=False): """ given an owner_id this function returns a list which is the full lineage of ownership optionally will also return aliases (e.g. 'Doubleclick' and 'Double Click') """ lineage_strings = [] for owner_id in self.get_domain_owner_lineage_ids(owner_id): lineage_strings.append((owner_id,self.domain_owners[owner_id]['owner_name'])) if get_aliases: for alias in self.domain_owners[owner_id]['aliases']: lineage_strings.append((owner_id,alias)) return lineage_strings # get_domain_owner_lineage_strings def get_domain_owner_lineage_combined_string(self,owner_id): """ given an owner_id this function returns a single string which is the full lineage of ownership """ lineage_string = '' for item in self.get_domain_owner_lineage_strings(owner_id): lineage_string += item[1] + ' > ' return lineage_string[:-2] # get_domain_owner_lineage_combined_string def get_domain_owner_child_ids(self,id): """ for a given owner id, get all of its children/subsidiaries """ # first get all the children ids if they exist child_ids = [] for item in self.domain_owners: if self.domain_owners[item]['parent_id'] == id: child_ids.append(item) # if we have children, call recursively if len(child_ids) > 0: for child_id in child_ids: child_ids.extend(self.get_domain_owner_child_ids(child_id)) # return an empty list if no children return child_ids # get_domain_owner_child_ids def get_top_tlds(self, limit): """ finds the most common tlds from all the pages type is default to tld, but pubsuffix also works returns list of tlds """ tlds = [] for row in self.sql_driver.get_all_tlds(): tlds.append(row[0]) top_tlds = collections.Counter(tlds).most_common() # cut the list to the limit top_tlds = top_tlds[0:limit] # push in entry for all tlds top_tlds.insert(0, (None,self.get_pages_ok_count)) return top_tlds # get_top_tlds def get_tracker_domains(self, threshold): """ NOTE: first determines all pairings of page domains and element domains note this is then unique on SITES, not on PAGES e.g. if you have several pages from the same site these links only count once returns a list of domains which link at least the threshold number of sites """ all_domains = [] for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs(): all_domains.append(page_domain_element_domain[1]) # count up all the pairs, convert to items() so can process as tuples domain_counts = collections.Counter(all_domains).items() # put the return values here tracker_domains = [] # check against threshold for domain_count in domain_counts: if domain_count[1] >= threshold: tracker_domains.append(domain_count[0]) # EDGE CASE # likely due to a large threshold we have no tracker domains, # so we throw warning and log error if len(tracker_domains) == 0: self.sql_driver.log_error('Analaysis Warning', 'Tracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------WARNING-----------') print('\t\tTracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------------------------') return tracker_domains # get_tracker_domains ##################### # REPORT HELPERS # ##################### def get_3p_domain_stats(self, num_pages, tld_filter = None): """ determines basic stats for the number of 3p domains contacted per-page note this is distinct domain+pubsuffix, not fqdns (e.g. 'sub.example.com' and sub2.example.com' only count as 'example.com') if tracker_domains have been set the stats will reflect only third-parties which have crossed the threshold (see get_tracker_domains()) """ # each page id corresponds to a list of domains belonging to page elements page_id_to_domains_dict = {} # run query to get all page id, page domain, and element domain entries # there is no third-party filter so each page will have at least one entry for first-party domain for row in self.sql_driver.get_page_id_3p_element_domain_pairs(tld_filter): page_id = row[0] element_domain = row[1] # if the page id is not yet seen enter the current element as a fresh list # otherwise, we add to the existing list # in both cases, if there is a tracker_domain list we only add # domains that are in the list if page_id not in page_id_to_domains_dict: if self.tracker_domains: if element_domain in self.tracker_domains: page_id_to_domains_dict[page_id] = [element_domain] else: page_id_to_domains_dict[page_id] = [element_domain] else: if self.tracker_domains: if element_domain in self.tracker_domains: page_id_to_domains_dict[page_id] = page_id_to_domains_dict[page_id] + [element_domain] else: page_id_to_domains_dict[page_id] = page_id_to_domains_dict[page_id] + [element_domain] # now we determine the number of domains each page is connected to by looking at len of list of 3p domains per_page_3p_element_counts = [] for page_id in page_id_to_domains_dict: per_page_3p_element_counts.append(len(page_id_to_domains_dict[page_id])) # pages that have no 3p elements are not yet in our counts # so for all uncounted pages we add in zeros uncounted_pages = num_pages - len(per_page_3p_element_counts) while uncounted_pages > 0: uncounted_pages -= 1 per_page_3p_element_counts.append(0) # mean and median should always be ok mean = statistics.mean(per_page_3p_element_counts) median = statistics.median(per_page_3p_element_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(per_page_3p_element_counts) except: mode = None return(mean, median, mode) # get_3p_domain_stats def get_3p_cookie_stats(self, num_pages, tld_filter = None): """ determines basic stats for the number of 3p cookies contacted per-page note that a single 3p many set more than one cookie if tracker_domains have been set the stats will reflect only third-parties which have crossed the threshold (see get_tracker_domains()) """ # each page id corresponds to a list of cookie ids page_id_to_cookie_id_dict = {} # run query to get all page id, 3p cookie id, 3p cookie domain entries for row in self.sql_driver.get_page_id_3p_cookie_id_3p_cookie_domain(tld_filter): page_id = row[0] cookie_id = row[1] cookie_domain = row[2] # if the page id is not yet seen enter the current cookie id as a fresh list # otherwise, we add to the existing list # in both cases, if there is a tracker_domain list we do not count cookies # set by domains which are not trackers if page_id not in page_id_to_cookie_id_dict: if self.tracker_domains: if cookie_domain in self.tracker_domains: page_id_to_cookie_id_dict[page_id] = [cookie_id] else: page_id_to_cookie_id_dict[page_id] = [cookie_id] else: if self.tracker_domains: if cookie_domain in self.tracker_domains: page_id_to_cookie_id_dict[page_id] = page_id_to_cookie_id_dict[page_id] + [cookie_id] else: page_id_to_cookie_id_dict[page_id] = page_id_to_cookie_id_dict[page_id] + [cookie_id] # determine the number of 3p cookies each page has by looking at len of list of cookie ids per_page_3p_cookie_counts = [] for page_id in page_id_to_cookie_id_dict: per_page_3p_cookie_counts.append(len(page_id_to_cookie_id_dict[page_id])) # pages that have no 3p cookies are not yet in our counts # so for all uncounted pages we add in zeros uncounted_pages = num_pages - len(per_page_3p_cookie_counts) while uncounted_pages > 0: uncounted_pages -= 1 per_page_3p_cookie_counts.append(0) # mean and median should always be ok mean = statistics.mean(per_page_3p_cookie_counts) median = statistics.median(per_page_3p_cookie_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(per_page_3p_cookie_counts) except: mode = None return(mean, median, mode) # get_3p_cookie_stats ##################### # REPORT GENERATORS # ##################### def generate_db_summary_report(self): """ outputs and stores report of basic data about how many records in db, etc. """ print('\t================') print('\t General Summary') print('\t================') csv_rows = [] total_pages_ok = self.sql_driver.get_pages_ok_count() print("\t\tTotal Pages OK:\t\t\t%s" % total_pages_ok) csv_rows.append(('Total Pages OK',total_pages_ok)) total_pages_noload = self.sql_driver.get_pages_noload_count() total_pages_attempted = total_pages_ok + total_pages_noload print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload) csv_rows.append(('Total Pages FAIL', total_pages_noload)) print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted) csv_rows.append(('Total Pages Attempted',total_pages_attempted)) percent_pages_OK = (total_pages_ok/total_pages_attempted)*100 print("\t\t%% Pages OK:\t\t\t%.2f%%" % round(percent_pages_OK,self.num_decimals)) csv_rows.append(('% Pages OK', round(percent_pages_OK,self.num_decimals))) print('\t\t---') total_errors = self.sql_driver.get_total_errors_count() print("\t\tTotal Errors:\t\t\t%s" % total_errors) csv_rows.append(('Total Errors', total_errors)) print('\t\t---') total_3p_cookies = self.sql_driver.get_total_cookie_count(is_3p = True) print("\t\tTotal 3P Cookies:\t\t%s" % total_3p_cookies) csv_rows.append(('Total Cookies', total_3p_cookies)) print('\t\t---') # see if we have both 1p/3p requests, if so show stats for all total_1p_elements = self.sql_driver.get_total_request_count(party='first') if total_1p_elements > 0: total_elements = self.sql_driver.get_total_request_count() print("\t\tTotal Elements Requested:\t%s" % total_elements) csv_rows.append(('Total Elements Requested', total_elements)) total_elements_received = self.sql_driver.get_total_request_count(received = True) print("\t\tTotal Elements Received:\t%s" % total_elements_received) csv_rows.append(('Total Elements Received', total_elements_received)) percent_element_received = (total_elements_received/total_elements)*100 print('\t\tTotal %% Elements Received:\t%.2f%%' % percent_element_received) csv_rows.append(('Total % Elements Received', round(percent_element_received,self.num_decimals))) print('\t\t---') # only 3p request/receive info - we always do this total_3p_elements = self.sql_driver.get_total_request_count(party='third') print("\t\t3P Elements Requested:\t\t%s" % total_3p_elements) csv_rows.append(('3P Elements Requested', total_3p_elements)) # avoid divide-by-zero if no 3p elements if total_3p_elements > 0: total_3p_elements_received = self.sql_driver.get_total_request_count(received = True, party='third') print("\t\t3P Elements Received:\t\t%s" % total_3p_elements_received) csv_rows.append(('3P Elements Received', total_3p_elements_received)) percent_3p_element_received = (total_3p_elements_received/total_3p_elements)*100 print('\t\t3P %% Elements Received:\t\t%.2f%%' % percent_3p_element_received) csv_rows.append(('3P % Elements Received', round(percent_3p_element_received,self.num_decimals))) print('\t\t'+'-'*40) self.write_csv('db_summary.csv', csv_rows) # generate_db_summary_report def generate_stats_report(self): """ High level stats """ print('\t=============================') print('\t Processing High-Level Stats ') print('\t=============================') for tld in self.top_tlds: csv_rows = [] if tld[0]: tld_filter = tld[0] file_name = tld[0]+'-stats.csv' else: tld_filter = None file_name = 'stats.csv' # page info total_pages = self.sql_driver.get_complex_page_count(tld_filter) total_pages_percent = (total_pages/self.get_pages_ok_count)*100 total_pages_elements = self.sql_driver.get_complex_page_count(tld_filter, 'elements', self.tracker_domains) percent_with_elements = (total_pages_elements/total_pages)*100 total_pages_cookies = self.sql_driver.get_complex_page_count(tld_filter, 'cookies', self.tracker_domains) percent_with_cookies = (total_pages_cookies/total_pages)*100 total_pages_js = self.sql_driver.get_complex_page_count(tld_filter, 'javascript', self.tracker_domains) percent_with_js = (total_pages_js/total_pages)*100 total_pages_ssl = self.sql_driver.get_pages_ok_count(is_ssl = True) percent_pages_ssl = (total_pages_ssl/total_pages)*100 # elements info total_elements_received = self.sql_driver.get_total_request_count(received = True) total_elements_received_ssl = self.sql_driver.get_total_request_count(received = True, is_ssl = True) total_elements_received_1p = self.sql_driver.get_total_request_count(received = True, party='first') total_elements_received_1p_ssl = self.sql_driver.get_total_request_count(received = True, party='first', is_ssl = True) total_elements_received_3p = self.sql_driver.get_total_request_count(received = True, party='third') total_elements_received_3p_ssl = self.sql_driver.get_total_request_count(received = True, party='third', is_ssl = True) all_load_times = self.sql_driver.get_pages_load_times() all_load_times_sum = 0 for load_time in all_load_times: all_load_times_sum += load_time average_page_load_time = all_load_times_sum/len(all_load_times) if self.tracker_threshold: filter_depth = self.tracker_threshold else: filter_depth = 'No Filter Used' domain_stats = self.get_3p_domain_stats(total_pages, tld_filter) domain_mean = domain_stats[0] domain_median = domain_stats[1] domain_mode = domain_stats[2] cookie_stats = self.get_3p_cookie_stats(total_pages, tld_filter) cookie_mean = cookie_stats[0] cookie_median = cookie_stats[1] cookie_mode = cookie_stats[2] csv_rows.append(('N Pages Loaded', total_pages)) csv_rows.append(('% of all Pages',total_pages_percent)) csv_rows.append(('% Pages SSL', round(percent_pages_ssl, self.num_decimals))) csv_rows.append(('N Elements Received', total_elements_received)) csv_rows.append(('% Elements Received SSL', round((total_elements_received_ssl/total_elements_received)*100,self.num_decimals))) csv_rows.append(('N 1P Elements Received', total_elements_received_1p)) csv_rows.append(('% 1P Elements Received SSL', round((total_elements_received_1p_ssl/total_elements_received_1p)*100,self.num_decimals))) csv_rows.append(('N 3P Elements Received', total_elements_received_3p)) csv_rows.append(('% 3P Elements Received SSL', round((total_elements_received_3p_ssl/total_elements_received_3p)*100,self.num_decimals))) csv_rows.append(('Average Page Load Time (ms)', round(average_page_load_time,self.num_decimals))) csv_rows.append(('% w/3p Element',round(percent_with_elements,self.num_decimals))) csv_rows.append(('% w/3p Cookie',round(percent_with_cookies,self.num_decimals))) csv_rows.append(('% w/3p Javascript',round(percent_with_js,self.num_decimals))) csv_rows.append(('Mean 3p Domains',round(domain_mean,self.num_decimals))) csv_rows.append(('Median 3p Domains',domain_median)) csv_rows.append(('Mode 3p Domains',domain_mode)) csv_rows.append(('Mean 3p Cookies',round(cookie_mean,self.num_decimals))) csv_rows.append(('Median 3p Cookies',cookie_median)) csv_rows.append(('Mode 3p Cookies',cookie_mode)) csv_rows.append(('Filter Depth Used',filter_depth)) self.write_csv(file_name,csv_rows) # generate_stats_report def generate_aggregated_tracking_attribution_report(self): """ generates ranked list of which entities collect data from the greatest number of pages ('data_flow_ownership.csv') - entities which have subsidiaries are ranked according to the pages their subsidiaries get data from as well - however, parent entities only get one hit on a page which has multiple subsidiaries present - for example, if a page has 'google analytics' and 'doubleclick' that is only one hit for 'google' also able to filter by tld """ print('\t======================================') print('\t Processing Aggregated Tracking Report ') print('\t======================================') for tld in self.top_tlds: csv_rows = [] csv_rows.append(('Percentage Pages Tracked','Owner','Owner Country','Owner Lineage')) # will need this value to determine percentages later on total_pages = self.sql_driver.get_complex_page_count(tld_filter=tld[0]) # list will have entry for each hit on a given entity all_owner_occurances = [] # each page id is a key which corresponds to a list of # ids for entities which own the 3p element domains page_to_element_owners = {} # this query may produce a large volume of results! results = self.sql_driver.get_all_page_id_3p_domain_owner_ids(tld_filter=tld[0]) # for each result we either create a new list, or extend the existing one # with the ids of the owners of the 3p elements for item in results: page_id = item[0] element_owner_id = item[1] if page_id not in page_to_element_owners: page_to_element_owners[page_id] = [element_owner_id] else: page_to_element_owners[page_id] = page_to_element_owners[page_id] + [element_owner_id] # now that we have ids for each page, we can look up the lineage # to create the aggregate measure of how often entities appear for item in page_to_element_owners: # this is a set so items which appear more than once only get counted once # reset this for each page page_domain_owners = set() # we are operating on a list of ids which correspond to the owners of domains which get the data for page_3p_owner_id in page_to_element_owners[item]: # for each domain owner we also count all of its parents by getting the lineage for lineage_id in self.get_domain_owner_lineage_ids(page_3p_owner_id): page_domain_owners.add((lineage_id, self.domain_owners[lineage_id]['owner_name'])) # we have finished processing for this page so we add the owner ids to the full list for owner_id in page_domain_owners: all_owner_occurances.append(owner_id) # write out data to csv for item in self.get_most_common_sorted(all_owner_occurances): # we want to specify the parent name for each item, or if there is no parent, identify as such parent_id = self.domain_owners[item[0][0]]['parent_id'] if parent_id: parent_name = self.domain_owners[parent_id]['owner_name'] else: parent_name = '' csv_rows.append(( round((item[1]/total_pages)*100,2), item[0][1], self.domain_owners[item[0][0]]['country'], self.get_domain_owner_lineage_combined_string(item[0][0]) ) ) # set file name prefix when doing tld-bounded report if tld[0]: file_name = tld[0]+'-aggregated_tracking_attribution.csv' else: file_name = 'aggregated_tracking_attribution.csv' # done! self.write_csv(file_name,csv_rows) # generate_aggregated_tracking_attribution_report def generate_aggregated_3p_ssl_use_report(self): """ this report tells us the percentage of requests made to a given third-party are encrypted """ print('\t=========================================') print('\t Processing Aggregated 3P SSL Use Report ') print('\t=========================================') csv_rows = [] domain_owners_ssl_use_dict = {} for item in self.sql_driver.get_3p_element_domain_owner_id_ssl_use(): child_domain_owner_id = item[0] is_ssl = item[1] for domain_owner_id in self.get_domain_owner_lineage_ids(child_domain_owner_id): if domain_owner_id not in domain_owners_ssl_use_dict: domain_owners_ssl_use_dict[domain_owner_id] = [is_ssl] else: domain_owners_ssl_use_dict[domain_owner_id] = domain_owners_ssl_use_dict[domain_owner_id] + [is_ssl] for domain_owner_id in domain_owners_ssl_use_dict: csv_rows.append(( round(100*(sum(domain_owners_ssl_use_dict[domain_owner_id])/len(domain_owners_ssl_use_dict[domain_owner_id])),self.num_decimals), self.domain_owners[domain_owner_id]['owner_name'], self.domain_owners[domain_owner_id]['country'], self.get_domain_owner_lineage_combined_string(domain_owner_id) )) # sort results by owner, note is upper then lower case # would cause code bloat to do otherwise, but worth considering csv_rows.sort(key=itemgetter(1)) # now sort by percentage of encrypted requests descending csv_rows.sort(key=itemgetter(0),reverse=True) # insert header row after sort csv_rows[0] = ('Percent Requests Encrypted','Owner','Owner Country','Owner Lineage') # done! self.write_csv('3p_ssl_use.csv',csv_rows) # generate_aggregated_3p_ssl_use_report def generate_per_page_data_flow_report(self): """ generates a csv which has information on data flows for each page note this file may be very large and is disabled by default """ print('\t======================================') print('\t Processing Per-Page Data Flow Report ') print('\t======================================') file_name = 'per_page_data_flow.csv' csv_rows = [] csv_rows.append(('Final URL','3P Domain','Owner','Owner Country','Owner Lineage')) for item in self.sql_driver.get_all_pages_3p_domains_and_owners(): # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[3] != None: csv_rows.append(( item[1], item[2], self.domain_owners[item[3]]['owner_name'], self.domain_owners[item[3]]['country'], self.get_domain_owner_lineage_combined_string(item[3]) )) else: csv_rows.append((item[1],item[2],'Unknown','','')) self.write_csv(file_name,csv_rows) # generate_per_page_data_flow_report def generate_3p_domain_report(self): """ this queries the db to get all elements, domains, and domain owners next they are counted to find the most common and formatted to csv rows and returned """ print('\t==============================') print('\t Processing 3P Domains Report ') print('\t==============================') for tld in self.top_tlds: csv_rows = [] csv_rows.append(('Percent Total','Domain','Owner','Owner Country', 'Owner Lineage')) if tld[0]: tld_filter = tld[0] file_name = tld[0]+'-3p_domains.csv' else: tld_filter = None file_name = '3p_domains.csv' total_pages = tld[1] all_3p_domains = [] for item in self.sql_driver.get_3p_domain_owners(tld_filter): all_3p_domains.append((item[1],item[2])) # if num_results is None we get everything, otherwise stops at limit for item in self.get_most_common_sorted(all_3p_domains)[:self.num_results]: # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[0][1] != None: owner_name = self.domain_owners[item[0][1]]['owner_name'] owner_country = self.domain_owners[item[0][1]]['country'] owner_lineage = self.get_domain_owner_lineage_combined_string(item[0][1]) else: owner_name = 'Unknown' owner_country = '' owner_lineage = '' csv_rows.append(( round((item[1]/total_pages)*100,self.num_decimals), item[0][0], owner_name, owner_country, owner_lineage )) self.write_csv(file_name,csv_rows) # generate_3p_domain_report def generate_3p_element_report(self,element_type=None): """ this queries the db to get all elements, domains, or domain owners next they are counted to find the most common and formatted to csv rows and returned """ if element_type == 'javascript': print('\t=================================') print('\t Processing 3P Javascript Report ') print('\t=================================') elif element_type == 'image': print('\t=============================') print('\t Processing 3P Images Report ') print('\t=============================') else: print('\t==============================') print('\t Processing 3P Element Report ') print('\t==============================') for tld in self.top_tlds: total_pages = tld[1] csv_rows = [] csv_rows.append(('Percent Total','Element','Extension','Type','Domain','Owner','Owner Country','Owner Lineage')) if tld[0]: tld_filter = tld[0] if element_type: file_name = tld[0]+'-3p_'+element_type+'.csv' else: file_name = tld[0]+'-3p_element.csv' else: tld_filter = None if element_type: file_name = '3p_'+element_type+'.csv' else: file_name = '3p_element.csv' all_3p_elements = [] for item in self.sql_driver.get_3p_elements(tld_filter, element_type): # we need to drop off the first element returned here # perhaps tho it should happen insql? all_3p_elements.append((item[1],item[2],item[3],item[4],item[5])) # if num_results is None we get everything, otherwise stops at limit for item in self.get_most_common_sorted(all_3p_elements)[:self.num_results]: # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[0][4] != None: owner_name = self.domain_owners[item[0][4]]['owner_name'] owner_country = self.domain_owners[item[0][4]]['country'] owner_lineage = self.get_domain_owner_lineage_combined_string(item[0][4]) else: owner_name = 'Unknown' owner_country = '' owner_lineage = '' csv_rows.append(( round((item[1]/total_pages)*100,self.num_decimals), item[0][0], item[0][1], item[0][2], item[0][3], owner_name, owner_country, owner_lineage )) self.write_csv(file_name,csv_rows) # generate_3p_element_report def generate_data_transfer_report(self): """ these reports tell us how much data was transferred across several dimensions """ print('\t==================================') print('\t Processing Data Transfer Reports ') print('\t==================================') for tld in self.top_tlds: # set up filter and file names if tld[0]: tld_filter = tld[0] summary_file_name = tld[0]+'-data_xfer_summary.csv' domain_file_name = tld[0]+'-data_xfer_by_domain.csv' aggregated_file_name = tld[0]+'-data_xfer_aggregated.csv' else: tld_filter = None summary_file_name = 'data_xfer_summary.csv' domain_file_name = 'data_xfer_by_domain.csv' aggregated_file_name = 'data_xfer_aggregated.csv' # get the data from db, tuple of (element_domain, size, is_3p (boolean), domain_owner_id) element_sizes = self.sql_driver.get_element_sizes(tld_filter=tld_filter) # initialize vars first_party_data = 0 third_party_data = 0 total_data = 0 # need Counter object, allows sorting later domain_data = collections.Counter() owner_data = collections.Counter() # process each row for item in element_sizes: element_domain = item[0] element_size = item[1] element_is_3p = item[2] domain_owner_id = item[3] # this is the measure of all data downloaded total_data += element_size # measures for third and first party data if element_is_3p: third_party_data += element_size else: first_party_data += element_size # data by domain, increment if already in there, otherwise new entry if element_domain in domain_data: domain_data[element_domain] += element_size else: domain_data[element_domain] = element_size # only if we know the owner, increment if domain_owner_id: for lineage_id in self.get_domain_owner_lineage_ids(domain_owner_id): if lineage_id in owner_data: owner_data[lineage_id] += element_size else: owner_data[lineage_id] = element_size # output data to csv summary_data_csv = [] summary_data_csv.append(('Party','Percent Total','Data Transfered (bytes)')) summary_data_csv.append(('All','100',total_data)) summary_data_csv.append(( 'First', round((first_party_data/total_data)*100, self.num_decimals), first_party_data)) summary_data_csv.append(( 'Third', round((third_party_data/total_data)*100, self.num_decimals), third_party_data)) self.write_csv(summary_file_name, summary_data_csv) # sort and output ranked data domain_data = domain_data.most_common() domain_data.sort() domain_data.sort(reverse=True, key=lambda item:item[1]) # for csv data domain_data_csv = [] domain_data_csv.append(('Percent Total','Domain','Data Transfered (bytes)')) # if num_results is None we get everything, otherwise stops at limit for item in domain_data[:self.num_results]: domain_data_csv.append(( round((item[1]/total_data)*100,self.num_decimals), item[0], item[1])) self.write_csv(domain_file_name, domain_data_csv) owner_data = self.get_most_common_sorted(owner_data) owner_data_csv = [] owner_data_csv.append(('Percent Total','Owner','Owner Country','Owner Lineage','Data Transfered (bytes)')) # get results for all known owners for item in owner_data: owner_data_csv.append(( round((item[1]/total_data)*100,self.num_decimals), self.domain_owners[item[0]]['owner_name'], self.domain_owners[item[0]]['country'], self.get_domain_owner_lineage_combined_string(item[0]), item[1] )) self.write_csv(aggregated_file_name, owner_data_csv) # generate_data_transfer_report def get_3p_use_data(self,tld_filter=None): """" For some domains we know what they are used for on a first-party basis (eg marketing). This function examines the data we have collected in order to determine what percentage of pages include a request to a third-party domain with a given use, how many such requests are made on a per-use basis per-page, and finally, what percentage of requests per-page set a third-party cookie. Data is returned as a dict, the first field of which is a set of all the uses we know of. """ # we first need to create a dict whereby each domain # corresponds to a list of known uses # domains with no known uses are not in the list # # IMPORTANT NOTE: # some domains may have several uses! domain_to_use_map = {} # a list of all known uses all_uses = set() # we read this from our normal domain_owners file infile = open('./webxray/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8') domain_data = json.load(infile) infile.close() # process all entries from the domain_owners file for item in domain_data: for domain in item['domains']: # if we have uses, enter them with domain if len(item['uses']) > 0: domain_to_use_map[domain] = item['uses'] # make sure we have the uses in all_uses for use in item['uses']: all_uses.add(use) # now that our domain to use mapping is done we have to # process the actual data! # for each page, create a list of the set of domains # which set a cookie # # note that due to currently unresolved chrome issues we sometimes # can get cookies which don't have a corresponding 3p request # this approach handles that gracefully page_cookie_domains = {} for page_id, cookie_domain in self.sql_driver.get_page_id_3p_cookie_domain_pairs(tld_filter): if page_id not in page_cookie_domains: page_cookie_domains[page_id] = [cookie_domain] else: page_cookie_domains[page_id] = page_cookie_domains[page_id] + [cookie_domain] # next, for each page we want a list of uses for domains and if # that domain corresponds to a cookie being set # NOTE: the same use may occur many times, this is desired # as it gives us our counts later on page_3p_uses = {} for page_id, element_domain in self.sql_driver.get_page_id_3p_element_domain_pairs(tld_filter): # if this 3p domain has a known use we add it to a list of uses keyed to page id if element_domain in domain_to_use_map: # check if the domain of this element has a cookie for this page if page_id in page_cookie_domains and element_domain in page_cookie_domains[page_id]: sets_cookie = True else: sets_cookie = False # add in a tuple of (use,sets_cookie) to a list for this page_id for use in domain_to_use_map[element_domain]: if page_id not in page_3p_uses: page_3p_uses[page_id] = [(use,sets_cookie)] else: page_3p_uses[page_id] = page_3p_uses[page_id] + [(use,sets_cookie)] # determine how often requests for a give use are encrypted with ssl # - note that on the same page multiple requests for a single use may be made # and each request may or may not be ssl use_ssl = {} use_total = {} total_classified = 0 for domain,is_ssl in self.sql_driver.get_3p_element_domain_ssl_use(): # only analyze domains we know the use for if domain in domain_to_use_map: total_classified += 1 # each domain may have several uses, add for all for use in domain_to_use_map[domain]: # increment count of ssl usage if is_ssl: if use not in use_ssl: use_ssl[use] = 1 else: use_ssl[use] = use_ssl[use] + 1 # keep track of total occurances of this use if use not in use_total: use_total[use] = 1 else: use_total[use] = use_total[use] + 1 # for each use we will produce summary counts, we # initialize everyting to zero here total_pages_w_use = {} total_use_occurances = {} total_use_occurances_w_cookie = {} for use in all_uses: total_pages_w_use[use] = 0 total_use_occurances[use] = 0 total_use_occurances_w_cookie[use] = 0 # process each page and update the relevant counts for page_id in page_3p_uses: # we only want to count use once per-page, so # create a set and add to it as we go along this_page_use_set = set() # upate the use occurance counters for use, has_cookie in page_3p_uses[page_id]: this_page_use_set.add(use) total_use_occurances[use] = total_use_occurances[use] + 1 if has_cookie == True: total_use_occurances_w_cookie[use] = total_use_occurances_w_cookie[use] + 1 # each use in the set adds one to the total page count for use in this_page_use_set: total_pages_w_use[use] = total_pages_w_use[use] + 1 # the last step is to calculate the relevant percentages and averages # used to get percentage by use total_pages = self.sql_driver.get_complex_page_count(tld_filter) percentage_by_use = {} average_use_occurance_per_page = {} percentage_use_w_cookie = {} percentage_use_ssl = {} for use in all_uses: percentage_by_use[use] = 0 average_use_occurance_per_page[use] = 0 percentage_use_w_cookie[use] = 0 for use in total_pages_w_use: if total_pages_w_use[use] > 0: percentage_by_use[use] = 100*(total_pages_w_use[use]/total_pages) average_use_occurance_per_page[use] = total_use_occurances[use]/total_pages_w_use[use] percentage_use_w_cookie[use] = 100*(total_use_occurances_w_cookie[use]/total_use_occurances[use]) else: percentage_by_use[use] = None average_use_occurance_per_page[use] = None percentage_use_w_cookie[use] = None # conditional to account for cases where no instance of a given use is ssl if use in use_ssl: percentage_use_ssl[use] = 100*(use_ssl[use]/use_total[use]) else: percentage_use_ssl[use] = 0 # send back everyting as a keyed dict return({ 'all_uses' : all_uses, 'percentage_by_use' : percentage_by_use, 'average_use_occurance_per_page' : average_use_occurance_per_page, 'percentage_use_w_cookie' : percentage_use_w_cookie, 'percentage_use_ssl' : percentage_use_ssl }) # get_3p_use_data def generate_use_report(self): """ This function handles the process of generating a csv report which details what percentage of pages use third-party content for specific uses, the number of requests made for a given type of use on a per-page basis, and the percentage of such requests which correspond to a third-party cookie. """ print('\t==========================') print('\t Processing 3P Use Report ') print('\t==========================') use_data = self.get_3p_use_data() all_uses = use_data['all_uses'] percentage_by_use = use_data['percentage_by_use'] average_use_occurance_per_page = use_data['average_use_occurance_per_page'] percentage_use_w_cookie = use_data['percentage_use_w_cookie'] percentage_use_ssl = use_data['percentage_use_ssl'] csv_rows = [] csv_rows.append(('use category','percent pages with use','ave occurances per page with use','percentage of use with cookie', 'percentage of use ssl')) for use in sorted(all_uses): if percentage_by_use[use] != None: csv_rows.append(( use, round(percentage_by_use[use],self.num_decimals), round(average_use_occurance_per_page[use],self.num_decimals), round(percentage_use_w_cookie[use],self.num_decimals), round(percentage_use_ssl[use],self.num_decimals) )) else: csv_rows.append((use,None,None,None,None)) self.write_csv('3p_uses.csv', csv_rows) # generate_use_report def generate_network_report(self): """ this report generates data necessary for graph/network analysis by outputting a list of page domains and the elements/owners they connect to """ print('\t=========================') print('\t Processing Network Ties ') print('\t=========================') # put output here csv_rows = [] # header row for csv csv_rows.append(('Page Domain','3P Element Domain','3P Domain Owner','3P Domain Owner Country')) # sql_driver.get_network_ties returns a set of tuples in the format # (page domain, element domain, element domain owner id) # we just go through this data to produce the report for item in self.sql_driver.get_3p_network_ties(): # if a page has no elements, edge[1] will be 'None' so we skip it # an alternate approach would be to include as orphan nodes if item[1]: # this condition has to specify != None, b/c otherwise it will skip values of 0 if item[2] != None: csv_rows.append((item[0],item[1],self.domain_owners[item[2]]['owner_name'],self.domain_owners[item[2]]['country'])) else: csv_rows.append((item[0],item[1],'Unknown','')) self.write_csv('network.csv', csv_rows)
def store_results_from_queue(self, process_num): """ If we are using a result queue this function will process all pending results. """ # set up new db connection to the server from webxray.PostgreSQLDriver import PostgreSQLDriver server_sql_driver = PostgreSQLDriver('server_config') # time to sleep when queue is empty wait_time = 5 # loop continues indefintely while True: result = server_sql_driver.get_result_from_queue() if not result: print( f'\t[p.{process_num}]\tπ΄ Going to sleep for {wait_time} seconds to wait for more tasks.' ) time.sleep(wait_time) continue # result is a dictionary object, unpack it result_id = result['result_id'] client_id = result['client_id'] client_ip = result['client_ip'] mapped_db = result['mapped_db'] target = result['target'] task = result['task'] # the task_result needs to be uncompressed task_result = json.loads( bz2.decompress(base64.urlsafe_b64decode( result['task_result'])).decode('utf-8')) if self.debug: print( f'\t[p.{process_num}]\tπ₯ Going to store result for {str(target)[:30]}' ) # store_result also handles task queue mangement store_result = self.store_result({ 'target': target, 'task': task, 'task_result': task_result, 'client_id': client_id, 'client_ip': client_ip, 'db_name': mapped_db }) # we finished processing this result, remove it from result queue server_sql_driver.remove_result_from_queue(result_id) # FYI if store_result['success'] == True: print('\t[p.%s]\tπ Success: %s' % (process_num, target[:50])) else: print('\t[p.%s]\tπ Error: %s %s' % (process_num, target[:50], store_result['result'])) # techincally we never get here... server_sql_driver.close() return
def __init__(self, db_name, db_engine, num_tlds, num_results, tracker_threshold=None, flush_domain_owners=True, start_date=False, end_date=False): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_name = db_name self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold # pass utilities the database info self.utilities = Utilities(db_name, db_engine) # set up the analyzer we will be using throughout self.analyzer = Analyzer(db_name, db_engine) # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') # creates a new directory if it doesn't exist already self.report_path = self.utilities.setup_report_dir(self.db_name) # this is used in various places to get owner information self.domain_owners = self.utilities.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for tld in self.top_tlds: if tld: print('\t\t |- %s' % tld) else: self.top_tlds = [None]
# db_engine can be 'mysql', 'postgres', or 'sqlite' # sqlite requires no configuation, but mysql and postgres # need user/pw set up in the relevant driver in the # ./webxray directory db_engine = 'sqlite' # set up database connection if db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver() elif db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver() elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver() else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) quit() #################### # HELPER FUNCTIONS # #################### def select_wbxr_db(): """ databases are stored with a prefix (default 'wbxr_'), this function helps select a database in interactive mode """ # you can optionally specify a different prefix here by setting "db_prefix = '[PREFIX]'" wbxr_dbs = sql_driver.get_wbxr_dbs_list()
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already unless we are doing a timeseries if self.allow_timeseries == False: if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem browser_output will be None if browser_output == None: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
class Server: """ The server runs as a flask application which is served by NGINX. The server loads its configuration data whenever it is called and thus changes made to wbxr_server_config are immediately made active. The server manages several primary tasks: - filtering incoming requests based on whitelisted client_ips (stored in server_config db) - responding to requests for scanning tasks from remote scan nodes - either immediately processing and storing, or queuing, results from scans TODO Items: - currently we rely on ip whitelisting, but we could move to an authentication scheme for clients with unstable ip addrs """ def __init__(self): """ Set up our server configuration here. Note we store config details in server_config.json because __init__ is run each time a worker processes a request this means we can modify our config on the fly without having to restart the server """ # connect to server config db to get client_config self.server_sql_driver = PostgreSQLDriver('server_config') # important parts of config currently are to # generate our whitelist of allowed ips # and to map our clients to their respective # databases self.whitelisted_ips = [] self.client_id_to_db = {} for client in self.server_sql_driver.get_client_configs(): if client['live']: if self.server_sql_driver.check_db_exist(client['mapped_db']): self.whitelisted_ips.append(client['client_ip']) self.client_id_to_db[ client['client_id']] = client['mapped_db'] else: print( f"Database {client['mapped_db']} for client {client['client_id']} does not exist" ) # __init__ def get_client_task(self, client_ip, client_id): """ We determine what the client should be doing when it sends us a 'READY' message. If we find a task in our queue we sent it back, otherwise we send 'WAIT' and the client will contact us again. """ # connect to appropriate db for this client, if none found # return wait command if client_id in self.client_id_to_db: sql_driver = PostgreSQLDriver(self.client_id_to_db[client_id]) else: print( 'client_id not in client_id_to_db list, returning wait command' ) return {'task': 'wait'} # get config for this db config = sql_driver.get_config() # get client config client_config = {} for item in config: if 'client' in item: client_config[item] = config[item] # if we have items in task_queue we send them back, otherwise # we sent a wait command if sql_driver.get_task_queue_length( max_attempts=config['max_attempts'], unlocked_only=True) != 0: # if this fails we wait try: target, task = sql_driver.get_task_from_queue( max_attempts=config['max_attempts'], client_id=client_id) except: print('β Returning command to wait.') return {'task': 'wait'} if task == 'get_scan': print(f'π Returning command to scan {target}') return { 'task': 'get_scan', 'target': target, 'client_config': client_config } elif task == 'get_crawl': print(f'π Returning command to crawl {target[:30]}...') return { 'task': 'get_crawl', 'target': json.loads(target), 'client_config': client_config } elif task == 'get_policy': print(f'π Returning command to get_policy {target}') return { 'task': 'get_policy', 'target': target, 'client_config': client_config } elif task == 'get_random_crawl': print(f'π Returning command to get_random_crawl {target}') return { 'task': 'get_random_crawl', 'target': target, 'client_config': client_config } else: print('β Returning command to wait.') return {'task': 'wait'} sql_driver.close() del sql_driver # get_client_task def store_result(self, data): """ We've gotten data from a client, attempt to store it. """ # unpack params client_id = data['client_id'] client_ip = data['client_ip'] success = data['success'] task = data['task'] task_result = data['task_result'] # we only load the json string if it is # not a crawl if task != 'get_crawl': target = json.loads(data['target']) else: target = data['target'] # get db connection from config mapped_db = self.client_id_to_db[client_id] # create db connection if client_id in self.client_id_to_db: sql_driver = PostgreSQLDriver(mapped_db) else: return 'FAIL: client_id not in client_id_to_db list' # get config for this db config = sql_driver.get_config() # if we're not expecting this result we ignore it if not sql_driver.is_task_in_queue({'task': task, 'target': target}): return 'FAIL: task not in queue, ignoring' # if browser failed we increment attempts and log the error if success == False: print(f'π Error for {target}: %s' % {task_result}) # for times we don't want to retry, such as a rejected # redirect or network resolution failure, this could be expanded fail_cases = [ 'reached fail limit', 'rejecting redirect', 'did not find enough internal links' ] if task_result in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result: sql_driver.set_task_as_failed(target, task) else: sql_driver.unlock_task_in_queue(target, task) sql_driver.log_error({ 'client_id': client_id, 'target': target, 'task': task, 'msg': task_result }) sql_driver.close() del sql_driver return 'FAIL' # we only need to put the result in the queue, allows # us to respond to clients faster and keep the results # compressed self.server_sql_driver.add_result_to_queue({ 'client_id': client_id, 'client_ip': client_ip, 'mapped_db': mapped_db, 'target': target, 'task': task, 'task_result': task_result }) # close out db connection and send back our response sql_driver.close() del sql_driver return 'OK' # store_result def process_request(self, request): """ Process requests from clients here, note we only process POST data and ignore GET data. IP whitelist checking is performed here, if we implement additional security checks that can be done here as well. """ # if we're running behind nginx/gunicorn # we get the ip from the headers otherwise # only flask is running and we get ip from # request.remote_addr if 'X-Real-IP' in request.headers: client_ip = request.headers['X-Real-IP'] else: client_ip = request.remote_addr # we whitelist the ips we accept commands from # ignore anything not in the list if client_ip not in self.whitelisted_ips: # print('ip (%s) not whitelisted!' % client_ip) return # read the post data from the form form = request.form # default response is failed, gets overwritten on success response = 'FAIL' # client is sending us data, store it and send back result (OK/FAIL) if 'task_result' in form.keys(): print(f'π¦ got data from {client_ip}') # store result can either process it or queue it based on config msg = self.store_result({ 'client_id': form['client_id'], 'client_ip': client_ip, 'success': json.loads(form['success']), 'target': form['target'], 'task': form['task'], 'task_result': form['task_result'] }) # tell the cient what happened response = bytes(msg, 'utf8') # client is ready, send a command back if 'ready' in form.keys(): print(f'πββοΈ got request for command from {client_ip}') command_set = json.dumps( self.get_client_task(client_ip, form['client_id'])) response = bytes(command_set, 'utf8') # all done return (response)