class Reporter: def __init__(self, db_name, num_tlds, num_results, tracker_threshold=0): self.db_name = db_name self.sql_driver = MySQLDriver(self.db_name) self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold self.startTime = datetime.now() self.pages_ok_count = self.sql_driver.pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t===========================') print('\t Patching DB with Org Data ') print('\t===========================') # update the domains to their owners self.patch_org_data() print('\t\tSuccess!') print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...') self.top_tlds = self.get_top_tlds(self.num_tlds) print(self.top_tlds) print('\t\tSuccess!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: print('\t\t |- %s (%s)' % (tld, pages)) # SPECIAL SAUCE, FOR EXPERTS: tracker domains! # # idea for this is you set a threshold of the number of sites a given domain # is connected to - domains connecting to many sites may correlate those visits # via referer strings etc, so we call these 'tracker domains' # # on a really large set of sites (e.g. 1M+) this works well but on small samples # (e.g. 500) it doesn't work well at all as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing... # DON'T USE THIS! # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # if tracker_threshold: print('\t=========================') print('\t Getting tracker domains ') print('\t=========================') print('\t\tProcessing...') self.tracker_domains = self.get_tracker_domains( self.tracker_threshold) print('\t\tSuccess!') else: self.tracker_domains = [] # end __init__ ######################### # HELPERS, GENERAL # ######################### def setup_report_dir(self): # create directory for where the reports go if not exist if os.path.exists('./reports') == False: print('\tMaking reporting directory.') os.makedirs('./reports') # set global report_path, trim off the wbxr_ prefix self.report_path = './reports/' + self.db_name[5:] # set up subdir for this database analysis if os.path.exists(self.report_path) == False: print('\tMaking subdirectory for reports.') os.makedirs(self.report_path) # just a notice print('\t\tWriting output to %s' % self.report_path) # setup_report_dir # reuse this a lot def write_csv(self, file_name, csv_rows): full_file_path = self.report_path + '/' + file_name file_out = open(full_file_path, 'w') for row in csv_rows: file_out.write(row) file_out.close() print('\t\t' + '*' * 40) print('\t\tOutput written to %s' % full_file_path) # write_csv # just fyi def print_runtime(self): print('~=' * 40) print("Finished!") print("Time to process: " + str(datetime.now() - self.startTime) + "\n") print('-' * 80) # end print_runtime # X-( def fatal(self, msg): print('FATAL ERROR: %s' % msg) print('EXITING.') exit() # fatal ############################# # HELPERS, DATABASE/INIT # ############################# def patch_org_data(self): # in order to analyze what entities receive user data, we need to update # the database with domain ownership records we have store previously # # we first clear out what is in there in case the new data has changed # perhaps make this optional, on big dbs takes a while # print('\t\tFlushing extant org data...') self.sql_driver.reset_domains_orgs() # next we pull the org/domain pairings from the json file in the resources dir # and add to the db print('\t\tPatching with new org data...') raw_data = open( os.path.join(os.path.dirname(__file__), './resources/org_domains/org_domains.json'), 'r') json_data = json.load(raw_data) # the default id for orgs is 1, so we advance from there id = 1 for item in json_data: id += 1 self.sql_driver.add_org(id, item['organization'], item['notes'], item['country']) for domain in item['domains']: self.sql_driver.update_domain_org(id, domain) # end patch_org_data def get_top_tlds(self, limit, type='tld'): # finds the most common tlds from all the pages # type is default to tld, but pubsuffix also works # have to do some weird sorting b/c python is arbitrary and f***s up diff tests # returns array tlds = [] for row in self.sql_driver.get_all_tlds(type): tlds.append(row[0]) top_tlds = collections.Counter(tlds).most_common() # sort alphabetical top_tlds.sort() # sub-sort on num occurances top_tlds.sort(reverse=True, key=lambda item: item[1]) # cut the array top_tlds = top_tlds[0:limit] # push in wild cards top_tlds.insert(0, ('*', self.pages_ok_count)) return top_tlds # end get_top_tlds def get_tracker_domains(self, threshold=0): # first finds all pairings of page domains and element domains # note this is then unique on SITE, not on PAGE # returns the list of domains which appear on at least the threshold number domains = [] for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs( ): domains.append(page_domain_element_domain[1]) # count up all the pairs, convert to items() so can process as tuples domain_counts = collections.Counter(domains).items() # put the return values here tracker_domains = [] # check against threshold for domain_count in domain_counts: if domain_count[1] >= threshold: tracker_domains.append(domain_count[0]) return tracker_domains # get_tracker_domains ######################### # REPORTS, GENERAL # ######################### def header(self): # just outputs really basic data about how many records in db, etc. # print('\t=================') print('\t General Summary') print('\t=================') output_for_csv = [] #write csv header output_for_csv.append('"Item","Value"\n') total_pages = self.sql_driver.pages_ok_count() print("\t\tTotal Pages OK:\t\t\t%s" % total_pages) output_for_csv.append('"Total Pages OK","%s"\n' % total_pages) total_pages_noload = self.sql_driver.pages_noload_count() total_pages_attempted = total_pages + total_pages_noload print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload) output_for_csv.append('"Total Pages FAIL","%s"\n' % total_pages_noload) print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted) output_for_csv.append('"Total Pages Attempted","%s"\n' % total_pages_attempted) percent_pages_OK = int((total_pages / total_pages_attempted) * 100) print("\t\t%% Pages OK:\t\t\t%s%%" % percent_pages_OK) output_for_csv.append('"%% Pages OK","%s"\n' % percent_pages_OK) total_errors = self.sql_driver.total_errors_count() print("\t\tTotal Errors:\t\t\t%s" % total_errors) output_for_csv.append('"Total Errors","%s"\n' % total_errors) total_cookies = self.sql_driver.total_cookie_count() print("\t\tTotal Cookies:\t\t\t%s" % total_cookies) output_for_csv.append('"Total Cookies","%s"\n' % total_cookies) total_pages_with_cookies = self.sql_driver.pages_w_cookie_count() print("\t\tPages with Cookies:\t\t%s" % total_pages_with_cookies) output_for_csv.append('"Pages with Cookies","%s"\n' % total_pages_with_cookies) percent_with_cookies = (total_pages_with_cookies / total_pages) * 100 print("\t\t%% Pages with Cookies:\t\t%s%%" % int(percent_with_cookies)) output_for_csv.append('"%% Pages with Cookies","%s"\n' % int(percent_with_cookies)) total_elements = self.sql_driver.total_element_count() print("\t\tTotal Elements Requested:\t%s" % total_elements) output_for_csv.append('"Total Elements Requested","%s"\n' % total_elements) total_elements_received = self.sql_driver.total_element_count( received=True) print("\t\tTotal Elements Received:\t%s" % total_elements_received) output_for_csv.append('"Total Elements Received","%s"\n' % total_elements_received) percent_element_received = int( (total_elements_received / total_elements) * 100) print('\t\t%% Elements Received:\t\t%s%%' % percent_element_received) output_for_csv.append('"%% Elements Received", "%s"\n' % percent_element_received) total_pages_with_elements = self.sql_driver.pages_w_element_count() print("\t\tPages with Elements:\t\t%s" % total_pages_with_elements) output_for_csv.append('"Pages with Elements","%s"\n' % total_pages_with_elements) percent_with_elements = (total_pages_with_elements / total_pages) * 100 print("\t\t%% Pages with Elements:\t\t%s%%" % int(percent_with_elements)) output_for_csv.append('"%% Pages With Elements","%s"\n' % int(percent_with_elements)) self.write_csv('db_summary.csv', output_for_csv) print('\t' + '-' * 80 + '\n') # header def get_network_ties(self): print('\t=============================') print('\t Processing Network Ties ') print('\t=============================') output_for_csv = [] # can also include the page_org in the report, but commented out for now # at a later date this could be an option # output_for_csv.append('"page_domain","page_org","3p_domain","3p_org"\n') output_for_csv.append('"page_domain","3p_domain","3p_org"\n') for edge in self.sql_driver.get_network_ties(): # output_for_csv.append('"%s","%s","%s","%s"\n' % (edge[0],edge[1],edge[2],edge[3])) output_for_csv.append('"%s","%s","%s",\n' % (edge[0], edge[2], edge[3])) self.write_csv('network.csv', output_for_csv) print('\t' + '-' * 80 + '\n') # get_network_ties def get_summary_by_tld(self): print('\t=============================') print('\t Processing Summaries by TLD ') print('\t=============================') output_for_csv = [] output_for_csv.append( '"TLD","N","% TOTAL","N W/3PE","% W/3PE","N W/COOKIE","% W/COOKIE","N W/JS","% W/JS","3P DOMAIN MEAN","3P DOMAIN MEDIAN","3P DOMAIN MODE" \n' ) # now do per-tld numbers for tld in self.top_tlds: print('\t\tGetting summary for %s' % tld[0]) if tld[0] != '*': tld_filter = tld[0] else: tld_filter = '' total_pages = self.sql_driver.get_complex_page_count(tld_filter) total_pages_percent = (total_pages / self.pages_ok_count) * 100 total_pages_elements = self.sql_driver.get_complex_page_count( tld_filter, 'elements', self.tracker_domains) percent_with_elements = (total_pages_elements / total_pages) * 100 total_pages_cookies = self.sql_driver.get_complex_page_count( tld_filter, 'cookies', self.tracker_domains) percent_with_cookies = (total_pages_cookies / total_pages) * 100 total_pages_js = self.sql_driver.get_complex_page_count( tld_filter, 'javascript', self.tracker_domains) percent_with_js = (total_pages_js / total_pages) * 100 stats = self.get_page_3p_stats(tld[0]) mean = stats[0] median = stats[1] mode = stats[2] output_for_csv.append( '"%s","%s","%.2f","%s","%.2f","%s","%.2f","%s","%.2f","%.2f","%s","%s"\n' % (tld[0], total_pages, total_pages_percent, total_pages_elements, percent_with_elements, total_pages_cookies, percent_with_cookies, total_pages_js, percent_with_js, mean, median, mode)) self.write_csv('summary_by_tld.csv', output_for_csv) # end get_summary_by_tld ##################### # REPORTS, MAIN # ##################### def get_reports_by_tld(self, type='', sub_type=''): print('\t=============================') print('\tProcessing Top %s %s %s' % (self.num_results, type, sub_type)) print('\t=============================') # keep output here csv_output = [] # write out the header row for the csv if type is 'elements': csv_output.append( '"TLD","TLD Rank","Intra-TLD Rank","Organization","Country", "Element","Extension","Type","Domain","Total Pages","Raw Count","Percent Total"\n' ) elif type is 'domains': csv_output.append( '"TLD","TLD Rank","Intra-TLD Rank","Domain","Organization","Country","Number of Pages","Raw Count","Percent Total"\n' ) elif type is 'orgs': csv_output.append( '"TLD","TLD Rank","Intra-TLD Rank","Organization","Country","Number of Pages","Raw Count","Percent Total"\n' ) else: self.fatal( 'Wrong type specified in get_reports_by_tld, must be "elements", "domains", or "orgs".' ) tld_count = 0 for tld in self.top_tlds: current_tld = tld[0] total_pages = tld[1] print('\t\tcurrently on: ' + current_tld) # filter on current page tld if tld[0] != '*': tld_filter = tld[0] tld_count += 1 else: tld_filter = '' # get results with specified filter results_rows = self.get_results_rows(total_pages, type, sub_type, tld_filter) current_row = 0 # loop results for result_row in results_rows: current_row += 1 if type is 'elements': csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # country result_row[0][2], # element result_row[0][3], # extension result_row[0][4], # type result_row[0][5], # domain total_pages, result_row[1], # raw_count (result_row[1] / total_pages) * 100) elif type is 'domains': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # domain_name result_row[0][1], # org_name result_row[0][2], # org_country total_pages, total_item, (total_item / total_pages) * 100) elif type is 'orgs': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # org_country total_pages, total_item, (total_item / total_pages) * 100) csv_output.append(csv_row) if current_row >= self.num_results: break # write out csv file_name = type + '-by-tld' # this really only applied to elements at present if sub_type: file_name += '-' + sub_type file_name += '.csv' # store the report self.write_csv(file_name, csv_output) # end get_reports_by_tld def get_results_rows(self, total_pages, type, sub_type='', tld_filter=''): # this queries the db to get all elements, domains, or orgs # next they are counted to find the most common # and formatted to csv rows and returned # query the db if type is 'elements': # rows are (page.start_uri, org.name, element.element_uri, element.extension, element.type, element_domain.domain) query_results = self.sql_driver.get_elements(tld_filter, sub_type) elif type is 'domains': # rows are (page.start_uri, element_domain.domain, org.name) query_results = self.sql_driver.get_domains(tld_filter) elif type is 'orgs': # row are page.start_uri, org.name query_results = self.sql_driver.get_orgs(tld_filter) else: self.fatal('Type must be elements, domains, or orgs.') # count up the unique elements, domains, or orgs we are looking for results_counter = collections.Counter() for row in query_results: # remove first element in tuple as it is the page uri, now irrelevant # add rest to results counter as this is what we care about now results_counter[row[1:]] += 1 # python's most_common() arbitrarily re-orders items with same value, making # debugging a nightmare, so we have to double sort here # convert to list we can sort results_counter = results_counter.most_common() # sort alphabetical results_counter.sort() # sub-sort on num occurrences results_counter.sort(reverse=True, key=lambda item: item[1]) return results_counter # end get_results_rows def get_page_3p_stats(self, tld=''): # This function calls get_page_element_domain_pairs to get a list of tuples # st. each tuple is a unique (page address, domain of an element) paring # This list of tuples is then iterated so that we count how many domains # each page is linked to # IMPORTANT: get_page_element_domain_pairs is *already* sorted by page, otherwise # loop below would not work if tld == '*': tld = '' # init vars this_page_element_count = 0 all_page_element_counts = [] last_page = '' # run query, process rows for row in self.sql_driver.get_page_uri_element_domain_pairs(tld): # page has no tackers, count is zero if not row[1]: all_page_element_counts.append(0) # page has trackers, add count else: # this is the same page, increment count if row[0] == last_page: if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count += 1 else: this_page_element_count += 1 # this is a new page, store our running count, reset to 1 # update last_page else: if last_page != '': all_page_element_counts.append(this_page_element_count) last_page = row[0] if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count = 1 else: this_page_element_count = 0 else: this_page_element_count = 1 # if we have an outstanding value, give it an increment if this_page_element_count != 0: # enter the last value into the list all_page_element_counts.append(this_page_element_count) # mean and median should always be ok mean = statistics.mean(all_page_element_counts) median = statistics.median(all_page_element_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(all_page_element_counts) except: mode = 'NULL' return (mean, median, mode) # get_page_3p_stats # end class Reporter
class Reporter: def __init__(self, db_name, num_tlds, num_results, tracker_threshold = 0): self.db_name = db_name self.sql_driver = MySQLDriver(self.db_name) self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold self.startTime = datetime.now() self.pages_ok_count = self.sql_driver.pages_ok_count() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') self.setup_report_dir() print('\t===========================') print('\t Patching DB with Org Data ') print('\t===========================') # update the domains to their owners self.patch_org_data() print('\t\tSuccess!') # if num_tlds is 0 we don't care about sub-analysis on the tlds (which takes time) # so we just push in the wildcard with the total page count # note that this is the *default* behavior if self.num_tlds == 0: self.top_tlds = [] self.top_tlds.append(('*',self.pages_ok_count)) # otherwise, we *do* care about the tlds, so get them else: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...') self.top_tlds = self.get_top_tlds(self.num_tlds) print('\t\tSuccess!') print('\t\tThe top tlds are:') for (tld, pages) in self.top_tlds: print('\t\t |- %s (%s)' % (tld,pages)) # SPECIAL SAUCE, FOR EXPERTS: tracker domains! # # idea for this is you set a threshold of the number of sites a given domain # is connected to - domains connecting to many sites may correlate those visits # via referer strings etc, so we call these 'tracker domains' # # on a really large set of sites (e.g. 1M+) this works well but on small samples # (e.g. 500) it doesn't work well at all as known tracker domains may only # appear on a single site # # this is off by default and unless you understand what you are doing... # DON'T USE THIS! # # longer-term we may want to train off a bigger corpus to find tracker domains and # have them prepackaged # if tracker_threshold: print('\t=========================') print('\t Getting tracker domains ') print('\t=========================') print('\t\tProcessing...') self.tracker_domains = self.get_tracker_domains(self.tracker_threshold) print('\t\tSuccess!') else: # set to False so various downstream operations get skipped self.tracker_domains = False # end __init__ ######################### # HELPERS, GENERAL # ######################### def setup_report_dir(self): # create directory for where the reports go if not exist if os.path.exists('./reports') == False: print('\t\tMaking global reports directory at ./reports.') os.makedirs('./reports') # set global report_path, trim off the wbxr_ prefix self.report_path = './reports/'+self.db_name[5:] # set up subdir for this database analysis if os.path.exists(self.report_path) == False: print('\t\tMaking subdirectory for reports at %s' % self.report_path) os.makedirs(self.report_path) # just a notice print('\t\tStoring output in %s' % self.report_path) # setup_report_dir # reuse this a lot def write_csv(self, file_name, csv_rows): full_file_path = self.report_path+'/'+file_name file_out = open(full_file_path, 'w') for row in csv_rows: file_out.write(row) file_out.close() print('\t\t'+'*'*40) print('\t\tOutput written to %s' % full_file_path) # write_csv # just fyi def print_runtime(self): print('~='*40) print("Finished!") print("Time to process: "+str(datetime.now()-self.startTime)+"\n") print('-'*80) # end print_runtime # X-( def fatal(self, msg): print('FATAL ERROR: %s' % msg) print('EXITING.') exit() # fatal ############################# # HELPERS, DATABASE/INIT # ############################# def patch_org_data(self): # in order to analyze what entities receive user data, we need to update # the database with domain ownership records we have store previously # # we first clear out what is in there in case the new data has changed # perhaps make this optional, on big dbs takes a while # print('\t\tFlushing extant org data...') self.sql_driver.reset_domains_orgs() # next we pull the org/domain pairings from the json file in the resources dir # and add to the db print('\t\tPatching with new org data...') raw_data = open(os.path.join(os.path.dirname(__file__), './resources/org_domains/org_domains.json'), 'r') json_data = json.load(raw_data) # the default id for orgs is 1, so we advance from there id = 1 for item in json_data: id += 1 self.sql_driver.add_org(id, item['organization'], item['notes'], item['country']) for domain in item['domains']: self.sql_driver.update_domain_org(id, domain) # end patch_org_data def get_top_tlds(self, limit, type = 'tld'): # finds the most common tlds from all the pages # type is default to tld, but pubsuffix also works # have to do some weird sorting b/c python is arbitrary and f***s up diff tests # returns array tlds = [] for row in self.sql_driver.get_all_tlds(type): tlds.append(row[0]) top_tlds = collections.Counter(tlds).most_common() # sort alphabetical top_tlds.sort() # sub-sort on num occurances top_tlds.sort(reverse=True, key=lambda item:item[1]) # cut the array top_tlds = top_tlds[0:limit] # push in wild cards top_tlds.insert(0, ('*',self.pages_ok_count)) return top_tlds # end get_top_tlds def get_tracker_domains(self, threshold): # NOTE: first determines all pairings of page domains and element domains # note this is then unique on SITES, not on PAGES # e.g. if you have several pages from the same site these links only # count once # # RETURNS: list of domains which link at least the threshold number of sites all_domains = [] for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs(): all_domains.append(page_domain_element_domain[1]) # count up all the pairs, convert to items() so can process as tuples domain_counts = collections.Counter(all_domains).items() # put the return values here tracker_domains = [] # check against threshold for domain_count in domain_counts: if domain_count[1] >= threshold: tracker_domains.append(domain_count[0]) # EDGE CASE # likely due to a large threshold we have no tracker domains, # so we throw warning and log error if len(tracker_domains) == 0: self.sql_driver.log_error('Analaysis Warning', 'Tracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------WARNING-----------') print('\t\tTracker Threshold of %s resulted in no tracking domains.' % threshold) print('\t\t-----------------------------') return tracker_domains # get_tracker_domains ######################### # REPORTS, GENERAL # ######################### def header(self): # just outputs really basic data about how many records in db, etc. # print('\t=================') print('\t General Summary') print('\t=================') output_for_csv = [] #write csv header output_for_csv.append('"Item","Value"\n') total_pages_ok = self.sql_driver.pages_ok_count() print("\t\tTotal Pages OK:\t\t\t%s" % total_pages_ok) output_for_csv.append('"Total Pages OK","%s"\n' % total_pages_ok) total_pages_noload = self.sql_driver.pages_noload_count() total_pages_attempted = total_pages_ok + total_pages_noload print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload) output_for_csv.append('"Total Pages FAIL","%s"\n' % total_pages_noload) print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted) output_for_csv.append('"Total Pages Attempted","%s"\n' % total_pages_attempted) percent_pages_OK = int((total_pages_ok/total_pages_attempted)*100) print("\t\t%% Pages OK:\t\t\t%s%%" % percent_pages_OK) output_for_csv.append('"%% Pages OK","%s"\n' % percent_pages_OK) total_errors = self.sql_driver.total_errors_count() print("\t\tTotal Errors:\t\t\t%s" % total_errors) output_for_csv.append('"Total Errors","%s"\n' % total_errors) total_cookies = self.sql_driver.total_cookie_count() print("\t\tTotal Cookies:\t\t\t%s" % total_cookies) output_for_csv.append('"Total Cookies","%s"\n' % total_cookies) total_pages_with_cookies = self.sql_driver.pages_w_cookie_count() print("\t\tPages with Cookies:\t\t%s" % total_pages_with_cookies) output_for_csv.append('"Pages with Cookies","%s"\n' % total_pages_with_cookies) percent_with_cookies = (total_pages_with_cookies/total_pages_ok)*100 print("\t\t%% Pages with Cookies:\t\t%s%%" % int(percent_with_cookies)) output_for_csv.append('"%% Pages with Cookies","%s"\n' % int(percent_with_cookies)) total_elements = self.sql_driver.total_element_count() print("\t\tTotal Elements Requested:\t%s" % total_elements) output_for_csv.append('"Total Elements Requested","%s"\n' % total_elements) total_elements_received = self.sql_driver.total_element_count(received = True) print("\t\tTotal Elements Received:\t%s" % total_elements_received) output_for_csv.append('"Total Elements Received","%s"\n' % total_elements_received) percent_element_received = int((total_elements_received/total_elements)*100) print('\t\t%% Elements Received:\t\t%s%%' % percent_element_received) output_for_csv.append('"%% Elements Received","%s"\n' % percent_element_received) total_pages_with_elements = self.sql_driver.pages_w_element_count() print("\t\tPages with Elements:\t\t%s" % total_pages_with_elements) output_for_csv.append('"Pages with Elements","%s"\n' % total_pages_with_elements) percent_with_elements = (total_pages_with_elements/total_pages_ok)*100 print("\t\t%% Pages with Elements:\t\t%s%%" % int(percent_with_elements)) output_for_csv.append('"%% Pages With Elements","%s"\n' % int(percent_with_elements)) self.write_csv('db_summary.csv', output_for_csv) print('\t'+'-'*80+'\n') # header def get_network_ties(self): # this report generates data necessary for graph/network analysis by # outputting a list of page domains and the elements/orgs they connect to print('\t=============================') print('\t Processing Network Ties ') print('\t=============================') # put output here output_for_csv = [] # header row for csv output_for_csv.append('"page_domain","3p_org","3p_domain"\n') # sql_driver.get_network_ties returns a set of tuples in the format # (page domain, page org, element domain, element org) # we just go through this data to produce the report # note: the report is currently omitting page org, but it can be altered easily for edge in self.sql_driver.get_network_ties(): # if a page has no elements, edge[2] will be 'None' so we skip it # an alternate approach would be to include as orphan nodes if edge[2]: output_for_csv.append('"%s","%s","%s",\n' % (edge[0],edge[3],edge[2])) # done! self.write_csv('network.csv', output_for_csv) print('\t'+'-'*80+'\n') # get_network_ties def get_summary_by_tld(self): print('\t=============================') print('\t Processing Summaries by TLD ') print('\t=============================') output_for_csv = [] output_for_csv.append('"TLD","N","% TOTAL","N W/3PE","% W/3PE","N W/COOKIE","% W/COOKIE","N W/JS","% W/JS","3P DOMAIN MEAN","3P DOMAIN MEDIAN","3P DOMAIN MODE","TRACKER FILTER DEPTH"\n') # now do per-tld numbers for tld in self.top_tlds: print('\t\tGetting summary for %s' % tld[0]) if tld[0] != '*': tld_filter = tld[0] else: tld_filter = '' total_pages = self.sql_driver.get_complex_page_count(tld_filter) total_pages_percent = (total_pages/self.pages_ok_count)*100 total_pages_elements = self.sql_driver.get_complex_page_count(tld_filter, 'elements', self.tracker_domains) percent_with_elements = (total_pages_elements/total_pages)*100 total_pages_cookies = self.sql_driver.get_complex_page_count(tld_filter, 'cookies', self.tracker_domains) percent_with_cookies = (total_pages_cookies/total_pages)*100 total_pages_js = self.sql_driver.get_complex_page_count(tld_filter, 'javascript', self.tracker_domains) percent_with_js = (total_pages_js/total_pages)*100 if self.tracker_threshold: filter_depth = self.tracker_threshold else: filter_depth = 'No Tracker Filter Used' stats = self.get_page_3p_stats(tld[0]) mean = stats[0] median = stats[1] mode = stats[2] output_for_csv.append('"%s","%s","%.2f","%s","%.2f","%s","%.2f","%s","%.2f","%.2f","%s","%s","%s"\n' % ( tld[0], total_pages, total_pages_percent, total_pages_elements, percent_with_elements, total_pages_cookies, percent_with_cookies, total_pages_js, percent_with_js, mean, median, mode, filter_depth)) self.write_csv('summary_by_tld.csv', output_for_csv) # end get_summary_by_tld ##################### # REPORTS, MAIN # ##################### def get_reports_by_tld(self, type='', sub_type=''): print('\t=============================') print('\tProcessing Top %s %s %s' % (self.num_results, type, sub_type)) print('\t=============================') # keep output here csv_output = [] # write out the header row for the csv if type is 'elements': csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Organization","Country", "Element","Extension","Type","Domain","Total Pages","Raw Count","Percent Total"\n') elif type is 'domains': csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Domain","Organization","Country","Number of Pages","Raw Count","Percent Total"\n') elif type is 'orgs': csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Organization","Country","Number of Pages","Raw Count","Percent Total"\n') else: self.fatal('Wrong type specified in get_reports_by_tld, must be "elements", "domains", or "orgs".') tld_count = 0 for tld in self.top_tlds: current_tld = tld[0] total_pages = tld[1] print('\t\tcurrently on: '+current_tld) # filter on current page tld if tld[0] != '*': tld_filter = tld[0] tld_count += 1 else: tld_filter = '' # get results with specified filter results_rows = self.get_results_rows(total_pages, type, sub_type, tld_filter) current_row = 0 # loop results for result_row in results_rows: current_row += 1 if type is 'elements': csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # country result_row[0][2], # element result_row[0][3], # extension result_row[0][4], # type result_row[0][5], # domain total_pages, result_row[1], # raw_count (result_row[1]/total_pages)*100) elif type is 'domains': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # domain_name result_row[0][1], # org_name result_row[0][2], # org_country total_pages, total_item, (total_item/total_pages)*100) elif type is 'orgs': total_item = result_row[1] csv_row = '"%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % ( current_tld, tld_count, current_row, result_row[0][0], # org_name result_row[0][1], # org_country total_pages, total_item, (total_item/total_pages)*100) csv_output.append(csv_row) if current_row >= self.num_results: break # write out csv file_name = type + '-by-tld' # this really only applied to elements at present if sub_type: file_name += '-' + sub_type file_name += '.csv' # store the report self.write_csv(file_name, csv_output) # end get_reports_by_tld def get_results_rows(self, total_pages, type, sub_type = '', tld_filter=''): # this queries the db to get all elements, domains, or orgs # next they are counted to find the most common # and formatted to csv rows and returned # query the db if type is 'elements': # rows are (page.start_uri, org.name, element.element_uri, element.extension, element.type, element_domain.domain) query_results = self.sql_driver.get_elements(tld_filter, sub_type) elif type is 'domains': # rows are (page.start_uri, element_domain.domain, org.name) query_results = self.sql_driver.get_domains(tld_filter) elif type is 'orgs': # row are page.start_uri, org.name query_results = self.sql_driver.get_orgs(tld_filter) else: self.fatal('Type must be elements, domains, or orgs.') # count up the unique elements, domains, or orgs we are looking for results_counter = collections.Counter() for row in query_results: # remove first element in tuple as it is the page uri, now irrelevant # add rest to results counter as this is what we care about now results_counter[row[1:]] += 1 # python's most_common() arbitrarily re-orders items with same value, making # debugging a nightmare, so we have to double sort here # convert to list we can sort results_counter = results_counter.most_common() # sort alphabetical results_counter.sort() # sub-sort on num occurrences results_counter.sort(reverse=True, key=lambda item:item[1]) return results_counter # end get_results_rows def get_page_3p_stats(self, tld = ''): # This function calls get_page_element_domain_pairs to get a list of tuples # st. each tuple is a unique (page address, domain of an element) paring # This list of tuples is then iterated so that we count how many domains # each page is linked to # IMPORTANT: get_page_element_domain_pairs is *already* sorted by page, otherwise # loop below would not work if tld == '*': tld = '' # init vars this_page_element_count = 0 all_page_element_counts = [] last_page = '' # run query, process rows for row in self.sql_driver.get_page_uri_element_domain_pairs(tld): # page has no tackers, count is zero if not row[1]: all_page_element_counts.append(0) # page has trackers, add count else: # this is the same page, increment count if row[0] == last_page: if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count += 1 else: this_page_element_count += 1 # this is a new page, store our running count, reset to 1 # update last_page else: if last_page != '': all_page_element_counts.append(this_page_element_count) last_page = row[0] if self.tracker_domains: if row[1] in self.tracker_domains: this_page_element_count = 1 else: this_page_element_count = 0 else: this_page_element_count = 1 # if we have an outstanding value, give it an increment if this_page_element_count != 0: # enter the last value into the list all_page_element_counts.append(this_page_element_count) # mean and median should always be ok mean = statistics.mean(all_page_element_counts) median = statistics.median(all_page_element_counts) # but mode can throw an error, so catch here try: mode = statistics.mode(all_page_element_counts) except: mode = 'NULL' return(mean, median, mode) # get_page_3p_stats # end class Reporter