def analyze(db_name): """ perform analysis, generate reports and store them in ./reports may also be called in stand-alone with 'run_webxray.py -a [DB_NAME]' """ from webxray.Analyzer import Analyzer # set how many tlds you want to produce sub-reports for num_tlds = None # set reports to only get the top X results, set to None to get everything num_results = 100 # set up a new analyzer analyzer = Analyzer(db_engine, db_name, num_tlds, num_results) # this is the full suite of reports, comment out those you don't need analyzer.generate_db_summary_report() analyzer.generate_stats_report() analyzer.generate_aggregated_tracking_attribution_report() analyzer.generate_use_report() analyzer.generate_3p_domain_report() analyzer.generate_3p_element_report() analyzer.generate_3p_element_report('javascript') analyzer.generate_3p_element_report('image') analyzer.generate_data_transfer_report() analyzer.generate_aggregated_3p_ssl_use_report() # the following reports may produce very large files, you have been warned # analyzer.generate_per_page_data_flow_report() analyzer.generate_network_report() analyzer.print_runtime()
def __init__(self, db_name, db_engine, num_tlds, num_results, tracker_threshold=None, flush_domain_owners=True, start_date=False, end_date=False): """ This performs a few start-up tasks: - sets up some useful global variables - makes sure we have a directory to store the reports - flushes the existing domain_owner mappings (this can be disabled) - if we want to do per-tld reports, figures out the most common - if we want to filter against a given tracker threshold, sets it up here (see documentation below for tracker threshold) """ # set various global vars self.db_name = db_name self.num_tlds = num_tlds self.num_results = num_results self.tracker_threshold = tracker_threshold # pass utilities the database info self.utilities = Utilities(db_name, db_engine) # set up the analyzer we will be using throughout self.analyzer = Analyzer(db_name, db_engine) # number of decimal places to round to in reports self.num_decimals = 2 # set up global db connection if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() print('\t=============================') print('\t Checking Output Directories ') print('\t=============================') # creates a new directory if it doesn't exist already self.report_path = self.utilities.setup_report_dir(self.db_name) # this is used in various places to get owner information self.domain_owners = self.utilities.get_domain_owner_dict() # if we want to get sub-reports for the most frequent tlds we find # them here if self.num_tlds: print('\t=====================') print('\t Getting top %s tlds' % self.num_tlds) print('\t=====================') print('\t\tProcessing...', end='', flush=True) self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds) print('done!') print('\t\tThe top tlds are:') for tld in self.top_tlds: if tld: print('\t\t |- %s' % tld) else: self.top_tlds = [None]