Exemple #1
0
def analyze(db_name):
    """
    perform analysis, generate reports and store them in ./reports
    may also be called in stand-alone with 'run_webxray.py -a [DB_NAME]'
    """

    from webxray.Analyzer import Analyzer
    
    # set how many tlds you want to produce sub-reports for
    num_tlds	= None

    # set reports to only get the top X results, set to None to get everything
    num_results	= 100

    # set up a new analyzer
    analyzer = Analyzer(db_engine, db_name, num_tlds, num_results)

    # this is the full suite of reports, comment out those you don't need
    analyzer.generate_db_summary_report()
    analyzer.generate_stats_report()
    analyzer.generate_aggregated_tracking_attribution_report()
    analyzer.generate_use_report()
    analyzer.generate_3p_domain_report()
    analyzer.generate_3p_element_report()
    analyzer.generate_3p_element_report('javascript')
    analyzer.generate_3p_element_report('image')
    analyzer.generate_data_transfer_report()
    analyzer.generate_aggregated_3p_ssl_use_report()
    
    # the following reports may produce very large files, you have been warned
    # analyzer.generate_per_page_data_flow_report()
    analyzer.generate_network_report()
    analyzer.print_runtime()
Exemple #2
0
    def __init__(self,
                 db_name,
                 db_engine,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_domain_owners=True,
                 start_date=False,
                 end_date=False):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold

        # pass utilities the database info
        self.utilities = Utilities(db_name, db_engine)

        # set up the analyzer we will be using throughout
        self.analyzer = Analyzer(db_name, db_engine)

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        # creates a new directory if it doesn't exist already
        self.report_path = self.utilities.setup_report_dir(self.db_name)

        # this is used in various places to get owner information
        self.domain_owners = self.utilities.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for tld in self.top_tlds:
                if tld: print('\t\t |- %s' % tld)
        else:
            self.top_tlds = [None]