Exemple #1
0
def analyze(db_name):
    """
    perform analysis, generate reports and store them in ./reports
    may also be called in stand-alone with 'run_webxray.py -a [DB_NAME]'
    """

    from webxray.Analyzer import Analyzer
    
    # set how many tlds you want to produce sub-reports for
    num_tlds	= None

    # set reports to only get the top X results, set to None to get everything
    num_results	= 100

    # set up a new analyzer
    analyzer = Analyzer(db_engine, db_name, num_tlds, num_results)

    # this is the full suite of reports, comment out those you don't need
    analyzer.generate_db_summary_report()
    analyzer.generate_stats_report()
    analyzer.generate_aggregated_tracking_attribution_report()
    analyzer.generate_use_report()
    analyzer.generate_3p_domain_report()
    analyzer.generate_3p_element_report()
    analyzer.generate_3p_element_report('javascript')
    analyzer.generate_3p_element_report('image')
    analyzer.generate_data_transfer_report()
    analyzer.generate_aggregated_3p_ssl_use_report()
    
    # the following reports may produce very large files, you have been warned
    # analyzer.generate_per_page_data_flow_report()
    analyzer.generate_network_report()
    analyzer.print_runtime()
Exemple #2
0
class Reporter:
    """
	Manages the production of a number of CSV reports.
	"""
    def __init__(self,
                 db_name,
                 db_engine,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_domain_owners=True,
                 start_date=False,
                 end_date=False):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold

        # pass utilities the database info
        self.utilities = Utilities(db_name, db_engine)

        # set up the analyzer we will be using throughout
        self.analyzer = Analyzer(db_name, db_engine)

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        # creates a new directory if it doesn't exist already
        self.report_path = self.utilities.setup_report_dir(self.db_name)

        # this is used in various places to get owner information
        self.domain_owners = self.utilities.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for tld in self.top_tlds:
                if tld: print('\t\t |- %s' % tld)
        else:
            self.top_tlds = [None]

    # __init__

    #####################
    # REPORT GENERATORS #
    #####################

    def generate_db_summary_report(self, print_to_cli=True):
        """
		outputs and stores report of basic data about how many records in db, etc.
		"""
        print('\t================')
        print('\t General Summary')
        print('\t================')

        # get the relevant db summary data
        db_summary = self.analyzer.get_db_summary()

        # print to cli
        if print_to_cli:
            print("\t\tTotal Crawls:\t\t\t%s" % db_summary['total_crawls_ok'])
            print("\t\tTotal Pages:\t\t\t%s" % db_summary['total_pages_ok'])
            print("\t\tTotal Tasks Fail:\t\t%s" %
                  db_summary['total_tasks_fail'])
            print("\t\tTotal Tasks Attempted:\t\t%s" %
                  db_summary['total_tasks_attempted'])
            print("\t\t%% Pages OK:\t\t\t%.2f%%" %
                  db_summary['percent_tasks_ok'])
            print("\t\tTotal Errors:\t\t\t%s" % db_summary['total_errors'])
            print("\t\tTotal Cookies:\t\t\t%s" % db_summary['total_cookies'])
            print("\t\tTotal 3P Cookies:\t\t%s" %
                  db_summary['total_3p_cookies'])
            print("\t\tTotal Dom Storage:\t\t%s" %
                  db_summary['total_dom_storage'])
            print("\t\tTotal Websockets:\t\t%s" %
                  db_summary['total_websockets'])
            print("\t\tTotal Websocket Events:\t\t%s" %
                  db_summary['total_websocket_events'])
            print("\t\tTotal Requests:\t\t\t%s" % db_summary['total_requests'])
            print("\t\tTotal Responses:\t\t%s" % db_summary['total_responses'])
            print('\t\t%% Requests Received:\t\t%.2f%%' %
                  db_summary['percent_requests_received'])
            print("\t\t3P Requests:\t\t\t%s" % db_summary['total_3p_requests'])
            print("\t\t3P Responses:\t\t\t%s" %
                  db_summary['total_3p_responses'])
            print('\t\t%% 3P Requests Received:\t\t%.2f%%' %
                  db_summary['percent_3p_requests_received'])
            print('\t\t' + '-' * 40)

        # write results to csv
        csv_rows = []
        csv_rows.append(('total_crawls_ok', db_summary['total_crawls_ok']))
        csv_rows.append(('total_pages_ok', db_summary['total_pages_ok']))
        csv_rows.append(('total_tasks_fail', db_summary['total_tasks_fail']))
        csv_rows.append(
            ('total_tasks_attempted', db_summary['total_tasks_attempted']))
        csv_rows.append(('percent_pages_ok', db_summary['percent_tasks_ok']))
        csv_rows.append(('total_errors', db_summary['total_errors']))
        csv_rows.append(('total_cookies', db_summary['total_cookies']))
        csv_rows.append(('total_3p_cookies', db_summary['total_3p_cookies']))
        csv_rows.append(('total_dom_storage', db_summary['total_dom_storage']))
        csv_rows.append(('total_websockets', db_summary['total_websockets']))
        csv_rows.append(
            ('total_websocket_events', db_summary['total_websocket_events']))
        csv_rows.append(('total_requests', db_summary['total_requests']))
        csv_rows.append(('total_responses', db_summary['total_responses']))
        csv_rows.append(('percent_requests_received',
                         db_summary['percent_requests_received']))
        csv_rows.append(('total_3p_requests', db_summary['total_3p_requests']))
        csv_rows.append(
            ('total_3p_responses', db_summary['total_3p_responses']))
        csv_rows.append(('percent_3p_requests_received',
                         db_summary['percent_3p_requests_received']))

        self.utilities.write_csv(self.report_path, 'db_summary.csv', csv_rows)

    # generate_db_summary_report

    def generate_stats_report(self):
        """
		High level stats
		"""
        print('\t=============================')
        print('\t Processing High-Level Stats ')
        print('\t=============================')

        for tld_filter in self.top_tlds:
            csv_rows = []

            if tld_filter:
                stats = self.analyzer.get_high_level_stats(tld_filter)
            else:
                stats = self.analyzer.get_high_level_stats()

            if self.tracker_threshold:
                filter_depth = self.tracker_threshold
            else:
                filter_depth = 'no_filter_used'

            csv_rows.append(('n_pages', stats['total_pages']))
            csv_rows.append(('n_crawls', stats['total_crawls']))
            csv_rows.append(('%_pages_ssl', stats['percent_pages_ssl']))
            csv_rows.append(
                ('n_requests_received', stats['total_requests_received']))
            csv_rows.append(
                ('%_requests_received_ssl', stats['percent_requests_ssl']))
            csv_rows.append(('n_1p_requests_received',
                             stats['total_requests_received_1p']))
            csv_rows.append(('%_1p_requests_received_ssl',
                             stats['percent_1p_requests_ssl']))
            csv_rows.append(('n_3p_requests_received',
                             stats['total_requests_received_3p']))
            csv_rows.append(('%_3p_requests_received_ssl',
                             stats['percent_3p_requests_ssl']))
            csv_rows.append(
                ('average_page_load_time', stats['average_page_load_time']))
            csv_rows.append(('%_w/3p_request', stats['percent_w_3p_request']))
            csv_rows.append(('%_w/3p_cookie', stats['percent_w_3p_cookie']))
            csv_rows.append(('%_w/3p_script', stats['percent_w_3p_script']))
            csv_rows.append(('mean_3p_domains', stats['3p_domains_mean']))
            csv_rows.append(('median_3p_domains', stats['3p_domains_median']))
            csv_rows.append(('mode_3p_domains', stats['3p_domains_mode']))
            csv_rows.append(('mean_3p_cookies', stats['3p_cookies_mean']))
            csv_rows.append(('median_3p_cookies', stats['3p_cookies_median']))
            csv_rows.append(('mode_3p_cookies', stats['3p_cookies_mode']))

            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-stats.csv', csv_rows)
            else:
                self.utilities.write_csv(self.report_path, 'stats.csv',
                                         csv_rows)

    # generate_stats_report

    def generate_aggregated_tracking_attribution_report(self):
        """
		generates ranked list of which entities collect data 
			from the greatest number of crawls ('aggregated_tracking_attribution.csv')

		- entities which have subsidiaries are ranked according 
			to the crawls their subsidiaries get data from as well
		- however, parent entities only get one hit on 
			a crawl which has multiple subsidiaries present
		- for example, if a crawl has 'google analytics' and 'doubleclick' 
			that is only one hit for 'google'
		"""
        print('\t======================================')
        print('\t Processing Aggregated Tracking Report ')
        print('\t======================================')

        for tld_filter in self.top_tlds:
            csv_rows = []

            # write out data to csv
            for item in self.analyzer.get_aggregated_tracking_attribution(
                    tld_filter):
                csv_rows.append(
                    (item['percent_crawls'], item['owner_name'],
                     item['owner_country'],
                     self.utilities.get_domain_owner_lineage_combined_string(
                         item['owner_id'])))

            # we want to first sort by owner name and then by percentage
            #	 to account for cases where two owners have the same percentage value
            csv_rows.sort(key=lambda x: x[1].lower())
            csv_rows.sort(key=lambda x: x[0], reverse=True)

            # insert header row after sort
            csv_rows.insert(0, ('percentage_crawls_tracked', 'owner',
                                'owner_country', 'owner_lineage'))

            # write out csv with tld prefix if applicable
            if tld_filter:
                self.utilities.write_csv(
                    self.report_path,
                    tld_filter + '-aggregated_tracking_attribution.csv',
                    csv_rows)
            else:
                self.utilities.write_csv(
                    self.report_path, 'aggregated_tracking_attribution.csv',
                    csv_rows)

    # generate_aggregated_tracking_attribution_report

    def generate_aggregated_3p_ssl_use_report(self):
        """
		this report tells us the percentage of requests made to a given
			third-party are encrypted
		"""

        print('\t=========================================')
        print('\t Processing Aggregated 3P SSL Use Report ')
        print('\t=========================================')

        for tld_filter in self.top_tlds:
            csv_rows = []
            for item in self.analyzer.get_aggregated_3p_ssl_use(tld_filter):
                csv_rows.append(
                    (item['ssl_use'], item['owner_name'],
                     item['owner_country'],
                     self.utilities.get_domain_owner_lineage_combined_string(
                         item['owner_id'])))

            # we want to first sort by owner name and then by percentage
            #	 to account for cases where two owners have the same percentage value
            csv_rows.sort(key=lambda x: x[1].lower())
            csv_rows.sort(key=lambda x: x[0], reverse=True)

            # insert header row after sort
            csv_rows.insert(0, ('percent_requests_encrypted', 'owner',
                                'owner_country', 'owner_lineage'))

            # write out csv with tld prefix if applicable
            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-3p_ssl_use.csv',
                                         csv_rows)
            else:
                self.utilities.write_csv(self.report_path, '3p_ssl_use.csv',
                                         csv_rows)

    # generate_aggregated_3p_ssl_use_report

    def generate_3p_domain_report(self):
        """
		This report tells us the most commonly occuring third-party domains.
		"""
        print('\t==============================')
        print('\t Processing 3P Domains Report ')
        print('\t==============================')

        for tld_filter in self.top_tlds:
            csv_rows = []
            csv_rows.append(('percent_total', 'domain', 'owner',
                             'owner_country', 'owner_lineage'))

            # get_3p_domain_percentages returns a list, we slice it to get only desired num_results
            for item in self.analyzer.get_3p_domain_percentages(
                    tld_filter)[:self.num_results]:

                # figure out the lineage string if we know who owns the domain
                if item['owner_id'] != None:
                    lineage_string = self.utilities.get_domain_owner_lineage_combined_string(
                        item['owner_id'])
                else:
                    lineage_string = None

                csv_rows.append((item['percent_crawls'], item['domain'],
                                 item['owner_name'], item['owner_country'],
                                 lineage_string))

            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-3p_domains.csv',
                                         csv_rows)
            else:
                self.utilities.write_csv(self.report_path, '3p_domains.csv',
                                         csv_rows)

    # generate_3p_domain_report

    def generate_3p_request_report(self, request_type=None):
        """
		this queries the db to get all requests, domains, or domain owners
		next they are counted to find the most common
		and formatted to csv rows and returned
		"""
        if request_type == 'script':
            print('\t=============================')
            print('\t Processing 3P Script Report ')
            print('\t=============================')
        else:
            print('\t==============================')
            print('\t Processing 3P Request Report ')
            print('\t==============================')

        for tld_filter in self.top_tlds:
            csv_rows = []
            csv_rows.append(('percent_total', 'request', 'type', 'domain',
                             'owner', 'owner_country', 'owner_lineage'))

            # get_3p_domain_percentages returns a list, we slice it to get only desired num_results
            for item in self.analyzer.get_3p_request_percentages(
                    tld_filter, request_type)[:self.num_results]:

                # figure out the lineage string if we know who owns the domain
                if item['request_owner_id'] != None:
                    lineage_string = self.utilities.get_domain_owner_lineage_combined_string(
                        item['request_owner_id'])
                else:
                    lineage_string = None

                csv_rows.append(
                    (item['percent_crawls'], item['request_url'],
                     item['request_type'], item['request_domain'],
                     item['request_owner_name'], item['request_owner_country'],
                     lineage_string))

            if tld_filter:
                if request_type:
                    self.utilities.write_csv(
                        self.report_path,
                        tld_filter + '-3p_' + request_type + '.csv', csv_rows)
                else:
                    self.utilities.write_csv(self.report_path,
                                             tld_filter + '-3p_request.csv',
                                             csv_rows)
            else:
                if request_type:
                    self.utilities.write_csv(self.report_path,
                                             '3p_' + request_type + '.csv',
                                             csv_rows)
                else:
                    self.utilities.write_csv(self.report_path,
                                             '3p_request.csv', csv_rows)

    # generate_3p_request_report

    def generate_data_transfer_report(self):
        """
		These reports tell us how much data was transferred across several dimensions
		"""

        print('\t==================================')
        print('\t Processing Data Transfer Reports ')
        print('\t==================================')

        for tld_filter in self.top_tlds:
            # set up filter and file names
            if tld_filter:
                summary_file_name = tld_filter + '-data_xfer_summary.csv'
                domain_file_name = tld_filter + '-data_xfer_by_domain.csv'
                aggregated_file_name = tld_filter + '-data_xfer_aggregated.csv'
            else:
                summary_file_name = 'data_xfer_summary.csv'
                domain_file_name = 'data_xfer_by_domain.csv'
                aggregated_file_name = 'data_xfer_aggregated.csv'

            # get the data from db, tuple of (response_domain, size, is_3p (boolean), domain_owner_id)
            response_sizes = self.sql_driver.get_response_sizes()

            # initialize vars
            first_party_data = 0
            third_party_data = 0
            total_data = 0

            # need Counter object, allows sorting later
            domain_data = collections.Counter()
            owner_data = collections.Counter()

            # process each row
            for item in response_sizes:

                response_domain = item[0]
                response_size = item[1]
                response_is_3p = item[2]
                domain_owner_id = item[3]

                # this is the measure of all data downloaded
                total_data += response_size

                # measures for third and first party data
                if response_is_3p:
                    third_party_data += response_size
                else:
                    first_party_data += response_size

                # data by domain, increment if already in there, otherwise new entry
                if response_domain in domain_data:
                    domain_data[response_domain] += response_size
                else:
                    domain_data[response_domain] = response_size

                # only if we know the owner, increment
                if domain_owner_id:
                    for lineage_id in self.utilities.get_domain_owner_lineage_ids(
                            domain_owner_id):
                        if lineage_id in owner_data:
                            owner_data[lineage_id] += response_size
                        else:
                            owner_data[lineage_id] = response_size

            # avoid divide-by-zero
            if total_data == 0:
                print('\t\tTotal data is zero, no report')
                return

            # output data to csv
            summary_data_csv = []
            summary_data_csv.append(
                ('party', 'percent_total', 'data_transfered_bytes'))
            summary_data_csv.append(('all', '100', total_data))
            summary_data_csv.append(
                ('First',
                 round((first_party_data / total_data) * 100,
                       self.num_decimals), first_party_data))
            summary_data_csv.append(
                ('Third',
                 round((third_party_data / total_data) * 100,
                       self.num_decimals), third_party_data))

            self.utilities.write_csv(self.report_path, summary_file_name,
                                     summary_data_csv)

            # sort and output ranked data
            domain_data = domain_data.most_common()
            domain_data.sort()
            domain_data.sort(reverse=True, key=lambda item: item[1])

            # for csv data
            domain_data_csv = []
            domain_data_csv.append(
                ('percent_total', 'domain', 'data_transfered_bytes'))

            # if num_results is None we get everything, otherwise stops at limit
            for item in domain_data[:self.num_results]:
                domain_data_csv.append(
                    (round((item[1] / total_data) * 100,
                           self.num_decimals), item[0], item[1]))
            self.utilities.write_csv(self.report_path, domain_file_name,
                                     domain_data_csv)

            owner_data = self.utilities.get_most_common_sorted(owner_data)
            owner_data_csv = []
            owner_data_csv.append(('percent_total', 'owner', 'owner_country',
                                   'owner_lineage', 'data_transfered_bytes'))
            # get results for all known owners
            for item in owner_data:
                owner_data_csv.append(
                    (round((item[1] / total_data) * 100, self.num_decimals),
                     self.domain_owners[item[0]]['owner_name'],
                     self.domain_owners[item[0]]['country'],
                     self.utilities.get_domain_owner_lineage_combined_string(
                         item[0]), item[1]))
            self.utilities.write_csv(self.report_path, aggregated_file_name,
                                     owner_data_csv)

    # generate_data_transfer_report

    def generate_use_report(self):
        """
		This function handles the process of generating a csv report which details
			what percentage of pages use third-party content for specific uses,
			the number of requests made for a given type of use on a per-page basis,
			and the percentage of such requests which correspond to a third-party
			cookie.
		"""

        print('\t==========================')
        print('\t Processing 3P Use Report ')
        print('\t==========================')

        for tld_filter in self.top_tlds:
            use_data = self.analyzer.get_3p_use_data(tld_filter)
            all_uses = use_data['all_uses']
            percentage_by_use = use_data['percentage_by_use']
            average_use_occurance_per_page = use_data[
                'average_use_occurance_per_crawl']
            percentage_use_w_cookie = use_data['percentage_use_w_cookie']
            percentage_use_ssl = use_data['percentage_use_ssl']

            csv_rows = []
            csv_rows.append(
                ('use_category', 'percent_crawls_w_use',
                 'ave_occurances_per_page', 'percentage_of_use_w_cookie',
                 'percentage_of_use_ssl'))
            for use in sorted(all_uses):
                if percentage_by_use[use] != None:
                    csv_rows.append((use, percentage_by_use[use],
                                     average_use_occurance_per_page[use],
                                     percentage_use_w_cookie[use],
                                     percentage_use_ssl[use]))
                else:
                    csv_rows.append((use, None, None, None, None))

            # write out csv with tld prefix if applicable
            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-3p_uses.csv', csv_rows)
            else:
                self.utilities.write_csv(self.report_path, '3p_uses.csv',
                                         csv_rows)

    # generate_use_report

    def generate_per_page_network_report(self):
        """
		this report generates data necessary for graph/network analysis by
			outputting a list of page domains and the requests/owners they connect to
			on a per-page basis
		"""

        print('\t====================================')
        print('\t Processing Per-Page Network Report ')
        print('\t====================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('page_start_url', 'page_final_url', 'page_accessed',
                         '3p_request_domain', '3p_domain_owner',
                         '3p_domain_owner_country'))

        # process all records
        for item in self.analyzer.get_page_to_3p_network():
            csv_rows.append(
                (item['page_start_url'], item['page_final_url'],
                 item['page_accessed'], item['request_domain'],
                 item['request_owner_name'], item['request_owner_country']))

        self.utilities.write_csv(self.report_path,
                                 'per_page_network_report.csv', csv_rows)

    # generate_per_page_network_report

    def generate_per_site_network_report(self):
        """
		this report generates data necessary for graph/network analysis by
			outputting a list of page domains and the requests/owners they connect to
			aggregated on a per-site basis (eg combining all pages)
		"""

        print('\t================================')
        print('\t Processing Site Network Report ')
        print('\t================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('page_domain', '3p_request_domain', '3p_domain_owner',
                         '3p_domain_owner_country'))

        for item in self.analyzer.get_site_to_3p_network():
            csv_rows.append(
                (item['page_domain'], item['request_domain'],
                 item['request_owner_name'], item['request_owner_country']))

        self.utilities.write_csv(self.report_path,
                                 'per_site_network_report.csv', csv_rows)

    # generate_per_site_network_report

    def generate_all_pages_request_dump(self):
        """
		Full dump of all requests loaded by all pages across all load times.
			Default is 3p only, can be overridden.
		"""

        print('\t===================================')
        print('\t Processing All Pages request Dump ')
        print('\t===================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('accessed', 'start_url', 'final_url', 'request_url',
                         'request_domain', 'domain_owner'))

        # process all records
        for item in self.analyzer.get_all_pages_requests():
            csv_rows.append(
                (item['accessed'], item['start_url'], item['final_url'],
                 item['request_url'], item['request_domain'],
                 item['request_domain_owner']))

        self.utilities.write_csv(self.report_path,
                                 'all_pages_request_dump.csv', csv_rows)

    # generate_all_pages_request_dump

    def generate_all_pages_cookie_dump(self):
        """
		Full dump of all cookies loaded by all pages across all load times.
			Default is 1p and 3p, can be overridden to 3p only.
		"""

        print('\t==================================')
        print('\t Processing All Pages Cookie Dump ')
        print('\t==================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('accessed', 'start_url', 'final_url', 'cookie_domain',
                         'cookie_owner', 'cookie_name', 'cookie_value'))

        # process all records
        for item in self.analyzer.get_all_pages_cookies():
            csv_rows.append(
                (item['accessed'], item['start_url'], item['final_url'],
                 item['cookie_domain'], item['cookie_owner'],
                 item['cookie_name'], item['cookie_value']))

        self.utilities.write_csv(self.report_path, 'all_pages_cookie_dump.csv',
                                 csv_rows)

    # generate_all_pages_request_dump

    def generate_site_host_report(self):
        """
		First, we update the domain table with the owners
			of the various ip addresses which gives us
			a mapping of pages to hosts.

		Second, we generate a network report for
			site domains to hosts.

		"""
        print('\t=====================')
        print('\t Updating Site Hosts ')
        print('\t=====================')

        self.analyzer.update_site_hosts()

        print('\t==============================')
        print('\t Generating Site Host Network ')
        print('\t==============================')

        site_host_data = self.analyzer.get_site_host_network()

        if len(site_host_data) == 0:
            print('\t\tNo site host data, skipping report.')
            return

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('page_domain', 'host_name'))

        for item in site_host_data:
            csv_rows.append((item['site_domain'], item['host_name']))

        self.utilities.write_csv(self.report_path, 'site_hosts-network.csv',
                                 csv_rows)

        print('\t============================================')
        print('\t Generating Aggregate Host Ownership Report ')
        print('\t============================================')

        owner_occurances = []
        for owner, in self.sql_driver.get_ip_owners():
            owner_occurances.append(owner)

        csv_rows = [('owner', 'percent_sites_w_owner')]
        for item in self.utilities.get_most_common_sorted(owner_occurances):
            csv_rows.append((item[0], 100 * (item[1] / len(owner_occurances))))

        self.utilities.write_csv(self.report_path, 'site_hosts-aggregated.csv',
                                 csv_rows)

    # generate_site_host_report

    ##############
    # POLICYXRAY #
    ##############

    def initialize_policy_reports(self):
        """
		Run various pre-production steps.
		"""

        print('\t====================================')
        print('\t Updating 3p Domain Disclosure Data ')
        print('\t====================================')

        #self.analyzer.update_request_disclosure()
        self.analyzer.update_crawl_disclosure()

        print('\t\t...done!')

        print('\t======================================')
        print('\t Getting Policy Types List and Counts ')
        print('\t======================================')

        # pre-populate with 'None' which gives all policies
        self.policy_types = [{
            'type': None,
            'count': self.analyzer.get_policy_count()
        }]

        for policy_type, in self.sql_driver.get_available_policy_types():
            self.policy_types.append({
                'type':
                policy_type,
                'count':
                self.analyzer.get_policy_count(policy_type=policy_type)
            })

        print('\t\t...done!')

    # initialize_policy_reports

    def generate_policy_summary_report(self):
        """
		Conducts prelminary analysis steps, determines what types of 
			policies we have, and then initiates the pertinent reports.
		"""
        print('\t==================================')
        print('\t Generating Policy Summary Report ')
        print('\t==================================')

        # header row
        csv_rows = [('Type', 'N', 'Word Count', 'FK Grade', 'FRE',
                     '% 3P Disclosed')]

        # get results for each policy_type
        for policy_type in self.policy_types:
            # makes reports clearer than 'None'
            if policy_type['type'] == None:
                this_policy_type = 'all'
            else:
                this_policy_type = policy_type['type']

            print('\t\tProcessing %s...' % this_policy_type,
                  end='',
                  flush=True)

            # fetch results
            readability_scores = self.analyzer.get_readability_scores(
                policy_type=policy_type['type'])

            csv_rows.append(
                (this_policy_type, policy_type['count'],
                 self.analyzer.get_average_policy_word_count(
                     policy_type=policy_type['type']),
                 readability_scores['ave_fkg'], readability_scores['ave_fre'],
                 self.analyzer.get_percent_crawl_3p_domains_disclosed(
                     policy_type=policy_type['type'])))
            print('done!')

        self.utilities.write_csv(self.report_path, 'policy-summary.csv',
                                 csv_rows)

    # generate_policy_summary_report

    def generate_policy_owner_disclosure_reports(self):
        """
		Determines what types of policies we have, and then
			initiates the pertinent reports.
		"""

        print('\t======================================')
        print('\t Generating Company Disclosure Report ')
        print('\t======================================')

        # header row
        csv_rows = [('Type', 'N', '%% 3P Disclosed')]

        print('\t\tProcessing ...', end='', flush=True)

        company_results = self.analyzer.get_disclosure_by_request_owner()
        csv_rows = [('Domain Owner', 'Total Occurances', 'Total Disclosures',
                     'Percent Disclosed')]
        for item in company_results:
            csv_rows.append(
                (item, company_results[item][0], company_results[item][1],
                 round(company_results[item][2], 2)))

        print('done!')
        self.utilities.write_csv(self.report_path,
                                 'policy-owner_disclosure.csv', csv_rows)

    # generate_policy_owner_disclosure_reports

    def generate_policy_gdpr_report(self):
        """
		Determine percentage of all policy types
			that contain gdpr article 9 terms.
		"""

        print('\t==============================')
        print('\t Generating GDPR Term Report ')
        print('\t==============================')

        term_list = [
            'racial or ethnic origin', 'political opinions',
            'religious or philosophical beliefs', 'trade union membership',
            'genetic data', 'biometric data', 'data concerning health',
            'sex life', 'sexual orientation'
        ]

        self.generate_terms_report('policy-gdpr_terms.csv', term_list)

    # generate_policy_gdpr_report

    def generate_policy_pacification_report(self):
        """
		Determine percentage of all policy types
			that contain pacification terms.
		"""

        print('\t=====================================')
        print('\t Generating Pacification Term Report ')
        print('\t=====================================')

        term_list = [
            'we value', 'we respect', 'important to us', 'help you', 'we care',
            'committed to protecting', 'cares about', 'transparency'
        ]

        self.generate_terms_report('policy-pacification_terms.csv', term_list)

    # generate_policy_pacification_report

    def generate_policy_pii_report(self):
        """
		Determine percentage of all policy types
			that contain pacification terms.
		"""

        print('\t============================')
        print('\t Generating PII Term Report ')
        print('\t============================')

        term_list = [
            'ip address', 'internet protocol address', 'browser type',
            'operating system'
        ]

        self.generate_terms_report('policy-pii_terms.csv', term_list)

    # generate_policy_pacification_report

    def generate_terms_report(self, report_name, term_list):
        """
		Generic function to generate reports on how
			often terms appear in policies.
		"""

        # set up header row
        csv_rows = []
        header_row = ('Type', 'any term')
        for term in term_list:
            header_row = header_row + (term, )

        csv_rows.append(header_row)

        # get results for each policy_type
        for policy_type in self.policy_types:
            # makes reports clearer than 'None'
            if policy_type['type'] == None:
                this_policy_type = 'all'
            else:
                this_policy_type = policy_type['type']

            print('\t\tProcessing %s...' % this_policy_type,
                  end='',
                  flush=True)

            this_csv_row = (this_policy_type, )
            this_csv_row = this_csv_row + (self.analyzer.get_terms_percentage(
                term_list,
                policy_type=policy_type['type'],
                policy_type_count=policy_type['count']), )
            for term in term_list:
                this_csv_row = this_csv_row + (
                    self.analyzer.get_terms_percentage(
                        [term],
                        policy_type=policy_type['type'],
                        policy_type_count=policy_type['count']), )
            csv_rows.append(this_csv_row)
            print('done!')

        self.utilities.write_csv(self.report_path, report_name, csv_rows)
Exemple #3
0
    def __init__(self,
                 db_name,
                 db_engine,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_domain_owners=True,
                 start_date=False,
                 end_date=False):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold

        # pass utilities the database info
        self.utilities = Utilities(db_name, db_engine)

        # set up the analyzer we will be using throughout
        self.analyzer = Analyzer(db_name, db_engine)

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        # creates a new directory if it doesn't exist already
        self.report_path = self.utilities.setup_report_dir(self.db_name)

        # this is used in various places to get owner information
        self.domain_owners = self.utilities.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for tld in self.top_tlds:
                if tld: print('\t\t |- %s' % tld)
        else:
            self.top_tlds = [None]
Exemple #4
0
def analyze(db_name):
	"""
	perform analysis, generate reports and store them in ./reports
	may also be called in stand-alone with 'run_webxray.py -a [DB_NAME]'
	"""

	from webxray.Analyzer import Analyzer
	
	# set how many tlds you want to produce sub-reports for
	num_tlds	= None

	# set reports to only get the top X results, set to None to get everything
	num_results	= 100

	# set up a new analyzer
	analyzer = Analyzer(db_engine, db_name, num_tlds, num_results)

	# this is the full suite of reports, comment out those you don't need
	analyzer.generate_db_summary_report()
	analyzer.generate_stats_report()
	analyzer.generate_aggregated_tracking_attribution_report()
	analyzer.generate_use_report()
	analyzer.generate_3p_domain_report()
	analyzer.generate_3p_element_report()
	analyzer.generate_3p_element_report('javascript')
	analyzer.generate_3p_element_report('image')
	analyzer.generate_data_transfer_report()
	analyzer.generate_aggregated_3p_ssl_use_report()
	
	# the following reports may produce very large files, you have been warned
	# analyzer.generate_per_page_data_flow_report()
	analyzer.generate_network_report()
	analyzer.print_runtime()