Ejemplo n.º 1
0
class Utilities:
    def __init__(self, db_name=None, db_engine=None):
        # if we have db params set up global db connection, otherwise we don't bother
        if db_name:
            if db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                self.sql_driver = SQLiteDriver(db_name)
            elif db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                self.sql_driver = PostgreSQLDriver(db_name)
            else:
                print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' %
                      db_engine)
                quit()
        elif db_engine:
            if db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                self.sql_driver = SQLiteDriver()
            elif db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                self.sql_driver = PostgreSQLDriver()
            else:
                print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' %
                      db_engine)
                quit()

        self.url_parser = ParseURL()

    # __init__

    def check_dependencies(self):

        import sys
        if sys.version_info[0] < 3 or sys.version_info[1] < 4:
            print(
                '******************************************************************************'
            )
            print(
                ' Python 3.4 or above is required for webXray; please check your installation. '
            )
            print(
                '******************************************************************************'
            )
            quit()

        try:
            from websocket import create_connection
        except:
            print('*******************************************************')
            print(' The websocket-client library is needed for webXray.   ')
            print(' Please try running "pip3 install -r requirements.txt" ')
            print('*******************************************************')
            quit()

        try:
            from textstat.textstat import textstat
        except:
            print('*******************************************************')
            print(' The textstat library is needed for webXray.           ')
            print(' Please try running "pip3 install -r requirements.txt" ')
            print('*******************************************************')
            quit()

        try:
            import lxml.html
        except:
            print('*******************************************************')
            print(' The lxml library is needed for webXray.               ')
            print(' Please try running "pip3 install -r requirements.txt" ')
            print('*******************************************************')
            quit()

    # check_dependencies

    def get_default_config(self, config_type):
        # the following are two pre-configured options for
        #	haystack and forensic scans, can be tweaked as desired
        if config_type == 'haystack':
            return {
                'client_browser_type': 'chrome',
                'client_prewait': 10,
                'client_no_event_wait': 20,
                'client_max_wait': 60,
                'client_get_bodies': False,
                'client_get_bodies_b64': False,
                'client_get_screen_shot': False,
                'client_get_text': False,
                'client_crawl_depth': 3,
                'client_crawl_retries': 5,
                'client_page_load_strategy': 'none',
                'client_reject_redirects': False,
                'client_min_internal_links': 5,
                'max_attempts': 5,
                'store_1p': True,
                'store_base64': False,
                'store_files': True,
                'store_screen_shot': False,
                'store_source': False,
                'store_page_text': False,
                'store_links': True,
                'store_dom_storage': True,
                'store_responses': True,
                'store_request_xtra_headers': True,
                'store_response_xtra_headers': True,
                'store_requests': True,
                'store_websockets': True,
                'store_websocket_events': True,
                'store_event_source_msgs': True,
                'store_cookies': True,
                'store_security_details': True,
                'timeseries_enabled': True,
                'timeseries_interval': 0
            }
        elif config_type == 'forensic':
            return {
                'client_browser_type': 'chrome',
                'client_prewait': 10,
                'client_no_event_wait': 20,
                'client_max_wait': 60,
                'client_get_bodies': True,
                'client_get_bodies_b64': True,
                'client_get_screen_shot': True,
                'client_get_text': True,
                'client_crawl_depth': 3,
                'client_crawl_retries': 5,
                'client_page_load_strategy': 'none',
                'client_reject_redirects': True,
                'client_min_internal_links': 5,
                'max_attempts': 5,
                'store_1p': True,
                'store_base64': True,
                'store_files': True,
                'store_screen_shot': True,
                'store_source': True,
                'store_page_text': True,
                'store_links': True,
                'store_dom_storage': True,
                'store_responses': True,
                'store_request_xtra_headers': True,
                'store_response_xtra_headers': True,
                'store_requests': True,
                'store_websockets': True,
                'store_websocket_events': True,
                'store_event_source_msgs': True,
                'store_cookies': True,
                'store_security_details': True,
                'timeseries_enabled': True,
                'timeseries_interval': 0
            }
        elif config_type == 'custom':
            print('Create a custom config in Utilities.py')
            quit()
        else:
            print('Invalid config option, see Utilities.py')
            quit()

    # get_default_config

    def select_wbxr_db(self):
        """
		databases are stored with a prefix (default 'wbxr_'), this function helps select a database in interactive mode
		"""

        # you can optionally specify a different prefix here by setting "db_prefix = '[PREFIX]'"
        wbxr_dbs = self.sql_driver.get_wbxr_dbs_list()
        wbxr_dbs.sort()

        if len(wbxr_dbs) == 0:
            print(
                '''\t\tThere are no databases to analyze, please try [C]ollecting data or 
					import an existing wbxr-formatted database manually.''')
            interaction()
            return

        for index, db_name in enumerate(wbxr_dbs):
            print('\t\t[%s] %s' % (index, db_name))

        max_index = len(wbxr_dbs) - 1

        # interaction step: loop until we get acceptable input
        while True:
            selected_db_index = input("\n\tPlease select database by number: ")
            if selected_db_index.isdigit():
                selected_db_index = int(selected_db_index)
                if selected_db_index >= 0 and selected_db_index <= max_index:
                    break
                else:
                    print(
                        '\t\t You entered an invalid string, please select a number in the range 0-%s.'
                        % max_index)
                    continue
            else:
                print(
                    '\t\t You entered an invalid string, please select a number in the range 0-%s.'
                    % max_index)
                continue

        db_name = wbxr_dbs[selected_db_index]
        return db_name

    # select_wbxr_db

    def stream_rate(self, type='scan', return_json=False, client_id=None):
        """
		This function is a generator which determines the rate
			at which pages are being add to the db
			allowing us to evaluate our rate of progress.
		"""

        # initialize dictionary to store rate data
        client_rate_data = {}

        # this diction will hold all the rates for each client so we can
        #	easily figure out the average rate
        all_rates = {}

        # None store the aggregate data for all clients
        client_rate_data[None] = {}
        all_rates[None] = []

        # add entries for each client
        for client_id, in self.sql_driver.get_client_list():
            client_rate_data[client_id] = {}
            all_rates[client_id] = []

        # for client_id in ['wbxr0','wbxr1','wbxr2','wbxr3','wbxr4','wbxr5']:
        # 	client_rate_data[client_id] = {}
        # 	all_rates[client_id] = []

        crawl_depth = self.sql_driver.get_config()['client_crawl_depth']

        # set time window we want to look at to see how many
        #	pages have been recently added

        # set the time gap between updates, leaving it too short
        #	means lots of db calls
        if type == 'scan' or type == 'policy':
            wait_seconds = 10
            interval_seconds = 600
        elif type == 'task':
            wait_seconds = 30
            interval_seconds = 30

        # keep track of how long we've been doing this
        elapsed_seconds = 0

        # for tasks
        if type == 'task':
            old_task_count = self.sql_driver.get_pending_task_count()

        # this runs forever, no terminating condition
        while True:
            # simple increment, note we we /60 before we return
            #	for minutes conversion
            elapsed_seconds += wait_seconds

            remaining_tasks = self.sql_driver.get_task_queue_length()

            total_count = 0

            for client_id, count in self.sql_driver.get_recent_page_count_by_client_id(
                    interval_seconds):
                total_count += count

                # to get rate/hour we take the number of pages we've added per
                #	second *3600
                current_rate = (count / interval_seconds) * 3600

                # this list is all the rates we've seen
                all_rates[client_id] = all_rates[client_id] + [current_rate]

                # nice built-in to get the average rate
                average_rate = statistics.mean(all_rates[client_id])

                # figure out how much longer to go, gracefully handle
                #	a rate of zero
                if average_rate != 0:
                    remaining_hours = remaining_tasks / average_rate
                else:
                    remaining_hours = 0

                # dictionary of the data to return
                client_rate_data[client_id] = {
                    'elapsed_minutes': round(elapsed_seconds / 60, 2),
                    'current_rate': round(current_rate, 2),
                    'average_rate': round(average_rate, 2),
                    'remaining_tasks': remaining_tasks,
                    'remaining_hours': round(remaining_hours, 2) * crawl_depth
                }

            # for overall measure
            total_current_rate = (total_count / interval_seconds) * 3600
            all_rates[None] += [total_current_rate]
            total_average_rate = statistics.mean(all_rates[None])

            # figure out how much longer to go, gracefully handle
            #	a rate of zero
            if total_average_rate != 0:
                remaining_hours = round(
                    (remaining_tasks / total_average_rate) * crawl_depth, 2)
            else:
                remaining_hours = 0

            # round down for days
            if remaining_hours > 24:
                remaining_time = f'{round(remaining_hours/24,2)} days'
            else:
                remaining_time = f'{remaining_hours} hours'

            client_rate_data[None] = {
                'elapsed_minutes': round(elapsed_seconds / 60, 2),
                'current_rate': round(total_current_rate, 2),
                'average_rate': round(total_average_rate, 2),
                'remaining_tasks': remaining_tasks,
                'remaining_hours': remaining_time
            }

            # if we are called by the flask admin_console it is
            #	easiest to do json formatting here, otherwise
            #	we don't.
            if return_json:
                yield f"data:{json.dumps(client_rate_data)}\n\n"
            else:
                yield client_rate_data

            # wait until we send a new update
            time.sleep(wait_seconds)

    # stream_rate

    def setup_report_dir(self, db_name):
        """
		Create directory for where the reports go if it does not exist,
			returns the path.
		"""
        if os.path.exists('./reports') == False:
            print('\t\tMaking global reports directory at ./reports.')
            os.makedirs('./reports')

        # set global report_path
        report_path = './reports/' + db_name

        # set up subdir for this analysis
        if os.path.exists(report_path) == False:
            print('\t\tMaking subdirectory for reports at %s' % report_path)
            os.makedirs(report_path)

        print('\t\tStoring output in %s' % report_path)
        return report_path

    # setup_report_dir

    def write_csv(self, report_path, file_name, csv_rows, num_decimals=2):
        """
		basic utility function to write list of csv rows to a file
		"""
        full_file_path = report_path + '/' + file_name
        with open(full_file_path, 'w', newline='',
                  encoding='utf-8') as csvfile:
            csv_writer = csv.writer(csvfile,
                                    delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_ALL)
            for row in csv_rows:
                rounded_row = []
                for item in row:
                    # round floats and decimals
                    if isinstance(item, float) or isinstance(
                            item, decimal.Decimal):
                        rounded_row.append(round(item, num_decimals))
                    else:
                        rounded_row.append(item)

                csv_writer.writerow(rounded_row)
        print('\t\tOutput written to %s' % full_file_path)

    # write_csv

    def print_runtime(self, action_name, start_time):
        """
		Just for CLI info
		"""
        print('-' * 40)
        print('\t%s finished in %s' %
              (action_name, str(datetime.now() - start_time)))
        print('-' * 40)

    # print_runtime

    def get_absolute_url_from_page_link(self, page_url, link_url):
        """
		Given a page_url and a link_url from that page we determine
			the absolute url of the link from the page_url.
		"""

        # ex nihilo nihil fit
        if link_url == None: return None
        if len(link_url) == 0: return None

        # we use the info from the original url for converting
        #	relative links to absolute
        parsed_page_url = urlparse(page_url)

        # this is an absolute url already, nothing further to do to
        if re.match('^https?://', link_url):
            return (link_url)
        # link with no scheme, paste it in
        elif re.match('^//', link_url):
            return (parsed_page_url.scheme + ':' + link_url)
        # relative link, fix it up
        else:
            if link_url[0] != '/':
                return (parsed_page_url.scheme + '://' +
                        parsed_page_url.netloc + '/' + link_url)
            else:
                return (parsed_page_url.scheme + '://' +
                        parsed_page_url.netloc + link_url)

        # this only happens if something breaks
        return None

    # get_absolute_url_from_link

    def get_most_common_sorted(self, list_in):
        """
		takes a list, finds the most common items
		and then resorts alpha (b/c python's Counter will arbitrarily 
		order items with same count), then sorts again for most-common

		assumes list_in contains alphanumeric tuples
		"""
        most_common_sorted = collections.Counter(list_in).most_common()
        most_common_sorted.sort()
        most_common_sorted.sort(reverse=True, key=lambda item: item[1])
        return most_common_sorted

    # get_most_common_sorted

    #########################
    #	POLICY EXTRACTION	#
    #########################

    def get_policy_link_terms(self):
        """
		Returns a list of terms used to indicate a link may be a policy, 
			note languages are all mixed together.
		"""
        policy_link_terms = []
        # go through json file and merge terms together
        for lang_term_set in json.load(
                open(os.path.dirname(os.path.abspath(__file__)) +
                     '/resources/policyxray/policy_terms.json',
                     'r',
                     encoding='utf-8')):
            for term in lang_term_set['policy_link_terms']:
                policy_link_terms.append(term)
        return policy_link_terms

    # get_policy_link_terms

    def get_policy_verification_terms(self):
        """
		Returns a dictionary of terms used to verify several types of
			policies, note languages are all mixed together.
		"""
        policy_verification_terms = {}
        policy_verification_terms['privacy_policy'] = []
        policy_verification_terms['terms_of_service'] = []
        policy_verification_terms['cookie_policy'] = []
        policy_verification_terms['ad_choices'] = []
        policy_verification_terms['gdpr_statement'] = []
        policy_verification_terms['ccpa_statement'] = []

        # go through json file and merge terms together
        for lang_term_set in json.load(
                open(os.path.dirname(os.path.abspath(__file__)) +
                     '/resources/policyxray/policy_terms.json',
                     'r',
                     encoding='utf-8')):
            for term in lang_term_set['privacy_policy_verification_terms']:
                policy_verification_terms[
                    'privacy_policy'] = policy_verification_terms[
                        'privacy_policy'] + [term]

            for term in lang_term_set['terms_of_service_verification_terms']:
                policy_verification_terms[
                    'terms_of_service'] = policy_verification_terms[
                        'terms_of_service'] + [term]

            for term in lang_term_set['cookie_policy_verification_terms']:
                policy_verification_terms[
                    'cookie_policy'] = policy_verification_terms[
                        'cookie_policy'] + [term]

            for term in lang_term_set['ad_choices_verification_terms']:
                policy_verification_terms[
                    'ad_choices'] = policy_verification_terms['ad_choices'] + [
                        term
                    ]

            for term in lang_term_set['gdpr_statement_verification_terms']:
                policy_verification_terms[
                    'gdpr_statement'] = policy_verification_terms[
                        'gdpr_statement'] + [term]

            for term in lang_term_set['ccpa_statement_verification_terms']:
                policy_verification_terms[
                    'ccpa_statement'] = policy_verification_terms[
                        'ccpa_statement'] + [term]

        return policy_verification_terms

    # get_policy_verification_terms

    def get_lang_to_privacy_policy_term_dict(self):
        """
		Returns a dict of privacy policy terms keyed by language code.
		"""
        lang_to_terms = {}
        for lang_term_set in json.load(
                open(os.path.dirname(os.path.abspath(__file__)) +
                     '/resources/policyxray/policy_terms.json',
                     'r',
                     encoding='utf-8')):
            lang_to_terms[
                lang_term_set['lang']] = lang_term_set['policy_terms']
        return lang_to_terms

    # get_lang_to_priv_term_dict

    #########################
    #	DOMAIN OWNERSHIP	#
    #########################

    def get_domain_owner_dict(self):
        """
		read out everything in the domain_owner table into a dictionary
			so we can easily use it as a global lookup table
		
		this is purposefully independent of self.patch_domain_owners
			and does not assume the above has been run, however will return
			and empty dictionary if the db has not been patched yet

		reasons for above is that if user does not wish to update with the 
			current json file historical data will remain consistent
		"""

        # domain_owners is both returned as well as made available to other class functions
        self.domain_owners = {}
        domain_owner_raw_data = self.sql_driver.get_all_domain_owner_data()
        if domain_owner_raw_data:
            for item in domain_owner_raw_data:
                # add everything to the dict
                self.domain_owners[item[0]] = {
                    'parent_id': item[1],
                    'owner_name': item[2],
                    'aliases': json.loads(item[3]),
                    'homepage_url': item[4],
                    'site_privacy_policy_urls': json.loads(item[5]),
                    'service_privacy_policy_urls': json.loads(item[6]),
                    'gdpr_statement_urls': json.loads(item[7]),
                    'terms_of_use_urls': json.loads(item[8]),
                    'platforms': json.loads(item[9]),
                    'uses': json.loads(item[10]),
                    'notes': item[11],
                    'country': item[12]
                }
        return self.domain_owners

    # get_domain_owner_dict

    def get_domain_owner_lineage_ids(self, id):
        """
		for a given domain owner id, return the list which corresponds to its ownership lineage
		"""
        if self.domain_owners[id]['parent_id'] == None:
            return [id]
        else:
            return [id] + self.get_domain_owner_lineage_ids(
                self.domain_owners[id]['parent_id'])

    # get_domain_owner_lineage_ids

    def get_domain_owner_lineage_strings(self, owner_id, get_aliases=False):
        """
		given an owner_id this function returns a list
			which is the full lineage of ownership

		optionally will also return aliases (e.g. 'Doubleclick' and 'Double Click')
		"""
        lineage_strings = []
        for owner_id in self.get_domain_owner_lineage_ids(owner_id):
            lineage_strings.append(
                (owner_id, self.domain_owners[owner_id]['owner_name']))
            if get_aliases:
                for alias in self.domain_owners[owner_id]['aliases']:
                    lineage_strings.append((owner_id, alias))
        return lineage_strings

    # get_domain_owner_lineage_strings

    def get_domain_owner_lineage_combined_string(self, owner_id):
        """
		given an owner_id this function returns a single string
			which is the full lineage of ownership
		"""
        lineage_string = ''
        for item in self.get_domain_owner_lineage_strings(owner_id):
            lineage_string += item[1] + ' > '
        return lineage_string[:-3]

    # get_domain_owner_lineage_combined_string

    def get_domain_owner_child_ids(self, id):
        """
		for a given owner id, get all of its children/subsidiaries
		"""

        # first get all the children ids if they exist
        child_ids = []
        for item in self.domain_owners:
            if self.domain_owners[item]['parent_id'] == id:
                child_ids.append(item)

        # if we have children, call recursively
        if len(child_ids) > 0:
            for child_id in child_ids:
                child_ids.extend(self.get_domain_owner_child_ids(child_id))

        # return an empty list if no children
        return child_ids

    # get_domain_owner_child_ids

    def is_url_valid(self, url):
        """
		Performs checks to verify if the url can actually be
			scanned.
		"""

        # only do http links
        if not (re.match('^https?://.+', url)): return False

        # if we can't get the url_path it is invalid
        try:
            url_path = urlsplit(url.strip().lower()).path
        except:
            return False

        # if we can't do idna conversion it is invalid
        try:
            idna_fixed_netloc = urlsplit(
                url.strip()).netloc.encode('idna').decode('utf-8')
        except:
            return False

        # these are common file types we want to avoid
        illegal_extensions = [
            'apk', 'dmg', 'doc', 'docx', 'exe', 'ics', 'iso', 'pdf', 'ppt',
            'pptx', 'rtf', 'txt', 'xls', 'xlsx'
        ]

        # if we can't parse the extension it doesn't exist and is
        #	therefore ok by our standards
        try:
            url_extension = re.search('\.([0-9A-Za-z]+)$', url_path).group(1)
            if url_extension in illegal_extensions: return False
        except:
            return True

        # it's good
        return True

    # is_url_valid

    def idna_encode_url(self, url, no_fragment=False):
        """
		Non-ascii domains will crash some browsers, so we need to convert them to 
			idna/ascii/utf-8. This requires splitting apart the url, converting the 
			domain to idna, and pasting it all back together
		"""
        split_url = urlsplit(url.strip())
        idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
        if no_fragment:
            return urlunsplit((split_url.scheme, idna_fixed_netloc,
                               split_url.path, split_url.query, ''))
        else:
            return urlunsplit(
                (split_url.scheme, idna_fixed_netloc, split_url.path,
                 split_url.query, split_url.fragment))

    # idna_encode_url

    def is_url_internal(self, origin_url, target_url):
        """
		Given two urls (origin, target) determines if 
			the target is internal to the origin based on
			subsuffix+1 domain.
		"""

        origin_domain = self.url_parser.get_parsed_domain_info(origin_url)
        target_domain = self.url_parser.get_parsed_domain_info(target_url)

        # we return None to signify we couldn't parse the urls
        if not origin_domain['success'] or not target_domain['success']:
            return None
        else:
            origin_domain = origin_domain['result']['domain']
            target_domain = target_domain['result']['domain']

        if origin_domain != target_domain:
            return False
        else:
            return True
Ejemplo n.º 2
0
class SingleScan:
	"""
	Loads and analyzes a single page, print outputs to cli
	Very simple and does not require a db being configured
	"""

	def __init__(self):
		self.url_parser		= ParseURL()
		self.domain_owners 	= {}
		self.id_to_owner	= {}
		self.id_to_parent	= {}

		# set up the domain ownership dictionary
		for item in json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')):
			if item['id'] == '-': continue

			self.id_to_owner[item['id']] 	= item['name']
			self.id_to_parent[item['id']] 	= item['parent_id']
			for domain in item['domains']:
				self.domain_owners[domain] = item['id']
	# end init

	def get_lineage(self, id):
		"""
		Find the upward chain of ownership for a given domain.
		"""
		if self.id_to_parent[id] == None:
			return [id]
		else:
			return [id] + self.get_lineage(self.id_to_parent[id])
	# end get_lineage

	def execute(self, url, config):
		"""
		Main function, loads page and analyzes results.
		"""

		print('\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
		print('\tSingle Site Test On: %s' % url)
		print('\t - Browser type is %s' % config['client_browser_type'])
		print('\t - Browser max wait time is %s seconds' % config['client_max_wait'])

		# make sure it is an http(s) address
		if not re.match('^https?://', url): 
			print('\tNot a valid url, aborting')
			return None

		# import and set up specified browser driver
		if config['client_browser_type'] == 'chrome':
			from webxray.ChromeDriver	import ChromeDriver
			browser_driver 	= ChromeDriver(config)
		else:
			print('INVALID BROWSER TYPE FOR %s, QUITTING!' % config['client_browser_type'])
			exit()

		# attempt to get the page
		browser_output = browser_driver.get_scan(url)

		# if there was a problem we print the error
		if browser_output['success'] == False:
			print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result']))
			return
		else:
			browser_output = browser_output['result']

		# get the ip, fqdn, domain, pubsuffix, and tld from the URL
		# we need the domain to figure out if cookies/elements are third-party
		origin_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url)

		# if we can't get page domain info we bail out
		if origin_ip_fqdn_domain_pubsuffix_tld is None:
			print('could not parse origin domain')
			return None

		origin_ip 			= origin_ip_fqdn_domain_pubsuffix_tld[0]
		origin_fqdn 		= origin_ip_fqdn_domain_pubsuffix_tld[1]
		origin_domain 		= origin_ip_fqdn_domain_pubsuffix_tld[2]
		origin_pubsuffix 	= origin_ip_fqdn_domain_pubsuffix_tld[3]
		origin_tld 			= origin_ip_fqdn_domain_pubsuffix_tld[4]

		print('\n\t------------------{ URL }------------------')
		print('\t %s' % url)
		print('\n\t------------------{ Final URL }------------------')
		print('\t %s' % browser_output['final_url'])
		print('\n\t------------------{ Title }------------------')
		print('\t %s' % browser_output['title'])
		print('\n\t------------------{ Description }------------------')
		print('\t %s' % browser_output['meta_desc'])
		print('\n\t------------------{ Domain }------------------')
		print('\t %s' % origin_domain)
		print('\n\t------------------{ Seconds to Complete Download }------------------')
		print('\t%s' % (browser_output['load_time']))
		print('\n\t------------------{ Cookies }------------------')
		# put relevant fields from cookies into list we can sort
		cookie_list = []
		for cookie in browser_output['cookies']:
			cookie_list.append(cookie['domain']+' -> '+cookie['name']+' -> '+cookie['value'])

		cookie_list.sort()
		for count,cookie in enumerate(cookie_list):
			print(f'\t[{count}] {cookie}')
			
		print('\n\t------------------{ Local Storage }------------------')
		for item in browser_output['dom_storage']:
			print('\t%s (is local: %s): %s' % (item['security_origin'],item['is_local_storage'],item['key']))

		print('\n\t------------------{ Domains Requested }------------------')
		request_domains = set()

		for request in browser_output['requests']:
			# if the request starts with 'data'/etc we can't parse tld anyway, so skip
			if re.match('^(data|about|chrome).+', request['url']):
				continue

			# parse domain from the security_origin
			domain_info = self.url_parser.get_parsed_domain_info(request['url'])
			if domain_info['success'] == False:
				print('\tUnable to parse domain info for %s with error %s' % (request['url'], domain_info['result']))
				continue

			# if origin_domain != domain_info['result']['domain']:
			request_domains.add(domain_info['result']['domain'])
		
		count = 0
		for domain in sorted(request_domains):
			count += 1
			if domain in self.domain_owners:
				lineage = ''
				for item in self.get_lineage(self.domain_owners[domain]):
					lineage += self.id_to_owner[item]+' > '
				print('\t%s) %s [%s]' % (count, domain, lineage[:-3]))
			else:
				print('\t%s) %s [Unknown Owner]' % (count, domain))
Ejemplo n.º 3
0
class ChromeDriver:
	def __init__(self, config, port_offset=1, chrome_path=None, headless=True):
		self.debug = False

		# unpack config
		if self.debug: print(config)
		self.prewait				= config['client_prewait']
		self.no_event_wait 			= config['client_no_event_wait']
		self.max_wait 				= config['client_max_wait']
		self.return_page_text 		= config['client_get_text']
		self.return_bodies 			= config['client_get_bodies']
		self.return_bodies_base64 	= config['client_get_bodies_b64']
		self.return_screen_shot 	= config['client_get_screen_shot']
		self.reject_redirects		= config['client_reject_redirects']
		self.crawl_depth 			= config['client_crawl_depth']
		self.crawl_retries 			= config['client_crawl_retries']
		self.page_load_strategy		= config['client_page_load_strategy']
		self.min_internal_links		= config['client_min_internal_links']
		self.headless 				= headless

		# custom library in /webxray
		self.url_parser = ParseURL()

		# prevents get_scan from closing browser
		#	when we are doing a crawl
		self.is_crawl = False

		# gets overwritten once, so we don't have to keep
		#	figuring it out when doing crawls
		self.browser_type		= None
		self.browser_version 	= None
		self.user_agent			= None

		# we can override the path here
		if chrome_path:
			chrome_cmd = chrome_cmd
		else:
			# if path is not specified we use the common
			#	paths for each os
			if platform.system() == 'Darwin':
				chrome_cmd = '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome '
			elif platform.system() == 'Linux':
				chrome_cmd = '/usr/bin/google-chrome '
			elif platform.system() == 'Windows':
				chrome_cmd = 'start chrome '
			else:
				print('Unable to determine Operating System and therefore cannot guess correct Chrome path, see ChromeDriver.py for details.')
				exit()

		# use port offset to avoid collissions between processes
		port = 9222+port_offset

		# each process will use it's own debugging port or we use default 9222
		chrome_cmd += '--remote-debugging-port=%s' % port

		# sets up blank profile
		chrome_cmd += ' --guest'

		# not sure this really does anything
		chrome_cmd += ' --disable-gpu'

                # disable sandbox to worki inside docker
		chrome_cmd += ' --no-sandbox'

		# set up headless
		if self.headless: chrome_cmd += ' --headless'

		# if we're in production send the subprocess output to dev/null, None is normal
		if not self.debug:
			devnull = open(os.devnull, 'w')
		else:
			devnull = None

		# run command and as subprocess
		if self.debug: print(f'going to run command: "{chrome_cmd}"')
		subprocess.Popen(chrome_cmd,shell=True,stdin=None,stdout=devnull,stderr=devnull,close_fds=True)

		# allow browser to launch
		time.sleep(5)

		# the debugger address has a 'json' path where we can find the websocket
		#	address which is how we send devtools commands, thus we extract the value
		#	"webSocketDebuggerUrl" from the first json object
		try:
			debuggerAddress_json = json.loads(urllib.request.urlopen('http://localhost:%s/json' % port).read().decode())
			if self.debug: print(debuggerAddress_json)
			webSocketDebuggerUrl = debuggerAddress_json[0]['webSocketDebuggerUrl']
			self.launched = True
		except Exception as e:
			self.launched = False
			return

		# third, once we have the websocket address we open a connection
		#	and we are (finally) able to communicate with chrome via devtools!
		# note this connection must be closed!
		self.devtools_connection = create_connection(webSocketDebuggerUrl)

		# important, makes sure we don't get stuck
		#	waiting for messages to arrive
		self.devtools_connection.settimeout(3)

		# this is incremented globally
		self.current_ws_command_id = 0

		# prevent downloading files, the /dev/null is redundant
		if self.debug: print('going to disable downloading')
		response = self.get_single_ws_response('Page.setDownloadBehavior','"behavior":"deny","downloadPath":"/dev/null"')
		if response['success'] == False:
			self.exit()
			return response
		else:
			response = response['result']
		if self.debug: print(f'{response}')

		# done
		return
	# __init__

	def get_single_ws_response(self,method,params=''):
		"""
		Attempt to send ws_command and return response, note this only works
			if you don't have the queue being flooded with network events,
			handles crashes gracefully.
		"""
		self.current_ws_command_id += 1
		try:
			self.devtools_connection.send('{"id":%s,"method":"%s","params":{%s}}' % (self.current_ws_command_id,method,params))
			return ({
				'success'	: True,
				'result'	: json.loads(self.devtools_connection.recv())
			})
		except:
			return ({
				'success'	: False,
				'result'	: 'Crashed on get_single_ws_response.'
			})
	# get_single_ws_response

	def send_ws_command(self,method,params='',override_id=None):
		"""
		Attempt to send ws_command, handle crashes gracefully.
		"""
		self.current_ws_command_id += 1
		try:
			self.devtools_connection.send('{"id":%s,"method":"%s","params":{%s}}' % (self.current_ws_command_id,method,params))
			return ({
				'success'	: True,
				'result'	: self.current_ws_command_id
			})
		except:
			return ({
				'success'	: False,
				'result'	: 'Crashed on send_ws_command.'
			})
	# send_ws_command

	def get_next_ws_response(self):
		"""
		Either get the next ws response or send None on
			timeout or crash.
		"""
		try:
			return json.loads(self.devtools_connection.recv())
		except:
			return None
	# get_next_ws_response

	def exit(self):
		"""
		Tidy things up before exiting.

		"""
		if self.launched:
			self.send_ws_command('Browser.close')
			self.devtools_connection.close()
	# exit

	def get_crawl(self, url_list):
		"""
		Performs multiple page loads using the same profile,
			which allows cookies to be transferred across loads
			and potentially allow for more tracking.
		"""

		# setting this globally prevents the browser
		#	from being closed after get_scan
		self.is_crawl = True

		# we return a list which is all the get_scan
		#	results we find
		results = []

		# do each url
		for url in url_list:
			result = self.get_scan(url)
			if result['success']:
				results.append(result['result'])
			else:
				error = result['result']
				self.exit()
				return ({
					'success': False,
					'result': error
				})

		# now it is ok to close the browser/ws connection
		self.exit()

		# done!
		return ({
			'success': True,
			'result': results
		})
	# get_crawl

	def get_random_crawl(self, seed_url):
		"""
		Based on an intial seed page conducts a first scan to
			get traffic and links, then loads additional pages
			on the same site based on links.

		Note the cookies from each page load carry over, thus we do
			not allow any domain-level redirects on page loads as this
			would skew our ability to categorize cookies as first
			or third-party.
		"""

		# setting this globally prevents the browser
		#	from being closed after get_scan
		self.is_crawl = True

		# we return a list which is all the get_scan
		#	results we find
		results = []

		# removing trailing /
		seed_url = re.sub('/$', '',seed_url)

		if self.debug: print(f'going to scan seed_url {seed_url}')
		result = self.get_scan(seed_url)

		if not result['success']:
			self.exit()
			return ({
				'success': False,
				'result': result["result"]
			})
		else:
			origin_url 		= result['result']['final_url']
			scanned_urls 	= [seed_url]
			results.append(result['result'])

		if self.debug: print(f'origin url is {origin_url}')

		# holds urls we may scan
		unique_urls = set()

		# look at links from the seed page, we will quit
		#	either when we exceed self.crawl_depth or run out of links
		for link in result['result']['all_links']:

			# only do internal links
			if not link['internal']: continue

			# (re)encode the url
			url = self.idna_encode_url(link['href'], no_fragment=True)

			# idna_encode failure yields a None value, skip
			if not url: continue

			# removing trailing /
			url = re.sub('/$', '',url)

			# make sure it is a real web page
			if not self.is_url_valid(url): continue

			# we already scanned this
			if url == seed_url or url == origin_url: continue

			# yay, it's usable
			unique_urls.add(url)

		# no need to do any scans if we can't find urls
		if len(unique_urls) < self.crawl_depth:
			self.exit()
			return ({
				'success'	: False,
				'result'	: 'did not find enough internal links'
			})

		# we allow a certain number of failed page loads, but eventually
		#	we must give up
		failed_urls = []

		# keep scanning pages until we've done enough
		for url in unique_urls:

			# if we have enough results break
			if len(scanned_urls) == self.crawl_depth: break

			# give up!
			if len(failed_urls) > self.crawl_retries:
				self.exit()
				return ({
					'success'	: False,
					'result'	: 'reached fail limit'
				})

			# do the scan
			result = self.get_scan(url)

			# either keep result or keep track of failures
			if result['success']:
				# reject redirects based on origin_url
				is_redirect = self.is_url_internal(origin_url,result['result']['final_url'])
				if is_redirect == None or is_redirect == False:
					if self.debug: print(f"caught redirect from {url} to {result['result']['final_url']}")
					failed_urls.append(url)
				else:
					results.append(result['result'])
					scanned_urls.append(url)
			else:
				if self.debug: print(f"fail on {result['result']}")
				failed_urls.append(url)

		if self.debug:
			print('crawled urls:')
			for res in results:
				print(res['start_url'],res['final_url'])

		# now it is ok to close the browser/ws connection
		self.exit()

		# done!
		num_results = len(results)
		if num_results < self.crawl_depth:
			return ({
				'success': False,
				'result': 'unable to crawl specified number of pages'
			})
		else:
			return ({
				'success': True,
				'result': results
			})
	# get_random_crawl

	def get_scan(self, url, get_text_only=False):
		"""
		The primary function for this class, performs a number of tasks based on the config
			including, but not limited to:

			- capture network traffic
			- capture response bodies
			- capture screen shots
			- capture page text using readability

		Note that if get_text_only is true we only do basic tasks
			such as getting the policy, and we return far less content which is useful
			for doing text capture.
		"""

		# let the games begin
		if self.debug: print('starting %s' % url)

		# we can't start Chrome, return error message as result
		if not self.launched:
			return ({
				'success': False,
				'result': 'Unable to launch Chrome instance, check that Chrome is installed in the expected location, see ChromeDriver.py for details.'
			})

		# Network events are stored as lists of dictionaries which are
		#	returned.
		requests  				= []
		request_extra_headers 	= []
		responses 				= []
		response_extra_headers 	= []
		websockets 				= []
		websocket_events 		= []
		event_source_msgs 		= []
		load_finish_events 		= []

		# Response bodies are keyed to the request_id when they are
		#	returned to calling function, and we get the response bodies
		#	by issuing websocket commands so we we first keep track
		#	of which command is linked to which ws_id.  Note this data is
		#	for internal processes and not returned
		ws_id_to_req_id = {}

		# When we get the websocket response we stored the body keyed
		#	to the request id, this is returned
		response_bodies = {}

		# We keep dom_storage here, the dict key is a tuple of the securityOrigin
		# 	isLocalStorage, and the domstorage key. This way we can keep only final
		#	values in cases they are overwritten.  Note this data is
		#	for internal processes and not returned
		dom_storage_holder 	= {}

		# Before we return the result we store the unique domstorage items to a
		#	list of dicts
		dom_storage = []

		# We merge the following types of websocket events
		websocket_event_types = [
			'Network.webSocketFrameError',
			'Network.webSocketFrameReceived',
			'Network.webSocketFrameSent',
			'Network.webSocketWillSendHandshakeRequest',
			'Network.webSocketHandshakeResponseReceived',
			'Network.webSocketClosed'
		]

		# The timestamps provided by Chrome DevTools are "Monotonically increasing time
		#	in seconds since an arbitrary point in the past."  What this means is they are
		#	essentially offsets (deltas) and not real timestamps.  However, the Network.requestWillBeSent
		#	also has a "wallTime" which is a UNIX timestamp.  So what we do below is set the
		#	origin_walltime which to be the earliest wallTime we've seen as this allow us to later
		#	use the "timestamps" to determine the real-world time when an event happened.
		origin_walltime  = None
		first_timestamp	 = None

		# keeps track of what ws_id belongs to which type of command, we
		#	remove entries when we get a response
		pending_ws_id_to_cmd = {}

		# get browser version and user agent
		if self.debug: print('going to get browser version')
		response = self.get_single_ws_response('Browser.getVersion')
		if response['success'] == False:
			self.exit()
			return response
		elif 'result' not in response['result']:
			self.exit()
			return ({
				'success': False,
				'result': 'No result for ws command'
			})
		else:
			response = response['result']
		if self.debug: print(f'ws response: {response}')

		if not self.browser_type:
			self.browser_type		= re.match('^(.+)?/(.+)$',response['result']['product'])[1]
			self.browser_version 	= re.match('^(.+)?/(.+)$',response['result']['product'])[2]
			self.user_agent			= response['result']['userAgent']

		# remove 'Headless' from the user_agent
		if self.headless:
			response = self.get_single_ws_response('Network.setUserAgentOverride','"userAgent":"%s"' % self.user_agent.replace('Headless',''))
			if response['success'] == False:
				self.exit()
				return response
			elif 'result' not in response['result']:
				self.exit()
				return ({
					'success': False,
					'result': 'No result for ws command'
				})
			else:
				response = response['result']
			if self.debug: print(f'ws response: {response}')

		# enable network and domstorage when doing a network_log
		if not get_text_only:
			if self.debug: print('going to enable network logging')
			response = self.get_single_ws_response('Network.enable')
			if response['success'] == False:
				self.exit()
				return response
			elif 'result' not in response['result']:
				self.exit()
				return ({
					'success': False,
					'result': 'No result for ws command'
				})
			else:
				response = response['result']
			if self.debug: print(f'ws response: {response}')

			if self.debug: print('going to enable domstorage logging')
			response = self.get_single_ws_response('DOMStorage.enable')
			if response['success'] == False:
				self.exit()
				return response
			else:
				response = response['result']
			if self.debug: print(f'ws response: {response}')

			if self.debug: print('going to disable cache')
			response = self.get_single_ws_response('Network.setCacheDisabled','"cacheDisabled":true')
			if response['success'] == False:
				self.exit()
				return response
			else:
				response = response['result']
			if self.debug: print(f'ws response: {response}')

		# start the page load process, fail gracefully, currently using
		#	selenium b/c it gives us a timeout, but may move to devtools
		if self.debug: print(f'going to load {url}')
		# try:
		# 	self.driver.get(url)
		# except Exception as e:
		# 	# close browser/websocket
		# 	self.exit()
		# 	# return the error
		# 	return ({
		# 		'success': False,
		# 		'result': str(e)
		# 	})
		response = self.get_single_ws_response('Page.navigate','"url":"%s"' % url)
		if response['success'] == False:
			self.exit()
			return response
		else:
			response = response['result']
		if self.debug: print(f'ws response: {response}')

		# this is the main loop where we get network log data
		if not get_text_only:

			#############################
			# DEVTOOLS NETWORK LOG DATA #
			#############################

			if self.debug: print('##############################')
			if self.debug: print(' Going to process Network Log ')
			if self.debug: print('##############################')

			# Keep track of how long we've been reading ws data
			response_loop_start = datetime.datetime.now()

			# Keetp track of when we last saw a Network event
			time_since_last_response = datetime.datetime.now()

			# Length of time since we last saw a Network event
			elapsed_no_event = 0

			# Keep track of what second we are on so we know
			#	when to scroll, is incremented whenever the second
			# 	changes (eg 1.99 -> 2.10 = 1 -> 2)
			last_second = 0

			# We keep collecting devtools_responses in this loop until either we haven't seen
			#	network activity for the no_event_wait value or we exceed the max_wait
			#	time.
			while True:

				# update how long we've been going
				loop_elapsed = (datetime.datetime.now()-response_loop_start).total_seconds()

				# perform two scrolls once a second
				if int(loop_elapsed) > last_second:
					last_second = int(loop_elapsed)
					for i in range(0,10):
						if self.debug: print(f'{last_second} : performing scroll #{i}')
						self.do_scroll()
						self.do_scroll()

				# see if time to stop
				elapsed_no_event = (datetime.datetime.now()-time_since_last_response).total_seconds()
				if loop_elapsed < self.prewait:
					if self.debug: print(f'{loop_elapsed}: In prewait period')

				if loop_elapsed > self.prewait and (elapsed_no_event > self.no_event_wait or loop_elapsed > self.max_wait):
					if self.debug: print(f'{loop_elapsed} No event for {elapsed_no_event}, max_wait is {self.max_wait}, breaking Network log loop.')
					break

				# try to get ws response, returns None if no response
				devtools_response = self.get_next_ws_response()

				# determine how long since we last got a response with
				#	a Network event, if we didn't get a response we wait
				#	for a second
				if devtools_response:
					if 'method' in devtools_response:
						if 'Network' in devtools_response['method']:
							time_since_last_response = datetime.datetime.now()
						else:
							if self.debug: print(f'No events for {elapsed_no_event} seconds; main loop running for {loop_elapsed}')
				else:
					if self.debug: print(f'No events for {elapsed_no_event} seconds; main loop running for {loop_elapsed}')
					time.sleep(1)
					continue

				# if we make it this far devtools_response was not None
				if self.debug: print(loop_elapsed,json.dumps(devtools_response)[:100])

				# PRESENCE OF 'METHOD' MEANS WE PROCESS LOG DATA
				if 'method' in devtools_response:
					# REQUEST
					if devtools_response['method'] == 'Network.requestWillBeSent':
						cleaned_request = self.clean_request(devtools_response['params'])
						cleaned_request['event_order'] = len(requests)

						# update global start time to measure page load time and calculate offsets
						if origin_walltime == None or cleaned_request['wall_time'] < origin_walltime:
							origin_walltime = cleaned_request['wall_time']

						if first_timestamp == None or cleaned_request['timestamp'] < first_timestamp:
							first_timestamp = cleaned_request['timestamp']

						# DOCUMENT ME
						if 'redirectResponse' in devtools_response['params']:
							redirect_response = {}
							redirect_response['response'] 		= devtools_response['params']['redirectResponse']
							redirect_response['requestId'] 		= devtools_response['params']['requestId']
							redirect_response['loaderId'] 		= devtools_response['params']['loaderId']
							redirect_response['timestamp']		= devtools_response['params']['timestamp']
							redirect_response['type'] 		 	= devtools_response['params']['type']
							redirect_response['event_order'] 	= len(responses)
							responses.append(self.clean_response(redirect_response))

							cleaned_request['redirect_response_url'] = devtools_response['params']['redirectResponse']['url']
						else:
							cleaned_request['redirect_response_url'] = None

						requests.append(cleaned_request)

					# REQUEST EXTRA INFO
					if devtools_response['method'] == 'Network.requestWillBeSentExtraInfo':
						request_extra_headers.append({
							'request_id'		: devtools_response['params']['requestId'],
							'headers'			: devtools_response['params']['headers'],
							'associated_cookies': devtools_response['params']['associatedCookies']
						})

					# RESPONSE
					if devtools_response['method'] == 'Network.responseReceived':
						responses.append(self.clean_response(devtools_response['params']))

					# RESPONSE EXTRA INFO
					if devtools_response['method'] == 'Network.responseReceivedExtraInfo':
						response_extra_headers.append({
							'request_id'		: devtools_response['params']['requestId'],
							'headers'			: devtools_response['params']['headers'],
							'blocked_cookies'	: devtools_response['params']['blockedCookies'],
						})

					# LOAD FINISHED
					if devtools_response['method'] == 'Network.loadingFinished':
						request_id = devtools_response['params']['requestId']

						load_finish_events.append({
							'encoded_data_length': 	devtools_response['params']['encodedDataLength'],
							'request_id': 			request_id,
							'timestamp': 			devtools_response['params']['timestamp'],
						})

					# WEBSOCKETS
					if devtools_response['method'] == 'Network.webSocketCreated':
						if 'initiator' in devtools_response['params']:
							this_initiator = devtools_response['params']['initiator']
						else:
							this_initiator = None

						websockets.append({
							'request_id'	: devtools_response['params']['requestId'],
							'url'			: devtools_response['params']['url'],
							'initiator'		: this_initiator,
							'event_order'	: len(websockets)
						})

					if devtools_response['method'] in websocket_event_types:
						if 'errorMessage' in devtools_response['params']:
							payload = devtools_response['params']['errorMessage']
						elif 'request' in devtools_response['params']:
							payload = devtools_response['params']['request']
						elif 'response' in devtools_response['params']:
							payload = devtools_response['params']['response']
						else:
							payload = None

						websocket_events.append({
							'request_id'	: devtools_response['params']['requestId'],
							'timestamp'		: devtools_response['params']['timestamp'],
							'event_type'	: devtools_response['method'].replace('Network.',''),
							'payload'		: payload,
							'event_order'	: len(websocket_events)
						})

					# EVENT SOURCE
					if devtools_response['method'] == 'Network.eventSourceMessageReceived':
						event_source_msgs.append({
							'internal_request_id'	: devtools_response['params']['requestId'],
							'timestamp'			: devtools_response['params']['timestamp'],
							'event_name'		: devtools_response['params']['eventName'],
							'event_id'			: devtools_response['params']['eventId'],
							'data'				: devtools_response['params']['data']
						})

					# DOMSTORAGE
					if devtools_response['method'] == 'DOMStorage.domStorageItemAdded' or devtools_response['method'] == 'DOMStorage.domStorageItemUpdated':
						dom_storage_id = devtools_response['params']['storageId']
						ds_key = (
								dom_storage_id['securityOrigin'],
								dom_storage_id['isLocalStorage'],
								devtools_response['params']['key']
						)

						dom_storage_holder[ds_key] = devtools_response['params']['newValue']

			# no need to continue processing if we got nothing back
			if len(responses) == 0:
				self.exit()
				return ({
					'success': False,
					'result': 'No responses for page'
				})

			if len(load_finish_events) == 0:
				self.exit()
				return ({
					'success': False,
					'result': 'No load_finish_events for page'
				})

			# Stop getting additional DOMStorage events
			response = self.send_ws_command('DOMStorage.disable')
			if response['success'] == False:
				self.exit()
				return response
		else:
			# if we are not getting the log we still do the prewait/scroll
			if self.debug: print(f'going to prewait for {self.prewait}')
			for i in range(0,self.prewait):
				self.do_scroll
				time.sleep(1)

		#####################
		# DEVTOOLS COMMANDS #
		#####################

		# only issue body commands for network_log
		if not get_text_only:
			if self.return_bodies:
				if self.debug: print('######################################')
				if self.debug: print(' Going to send response body commands ')
				if self.debug: print('######################################')

				# send commands to get response bodies
				for event in load_finish_events:
					request_id = event['request_id']
					response = self.send_ws_command('Network.getResponseBody',f'"requestId":"{request_id}"')
					if response['success'] == False:
						self.exit()
						return response
					else:
						ws_id = response['result']
					ws_id_to_req_id[ws_id] = request_id
					pending_ws_id_to_cmd[ws_id] = 'response_body'

				if self.debug: print('\tdone')

			# No longer need Network domain enabled
			self.send_ws_command('Network.disable')
			if response['success'] == False:
				self.exit()
				return response

		if self.debug: print('###########################################')
		if self.debug: print(' Going to send devtools websocket commands ')
		if self.debug: print('###########################################')

		# send the ws commands to get above data
		response = self.send_ws_command('Page.getNavigationHistory')
		if response['success'] == False:
			self.exit()
			return response
		else:
			ws_id = response['result']

		pending_ws_id_to_cmd[ws_id] = 'page_nav'

		response = self.send_ws_command('Runtime.evaluate',params='"expression":"document.documentElement.outerHTML","timeout":1000')
		if response['success'] == False:
			self.exit()
			return response
		else:
			ws_id = response['result']
		pending_ws_id_to_cmd[ws_id] = 'page_src'

		response = self.send_ws_command('Runtime.evaluate',params='"expression":"document.documentElement.lang","timeout":1000')
		if response['success'] == False:
			self.exit()
			return response
		else:
			ws_id = response['result']
		pending_ws_id_to_cmd[ws_id] = 'html_lang'

		# LINKS
		js = json.dumps("""
			var wbxr_links = (function () {
				var wbxr_processed_links = [];
				var wbxr_links 			 = document.links;
				for (var wbxr_i = 0; wbxr_i < wbxr_links.length; wbxr_i++) {
					wbxr_processed_links.push({
						'text'		: wbxr_links[wbxr_i]['innerText'],
						'href'		: wbxr_links[wbxr_i]['href'],
						'protocol'	: wbxr_links[wbxr_i]['protocol']
					});
				}
				return (wbxr_processed_links);
			}());
			wbxr_links;
		""")
		response = self.send_ws_command('Runtime.evaluate',params=f'"expression":{js},"timeout":1000,"returnByValue":true')
		if response['success'] == False:
			self.exit()
			return response
		else:
			ws_id = response['result']
		pending_ws_id_to_cmd[ws_id] = 'links'

		# META_DESC
		js = json.dumps("""
			document.querySelector('meta[name="description" i]').content;
		""")

		response = self.send_ws_command('Runtime.evaluate',params=f'"expression":{js},"timeout":1000,"returnByValue":true')
		if response['success'] == False:
			self.exit()
			return response
		else:
			ws_id = response['result']
		pending_ws_id_to_cmd[ws_id] = 'meta_desc'

		# PAGE_TEXT / READABILITY_HTML
		#
		# Inject the locally downloaded copy of readability into the page
		#	and extract the content. Note you must download readability on
		#	your own and place in the appropriate directory
		if self.return_page_text or get_text_only:
			# if we can't load readability it likely isn't installed, raise error
			try:
				readability_js = open(os.path.dirname(os.path.abspath(__file__))+'/resources/policyxray/Readability.js', 'r', encoding='utf-8').read()
				js = json.dumps(f"""
					var wbxr_readability = (function() {{
						{readability_js}
						var documentClone = document.cloneNode(true);
						var article = new Readability(documentClone).parse();
						return (article);
					}}());
					wbxr_readability;
				""")
				response = self.send_ws_command('Runtime.evaluate',params=f'"expression":{js},"timeout":1000,"returnByValue":true')
				if response['success'] == False:
					self.exit()
					return response
				else:
					ws_id = response['result']
				pending_ws_id_to_cmd[ws_id] = 'page_text'
			except:
				print('\t****************************************************')
				print('\t The Readability.js library is needed for webXray to')
				print('\t  extract text, and it appears to be missing.      ')
				print()
				print('\t Please go to https://github.com/mozilla/readability')
				print('\t  download the file Readability.js and place it     ')
				print('\t  in the directory "webxray/resources/policyxray/"  ')
				print('\t****************************************************')
				self.exit()
				return ({
					'success': False,
					'result': 'Attempting to extract text but Readability.js is not found.'
				})
		else:
			page_text 			= None
			readability_html 	= None

		if self.return_screen_shot:
			# scroll back to top for screen shot
			try:
				self.driver.execute_script('window.scrollTo(0, 0);')
			except:
				pass
			response = self.send_ws_command('Page.captureScreenshot')
			if response['success'] == False:
				self.exit()
				return response
			else:
				ws_id = response['result']
			pending_ws_id_to_cmd[ws_id] = 'screen_shot'
		else:
			screen_shot = None

		# do cookies last
		response  = self.send_ws_command('Network.getAllCookies')
		if response['success'] == False:
			self.exit()
			return response
		else:
			ws_id = response['result']
		pending_ws_id_to_cmd[ws_id] = 'cookies'

		# Keep track of how long we've been reading ws data
		response_loop_start = datetime.datetime.now()

		# just to let us know how much work to do
		if self.debug: print('Pending ws requests: %s %s' % (url, len(pending_ws_id_to_cmd)))

		# Keep going until we get all the pending responses or 3min timeout
		while True:

			# if result is None we are either out of responses (prematurely) or
			#	we failed
			devtools_response = self.get_next_ws_response()
			if not devtools_response:
				self.exit()
				return ({
					'success': False,
					'result': 'Unable to get devtools response.'
				})

			# update how long we've been going
			loop_elapsed = (datetime.datetime.now()-response_loop_start).total_seconds()

			# if we're still processing responses after 3 min, kill it
			if loop_elapsed > 180:
				self.exit()
				return ({
					'success': False,
					'result': 'Timeout when processing devtools responses.'
				})

			if self.debug: print(loop_elapsed,json.dumps(devtools_response)[:100])

			# if response has an 'id' see which of our commands it goes to
			if 'id' in devtools_response:
				ws_id = devtools_response['id']

				# we don't care about this
				if ws_id not in pending_ws_id_to_cmd: continue

				# remove the current one from pending
				# if self.debug: print('Going to remove ws_id %s from pending' % ws_id)
				cmd = pending_ws_id_to_cmd[ws_id]
				del pending_ws_id_to_cmd[ws_id]
				if self.debug: print(f'Removing {ws_id}:{cmd}, pending ws_id count is %s' % len(pending_ws_id_to_cmd))

				# NAV HISTORY/FINAL_URL
				if cmd == 'page_nav':
					try:
						final_url 	= devtools_response['result']['entries'][-1]['url']
						title 		= devtools_response['result']['entries'][-1]['title']
					except:
						self.exit()
						return ({
							'success': False,
							'result': 'Unable to get final_url,title via Devtools'
						})

					# this is the first time we know it is a redirect, return now to save further wasted effort
					is_redirect = self.is_url_internal(url,final_url)
					if self.reject_redirects and (is_redirect == None or is_redirect == False):
						self.exit()
						return ({
							'success': False,
							'result': 'rejecting redirect'
						})

				# PAGE SOURCE
				elif cmd == 'page_src':
					try:
						page_source = devtools_response['result']['result']['value']
					except:
						self.exit()
						return ({
							'success': False,
							'result': 'Unable to get page_source via Devtools'
						})

				# HTML LANG
				elif cmd == 'html_lang':
					try:
						lang = devtools_response['result']['result']['value']
					except:
						self.exit()
						return ({
							'success': False,
							'result': 'Unable to get html lang via Devtools'
						})

				# RESPONSE BODIES
				elif cmd == 'response_body':
					if 'result' not in devtools_response:
						if self.debug: print('response body error: %s' % devtools_response)
						continue

					# if we are here we already know return_bodies is true so we
					#	just have to check the reponse is either not base64 or we
					#	do want to return base64
					if devtools_response['result']['base64Encoded'] == False or self.return_bodies_base64:
						response_bodies[ws_id_to_req_id[ws_id]] = {
								'body': 	 devtools_response['result']['body'],
								'is_base64': devtools_response['result']['base64Encoded']
						}

				# SCREENSHOT
				elif cmd == 'screen_shot':
					if 'result' in devtools_response:
						screen_shot = devtools_response['result']['data']

				# COOKIES
				elif cmd == 'cookies':
					try:
						cookies = devtools_response['result']['cookies']
					except:
						self.exit()
						return ({
							'success': False,
							'result': 'Unable to get cookies via Devtools'
						})

				# LINKS
				elif cmd == 'links':
					try:
						js_links = devtools_response['result']['result']['value']
					except:
						js_links = []

				# META_DESC
				elif cmd == 'meta_desc':
					try:
						meta_desc = devtools_response['result']['result']['value']
					except:
						meta_desc = None

				# PAGE_TEXT
				elif cmd == 'page_text':
					# if we don't get a result we don't care
					try:
						page_text 			= devtools_response['result']['result']['value']['textContent']
						readability_html 	= devtools_response['result']['result']['value']['content']
					except:
						page_text 			= None
						readability_html 	= None

			# we've gotten all the reponses we need, break
			if len(pending_ws_id_to_cmd) == 0:
				if self.debug: print('Got all ws responses!')
				break
		# end ws loop

		# catch redirect to illegal url
		if not self.is_url_valid(final_url):
			self.exit()
			return ({
				'success': False,
				'result': 'Redirected to illegal url: '+final_url
			})

		# process links and mark if internal
		all_links = []
		internal_link_count = 0
		for link in js_links:
			# filtering steps
			if 'href' not in link: continue
			if len(link['href']) == 0: continue
			if link['protocol'][:4] != 'http': continue

			# get rid of trailing # and /
			if link['href'].strip()[-1:] == '#': link['href'] = link['href'].strip()[:-1]
			if link['href'].strip()[-1:] == '/': link['href'] = link['href'].strip()[:-1]

			# sometimes the text will be a dict (very rarely)
			# 	so we convert to string
			link_text = str(link['text']).strip()

			# set up the dict
			if self.is_url_internal(final_url,link['href']):
				internal_link_count += 1
				link = {
					'text'		: link_text,
					'href'		: link['href'].strip(),
					'internal'	: True
				}
			else:
				link = {
					'text'		: link_text,
					'href'		: link['href'].strip(),
					'internal'	: False
				}

			# only add unique links
			if link not in all_links:
				all_links.append(link)

		# fail if we don't have enough internal links
		if self.min_internal_links:
			if internal_link_count < self.min_internal_links:
				self.exit()
				return ({
					'success': False,
					'result': 'did not find enough internal links'
				})

		# reprocess domstorage into list of dicts if doing network_log
		if not get_text_only:
			if self.debug: print('Fixing domstorage')
			for ds_key in dom_storage_holder:
				dom_storage.append({
					'security_origin'	: ds_key[0],
					'is_local_storage'	: ds_key[1],
					'key'				: ds_key[2],
					'value'				: dom_storage_holder[ds_key]
				})

		################################################
		# FIX TIMESTAMPS: ONLY NEEDED FOR NETWORK_LOG #
		################################################
		if not get_text_only:
			# See note above regarding how chrome timestamps work, in the below blocks
			#	we fix the timestamps to reflect real world time.
			if self.debug: print('Fixing timestamps')

			# likely nothing was loaded
			if not first_timestamp:
				self.exit()
				return ({
					'success': False,
					'result': 'first_timestamp was None'
				})

			# Page load time is the delta between the origin_walltime and the final_walltime
			#	we initialize final_walltime to None as if it does not get updated nothing
			#	was loaded and we failed.
			final_walltime = None

			# As we update the load_finish_event timestamps we also update the final_walltime.
			for load_finish_event in load_finish_events:
				fixed_timestamp = self.fixed_timestamp(origin_walltime, first_timestamp, load_finish_event['timestamp'])
				load_finish_event['timestamp'] = fixed_timestamp
				if final_walltime == None or fixed_timestamp > final_walltime:
					final_walltime = fixed_timestamp

			# These timestamp fixes are straightforward
			for request in requests:
				request['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, request['timestamp'])

			for response in responses:
				response['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, response['timestamp'])

			for websocket_event in websocket_events:
				websocket_event['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, websocket_event['timestamp'])

			for event_source_msg in event_source_msgs:
				event_source_msg['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, event_source_msg['timestamp'])

			# Session cookies have expires of -1 so we sent to None
			for cookie in cookies:
				if cookie['expires']:
					if cookie['expires'] > 0:
						cookie['expires'] = cookie['expires']
					else:
						cookie['expires'] = None

			# If origin_walltime or final_walltime are None that means
			#	we didn't record any Network.requestWillBeSent or
			#	Network.loadingFinished events, and this was not a successful
			#	page load
			if origin_walltime == None or final_walltime == None:
				self.exit()
				return ({
					'success': False,
					'result': 'Unable to calculate load time, possible nothing was loaded'
				})
			else:
				# get seconds between the last time we got a load finish and
				#	the first request
				load_time = (datetime.datetime.fromtimestamp(final_walltime) - datetime.datetime.fromtimestamp(origin_walltime)).total_seconds()
				#load_time = 0
		else:
			# we only do a prewait if not doing network log
			load_time = self.prewait

		# other parts of webxray expect this data format, common to all browser drivers used
		if self.debug: print('returning data on %s' % url)
		return_dict = {
			'accessed'				: origin_walltime,
			'all_links'				: all_links,
			'client_timezone'		: '_'.join(time.tzname),
			'browser_type'			: self.browser_type,
			'browser_version'		: self.browser_version,
			'prewait'				: self.prewait,
			'no_event_wait' 		: self.no_event_wait,
			'max_wait' 				: self.max_wait,
			'start_url'				: url,
			'final_url'				: final_url,
			'title'					: title,
			'meta_desc'				: meta_desc,
			'lang'					: lang,
			'load_time'				: load_time,
			'requests'				: requests,
			'request_extra_headers'	: request_extra_headers,
			'responses'				: responses,
			'response_extra_headers': response_extra_headers,
			'load_finish_events'	: load_finish_events,
			'websockets'			: websockets,
			'websocket_events'		: websocket_events,
			'event_source_msgs'		: event_source_msgs,
			'response_bodies'		: response_bodies,
			'cookies'				: cookies,
			'dom_storage'			: dom_storage,
			'page_source'			: page_source,
			'page_text'				: page_text,
			'readability_html'		: readability_html,
			'screen_shot'			: screen_shot,
			'page_load_strategy'	: self.page_load_strategy
		}

		# Close browser and websocket connection, if doing a crawl
		#	this happens in get_crawl_traffic
		if self.is_crawl == False: self.exit()

		# done!
		return ({
			'success': True,
			'result': return_dict
		})
	# get_scan

	def clean_request(self, request_params):
		"""
		Many of the request fields are optional so we make sure
			we make them None if not present and also normalize
			the naming convention.  Returns a dict.
		"""

		cleaned_request = {}

		# get non-optional values first
		cleaned_request['request_id'] 		= request_params['requestId']
		cleaned_request['loader_id'] 		= request_params['loaderId']
		cleaned_request['document_url'] 	= request_params['documentURL']
		cleaned_request['timestamp'] 		= request_params['timestamp']
		cleaned_request['wall_time'] 		= request_params['wallTime']
		cleaned_request['initiator'] 		= request_params['initiator']

		# handle optional values in main params
		if 'type' in request_params:
			cleaned_request['type'] = request_params['type']
		else:
			cleaned_request['type'] = None

		if 'frameId' in request_params:
			cleaned_request['frame_id'] = request_params['frameId']
		else:
			cleaned_request['frame_id'] = None

		if 'hasUserGesture' in request_params:
			cleaned_request['has_user_gesture'] = request_params['hasUserGesture']
		else:
			cleaned_request['has_user_gesture'] = None

		if 'redirectResponse' in request_params:
			cleaned_request['redirect_response_url'] = request_params['redirectResponse']['url']
		else:
			cleaned_request['redirect_response_url'] = None

		# for readability
		this_request = request_params['request']

		# get non-optional values first
		cleaned_request['url'] 				= this_request['url']
		cleaned_request['method'] 			= this_request['method']
		cleaned_request['headers'] 			= this_request['headers']
		cleaned_request['initial_priority'] = this_request['initialPriority']
		cleaned_request['referrer_policy'] 	= this_request['referrerPolicy']

		# handle optional values in request
		if 'urlFragment' in this_request:
			cleaned_request['url_fragment'] = this_request['urlFragment']
		else:
			cleaned_request['url_fragment'] = None

		if 'postData' in this_request:
			cleaned_request['post_data'] = this_request['postData']
		else:
			cleaned_request['post_data'] = None

		if 'mixedContentType' in this_request:
			cleaned_request['mixed_content_type'] = this_request['mixedContentType']
		else:
			cleaned_request['mixed_content_type'] = None

		if 'isLinkPreload' in this_request:
			cleaned_request['is_link_preload'] = this_request['isLinkPreload']
		else:
			cleaned_request['is_link_preload'] = None

		# done!
		return cleaned_request
	# clean_request

	def clean_response(self, response_params):
		"""
		Many of the response fields are optional so we make sure
			we make them None if not present and also normalize
			the naming convention.  Returns a dict.
		"""

		cleaned_response = {}

		# get non-optional param values first
		cleaned_response['request_id'] 	= response_params['requestId']
		cleaned_response['loader_id'] 	= response_params['loaderId']
		cleaned_response['timestamp'] 	= response_params['timestamp']
		cleaned_response['type'] 		= response_params['type']

		# handle optional param values
		if 'frameId' in response_params:
			cleaned_response['frame_id'] = response_params['frameId']
		else:
			cleaned_response['frame_id'] = None

		# handle non-optional reponse values
		this_response = response_params['response']

		cleaned_response['url'] 				= this_response['url']
		cleaned_response['status'] 				= this_response['status']
		cleaned_response['status_text']			= this_response['statusText']
		cleaned_response['response_headers'] 	= this_response['headers']
		cleaned_response['mime_type'] 			= this_response['mimeType']
		cleaned_response['connection_reused'] 	= this_response['connectionReused']
		cleaned_response['connection_id']		= this_response['connectionId']
		cleaned_response['encoded_data_length'] = this_response['encodedDataLength']
		cleaned_response['security_state'] 		= this_response['securityState']

		# handle optional response values
		if 'requestHeaders' in this_response:
			cleaned_response['request_headers'] = this_response['requestHeaders']
		else:
			cleaned_response['request_headers'] = None

		if 'remoteIPAddress' in this_response:
			cleaned_response['remote_ip_address'] = this_response['remoteIPAddress']
		else:
			cleaned_response['remote_ip_address'] = None

		if 'remotePort' in this_response:
			cleaned_response['remote_port'] = this_response['remotePort']
		else:
			cleaned_response['remote_port'] = None

		if 'fromDiskCache' in this_response:
			cleaned_response['from_disk_cache'] = this_response['fromDiskCache']
		else:
			cleaned_response['from_disk_cache'] = None

		if 'fromServiceWorker' in this_response:
			cleaned_response['from_service_worker'] = this_response['fromServiceWorker']
		else:
			cleaned_response['from_service_worker'] = None

		if 'fromPrefetchCache' in this_response:
			cleaned_response['from_prefetch_cache'] = this_response['fromPrefetchCache']
		else:
			cleaned_response['from_prefetch_cache'] = None

		if 'timing' in this_response:
			cleaned_response['timing'] = this_response['timing']
		else:
			cleaned_response['timing'] = None

		if 'protocol' in this_response:
			cleaned_response['protocol'] = this_response['protocol']
		else:
			cleaned_response['protocol'] = None

		if 'securityDetails' in this_response:
			cleaned_response['security_details'] = this_response['securityDetails']
		else:
			cleaned_response['security_details'] = None

		# done!
		return cleaned_response
	# clean_response

	def fixed_timestamp(self,origin_walltime,first_timestamp,timestamp):
		"""
		See notes above for details.
		"""
		# first calculate the timestamp offset
		elapsed_time = timestamp - first_timestamp

		# now add offset to the origin time to get the real time
		return origin_walltime + elapsed_time
	# fixed_timestamp

	def is_url_valid(self, url):
		"""
		Performs checks to verify if the url can actually be
			scanned.
		"""

		# only do http links
		if not (re.match('^https?://.+', url)): return False

		# if we can't get the url_path it is invalid
		try:
			url_path = urlsplit(url.strip().lower()).path
		except:
			return False

		# these are common file types we want to avoid
		illegal_extensions = [
			'apk',
			'dmg',
			'doc',
			'docx',
			'exe',
			'ics',
			'iso',
			'pdf',
			'ppt',
			'pptx',
			'rtf',
			'txt',
			'xls',
			'xlsx'
		]

		# if we can't parse the extension it doesn't exist and is
		#	therefore ok by our standards
		try:
			url_extension = re.search('\.([0-9A-Za-z]+)$', url_path).group(1)
			if url_extension in illegal_extensions: return False
		except:
			return True

		# it's good
		return True
	# is_url_valid

	def idna_encode_url(self, url, no_fragment=False):
		"""
		Non-ascii domains will crash some browsers, so we need to convert them to
			idna/ascii/utf-8. This requires splitting apart the url, converting the
			domain to idna, and pasting it all back together.  Note that this can fail,
			particularly in regards to idna encoding of invalid addresses (eg http://.example.com)
			so we return None in fail event.
		"""
		try:
			split_url = urlsplit(url.strip())
			idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
			if no_fragment:
				return urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,''))
			else:
				return urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment))
		except:
			return None
	# idna_encode_url

	def is_url_internal(self,origin_url,target_url):
		"""
		Given two urls (origin, target) determines if
			the target is internal to the origin based on
			subsuffix+1 domain.
		"""

		origin_domain 	= self.url_parser.get_parsed_domain_info(origin_url)
		target_domain	= self.url_parser.get_parsed_domain_info(target_url)

		# we return None to signify we couldn't parse the urls
		if not origin_domain['success'] or not target_domain['success']:
			return None
		else:
			origin_domain 	= origin_domain['result']['domain']
			target_domain  	= target_domain['result']['domain']

		if origin_domain != target_domain:
			return False
		else:
			return True
	# is_url_internal

	def do_scroll(self):
		"""
		Performs a random scroll action on Y axis, can be called at regular
			intervals to surface content on pages.
		"""
		self.send_ws_command('Input.dispatchMouseEvent','"x":0,"y":0,"type":"mouseWheel","deltaX":0,"deltaY":%s' % random.randrange(10,100))
Ejemplo n.º 4
0
class OutputStore:
	"""	
	This class receives data from the browser, processes it, and stores it in the db
	"""

	def __init__(self, db_name, db_engine):
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
		self.debug		= False
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(self.db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()
		self.config 	= self.sql_driver.get_config()
	# __init__

	def close(self):
		"""
		Just to make sure we close the db connection.
		"""
		self.sql_driver.close()
	# close

	def store_scan(self, params):
		"""
		This function pre-processes data from the browser, inserts it into 
			database, and handles linking various entries across tables.
		"""

		# unpack params
		browser_output 	= params['browser_output']
		client_id 		= params['client_id']
		crawl_id 		= params['crawl_id']
		crawl_timestamp = params['crawl_timestamp']
		crawl_sequence	= params['crawl_sequence']

		# client_ip is optional
		if 'client_ip' in params:
			client_ip = params['client_ip']
		else:
			client_ip = None

		if self.debug: print('going to store scan %s' % browser_output['start_url'])

		# keep track of domains
		page_3p_cookie_domains 		= set()
		page_3p_dom_storage_domains = set()
		page_3p_request_domains 	= set()
		page_3p_response_domains 	= set()
		page_3p_websocket_domains 	= set()

		# convert from timestamp to datetime object that will go to the db
		accessed = datetime.fromtimestamp(browser_output['accessed'])

		# first make sure we don't have it already
		if self.sql_driver.page_exists(browser_output['start_url'],accessed): 
			return {'success': False, 'result': 'exists in db already'}

		# if we have no responses the page didn't load at all and we skip
		#	 unless we are using basic driver and then it's ok
		if len(browser_output['responses']) == 0 and browser_output['browser_type'] != 'basic':
			return {'success': False, 'result': 'no responses received'}

		# ignore any malformed unicode characters
		page_source = browser_output['page_source'].encode('utf-8', 'ignore').decode()

		# store source
		if self.config['store_source']:
			if self.debug: print('going to store source %s' % browser_output['start_url'])
			page_source_md5 = self.store_file(page_source, False, 'page_source')
		else:
			page_source_md5 = None

		# store readability_html
		if self.config['store_page_text'] and browser_output['page_text']:
			if self.debug: print('going to store readability_html')
			# ignore any malformed unicode characters
			readability_html 		= browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip()
			readability_source_md5 	= self.store_file(readability_html, False, 'readability_html')

			# store_page_text handles some addition operations
			if self.debug: print('going to store page_text')
			page_text_id = self.store_page_text(readability_html,readability_source_md5)
		else:
			page_text_id 			= None

		# process info on the start_url domain
		if self.debug: print('going to parse start/final_url %s' % browser_output['start_url'])
		start_url = browser_output['start_url']
		start_url_domain_info = self.url_parser.get_parsed_domain_info(start_url)
		if start_url_domain_info['success'] == False:
			err_msg = 'unable to parse start_url_domain_info info for %s with error %s' % (browser_output['start_url'], start_url_domain_info['result'])
			if self.debug: print(err_msg)
			self.sql_driver.log_error({
				'client_id'		: client_id, 
				'target'		: start_url, 
				'task'			: 'output_store',
				'msg'			: err_msg
			})
			return {'success': False, 'result': 'could not parse start_url'}
		else:
			# needed for comparisons later on
			start_url_domain = start_url_domain_info['result']['domain']

			# add start_url domain and get id
			start_url_domain_id = self.sql_driver.add_domain(start_url_domain_info['result'])

		# process info on the final_url domain
		# note: we use the final_url domain as the benchmark for determine 1p/3p
		final_url = browser_output['final_url']
		final_url_domain_info = self.url_parser.get_parsed_domain_info(final_url)
		if final_url_domain_info['success'] == False:
			err_msg = 'unable to parse final_url_domain_info info for %s with error %s' % (browser_output['final_url'], final_url_domain_info['result'])
			if self.debug: print(err_msg)
			self.sql_driver.log_error({
				'client_id'		: client_id, 
				'target'		: start_url, 
				'task'			: 'output_store',
				'msg'			: err_msg
			})
			return {'success': False, 'result': 'could not parse final_url'}
		else:
			final_url_domain = final_url_domain_info['result']['domain']
			# self.sql_driver.add_domain both stores the new domain and returns its db row id
			# if it is already in db just return the existing id
			final_url_domain_id = self.sql_driver.add_domain(final_url_domain_info['result'])

		# check if the page has redirected to a new domain
		if start_url_domain != final_url_domain:
			page_domain_redirect = True
		else:
			page_domain_redirect = False

		# this is semi-redundant but ensures that any config changes made while
		#	a result is queued are followed
		if self.config['client_reject_redirects'] and page_domain_redirect:
			return {'success': False, 'result': 'rejecting redirect'}

		# if the final page is https (often after a redirect), mark it appropriately
		if browser_output['final_url'][:5] == 'https':
			page_is_ssl = True
		else:
			page_is_ssl = False

		# (optionally) process and store links, this allows us to go back later and do deeper scans
		#	as well as do more with policies
		
		# links starts as empty list
		links = []

		# keep track of link counts as helpful for filtering pages
		link_count_internal = 0
		link_count_external = 0

		if self.config['store_links']:

			if self.debug: print('going to process links %s' % browser_output['start_url'])

			# we use the list of policy_link_terms to flag that a link *might*
			# 	be for a policy, we check if it actually is policy in PolicyCollector.py
			policy_link_terms = self.utilities.get_policy_link_terms()

			# process links, duplicates get ignored by db
			for link in browser_output['all_links']:
				# skip if href not valid
				if not self.utilities.is_url_valid(link['href']): continue

				# unpack values and catch any unicode errors
				link_text = link['text'].encode('utf-8', 'ignore').decode()
				link_url  = link['href'].encode('utf-8', 'ignore').decode()

				# get rid of trailing # and /
				if link_url.strip()[-1:] == '#': link_url = link_url.strip()[:-1]
				if link_url.strip()[-1:] == '/': link_url = link_url.strip()[:-1]

				# sometimes the text will be a dict (very rarely)
				# 	so we convert to string
				link_text = str(link_text).strip()

				# clean up white space and remove line breaks
				link_text = re.sub('\n|\r|\t|\s+',' ',link_text.strip())
				link_url  = re.sub('\n|\r|\t|\s+',' ',link_url.strip())

				# catch nulls
				link_text = link_text.replace('\x00','NULL_REPLACED_FOR_PSQL')
				link_url  = link_url.replace('\x00','NULL_REPLACED_FOR_PSQL')

				# update counts
				if link['internal']:
					link_count_internal += 1
				else:
					link_count_external += 1

				# flag links that could be policies, default False
				link_is_policy = False

				# determine if a policy term appears in the link
				for policy_term in policy_link_terms:
					if policy_term in link_text.lower():
						link_is_policy = True
						break

				link_domain_info = self.url_parser.get_parsed_domain_info(link_url)
				if link_domain_info['success'] == False:
					# don't bother with storing errors
					link_domain_id = None
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# 	if it is already in db just return the existing id
					link_domain_id = self.sql_driver.add_domain(link_domain_info['result'])

				links.append({
					'url'			: link_url, 
					'text'			: link_text, 
					'is_internal'	: link['internal'], 
					'is_policy'		: link_is_policy, 
					'domain_id'		: link_domain_id
				})

		# if we got the screen shot we get the hash and store it to the file table
		screen_shot_md5 = None
		if browser_output['screen_shot'] and self.config['store_screen_shot']:
			if self.debug: print('going to store screen shot %s' % browser_output['start_url'])
			# store file to get md5
			screen_shot_md5 = self.store_file(browser_output['screen_shot'],True,'screen_shot')

		# if we have timestamp it is also an 'accessed' field from
		#	a page load so we convert that as well
		if crawl_timestamp:
			crawl_timestamp = datetime.fromtimestamp(crawl_timestamp)

		# ignore any malformed unicode characters
		if browser_output['title']:
			browser_output['title'] = browser_output['title'].encode('utf-8', 'ignore').decode()

		if browser_output['meta_desc']:
			browser_output['meta_desc'] = browser_output['meta_desc'].encode('utf-8', 'ignore').decode()

		if browser_output['lang']:
			browser_output['lang'] = browser_output['lang'].encode('utf-8', 'ignore').decode()

		# now we know link counts we can store the page
		if self.debug: print('going to store page %s' % browser_output['start_url'])
		page_id = self.sql_driver.add_page({
			'accessed'				: accessed,
			'browser_type'			: browser_output['browser_type'],
			'browser_version'		: browser_output['browser_version'],
			'browser_prewait'		: browser_output['prewait'],
			'browser_no_event_wait'	: browser_output['no_event_wait'],
			'browser_max_wait'		: browser_output['max_wait'],
			'page_load_strategy'	: browser_output['page_load_strategy'],
			'title'					: browser_output['title'],
			'meta_desc'				: browser_output['meta_desc'],
			'lang'					: browser_output['lang'],
			'start_url'				: browser_output['start_url'],
			'final_url'				: browser_output['final_url'],
			'is_ssl'				: page_is_ssl,
			'page_domain_redirect'	: page_domain_redirect,
			'link_count_internal'	: link_count_internal,
			'link_count_external'	: link_count_external,
			'load_time'				: browser_output['load_time'],
			'start_url_domain_id'	: start_url_domain_id,
			'final_url_domain_id'	: final_url_domain_id,
			'client_id'				: client_id,
			'client_timezone'		: browser_output['client_timezone'],
			'client_ip'				: client_ip,
			'page_text_id'			: page_text_id,
			'screen_shot_md5'		: screen_shot_md5,
			'page_source_md5'		: page_source_md5,
			'crawl_id'				: crawl_id,
			'crawl_timestamp'		: crawl_timestamp,
			'crawl_sequence'		: crawl_sequence
		})

		# STORE LINKS
		if self.config['store_links']:
			if self.debug: print('going to store links %s' % browser_output['start_url'])
			for link in links:
				link_id = self.sql_driver.add_link(link)
				if link_id: self.sql_driver.join_link_to_page(page_id,link_id)

		# PROCESS DOM_STORAGE
		if self.config['store_dom_storage']:
			if self.debug: print('going to process dom storage %s' % browser_output['start_url'])
			for dom_storage in browser_output['dom_storage']:
				# parse domain from the security_origin, which is equivalent to a url
				domain_info = self.url_parser.get_parsed_domain_info(dom_storage['security_origin'])
				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (dom_storage['security_origin'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# if it is already in db just return the existing id
					dom_storage['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# mark if third-party storage
				if final_url_domain != domain_info['result']['domain']:
					dom_storage['is_3p'] = True
				else:
					dom_storage['is_3p'] = False

				# key to page
				dom_storage['page_id'] = page_id

				# replace null b/c postgres will die otherwise
				dom_storage['key']		= dom_storage['key'].replace('\x00','NULL_REPLACED_FOR_PSQL')
				dom_storage['value']	= dom_storage['value'].replace('\x00','NULL_REPLACED_FOR_PSQL')

				# there types of illegal utf-8 characters that psql doesn't like, eg trying to store
				#	'\uded5' gives this error when storing in psql: 
				#	'UnicodeEncodeError: 'utf-8' codec can't encode character '\uded5' in position 0: surrogates not allowed'
				#
				# to overcome the above, we use python's backslashreplace to keep the original data in 
				#	a way that won't cause our queries to die
				# see https://docs.python.org/3/library/codecs.html#error-handlers
				dom_storage['key']		= dom_storage['key'].encode('utf-8','backslashreplace')
				dom_storage['value']	= dom_storage['value'].encode('utf-8','backslashreplace')

				# now that we've encoded with backslashes we decode to get the semi-original data
				dom_storage['key']		= dom_storage['key'].decode('utf-8')
				dom_storage['value']	= dom_storage['value'].decode('utf-8')

				# all done with this item
				self.sql_driver.add_dom_storage(dom_storage)

				# update domains
				if dom_storage['is_3p']:
					page_3p_dom_storage_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		# PROCESS LOAD FINISH
		if self.debug: print('going to process load finish data %s' % browser_output['start_url'])
		load_finish_data = {}
		for load_finish_event in browser_output['load_finish_events']:
			load_finish_data[load_finish_event['request_id']] = load_finish_event['encoded_data_length']

		# RESPONSE EXTRA HEADERS
		if self.debug: print('going to process response extra header data %s' % browser_output['start_url'])
		http_cookies = []
		internal_id_to_resp_ex_headers = {}
		for response_extra_header in browser_output['response_extra_headers']:
			response_extra_header['page_id'] 		= page_id
			response_extra_header['cookies_set']	= None
			
			# to check for domain leakage in headers we make a big string keyed to the internal id
			if response_extra_header['request_id'] not in internal_id_to_resp_ex_headers:
				internal_id_to_resp_ex_headers[response_extra_header['request_id']] = str(response_extra_header['headers'])
			else:
				internal_id_to_resp_ex_headers[response_extra_header['request_id']] += str(response_extra_header['headers'])

			for item in response_extra_header['headers']:
				if item.lower() == 'set-cookie':
					response_extra_header['cookies_set'] = response_extra_header['headers'][item]

					# when we add cookies later on we mark those that came from response headers,
					#	note we try/pass on this in case we can't parse
					for cookie in response_extra_header['cookies_set'].split('\n'):
						if 'domain' in cookie.lower():
							try:
								name = re.match('^(.+?)=',cookie)[0][:-1]
								domain = re.match('^.+domain=(.+?)(;|$)',cookie.lower())[1]
								if domain[0] == '.': domain = domain[1:]
								http_cookies.append((domain,name))
							except:
								pass

			if self.config['store_response_xtra_headers']:
				self.sql_driver.add_response_extra_header(response_extra_header)

		# PROCESS RESPONSES
		response_received_req_ids = []
		
		if self.debug: print('going to process response data %s' % browser_output['start_url'])
		
		for response in browser_output['responses']:
			
			# defaut values that may get over-written
			response['file_md5'] 				= None
			response['is_data']  				= False
			response['is_3p'] 					= None
			response['is_ssl']					= None
			response['page_domain_in_headers'] 	= False

			# first handle non-http urls and optionally store content
			if re.match('^(data|about|chrome|blob|javascript).+', response['url']):
				if 'base64' in response['url'].lower() or 'image' in response['type'].lower():
					is_base64 = True
				else:
					is_base64 = False
					
				# store_file follows the config as far as actually storing the file goes 
				#	and will either return the md5 or None
				# make sure we're following our configuration
				if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False):
					response['file_md5'] = self.store_file(response['url'],is_base64,response['type'])
				else:
					response['file_md5'] = None

				response['url']	      = None
				response['is_data']   = True
				response['domain_id'] = None
			else:
				# parse, store, and get id of domain; if fails skip
				domain_info = self.url_parser.get_parsed_domain_info(response['url'])
				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (response['url'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					response_domain = domain_info['result']['domain']
					response['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# now add ip
				if response['remote_ip_address']:
					self.sql_driver.add_domain_ip_addr(response['domain_id'],response['remote_ip_address'])

				# mark third-party responses based on final_url domain
				if response_domain != final_url_domain:
					response['is_3p'] = True
				else:
					response['is_3p'] = False

				# determine if encrypted
				if response['url'][:5] == 'https' or response['url'][:3] == 'wss':
					response['is_ssl']  = True
				else:
					response['is_ssl']  = False


			# keep track of the request ids of each reponse to mark as received
			response_received_req_ids.append(response['request_id'])

			# we do no more processing at this point
			if not self.config['store_responses']:
				continue

			# lower case the type, simplifies db queries
			response['type'] = response['type'].lower()

			# store the security details if they exist
			if response['security_details'] and self.config['store_security_details']:
				response['security_details_id'] = self.sql_driver.add_security_details(response['security_details'])
			else:
				response['security_details_id'] = None

			# store the size of the request
			if response['request_id'] in load_finish_data:
				response['final_data_length'] = load_finish_data[response['request_id']]
			else:
				response['final_data_length'] = None

			# parse off args/etc

			# consider anything before the "?" to be the element_url
			try:
				response['base_url'] = re.search('^(.+?)\?.+$', response['url']).group(1)
			except:
				response['base_url'] = response['url']

			# attempt to parse off the extension
			try:
				response['extension'] = re.search('\.([0-9A-Za-z]+)$', response['base_url']).group(1).lower()
			except:
				response['extension'] = None
			
			# First see if this request_id is present in response_bodies, and if
			#	the entry is not None, then we store it to the db if config says to.
			if response['request_id'] in browser_output['response_bodies']:
				if browser_output['response_bodies'][response['request_id']]:
					# make sure we're following our configuration
					is_base64 = browser_output['response_bodies'][response['request_id']]['is_base64']
					if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False):
						response['file_md5'] = self.store_file(
							browser_output['response_bodies'][response['request_id']]['body'],
							is_base64,
							response['type']
						)
					else:
						response['file_md5'] = None

			# link to page
			response['page_id'] = page_id

			# parse data headers, accounts for upper/lower case variations (eg 'set-cookie', 'Set-Cookie')
			response['content_type'] = None
			response['cookies_set'] = None
			
			for item in response['response_headers']:
				if item.lower() == 'content-type':
					response['content_type'] = response['response_headers'][item]
				
				if item.lower() == 'set-cookie':
					response['cookies_set']  = response['response_headers'][item]

			# if we have request_headers look for cookies sent
			response['cookies_sent']  = None
			if response['request_headers']:
				for item in response['request_headers']:
					if item.lower() == 'cookie':
						response['cookies_sent']  = response['request_headers'][item]

			# parse referer header
			response['referer'] = None
			for item in response['response_headers']:
				if item.lower() == 'referer':
					response['referer'] = response['response_headers'][item]

			# check if domain leaked in referer
			if response['request_id'] in internal_id_to_resp_ex_headers:
				if final_url_domain in internal_id_to_resp_ex_headers[response['request_id']]:
					response['page_domain_in_headers'] = True

			# convert from timestamp to datetime object that will go to the db
			response['timestamp'] = datetime.fromtimestamp(response['timestamp'])

			# store
			self.sql_driver.add_response(response)

			# update domains
			if response['is_3p']:
				page_3p_response_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		# REQUEST EXTRA HEADERS
		if self.debug: print('going to process request extra headers data %s' % browser_output['start_url'])
		internal_id_to_req_ex_headers = {}
		for request_extra_header in browser_output['request_extra_headers']:
			request_extra_header['page_id'] 		= page_id
			request_extra_header['cookies_sent']	= None

			# to check for domain leakage in headers we make a big string keyed to the internal id
			if request_extra_header['request_id'] not in internal_id_to_req_ex_headers:
				internal_id_to_req_ex_headers[request_extra_header['request_id']] = str(request_extra_header['headers'])
			else:
				internal_id_to_req_ex_headers[request_extra_header['request_id']] += str(request_extra_header['headers'])
			
			for item in request_extra_header['headers']:
				if item.lower() == 'cookie':
					request_extra_header['cookies_sent'] = request_extra_header['headers'][item]
			
			if self.config['store_request_xtra_headers']:
				self.sql_driver.add_request_extra_header(request_extra_header)

		# PROCESS REQUESTS
		if self.config['store_requests']:
			if self.debug: print('going to process request data %s' % browser_output['start_url'])
			for request in browser_output['requests']:
				# defaut values that may get over-written
				request['file_md5'] 				= None
				request['is_data']  				= False
				request['is_3p'] 					= None
				request['is_ssl']					= None
				request['page_domain_in_headers'] 	= False

				# first handle non-http urls and optionally store content
				if re.match('^(data|about|chrome|blob|javascript).+', request['url']):
					if 'base64' in request['url'].lower() or 'image' in request['url'].lower():
						is_base64 = True
					else:
						is_base64 = False
					
					# store_file follows the config as far as actually storing the file goes 
					#	and will either return the md5 or None
					# make sure we're following our configuration
					if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False):
						request['file_md5'] = self.store_file(request['url'],is_base64,request['type'])
					else:
						request['file_md5'] = None

					request['url']	     = None
					request['is_data']   = True
					request['domain_id'] = None
				else:
					# parse, store, and get id of domain; if fails skip
					domain_info = self.url_parser.get_parsed_domain_info(request['url'])
					if domain_info['success'] == False:
						err_msg = 'unable to parse domain info for %s with error %s' % (request['url'], domain_info['result'])
						if self.debug: print(err_msg)
						self.sql_driver.log_error({
							'client_id'		: client_id, 
							'target'		: start_url, 
							'task'			: 'output_store',
							'msg'			: err_msg
						})
						continue
					else:
						request_domain = domain_info['result']['domain']
						request['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

					# mark third-party requests based on final_url domain
					if request_domain != final_url_domain:
						request['is_3p'] = True
					else:
						request['is_3p'] = False

					# determine if encrypted
					if request['url'][:5] == 'https' or request['url'][:3] == 'wss':
						request['is_ssl']  = True
					else:
						request['is_ssl']  = False

				# replace null b/c postgres will die otherwise
				if request['post_data']:
					request['post_data'] = request['post_data'].replace('\x00','NULL_REPLACED_FOR_PSQL')

				# consider anything after the "?" to be the GET data
				try:
					get_string = re.search('^.+\?(.+)$', request['url']).group(1)
					get_string = get_string.replace('\x00','NULL_REPLACED_FOR_PSQL')
					get_data = {}
					for key_val in get_string.split('&'):
						get_data[key_val.split('=')[0]] = key_val.split('=')[1]
					request['get_data'] = json.dumps(get_data)
				except:
					request['get_data'] = None

				# mark if response received
				if request['request_id'] in response_received_req_ids:
					request['response_received'] = True
				else:
					request['response_received'] = None

				# mark if the loading finished
				if request['request_id'] in load_finish_data:
					request['load_finished'] = True
				else:
					request['load_finished'] = None

				# lower case the type, simplifies db queries
				if request['type']: request['type'] = request['type'].lower()

				# parse off args/etc

				# consider anything before the "?" to be the element_url
				try:
					request['base_url'] = re.search('^(.+?)\?.+$', request['url']).group(1)
				except:
					request['base_url'] = request['url']

				# attempt to parse off the extension
				try:
					request['extension'] = re.search('\.([0-9A-Za-z]+)$', request['base_url']).group(1).lower()
				except:
					request['extension'] = None

				# link to page
				request['page_id'] = page_id

				# parse referer header
				request['referer'] = None
				for item in request['headers']:
					if item.lower() == 'referer':
						request['referer'] 	 = request['headers'][item]

				# check if domain leaked in headers
				if request['request_id'] in internal_id_to_req_ex_headers:
					if final_url_domain in internal_id_to_req_ex_headers[request['request_id']]:
						request['page_domain_in_headers'] = True

				# convert from timestamp to datetime object that will go to the db
				request['timestamp'] = datetime.fromtimestamp(request['timestamp'])

				# all done
				self.sql_driver.add_request(request)

				# update domains
				if request['is_3p']:
					page_3p_request_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		# PROCESS WEBSOCKETS
		if self.config['store_websockets']:
			if self.debug: print('going to process websocket data %s' % browser_output['start_url'])
			ws_id_map = {}
			for websocket in browser_output['websockets']:
				domain_info = self.url_parser.get_parsed_domain_info(websocket['url'])
				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (websocket['url'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# if it is already in db just return the existing id
					websocket['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# mark if third-party connection
				if final_url_domain != domain_info['result']['domain']:
					websocket['is_3p'] = True
				else:
					websocket['is_3p'] = False

				websocket['page_id'] = page_id
				this_websocket_id = self.sql_driver.add_websocket(websocket)

				# update domains
				if websocket['is_3p']:
					page_3p_websocket_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

				if websocket['request_id'] not in ws_id_map:
					ws_id_map[websocket['request_id']] = this_websocket_id
				else:
					print('ERROR WS_REQ_ID ALREADY IN MAP')

		# PROCESS WEBSOCKET EVENTS
		if self.config['store_websockets'] and self.config['store_websocket_events']:
			for websocket_event in browser_output['websocket_events']:
				websocket_event['page_id'] = page_id
				if websocket_event['request_id'] in ws_id_map:
					websocket_event['websocket_id'] = ws_id_map[websocket_event['request_id']]
				else:
					websocket_event['websocket_id'] = None

				# convert from timestamp to datetime object that will go to the db
				websocket_event['timestamp'] = datetime.fromtimestamp(websocket_event['timestamp'])

				self.sql_driver.add_websocket_event(websocket_event)

		# PROCESS EVENT SOURCE MSGS
		if self.config['store_event_source_msgs']:
			if self.debug: print('going to process event source data %s' % browser_output['start_url'])
			for event_source_msg in browser_output['event_source_msgs']:
				event_source_msg['page_id'] = page_id

				# convert from timestamp to datetime object that will go to the db
				event_source_msg['timestamp'] = datetime.fromtimestamp(event_source_msg['timestamp'])

				self.sql_driver.add_event_source_msg(event_source_msg)

		# PROCESS COOKIES
		if self.config['store_cookies']:
			if self.debug: print('going to process cookies %s' % browser_output['start_url'])
			for cookie in browser_output['cookies']:
				# get the ip, fqdn, domain, pubsuffix, and tld
				# we need the domain to figure out if cookies/elements are third-party
				# note:
				#	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://

				# parse domain from the security_origin, which is equivalent to a url
				domain_info = self.url_parser.get_parsed_domain_info('http://'+cookie['domain'])

				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (cookie['domain'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# if it is already in db just return the existing id
					cookie['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# mark if third-party cookie
				if final_url_domain != domain_info['result']['domain']:
					cookie['is_3p'] = True
				else:
					cookie['is_3p'] = False

				# key to page
				cookie['page_id'] = page_id

				# fix var names
				cookie['http_only'] = cookie['httpOnly']

				# attempt to convert cookie expiry from timestamp to datetime object, note we 
				#	need try/except as python datetime object cannot have year > 9999 and some 
				#	cookies do that
				cookie['expires_timestamp'] = None
				if cookie['expires']: 
					try:
						cookie['expires_timestamp'] = datetime.fromtimestamp(cookie['expires'])
					except:
						pass

				# this is optional, do fall-back
				if 'sameSite' in cookie:
					cookie['same_site'] = cookie['sameSite']
				else:
					cookie['same_site'] = None

				# see if this cookie was set via http response
				if cookie['domain'][0] == '.': 
					cookie_tuple = (cookie['domain'][1:],cookie['name'])
				else:
					cookie_tuple = (cookie['domain'],cookie['name'])
				
				if cookie_tuple in http_cookies:
					cookie['is_set_by_response'] = True
				else:
					cookie['is_set_by_response'] = False

				# all done with this cookie
				self.sql_driver.add_cookie(cookie)

				# update domains
				if cookie['is_3p']:
					page_3p_cookie_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		if self.debug: print('done storing scan %s' % browser_output['start_url'])
		return {
			'success'						: True,
			'page_id'						: page_id,
			'page_3p_request_domains'		: page_3p_request_domains,
			'page_3p_response_domains'		: page_3p_response_domains,
			'page_3p_websocket_domains'		: page_3p_websocket_domains,
			'page_3p_dom_storage_domains'	: page_3p_dom_storage_domains,
			'page_3p_cookie_domains'		: page_3p_cookie_domains
		}
	# store_scan

	def store_file(self,body,is_base64,type):
		"""
		Hashes and stores file, returns file_md5.
		"""

		# in theory we shouldn't get here if it is base64, so this is a fail-safe check
		if not self.config['store_base64']:
			if is_base64 or type.lower()=='image':
				return None

		# note hash is on original data, which we modify to remove \x00 before we store
		file_md5 = hashlib.md5(body.encode()).hexdigest()

		# store to db, note query will be ignored on conflict
		#	but since we calculate the md5 as above that is fine
		self.sql_driver.add_file({
			'md5'		: file_md5,
			'body'		: body.replace('\x00','NULL_REPLACED_FOR_PSQL'),
			'type'		: type.lower(),
			'is_base64'	: is_base64
		})

		return file_md5
	# store_file

	def store_policy(self, browser_output, client_id, client_ip=None):
		"""
		We attempt to figure out if the text provided is a policy, if so
			we store it to the database.
		"""

		# keep values in a dict here
		policy = {}

		# attempt to get_policy was a success, extract data from
		#	dict, since postgres cannot handle '\x00' we convert to 
		#	string for several fields and use .replace('\x00',' ') to 
		# 	clean the input
		policy['client_id']			= client_id
		policy['client_ip']			= client_ip
		policy['browser_type']		= browser_output['browser_type']
		policy['browser_version']	= browser_output['browser_version']
		policy['browser_prewait']	= browser_output['prewait']
		policy['start_url']			= browser_output['start_url']
		policy['final_url']			= browser_output['final_url']
		policy['title']				= browser_output['title']
		policy['meta_desc']			= browser_output['meta_desc']
		policy['lang']				= browser_output['lang']
		policy['fk_score']			= None
		policy['fre_score']			= None
		policy['word_count']		= None
		policy['type']				= None
		policy['match_term']		= None
		policy['match_text']		= None
		policy['match_text_type']	= None
		policy['confidence']		= None
		policy['page_text_id']		= None
		policy['page_source_md5']	= None

		# if readability failed we bail
		if not browser_output['readability_html'] or not browser_output['page_text']:
			self.sql_driver.close()
			return {
				'success'	: False,
				'result'	: 'No readability result'
			}

		# ignore any malformed unicode characters
		readability_html 	= browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip()
		page_text 			= browser_output['page_text'].encode('utf-8', 'ignore').decode().strip()
		page_source 		= browser_output['page_source'].encode('utf-8', 'ignore').decode()

		# bail on empty text
		if len(page_text) == 0:
			self.sql_driver.close()
			return {
				'success'	: False,
				'result'	: 'Empty page text'
			}

		# load the source into lxml so we can do additional processing, 
		#	if we fail we bail
		try:
			lxml_doc = lxml.html.fromstring(readability_html)
		except:
			return ({
				'success': False,
				'result': 'Could not parse readability_html with lxml'
			})

		# if the text is less than 500 words we ignore it
		if len(page_text.split(' ')) < 500:
			self.sql_driver.close()
			return {
				'success'	: False,
				'result'	: 'Page text < 500 words'
			}

		# once we have the text we figure out if it is 
		#	a policy, start false, override on match
		is_policy = False

		# first look for matches on page title
		# 	we give this confidence of 100 as it is
		#	definitely a match
		if policy['title']:
			policy_type_result = self.determine_policy_type_from_text(policy['title'])
			if policy_type_result['success'] == True:
				is_policy 		= True
				policy['type']				= policy_type_result['result']['policy_type']
				policy['match_term']		= policy_type_result['result']['match_term']
				policy['match_text']		= policy_type_result['result']['match_text']
				policy['match_text_type']	= 'title'
				policy['confidence']		= 100

		# deep checks may generate false positives so
		#	they have confidence of 0 until they can
		#	be verified, note we may do this here
		#	or later on
		deep_checks = True
		if deep_checks:
			policy['confidence'] = 0
			# convert the url path to a sentence by replacing
			#	common delimiters with spaces and attempt matches	
			if self.debug: print('going to do checks on url path')
			if not is_policy:
				url_path_string = re.sub('[-|_|/|\.]',' ',urlsplit(policy['start_url']).path)
				if len(url_path_string) > 0:
					policy_type_result = self.determine_policy_type_from_text(url_path_string)
					if policy_type_result['success'] == True:
						is_policy 					= True
						policy['type']				= policy_type_result['result']['policy_type']
						policy['match_term']		= policy_type_result['result']['match_term']
						policy['match_text']		= policy_type_result['result']['match_text']
						policy['match_text_type']	= 'url_path'

			if self.debug: print('going to do checks on meta desc')
			if not is_policy and policy['meta_desc']:
				policy_type_result = self.determine_policy_type_from_text(policy['meta_desc'])
				if policy_type_result['success'] == True:
					is_policy 					= True
					policy['type']				= policy_type_result['result']['policy_type']
					policy['match_term']		= policy_type_result['result']['match_term']
					policy['match_text']		= policy_type_result['result']['match_text']
					policy['match_text_type']	= 'meta_desc'

			# iterate over all types of heading tags to extract text 
			#	and check for policy matches.  note we go in order of
			#	importance (eg h1->h7->span,etc)
			if self.debug: print('going to do checks on heading tags')
			if not is_policy:
				for tag_type in ['h1','h2','h3','h4','h5','h6','h7','span','strong','em']:
					if is_policy: break
					tags = lxml_doc.cssselect(tag_type)
					if len(tags) > 0:
						for tag in tags:
							tag_text = tag.text_content()
							# if it is > 15 words it is likely not a heading
							if len(tag_text.split(' ')) > 15: break
							policy_type_result = self.determine_policy_type_from_text(tag_text)
							if policy_type_result['success'] == True:
								is_policy 					= True
								policy['type']				= policy_type_result['result']['policy_type']
								policy['match_term']		= policy_type_result['result']['match_term']
								policy['match_text']		= policy_type_result['result']['match_text']
								policy['match_text_type']	= tag_type

		# if it is a policy we do additional processing
		#	before storing in db, otherwise we fail
		#	gracefully
		if is_policy:
			if self.debug: print('going to store readability_html')
			readability_source_md5 = self.store_file(readability_html, False, 'readability_html')

			if self.debug: print('going to store page_text')

			# store_page_text handles some addition operations
			if self.debug: print('going to store page_text')
			policy['page_text_id'] = self.store_page_text(readability_html, readability_source_md5)

			if self.debug: print(f"page_text_id is {policy['page_text_id']}")

			if self.debug: print('going to store page_source')
			policy['page_source_md5'] 	= self.store_file(page_source, False, 'page_source')

			if self.debug: print('going to do reading ease scores')
			# get readability scores, scores below zero are
			#	invalid so we null them
			policy['fre_score'] = textstat.flesch_reading_ease(page_text)
			if policy['fre_score'] <= 0:
				policy['fre_score'] = None

			policy['fk_score']  = textstat.flesch_kincaid_grade(page_text)
			if policy['fk_score'] <= 0:
				policy['fk_score'] = None

			if self.debug: print('going to store policy')
			# add to db and get id for this policy
			policy_id  = self.sql_driver.add_policy(policy)

			if self.debug: print('going to link policy to pages')
			# attach policy to all links with this url, not we can filter
			#	do only do internal links
			for page_id, crawl_id in self.sql_driver.get_page_ids_from_link_url(policy['start_url'],internal_links_only=True):
				self.sql_driver.attach_policy_to_page(policy_id,page_id)
				self.sql_driver.attach_policy_to_crawl(policy_id,crawl_id)

			if self.debug: 
				print(f'\t� Success: {policy["start_url"]}')
			self.sql_driver.close()
			return {'success': True}
		else:
			if self.debug: 
				print(f'\t👎 Fail: {policy["start_url"]}')
			self.sql_driver.close()
			return {
				'success': False,
				'result': 'Not policy'
			}
	# store_policy

	def determine_policy_type_from_text(self, text):
		"""
		Determine if a given text fragment indicates
			a given type of policy.

		Returns dict.

		"""

		# clear whitespace
		text = re.sub('\s+',' ',text)

		# retrieve values from policy_terms.json
		policy_verification_terms = self.utilities.get_policy_verification_terms()

		policy_type_keys = []
		for key in policy_verification_terms:
			policy_type_keys.append(key)

		# randomize the order we do our checks
		random.shuffle(policy_type_keys)

		# look for matches against verification terms
		for policy_type in policy_type_keys:
			for term in policy_verification_terms[policy_type]:
				if term in text.lower():
					return({
						'success': True,
						'result' :{
							'policy_type':	policy_type,
							'match_term':	term,
							'match_text':	text
						}
					})

		# no match
		return ({'success': False})
	# determine_policy_type_from_text

	def store_page_text(self,readability_html,readability_source_md5):
		# the actual 'page_text' output from readability doesn't properly seperate words
		#	that use markup as a space.  eg '<h3>this</h3><p>that</p>' becomes 'thisthat'
		#	whereas 'this that' is what a user would see in the browser
		# to overcome the above issue we have to manually strip out html and do some 
		#	cleaning of our own.
		page_text = re.sub('<!--.+-->',' ', readability_html)
		page_text = re.sub('<svg.+</svg>',' ', page_text)
		page_text = re.sub('<.+?>', ' ', page_text)
		page_text = re.sub('[\n|\r]', ' ', page_text)
		page_text = re.sub('\s+', ' ', page_text)
		page_text = unicodedata.normalize('NFKD',html.unescape(page_text.strip()))

		# postgres can't handle nulls
		page_text = page_text.replace('\x00','NULL_REPLACED_FOR_PSQL')

		# return the id
		return self.sql_driver.add_page_text({
			'text'						: page_text.replace('\x00',' '),
			'word_count'				: len(page_text.split()),
			'readability_source_md5' 	: readability_source_md5
		})