Beispiel #1
0
	def __init__(self,db_name,db_engine, flush_domain_owners):

		# set up global db connection
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()

		# these gets reused frequently, minimize db calls by doing it up here
		self.total_pages 	= self.sql_driver.get_complex_page_count()
		self.total_crawls 	= self.sql_driver.get_crawl_count()

		# pass utilities the database info
		self.utilities = Utilities(db_name,db_engine)

		# initialize the domain owner dict
		self.domain_owners = self.utilities.get_domain_owner_dict()

		# update domain owners
		if flush_domain_owners:
			self.patch_domain_owners()

		# load to memory for faster processing, make sure you
		#	have enough RAM!
		self.get_crawl_id_to_3p_domain_info()
Beispiel #2
0
    def __init__(self, ua=False, dnt=False):
        """
        set various global options here
        """

        self.dnt = dnt

        # set here if you want to use headless mode for Chrome
        self.headless = True

        # if you want to get potentially dangerous requests, set
        #	this to true.
        # false by default for hopefully obvious reasons
        self.allow_insecure = False

        # if you have trouble getting chrome to start
        #	change these values manually
        self.chromedriver_path = None
        self.chrome_binary_path = None

        # we want to give our browsers a full minute to try to
        #	download content, but gracefully timeout thereafter
        self.page_timeout_seconds = 60

        # Set ua if we have it, see get_ua_for_headless
        #	for details
        self.ua = ua

        # useful for various tasks
        self.utilities = Utilities()

        return None
def rate_estimate(db_name, client_id):
    """
	Tells us how much longer to go...
	"""
    print('Showing scan rate for database %s' % db_name)
    if client_id:
        print('\tclient_id is %s' % client_id)
    else:
        client_id = None

    print()
    print()

    print(
        'elapsed_minutes\tcurrent_rate\taverage_rate\tremaining_tasks\tremaining_hours'
    )
    print(
        '---------------\t------------\t------------\t---------------\t---------------'
    )
    utilities = Utilities(db_name=db_name, db_engine=db_engine)
    for result in utilities.stream_rate():
        print('%s\t\t%s\t\t%s\t\t%s\t\t%s' %
              (result[client_id]['elapsed_minutes'],
               result[client_id]['current_rate'],
               result[client_id]['average_rate'],
               result[client_id]['remaining_tasks'],
               result[client_id]['remaining_hours']))
Beispiel #4
0
    def __init__(self, db_name=None, db_engine=None, client_id=None):
        """
		This class can be called to run store_results_from_queue which connects
			to the server_config database to fetch results, in which case a global
			db_name isn't needed, so we have db_name=None to account for that.
			However, if we *do* have a db_name we set up a global config.
		"""
        self.db_name = db_name
        self.db_engine = db_engine
        self.client_id = client_id
        self.debug = True
        self.utilities = Utilities()

        # get global config for this db
        if db_name:
            # set up database connection
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            self.config = sql_driver.get_config()
            self.browser_config = {}

            for item in self.config:
                if 'client' in item:
                    self.browser_config[item] = self.config[item]

            sql_driver.close()
Beispiel #5
0
	def __init__(self, db_name, db_engine):
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
		self.debug		= False
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(self.db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()
		self.config 	= self.sql_driver.get_config()
Beispiel #6
0
	def __init__(self,ua=False):
		"""
		set various global options here
		"""

		# set here if you want to use headless mode for Chrome
		self.headless = True

		# if you want to get potentially dangerous requests, set 
		#	this to true.
		# false by default for hopefully obvious reasons
		self.allow_insecure = False

		# if you have trouble getting chrome to start
		#	change these values manually
		self.chromedriver_path  = None
		self.chrome_binary_path = None

		# we want to give our browsers a full minute to try to
		#	download content, but gracefully timeout thereafter
		self.page_timeout_seconds = 60

		# Set ua if we have it, see get_ua_for_headless
		#	for details
		self.ua = ua
		
		# useful for various tasks
		self.utilities = Utilities()

		return None
Beispiel #7
0
class OutputStore:
	"""	
	This class receives data from the browser, processes it, and stores it in the db
	"""

	def __init__(self, db_name, db_engine):
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
		self.debug		= False
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(self.db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()
		self.config 	= self.sql_driver.get_config()
	# __init__

	def close(self):
		"""
		Just to make sure we close the db connection.
		"""
		self.sql_driver.close()
	# close

	def store_scan(self, params):
		"""
		This function pre-processes data from the browser, inserts it into 
			database, and handles linking various entries across tables.
		"""

		# unpack params
		browser_output 	= params['browser_output']
		client_id 		= params['client_id']
		crawl_id 		= params['crawl_id']
		crawl_timestamp = params['crawl_timestamp']
		crawl_sequence	= params['crawl_sequence']

		# client_ip is optional
		if 'client_ip' in params:
			client_ip = params['client_ip']
		else:
			client_ip = None

		if self.debug: print('going to store scan %s' % browser_output['start_url'])

		# keep track of domains
		page_3p_cookie_domains 		= set()
		page_3p_dom_storage_domains = set()
		page_3p_request_domains 	= set()
		page_3p_response_domains 	= set()
		page_3p_websocket_domains 	= set()

		# convert from timestamp to datetime object that will go to the db
		accessed = datetime.fromtimestamp(browser_output['accessed'])

		# first make sure we don't have it already
		if self.sql_driver.page_exists(browser_output['start_url'],accessed): 
			return {'success': False, 'result': 'exists in db already'}

		# if we have no responses the page didn't load at all and we skip
		#	 unless we are using basic driver and then it's ok
		if len(browser_output['responses']) == 0 and browser_output['browser_type'] != 'basic':
			return {'success': False, 'result': 'no responses received'}

		# ignore any malformed unicode characters
		page_source = browser_output['page_source'].encode('utf-8', 'ignore').decode()

		# store source
		if self.config['store_source']:
			if self.debug: print('going to store source %s' % browser_output['start_url'])
			page_source_md5 = self.store_file(page_source, False, 'page_source')
		else:
			page_source_md5 = None

		# store readability_html
		if self.config['store_page_text'] and browser_output['page_text']:
			if self.debug: print('going to store readability_html')
			# ignore any malformed unicode characters
			readability_html 		= browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip()
			readability_source_md5 	= self.store_file(readability_html, False, 'readability_html')

			# store_page_text handles some addition operations
			if self.debug: print('going to store page_text')
			page_text_id = self.store_page_text(readability_html,readability_source_md5)
		else:
			page_text_id 			= None

		# process info on the start_url domain
		if self.debug: print('going to parse start/final_url %s' % browser_output['start_url'])
		start_url = browser_output['start_url']
		start_url_domain_info = self.url_parser.get_parsed_domain_info(start_url)
		if start_url_domain_info['success'] == False:
			err_msg = 'unable to parse start_url_domain_info info for %s with error %s' % (browser_output['start_url'], start_url_domain_info['result'])
			if self.debug: print(err_msg)
			self.sql_driver.log_error({
				'client_id'		: client_id, 
				'target'		: start_url, 
				'task'			: 'output_store',
				'msg'			: err_msg
			})
			return {'success': False, 'result': 'could not parse start_url'}
		else:
			# needed for comparisons later on
			start_url_domain = start_url_domain_info['result']['domain']

			# add start_url domain and get id
			start_url_domain_id = self.sql_driver.add_domain(start_url_domain_info['result'])

		# process info on the final_url domain
		# note: we use the final_url domain as the benchmark for determine 1p/3p
		final_url = browser_output['final_url']
		final_url_domain_info = self.url_parser.get_parsed_domain_info(final_url)
		if final_url_domain_info['success'] == False:
			err_msg = 'unable to parse final_url_domain_info info for %s with error %s' % (browser_output['final_url'], final_url_domain_info['result'])
			if self.debug: print(err_msg)
			self.sql_driver.log_error({
				'client_id'		: client_id, 
				'target'		: start_url, 
				'task'			: 'output_store',
				'msg'			: err_msg
			})
			return {'success': False, 'result': 'could not parse final_url'}
		else:
			final_url_domain = final_url_domain_info['result']['domain']
			# self.sql_driver.add_domain both stores the new domain and returns its db row id
			# if it is already in db just return the existing id
			final_url_domain_id = self.sql_driver.add_domain(final_url_domain_info['result'])

		# check if the page has redirected to a new domain
		if start_url_domain != final_url_domain:
			page_domain_redirect = True
		else:
			page_domain_redirect = False

		# this is semi-redundant but ensures that any config changes made while
		#	a result is queued are followed
		if self.config['client_reject_redirects'] and page_domain_redirect:
			return {'success': False, 'result': 'rejecting redirect'}

		# if the final page is https (often after a redirect), mark it appropriately
		if browser_output['final_url'][:5] == 'https':
			page_is_ssl = True
		else:
			page_is_ssl = False

		# (optionally) process and store links, this allows us to go back later and do deeper scans
		#	as well as do more with policies
		
		# links starts as empty list
		links = []

		# keep track of link counts as helpful for filtering pages
		link_count_internal = 0
		link_count_external = 0

		if self.config['store_links']:

			if self.debug: print('going to process links %s' % browser_output['start_url'])

			# we use the list of policy_link_terms to flag that a link *might*
			# 	be for a policy, we check if it actually is policy in PolicyCollector.py
			policy_link_terms = self.utilities.get_policy_link_terms()

			# process links, duplicates get ignored by db
			for link in browser_output['all_links']:
				# skip if href not valid
				if not self.utilities.is_url_valid(link['href']): continue

				# unpack values and catch any unicode errors
				link_text = link['text'].encode('utf-8', 'ignore').decode()
				link_url  = link['href'].encode('utf-8', 'ignore').decode()

				# get rid of trailing # and /
				if link_url.strip()[-1:] == '#': link_url = link_url.strip()[:-1]
				if link_url.strip()[-1:] == '/': link_url = link_url.strip()[:-1]

				# sometimes the text will be a dict (very rarely)
				# 	so we convert to string
				link_text = str(link_text).strip()

				# clean up white space and remove line breaks
				link_text = re.sub('\n|\r|\t|\s+',' ',link_text.strip())
				link_url  = re.sub('\n|\r|\t|\s+',' ',link_url.strip())

				# catch nulls
				link_text = link_text.replace('\x00','NULL_REPLACED_FOR_PSQL')
				link_url  = link_url.replace('\x00','NULL_REPLACED_FOR_PSQL')

				# update counts
				if link['internal']:
					link_count_internal += 1
				else:
					link_count_external += 1

				# flag links that could be policies, default False
				link_is_policy = False

				# determine if a policy term appears in the link
				for policy_term in policy_link_terms:
					if policy_term in link_text.lower():
						link_is_policy = True
						break

				link_domain_info = self.url_parser.get_parsed_domain_info(link_url)
				if link_domain_info['success'] == False:
					# don't bother with storing errors
					link_domain_id = None
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# 	if it is already in db just return the existing id
					link_domain_id = self.sql_driver.add_domain(link_domain_info['result'])

				links.append({
					'url'			: link_url, 
					'text'			: link_text, 
					'is_internal'	: link['internal'], 
					'is_policy'		: link_is_policy, 
					'domain_id'		: link_domain_id
				})

		# if we got the screen shot we get the hash and store it to the file table
		screen_shot_md5 = None
		if browser_output['screen_shot'] and self.config['store_screen_shot']:
			if self.debug: print('going to store screen shot %s' % browser_output['start_url'])
			# store file to get md5
			screen_shot_md5 = self.store_file(browser_output['screen_shot'],True,'screen_shot')

		# if we have timestamp it is also an 'accessed' field from
		#	a page load so we convert that as well
		if crawl_timestamp:
			crawl_timestamp = datetime.fromtimestamp(crawl_timestamp)

		# ignore any malformed unicode characters
		if browser_output['title']:
			browser_output['title'] = browser_output['title'].encode('utf-8', 'ignore').decode()

		if browser_output['meta_desc']:
			browser_output['meta_desc'] = browser_output['meta_desc'].encode('utf-8', 'ignore').decode()

		if browser_output['lang']:
			browser_output['lang'] = browser_output['lang'].encode('utf-8', 'ignore').decode()

		# now we know link counts we can store the page
		if self.debug: print('going to store page %s' % browser_output['start_url'])
		page_id = self.sql_driver.add_page({
			'accessed'				: accessed,
			'browser_type'			: browser_output['browser_type'],
			'browser_version'		: browser_output['browser_version'],
			'browser_prewait'		: browser_output['prewait'],
			'browser_no_event_wait'	: browser_output['no_event_wait'],
			'browser_max_wait'		: browser_output['max_wait'],
			'page_load_strategy'	: browser_output['page_load_strategy'],
			'title'					: browser_output['title'],
			'meta_desc'				: browser_output['meta_desc'],
			'lang'					: browser_output['lang'],
			'start_url'				: browser_output['start_url'],
			'final_url'				: browser_output['final_url'],
			'is_ssl'				: page_is_ssl,
			'page_domain_redirect'	: page_domain_redirect,
			'link_count_internal'	: link_count_internal,
			'link_count_external'	: link_count_external,
			'load_time'				: browser_output['load_time'],
			'start_url_domain_id'	: start_url_domain_id,
			'final_url_domain_id'	: final_url_domain_id,
			'client_id'				: client_id,
			'client_timezone'		: browser_output['client_timezone'],
			'client_ip'				: client_ip,
			'page_text_id'			: page_text_id,
			'screen_shot_md5'		: screen_shot_md5,
			'page_source_md5'		: page_source_md5,
			'crawl_id'				: crawl_id,
			'crawl_timestamp'		: crawl_timestamp,
			'crawl_sequence'		: crawl_sequence
		})

		# STORE LINKS
		if self.config['store_links']:
			if self.debug: print('going to store links %s' % browser_output['start_url'])
			for link in links:
				link_id = self.sql_driver.add_link(link)
				if link_id: self.sql_driver.join_link_to_page(page_id,link_id)

		# PROCESS DOM_STORAGE
		if self.config['store_dom_storage']:
			if self.debug: print('going to process dom storage %s' % browser_output['start_url'])
			for dom_storage in browser_output['dom_storage']:
				# parse domain from the security_origin, which is equivalent to a url
				domain_info = self.url_parser.get_parsed_domain_info(dom_storage['security_origin'])
				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (dom_storage['security_origin'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# if it is already in db just return the existing id
					dom_storage['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# mark if third-party storage
				if final_url_domain != domain_info['result']['domain']:
					dom_storage['is_3p'] = True
				else:
					dom_storage['is_3p'] = False

				# key to page
				dom_storage['page_id'] = page_id

				# replace null b/c postgres will die otherwise
				dom_storage['key']		= dom_storage['key'].replace('\x00','NULL_REPLACED_FOR_PSQL')
				dom_storage['value']	= dom_storage['value'].replace('\x00','NULL_REPLACED_FOR_PSQL')

				# there types of illegal utf-8 characters that psql doesn't like, eg trying to store
				#	'\uded5' gives this error when storing in psql: 
				#	'UnicodeEncodeError: 'utf-8' codec can't encode character '\uded5' in position 0: surrogates not allowed'
				#
				# to overcome the above, we use python's backslashreplace to keep the original data in 
				#	a way that won't cause our queries to die
				# see https://docs.python.org/3/library/codecs.html#error-handlers
				dom_storage['key']		= dom_storage['key'].encode('utf-8','backslashreplace')
				dom_storage['value']	= dom_storage['value'].encode('utf-8','backslashreplace')

				# now that we've encoded with backslashes we decode to get the semi-original data
				dom_storage['key']		= dom_storage['key'].decode('utf-8')
				dom_storage['value']	= dom_storage['value'].decode('utf-8')

				# all done with this item
				self.sql_driver.add_dom_storage(dom_storage)

				# update domains
				if dom_storage['is_3p']:
					page_3p_dom_storage_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		# PROCESS LOAD FINISH
		if self.debug: print('going to process load finish data %s' % browser_output['start_url'])
		load_finish_data = {}
		for load_finish_event in browser_output['load_finish_events']:
			load_finish_data[load_finish_event['request_id']] = load_finish_event['encoded_data_length']

		# RESPONSE EXTRA HEADERS
		if self.debug: print('going to process response extra header data %s' % browser_output['start_url'])
		http_cookies = []
		internal_id_to_resp_ex_headers = {}
		for response_extra_header in browser_output['response_extra_headers']:
			response_extra_header['page_id'] 		= page_id
			response_extra_header['cookies_set']	= None
			
			# to check for domain leakage in headers we make a big string keyed to the internal id
			if response_extra_header['request_id'] not in internal_id_to_resp_ex_headers:
				internal_id_to_resp_ex_headers[response_extra_header['request_id']] = str(response_extra_header['headers'])
			else:
				internal_id_to_resp_ex_headers[response_extra_header['request_id']] += str(response_extra_header['headers'])

			for item in response_extra_header['headers']:
				if item.lower() == 'set-cookie':
					response_extra_header['cookies_set'] = response_extra_header['headers'][item]

					# when we add cookies later on we mark those that came from response headers,
					#	note we try/pass on this in case we can't parse
					for cookie in response_extra_header['cookies_set'].split('\n'):
						if 'domain' in cookie.lower():
							try:
								name = re.match('^(.+?)=',cookie)[0][:-1]
								domain = re.match('^.+domain=(.+?)(;|$)',cookie.lower())[1]
								if domain[0] == '.': domain = domain[1:]
								http_cookies.append((domain,name))
							except:
								pass

			if self.config['store_response_xtra_headers']:
				self.sql_driver.add_response_extra_header(response_extra_header)

		# PROCESS RESPONSES
		response_received_req_ids = []
		
		if self.debug: print('going to process response data %s' % browser_output['start_url'])
		
		for response in browser_output['responses']:
			
			# defaut values that may get over-written
			response['file_md5'] 				= None
			response['is_data']  				= False
			response['is_3p'] 					= None
			response['is_ssl']					= None
			response['page_domain_in_headers'] 	= False

			# first handle non-http urls and optionally store content
			if re.match('^(data|about|chrome|blob|javascript).+', response['url']):
				if 'base64' in response['url'].lower() or 'image' in response['type'].lower():
					is_base64 = True
				else:
					is_base64 = False
					
				# store_file follows the config as far as actually storing the file goes 
				#	and will either return the md5 or None
				# make sure we're following our configuration
				if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False):
					response['file_md5'] = self.store_file(response['url'],is_base64,response['type'])
				else:
					response['file_md5'] = None

				response['url']	      = None
				response['is_data']   = True
				response['domain_id'] = None
			else:
				# parse, store, and get id of domain; if fails skip
				domain_info = self.url_parser.get_parsed_domain_info(response['url'])
				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (response['url'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					response_domain = domain_info['result']['domain']
					response['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# now add ip
				if response['remote_ip_address']:
					self.sql_driver.add_domain_ip_addr(response['domain_id'],response['remote_ip_address'])

				# mark third-party responses based on final_url domain
				if response_domain != final_url_domain:
					response['is_3p'] = True
				else:
					response['is_3p'] = False

				# determine if encrypted
				if response['url'][:5] == 'https' or response['url'][:3] == 'wss':
					response['is_ssl']  = True
				else:
					response['is_ssl']  = False


			# keep track of the request ids of each reponse to mark as received
			response_received_req_ids.append(response['request_id'])

			# we do no more processing at this point
			if not self.config['store_responses']:
				continue

			# lower case the type, simplifies db queries
			response['type'] = response['type'].lower()

			# store the security details if they exist
			if response['security_details'] and self.config['store_security_details']:
				response['security_details_id'] = self.sql_driver.add_security_details(response['security_details'])
			else:
				response['security_details_id'] = None

			# store the size of the request
			if response['request_id'] in load_finish_data:
				response['final_data_length'] = load_finish_data[response['request_id']]
			else:
				response['final_data_length'] = None

			# parse off args/etc

			# consider anything before the "?" to be the element_url
			try:
				response['base_url'] = re.search('^(.+?)\?.+$', response['url']).group(1)
			except:
				response['base_url'] = response['url']

			# attempt to parse off the extension
			try:
				response['extension'] = re.search('\.([0-9A-Za-z]+)$', response['base_url']).group(1).lower()
			except:
				response['extension'] = None
			
			# First see if this request_id is present in response_bodies, and if
			#	the entry is not None, then we store it to the db if config says to.
			if response['request_id'] in browser_output['response_bodies']:
				if browser_output['response_bodies'][response['request_id']]:
					# make sure we're following our configuration
					is_base64 = browser_output['response_bodies'][response['request_id']]['is_base64']
					if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False):
						response['file_md5'] = self.store_file(
							browser_output['response_bodies'][response['request_id']]['body'],
							is_base64,
							response['type']
						)
					else:
						response['file_md5'] = None

			# link to page
			response['page_id'] = page_id

			# parse data headers, accounts for upper/lower case variations (eg 'set-cookie', 'Set-Cookie')
			response['content_type'] = None
			response['cookies_set'] = None
			
			for item in response['response_headers']:
				if item.lower() == 'content-type':
					response['content_type'] = response['response_headers'][item]
				
				if item.lower() == 'set-cookie':
					response['cookies_set']  = response['response_headers'][item]

			# if we have request_headers look for cookies sent
			response['cookies_sent']  = None
			if response['request_headers']:
				for item in response['request_headers']:
					if item.lower() == 'cookie':
						response['cookies_sent']  = response['request_headers'][item]

			# parse referer header
			response['referer'] = None
			for item in response['response_headers']:
				if item.lower() == 'referer':
					response['referer'] = response['response_headers'][item]

			# check if domain leaked in referer
			if response['request_id'] in internal_id_to_resp_ex_headers:
				if final_url_domain in internal_id_to_resp_ex_headers[response['request_id']]:
					response['page_domain_in_headers'] = True

			# convert from timestamp to datetime object that will go to the db
			response['timestamp'] = datetime.fromtimestamp(response['timestamp'])

			# store
			self.sql_driver.add_response(response)

			# update domains
			if response['is_3p']:
				page_3p_response_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		# REQUEST EXTRA HEADERS
		if self.debug: print('going to process request extra headers data %s' % browser_output['start_url'])
		internal_id_to_req_ex_headers = {}
		for request_extra_header in browser_output['request_extra_headers']:
			request_extra_header['page_id'] 		= page_id
			request_extra_header['cookies_sent']	= None

			# to check for domain leakage in headers we make a big string keyed to the internal id
			if request_extra_header['request_id'] not in internal_id_to_req_ex_headers:
				internal_id_to_req_ex_headers[request_extra_header['request_id']] = str(request_extra_header['headers'])
			else:
				internal_id_to_req_ex_headers[request_extra_header['request_id']] += str(request_extra_header['headers'])
			
			for item in request_extra_header['headers']:
				if item.lower() == 'cookie':
					request_extra_header['cookies_sent'] = request_extra_header['headers'][item]
			
			if self.config['store_request_xtra_headers']:
				self.sql_driver.add_request_extra_header(request_extra_header)

		# PROCESS REQUESTS
		if self.config['store_requests']:
			if self.debug: print('going to process request data %s' % browser_output['start_url'])
			for request in browser_output['requests']:
				# defaut values that may get over-written
				request['file_md5'] 				= None
				request['is_data']  				= False
				request['is_3p'] 					= None
				request['is_ssl']					= None
				request['page_domain_in_headers'] 	= False

				# first handle non-http urls and optionally store content
				if re.match('^(data|about|chrome|blob|javascript).+', request['url']):
					if 'base64' in request['url'].lower() or 'image' in request['url'].lower():
						is_base64 = True
					else:
						is_base64 = False
					
					# store_file follows the config as far as actually storing the file goes 
					#	and will either return the md5 or None
					# make sure we're following our configuration
					if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False):
						request['file_md5'] = self.store_file(request['url'],is_base64,request['type'])
					else:
						request['file_md5'] = None

					request['url']	     = None
					request['is_data']   = True
					request['domain_id'] = None
				else:
					# parse, store, and get id of domain; if fails skip
					domain_info = self.url_parser.get_parsed_domain_info(request['url'])
					if domain_info['success'] == False:
						err_msg = 'unable to parse domain info for %s with error %s' % (request['url'], domain_info['result'])
						if self.debug: print(err_msg)
						self.sql_driver.log_error({
							'client_id'		: client_id, 
							'target'		: start_url, 
							'task'			: 'output_store',
							'msg'			: err_msg
						})
						continue
					else:
						request_domain = domain_info['result']['domain']
						request['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

					# mark third-party requests based on final_url domain
					if request_domain != final_url_domain:
						request['is_3p'] = True
					else:
						request['is_3p'] = False

					# determine if encrypted
					if request['url'][:5] == 'https' or request['url'][:3] == 'wss':
						request['is_ssl']  = True
					else:
						request['is_ssl']  = False

				# replace null b/c postgres will die otherwise
				if request['post_data']:
					request['post_data'] = request['post_data'].replace('\x00','NULL_REPLACED_FOR_PSQL')

				# consider anything after the "?" to be the GET data
				try:
					get_string = re.search('^.+\?(.+)$', request['url']).group(1)
					get_string = get_string.replace('\x00','NULL_REPLACED_FOR_PSQL')
					get_data = {}
					for key_val in get_string.split('&'):
						get_data[key_val.split('=')[0]] = key_val.split('=')[1]
					request['get_data'] = json.dumps(get_data)
				except:
					request['get_data'] = None

				# mark if response received
				if request['request_id'] in response_received_req_ids:
					request['response_received'] = True
				else:
					request['response_received'] = None

				# mark if the loading finished
				if request['request_id'] in load_finish_data:
					request['load_finished'] = True
				else:
					request['load_finished'] = None

				# lower case the type, simplifies db queries
				if request['type']: request['type'] = request['type'].lower()

				# parse off args/etc

				# consider anything before the "?" to be the element_url
				try:
					request['base_url'] = re.search('^(.+?)\?.+$', request['url']).group(1)
				except:
					request['base_url'] = request['url']

				# attempt to parse off the extension
				try:
					request['extension'] = re.search('\.([0-9A-Za-z]+)$', request['base_url']).group(1).lower()
				except:
					request['extension'] = None

				# link to page
				request['page_id'] = page_id

				# parse referer header
				request['referer'] = None
				for item in request['headers']:
					if item.lower() == 'referer':
						request['referer'] 	 = request['headers'][item]

				# check if domain leaked in headers
				if request['request_id'] in internal_id_to_req_ex_headers:
					if final_url_domain in internal_id_to_req_ex_headers[request['request_id']]:
						request['page_domain_in_headers'] = True

				# convert from timestamp to datetime object that will go to the db
				request['timestamp'] = datetime.fromtimestamp(request['timestamp'])

				# all done
				self.sql_driver.add_request(request)

				# update domains
				if request['is_3p']:
					page_3p_request_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		# PROCESS WEBSOCKETS
		if self.config['store_websockets']:
			if self.debug: print('going to process websocket data %s' % browser_output['start_url'])
			ws_id_map = {}
			for websocket in browser_output['websockets']:
				domain_info = self.url_parser.get_parsed_domain_info(websocket['url'])
				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (websocket['url'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# if it is already in db just return the existing id
					websocket['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# mark if third-party connection
				if final_url_domain != domain_info['result']['domain']:
					websocket['is_3p'] = True
				else:
					websocket['is_3p'] = False

				websocket['page_id'] = page_id
				this_websocket_id = self.sql_driver.add_websocket(websocket)

				# update domains
				if websocket['is_3p']:
					page_3p_websocket_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

				if websocket['request_id'] not in ws_id_map:
					ws_id_map[websocket['request_id']] = this_websocket_id
				else:
					print('ERROR WS_REQ_ID ALREADY IN MAP')

		# PROCESS WEBSOCKET EVENTS
		if self.config['store_websockets'] and self.config['store_websocket_events']:
			for websocket_event in browser_output['websocket_events']:
				websocket_event['page_id'] = page_id
				if websocket_event['request_id'] in ws_id_map:
					websocket_event['websocket_id'] = ws_id_map[websocket_event['request_id']]
				else:
					websocket_event['websocket_id'] = None

				# convert from timestamp to datetime object that will go to the db
				websocket_event['timestamp'] = datetime.fromtimestamp(websocket_event['timestamp'])

				self.sql_driver.add_websocket_event(websocket_event)

		# PROCESS EVENT SOURCE MSGS
		if self.config['store_event_source_msgs']:
			if self.debug: print('going to process event source data %s' % browser_output['start_url'])
			for event_source_msg in browser_output['event_source_msgs']:
				event_source_msg['page_id'] = page_id

				# convert from timestamp to datetime object that will go to the db
				event_source_msg['timestamp'] = datetime.fromtimestamp(event_source_msg['timestamp'])

				self.sql_driver.add_event_source_msg(event_source_msg)

		# PROCESS COOKIES
		if self.config['store_cookies']:
			if self.debug: print('going to process cookies %s' % browser_output['start_url'])
			for cookie in browser_output['cookies']:
				# get the ip, fqdn, domain, pubsuffix, and tld
				# we need the domain to figure out if cookies/elements are third-party
				# note:
				#	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://

				# parse domain from the security_origin, which is equivalent to a url
				domain_info = self.url_parser.get_parsed_domain_info('http://'+cookie['domain'])

				if domain_info['success'] == False:
					err_msg = 'unable to parse domain info for %s with error %s' % (cookie['domain'], domain_info['result'])
					if self.debug: print(err_msg)
					self.sql_driver.log_error({
						'client_id'		: client_id, 
						'target'		: start_url, 
						'task'			: 'output_store',
						'msg'			: err_msg
					})
					continue
				else:
					# self.sql_driver.add_domain both stores the new domain and returns its db row id
					# if it is already in db just return the existing id
					cookie['domain_id'] = self.sql_driver.add_domain(domain_info['result'])

				# mark if third-party cookie
				if final_url_domain != domain_info['result']['domain']:
					cookie['is_3p'] = True
				else:
					cookie['is_3p'] = False

				# key to page
				cookie['page_id'] = page_id

				# fix var names
				cookie['http_only'] = cookie['httpOnly']

				# attempt to convert cookie expiry from timestamp to datetime object, note we 
				#	need try/except as python datetime object cannot have year > 9999 and some 
				#	cookies do that
				cookie['expires_timestamp'] = None
				if cookie['expires']: 
					try:
						cookie['expires_timestamp'] = datetime.fromtimestamp(cookie['expires'])
					except:
						pass

				# this is optional, do fall-back
				if 'sameSite' in cookie:
					cookie['same_site'] = cookie['sameSite']
				else:
					cookie['same_site'] = None

				# see if this cookie was set via http response
				if cookie['domain'][0] == '.': 
					cookie_tuple = (cookie['domain'][1:],cookie['name'])
				else:
					cookie_tuple = (cookie['domain'],cookie['name'])
				
				if cookie_tuple in http_cookies:
					cookie['is_set_by_response'] = True
				else:
					cookie['is_set_by_response'] = False

				# all done with this cookie
				self.sql_driver.add_cookie(cookie)

				# update domains
				if cookie['is_3p']:
					page_3p_cookie_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id']))

		if self.debug: print('done storing scan %s' % browser_output['start_url'])
		return {
			'success'						: True,
			'page_id'						: page_id,
			'page_3p_request_domains'		: page_3p_request_domains,
			'page_3p_response_domains'		: page_3p_response_domains,
			'page_3p_websocket_domains'		: page_3p_websocket_domains,
			'page_3p_dom_storage_domains'	: page_3p_dom_storage_domains,
			'page_3p_cookie_domains'		: page_3p_cookie_domains
		}
	# store_scan

	def store_file(self,body,is_base64,type):
		"""
		Hashes and stores file, returns file_md5.
		"""

		# in theory we shouldn't get here if it is base64, so this is a fail-safe check
		if not self.config['store_base64']:
			if is_base64 or type.lower()=='image':
				return None

		# note hash is on original data, which we modify to remove \x00 before we store
		file_md5 = hashlib.md5(body.encode()).hexdigest()

		# store to db, note query will be ignored on conflict
		#	but since we calculate the md5 as above that is fine
		self.sql_driver.add_file({
			'md5'		: file_md5,
			'body'		: body.replace('\x00','NULL_REPLACED_FOR_PSQL'),
			'type'		: type.lower(),
			'is_base64'	: is_base64
		})

		return file_md5
	# store_file

	def store_policy(self, browser_output, client_id, client_ip=None):
		"""
		We attempt to figure out if the text provided is a policy, if so
			we store it to the database.
		"""

		# keep values in a dict here
		policy = {}

		# attempt to get_policy was a success, extract data from
		#	dict, since postgres cannot handle '\x00' we convert to 
		#	string for several fields and use .replace('\x00',' ') to 
		# 	clean the input
		policy['client_id']			= client_id
		policy['client_ip']			= client_ip
		policy['browser_type']		= browser_output['browser_type']
		policy['browser_version']	= browser_output['browser_version']
		policy['browser_prewait']	= browser_output['prewait']
		policy['start_url']			= browser_output['start_url']
		policy['final_url']			= browser_output['final_url']
		policy['title']				= browser_output['title']
		policy['meta_desc']			= browser_output['meta_desc']
		policy['lang']				= browser_output['lang']
		policy['fk_score']			= None
		policy['fre_score']			= None
		policy['word_count']		= None
		policy['type']				= None
		policy['match_term']		= None
		policy['match_text']		= None
		policy['match_text_type']	= None
		policy['confidence']		= None
		policy['page_text_id']		= None
		policy['page_source_md5']	= None

		# if readability failed we bail
		if not browser_output['readability_html'] or not browser_output['page_text']:
			self.sql_driver.close()
			return {
				'success'	: False,
				'result'	: 'No readability result'
			}

		# ignore any malformed unicode characters
		readability_html 	= browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip()
		page_text 			= browser_output['page_text'].encode('utf-8', 'ignore').decode().strip()
		page_source 		= browser_output['page_source'].encode('utf-8', 'ignore').decode()

		# bail on empty text
		if len(page_text) == 0:
			self.sql_driver.close()
			return {
				'success'	: False,
				'result'	: 'Empty page text'
			}

		# load the source into lxml so we can do additional processing, 
		#	if we fail we bail
		try:
			lxml_doc = lxml.html.fromstring(readability_html)
		except:
			return ({
				'success': False,
				'result': 'Could not parse readability_html with lxml'
			})

		# if the text is less than 500 words we ignore it
		if len(page_text.split(' ')) < 500:
			self.sql_driver.close()
			return {
				'success'	: False,
				'result'	: 'Page text < 500 words'
			}

		# once we have the text we figure out if it is 
		#	a policy, start false, override on match
		is_policy = False

		# first look for matches on page title
		# 	we give this confidence of 100 as it is
		#	definitely a match
		if policy['title']:
			policy_type_result = self.determine_policy_type_from_text(policy['title'])
			if policy_type_result['success'] == True:
				is_policy 		= True
				policy['type']				= policy_type_result['result']['policy_type']
				policy['match_term']		= policy_type_result['result']['match_term']
				policy['match_text']		= policy_type_result['result']['match_text']
				policy['match_text_type']	= 'title'
				policy['confidence']		= 100

		# deep checks may generate false positives so
		#	they have confidence of 0 until they can
		#	be verified, note we may do this here
		#	or later on
		deep_checks = True
		if deep_checks:
			policy['confidence'] = 0
			# convert the url path to a sentence by replacing
			#	common delimiters with spaces and attempt matches	
			if self.debug: print('going to do checks on url path')
			if not is_policy:
				url_path_string = re.sub('[-|_|/|\.]',' ',urlsplit(policy['start_url']).path)
				if len(url_path_string) > 0:
					policy_type_result = self.determine_policy_type_from_text(url_path_string)
					if policy_type_result['success'] == True:
						is_policy 					= True
						policy['type']				= policy_type_result['result']['policy_type']
						policy['match_term']		= policy_type_result['result']['match_term']
						policy['match_text']		= policy_type_result['result']['match_text']
						policy['match_text_type']	= 'url_path'

			if self.debug: print('going to do checks on meta desc')
			if not is_policy and policy['meta_desc']:
				policy_type_result = self.determine_policy_type_from_text(policy['meta_desc'])
				if policy_type_result['success'] == True:
					is_policy 					= True
					policy['type']				= policy_type_result['result']['policy_type']
					policy['match_term']		= policy_type_result['result']['match_term']
					policy['match_text']		= policy_type_result['result']['match_text']
					policy['match_text_type']	= 'meta_desc'

			# iterate over all types of heading tags to extract text 
			#	and check for policy matches.  note we go in order of
			#	importance (eg h1->h7->span,etc)
			if self.debug: print('going to do checks on heading tags')
			if not is_policy:
				for tag_type in ['h1','h2','h3','h4','h5','h6','h7','span','strong','em']:
					if is_policy: break
					tags = lxml_doc.cssselect(tag_type)
					if len(tags) > 0:
						for tag in tags:
							tag_text = tag.text_content()
							# if it is > 15 words it is likely not a heading
							if len(tag_text.split(' ')) > 15: break
							policy_type_result = self.determine_policy_type_from_text(tag_text)
							if policy_type_result['success'] == True:
								is_policy 					= True
								policy['type']				= policy_type_result['result']['policy_type']
								policy['match_term']		= policy_type_result['result']['match_term']
								policy['match_text']		= policy_type_result['result']['match_text']
								policy['match_text_type']	= tag_type

		# if it is a policy we do additional processing
		#	before storing in db, otherwise we fail
		#	gracefully
		if is_policy:
			if self.debug: print('going to store readability_html')
			readability_source_md5 = self.store_file(readability_html, False, 'readability_html')

			if self.debug: print('going to store page_text')

			# store_page_text handles some addition operations
			if self.debug: print('going to store page_text')
			policy['page_text_id'] = self.store_page_text(readability_html, readability_source_md5)

			if self.debug: print(f"page_text_id is {policy['page_text_id']}")

			if self.debug: print('going to store page_source')
			policy['page_source_md5'] 	= self.store_file(page_source, False, 'page_source')

			if self.debug: print('going to do reading ease scores')
			# get readability scores, scores below zero are
			#	invalid so we null them
			policy['fre_score'] = textstat.flesch_reading_ease(page_text)
			if policy['fre_score'] <= 0:
				policy['fre_score'] = None

			policy['fk_score']  = textstat.flesch_kincaid_grade(page_text)
			if policy['fk_score'] <= 0:
				policy['fk_score'] = None

			if self.debug: print('going to store policy')
			# add to db and get id for this policy
			policy_id  = self.sql_driver.add_policy(policy)

			if self.debug: print('going to link policy to pages')
			# attach policy to all links with this url, not we can filter
			#	do only do internal links
			for page_id, crawl_id in self.sql_driver.get_page_ids_from_link_url(policy['start_url'],internal_links_only=True):
				self.sql_driver.attach_policy_to_page(policy_id,page_id)
				self.sql_driver.attach_policy_to_crawl(policy_id,crawl_id)

			if self.debug: 
				print(f'\t� Success: {policy["start_url"]}')
			self.sql_driver.close()
			return {'success': True}
		else:
			if self.debug: 
				print(f'\t👎 Fail: {policy["start_url"]}')
			self.sql_driver.close()
			return {
				'success': False,
				'result': 'Not policy'
			}
	# store_policy

	def determine_policy_type_from_text(self, text):
		"""
		Determine if a given text fragment indicates
			a given type of policy.

		Returns dict.

		"""

		# clear whitespace
		text = re.sub('\s+',' ',text)

		# retrieve values from policy_terms.json
		policy_verification_terms = self.utilities.get_policy_verification_terms()

		policy_type_keys = []
		for key in policy_verification_terms:
			policy_type_keys.append(key)

		# randomize the order we do our checks
		random.shuffle(policy_type_keys)

		# look for matches against verification terms
		for policy_type in policy_type_keys:
			for term in policy_verification_terms[policy_type]:
				if term in text.lower():
					return({
						'success': True,
						'result' :{
							'policy_type':	policy_type,
							'match_term':	term,
							'match_text':	text
						}
					})

		# no match
		return ({'success': False})
	# determine_policy_type_from_text

	def store_page_text(self,readability_html,readability_source_md5):
		# the actual 'page_text' output from readability doesn't properly seperate words
		#	that use markup as a space.  eg '<h3>this</h3><p>that</p>' becomes 'thisthat'
		#	whereas 'this that' is what a user would see in the browser
		# to overcome the above issue we have to manually strip out html and do some 
		#	cleaning of our own.
		page_text = re.sub('<!--.+-->',' ', readability_html)
		page_text = re.sub('<svg.+</svg>',' ', page_text)
		page_text = re.sub('<.+?>', ' ', page_text)
		page_text = re.sub('[\n|\r]', ' ', page_text)
		page_text = re.sub('\s+', ' ', page_text)
		page_text = unicodedata.normalize('NFKD',html.unescape(page_text.strip()))

		# postgres can't handle nulls
		page_text = page_text.replace('\x00','NULL_REPLACED_FOR_PSQL')

		# return the id
		return self.sql_driver.add_page_text({
			'text'						: page_text.replace('\x00',' '),
			'word_count'				: len(page_text.split()),
			'readability_source_md5' 	: readability_source_md5
		})
Beispiel #8
0
class ChromeDriver:
    """
    This class allows for using the production Chrome browser with webXray.
    Requirements are Selenium, Chrome, and ChromeDriver.

    Pros:
        Production browser which is largely identical to real-world use
        By turning headless off it is very similar to a 'real' session
        By turning headless on the CPU/Mem usage is lower than otherwise
    Cons:
        Less testing with webxray than phantomjs, does not handle many paralell instances very well
        In headless mode prior to 64.0.3254.0, the cookie database does not get created and no cookies are returned
    """
    def __init__(self, ua=False, dnt=False):
        """
        set various global options here
        """

        self.dnt = dnt

        # set here if you want to use headless mode for Chrome
        self.headless = True

        # if you want to get potentially dangerous requests, set
        #	this to true.
        # false by default for hopefully obvious reasons
        self.allow_insecure = False

        # if you have trouble getting chrome to start
        #	change these values manually
        self.chromedriver_path = None
        self.chrome_binary_path = None

        # we want to give our browsers a full minute to try to
        #	download content, but gracefully timeout thereafter
        self.page_timeout_seconds = 60

        # Set ua if we have it, see get_ua_for_headless
        #	for details
        self.ua = ua

        # useful for various tasks
        self.utilities = Utilities()

        return None

    # init

    def create_chromedriver(self):
        """
        Since we have many functions we can perform we consolidate
            chromedriver code here.
        """

        # set up options object
        chrome_options = Options()

        # if we have chrome binary set it up
        if self.chrome_binary_path:
            chrome_options.binary_location = self.chrome_binary_path

        # live dangerously
        if self.allow_insecure:
            chrome_options.add_argument('--allow-running-insecure-content')

        # thank god for this option
        chrome_options.add_argument('--mute-audio')

        # if we are headless we also mix up window size a bit
        if self.headless:
            chrome_options.add_argument('headless')
            chrome_options.add_argument('disable-gpu')
            window_x = random.randrange(1050, 1920)
            window_y = random.randrange(900, 1080)
            chrome_options.add_argument('window-size=%sx%s' %
                                        (window_x, window_y))

        # if we have a ua set it here
        if self.ua:
            chrome_options.add_argument('user-agent=' + self.ua)

        # attempt to start driver, fail gracefull otherwise
        try:
            # if we have chromedriver path set it up
            if self.chromedriver_path:
                driver = webdriver.Chrome(self.chromedriver_path,
                                          desired_capabilities={
                                              'loggingPrefs': {
                                                  'performance': 'ALL'
                                              }
                                          },
                                          chrome_options=chrome_options)
            else:
                driver = webdriver.Chrome(
                    desired_capabilities={
                        'loggingPrefs': {
                            'performance': 'ALL'
                        }
                    },
                    chrome_options=chrome_options,
                )
        except:
            return None

        # allow one minute before we kill it, seperate from browser_wait
        driver.set_page_load_timeout(self.page_timeout_seconds)

        return driver

    # init_headless_driver

    def get_ua_for_headless(self):
        """
        Using chrome in headless sends a 'Headless' ua string,
            here we figure out the current ua and remove the 
            'Headless' part to help with compatability

        This requires firing up a new browser instance
            and destroying it, so this should be called once
            and resused if possible and this is not in
            __init___ on purpose
        """
        driver = self.create_chromedriver()
        if driver != None:
            ua = driver.execute_script('return navigator.userAgent')
            driver.quit()
            return ua.replace('Headless', '')
        else:
            return None

    # get_ua_for_headless

    def get_webxray_scan_data(self, url, browser_wait):
        """
        This function loads the page, monitors network traffic, and returns relevant data/logs.

        IMPORTANT: headless will miss all cookies in chrome versions < 64.0.3254.0

        This uses the chrome performance log to get network traffic details, see following for details:
            - https://gist.githubusercontent.com/klepikov/5457750/raw/ecedc6dd4eed82f318db91adb923627716fb6b58/test.py
            - https://sites.google.com/a/chromium.org/chromedriver/logging/performance-log
        """

        driver = self.create_chromedriver()
        # we can't start Chrome, return error message as result
        if driver == None:
            return ({
                'success': False,
                'result': 'Unable to launch Chrome instance'
            })

        # allow one minute before we kill it, seperate from browser_wait
        driver.set_page_load_timeout(60)

        # start the page load process, return error message if we fail
        try:
            if self.dnt:
                print("DNT on")
                driver._client.set_header_overrides(headers={'DNT': 1})
            driver.get(url)
        except:
            driver.quit()
            return ({'success': False, 'result': 'Unable to load page'})

        # if the page has an alert window open it will throw the following exception when trying
        #	to get the current_url: selenium.common.exceptions.UnexpectedAlertPresentException
        # in theory we should be able to set an option for UNEXPECTED_ALERT_BEHAVIOUR to ACCEPT
        # 	but it does not seem to be supported by chromedriver at present
        # in some cases we can catch the items we need before an alert fires, otherwise
        # 	we fail gracefully, but this is a bug that needs resolution
        try:
            final_url = driver.current_url
            title = driver.title
            page_source = driver.page_source
        except:
            # quit the driver or it will never die!
            driver.quit()
            return ({
                'success':
                False,
                'result':
                'Unable to load page, possible javascript alert issue'
            })

        # handle odd bug where title is a 'webelement' object
        if not isinstance(title, str): title = None

        # while the browser may be finished loading the page, scripts may still making
        # 	additional requests, so we wait to let all that finish
        time.sleep(browser_wait)

        # We use the Chrome performance log get network traffic. Chrome performance log outputs a
        #	number of independent 'message' events which are keyed to a 'requestId'.  What we want
        #	to send upstream is a dictionary keyed on the requested url so we do a lot of processing
        #	here to stitch together a coherent log in the format expected by wbxr.
        #
        # There are two types of network events we are concerned with: normal http
        #	requests (initiated by Network.requestWillBeSent) and websocket requests (initiated
        #	by Network.webSocketCreated).
        #
        # For normal events, we add entries to the 'requests' dictionary which we key to the requested
        #	url.  The reason for this is a single requestId may correspond with many urls in
        #	cases where a request results in redirects occuring.  However, data from the
        #	Network.loadingFinished event does not include the url, so we key that seperately
        #	in the load_finish_data dict and then attach it later on.  Note that if a request to
        #	x.com results in redirects to y.com and z.com, all three will end up sharing
        #	the same loadingFinished data.
        #
        # webSocket events are a special case in that they are not strictly HTTP events, but
        #	they do two things we are concerned with: potentially linking a user to
        #	a third-party domain and setting cookies.  The url contacted is only exposed in the
        #	first event, Network.webSocketCreated, so we must use the requestId to tie together
        #	subsequent Network.webSocketWillSendHandshakeRequest and
        #	Network.webSocketHandshakeResponseReceived events.  We use the dictionary websocket_requests
        #	to keep track of such events, and we then reprocess them to be keyed to the url in our
        #	normal requests log.  Note that to keep track of websocket request we use 'websocket'
        #	for content type, and there may be a better way to handle this.

        # http requests are keyed to URL
        requests = {}

        # these events are keyed to requestID
        load_finish_data = {}
        websocket_requests = {}

        # to get page load time we will figure out when the first request and final load finished occured
        first_start_time = None
        last_end_time = None

        # for debuging
        duplicate_keys = []

        # crunch through all the chrome logs here, the main event!
        for log_item in driver.get_log('performance'):
            for key, this_log_item in log_item.items():
                # we are only interested in message events
                if key == 'message':
                    # we have to read in this value to get json data
                    log_item_data = json.loads(this_log_item)
                    message_type = log_item_data['message']['method']

                    ################################
                    # normal http event processing #
                    ################################

                    # we have a new http event, create new empty entry keyed to url
                    # and keep track of start time info
                    if message_type == 'Network.requestWillBeSent':
                        this_request = log_item_data['message']['params'][
                            'request']
                        this_url = this_request['url']

                        # skip if not http(s)
                        if not re.match('^https?://', this_url): continue

                        # the presence of 'redirectResponse' means a prior request is redirected
                        #	so we update the status of the original request here and
                        #	then continue processing the current request
                        if 'redirectResponse' in log_item_data['message'][
                                'params']:
                            redirect_info = log_item_data['message']['params'][
                                'redirectResponse']
                            original_url = redirect_info['url']

                            # the request was received, mark it
                            requests[original_url].update({'received': True})

                            # record status code and text
                            requests[original_url].update(
                                {'status': redirect_info['status']})
                            requests[original_url].update(
                                {'status_text': redirect_info['statusText']})

                            # try to get response headers, fail gracefully as they are already None
                            try:
                                requests[this_url].update({
                                    'response_headers':
                                    this_response['headersText']
                                })
                            except:
                                pass

                            try:
                                requests[this_url].update({
                                    'content_type':
                                    this_response['headers']['Content-Type']
                                })
                            except:
                                pass

                        # if a new request we initialize entry
                        if this_url not in requests:
                            requests[this_url] = {}

                            # we use this to get the load_finish_data later on
                            requests[this_url].update({
                                'request_id':
                                log_item_data['message']['params']['requestId']
                            })

                            # we set received to false to start with
                            requests[this_url].update({'received': False})

                            # initialze response values to None in case we don't get response
                            requests[this_url].update({'end_time': None})
                            requests[this_url].update({'status': None})
                            requests[this_url].update({'status_text': None})
                            requests[this_url].update(
                                {'response_headers': None})
                            requests[this_url].update({'content_type': None})
                            requests[this_url].update({'body_size': None})
                            requests[this_url].update({'end_time': None})
                            requests[this_url].update({'user_agent': None})
                            requests[this_url].update({'referer': None})

                            # each request has a start_time, we use this to figure out the time it took to download
                            this_start_time = log_item_data['message'][
                                'params']['timestamp']
                            requests[this_url].update(
                                {'start_time': this_start_time})

                            # update global start time to measure page load time
                            if first_start_time == None or this_start_time < first_start_time:
                                first_start_time = this_start_time

                            # get the request headers
                            requests[this_url].update(
                                {'request_headers': this_request['headers']})

                            # these can fail, if so, we ignore
                            try:
                                requests[this_url].update({
                                    'user_agent':
                                    this_request['headers']['User-Agent']
                                })
                            except:
                                pass

                            try:
                                requests[this_url].update({
                                    'referer':
                                    this_request['headers']['Referer']
                                })
                            except:
                                pass
                        # this_url already exists, log
                        else:
                            duplicate_keys.append(this_url)
                            continue

                    # we have received a response to our request, update appropriately
                    if message_type == 'Network.responseReceived':
                        this_response = log_item_data['message']['params'][
                            'response']
                        this_url = this_response['url']

                        # skip if not http(s)
                        if not re.match('^https?://', this_url): continue

                        # the request was received, mark it
                        requests[this_url].update({'received': True})

                        # record status code and text
                        requests[this_url].update(
                            {'status': this_response['status']})
                        requests[this_url].update(
                            {'status_text': this_response['statusText']})

                        # try to get response headers, fail gracefully as they are already None
                        try:
                            requests[this_url].update({
                                'response_headers':
                                this_response['headersText']
                            })
                        except:
                            pass

                        try:
                            requests[this_url].update({
                                'content_type':
                                this_response['headers']['Content-Type']
                            })
                        except:
                            pass

                    # load finish events are keyed to requestId and may apply to many requested urls
                    #	so we keep this in a seperate dictionary to be relinked when we're done
                    if message_type == 'Network.loadingFinished':
                        this_request_id = log_item_data['message']['params'][
                            'requestId']
                        this_end_time = log_item_data['message']['params'][
                            'timestamp']

                        # update global end time
                        if last_end_time == None or this_end_time > last_end_time:
                            last_end_time = this_end_time

                        if this_request_id not in load_finish_data:
                            load_finish_data[this_request_id] = {}

                        # size is updated during loading and is shown in logs, but we only want the final size which is here
                        load_finish_data[this_request_id].update({
                            'body_size':
                            log_item_data['message']['params']
                            ['encodedDataLength']
                        })

                        # we use this to calculate the total time for all requests
                        load_finish_data[this_request_id].update(
                            {'end_time': this_end_time})

                    ##############################
                    # webSocket event processing #
                    ##############################

                    # we have a new websocket, create new empty entry keyed to requestId
                    # 	this will be rekeyed to url
                    # note we ignore timing data for websockets
                    if message_type == 'Network.webSocketCreated':
                        this_url = log_item_data['message']['params']['url']
                        this_request_id = log_item_data['message']['params'][
                            'requestId']

                        if this_request_id not in websocket_requests:
                            websocket_requests[this_request_id] = {}
                            websocket_requests[this_request_id].update(
                                {'url': this_url})
                            websocket_requests[this_request_id].update(
                                {'content_type': 'websocket'})
                            websocket_requests[this_request_id].update(
                                {'received': False})
                            websocket_requests[this_request_id].update(
                                {'end_time': None})
                            websocket_requests[this_request_id].update(
                                {'status': None})
                            websocket_requests[this_request_id].update(
                                {'status_text': None})
                            websocket_requests[this_request_id].update(
                                {'response_headers': None})
                            websocket_requests[this_request_id].update(
                                {'body_size': None})
                            websocket_requests[this_request_id].update(
                                {'end_time': None})
                            websocket_requests[this_request_id].update(
                                {'start_time': None})
                            websocket_requests[this_request_id].update(
                                {'user_agent': None})
                            websocket_requests[this_request_id].update(
                                {'referer': None})

                    # websocket request made, update relevant fields
                    if message_type == 'Network.webSocketWillSendHandshakeRequest':
                        this_request = log_item_data['message']['params'][
                            'request']
                        this_request_id = log_item_data['message']['params'][
                            'requestId']
                        websocket_requests[this_request_id].update(
                            {'request_headers': this_request['headers']})
                        websocket_requests[this_request_id].update({
                            'user_agent':
                            this_request['headers']['User-Agent']
                        })

                    # websocket response received, update relevant fields
                    if message_type == 'Network.webSocketHandshakeResponseReceived':
                        this_response = log_item_data['message']['params'][
                            'response']
                        this_request_id = log_item_data['message']['params'][
                            'requestId']
                        websocket_requests[this_request_id].update(
                            {'received': True})
                        websocket_requests[this_request_id].update(
                            {'status': this_response['status']})
                        websocket_requests[this_request_id].update(
                            {'status_text': this_response['statusText']})
                        websocket_requests[this_request_id].update(
                            {'response_headers': this_response['headersText']})
        # end log processing loop

        # append load finish info to requests
        for this_url in requests:
            this_request_id = requests[this_url]['request_id']
            if this_request_id in load_finish_data:
                requests[this_url].update({
                    'body_size':
                    load_finish_data[this_request_id]['body_size']
                })

                # load_time is start time minus end time,
                # 	multiplied by 1k to convert to miliseconds
                load_time = (load_finish_data[this_request_id]['end_time'] -
                             requests[this_url]['start_time']) * 1000

                # we shouldn't be getting <=0, but make it null if this happens
                if load_time <= 0:
                    requests[this_url].update({'load_time': load_time})
                else:
                    requests[this_url].update({'load_time': None})
            else:
                requests[this_url].update({'body_size': None})
                requests[this_url].update({'load_time': None})

        # append websocket data to requests data
        for item in websocket_requests:
            requests[websocket_requests[item]
                     ['url']] = websocket_requests[item]

        # return all the links for later processing
        all_links = []
        try:
            links = driver.find_elements_by_tag_name('a')
            for link in links:
                all_links.append(
                    [link.get_attribute('text'),
                     link.get_attribute('href')])
        except:
            pass

        # get the page meta description
        try:
            meta_desc = driver.find_element_by_xpath(
                "//meta[@name='description']").get_attribute("content")
        except:
            meta_desc = None

        # get the language of the page
        try:
            lang = driver.find_element_by_xpath('/html').get_attribute('lang')
        except:
            lang = None

        # get all the cookies
        # 	the selenium get_cookies method does not return third-party cookies
        #	so we open the cookie db directly from the chrome profile
        #	note that in headless mode this does not work in chrome versions
        #	prior to 64.0.3254.0 and no cookies will be returned
        cookies = []
        try:
            conn = sqlite3.connect(
                driver.capabilities['chrome']['userDataDir'] +
                '/Default/Cookies')
            c = conn.cursor()
            c.execute(
                "SELECT name,is_secure,path,host_key,expires_utc,is_httponly,value FROM cookies"
            )
            for cookie in c.fetchall():
                cookies.append({
                    'name': cookie[0],
                    'secure': cookie[1],
                    'path': cookie[2],
                    'domain': cookie[3],
                    'expiry': cookie[4],
                    'httponly': cookie[5],
                    'value': cookie[6]
                })
        except:
            return ({
                'success':
                False,
                'result':
                'Cookie database not loaded, if this message appears often something is fundamentally wrong and requires attention!'
            })

        if self.headless == True:
            browser_version = driver.capabilities['version'] + ' [headless]'
        else:
            browser_version = driver.capabilities['version']

        # other parts of webxray expect this data format, common to all browser drivers used
        return_dict = {
            'browser_type': driver.capabilities['browserName'],
            'browser_version': browser_version,
            'browser_wait': browser_wait,
            'start_url': url,
            'final_url': final_url,
            'title': title,
            'meta_desc': meta_desc,
            'lang': lang,
            'load_time': int((last_end_time - first_start_time) * 1000),
            'processed_requests': requests,
            'cookies': cookies,
            'all_links': all_links,
            'source': page_source
        }

        # quit the driver or it will never die!
        driver.quit()

        return ({'success': True, 'result': return_dict})

    # get_webxray_scan_data

    def get_page_source_text_title_lang(self, url):
        """
        Given a url, this function attempts to load the page, inject
            the Readability.js library, execute it, and extract
            only the text of a given page.

        Function returns the success value (True/False), page source, the 
            extracted text, and the page title in case of success; in 
            case of failure returns an error message.
        """

        # set up sql_driver for logging errors
        driver = self.create_chromedriver()

        # browser hasn't started and error already printed to cli
        if driver == None:
            return ({
                'success': False,
                'result': 'Unable to launch Chrome instance'
            })

        # starts the page load process, quits driver and returns nothing if we fail
        try:
            driver.get(url)
        except:
            driver.quit()
            return ({'success': False, 'result': 'Unable to load page'})

        # if we can't get source something is wrong, return None
        try:
            page_source = driver.page_source
        except:
            driver.quit()
            return ({'success': False, 'result': 'Unable to read page source'})

        # if page title fails put in a null value
        try:
            page_title = driver.title
        except:
            page_title = None

        # get the language of the page
        try:
            page_lang = driver.find_element_by_xpath('/html').get_attribute(
                'lang')
        except:
            page_lang = None

        # inject the locally downloaded copy of readability into the page
        #	and extract the content
        #
        # NOTE: you must download readability on your own and place in the
        #	appropriate directory
        readability_js = open(os.path.dirname(os.path.abspath(__file__)) +
                              '/resources/policyxray/readability.js',
                              'r',
                              encoding='utf-8').read()
        try:
            readabilty_result = json.loads(
                driver.execute_script("""
                %s
                var readabilityObj = new Readability("%s", document);
                return JSON.stringify(readabilityObj.parse(), null, 2);
            """ % (readability_js, url)))
            driver.quit()
        except:
            driver.quit()
            return ({
                'success': False,
                'result': 'Unable to inject Readability.js'
            })

        # readability failure, return None
        if readabilty_result == None:
            return ({'success': False, 'result': 'Empty Readability result'})

        # readability has HTML formatting, remove it (and extra spaces), return None on failure
        try:
            page_text = re.sub(
                '\s+', ' ',
                re.sub('&.+?;', ' ',
                       re.sub('<.+?>', ' ', readabilty_result['content'])))
        except:
            return ({
                'success': False,
                'result': 'Unable to remove HTML from Readability result'
            })

        # the page text is empty, return None
        if len(page_text) == 0:
            return ({
                'success': False,
                'result': 'Empty result after HTML removal'
            })

        # looks good, return tuple
        return ({
            'success': True,
            'result': {
                'page_source': page_source,
                'page_text': page_text,
                'page_title': page_title,
                'page_lang': page_lang
            }
        })

    # get_page_source_text_title_lang

    def get_absolute_page_links(self, url):
        """
        Returns all links on the page, changes relative links to
            absolute links as well.
        """

        # initialize browser instance
        driver = self.create_chromedriver()

        # browser hasn't started and error already printed to cli
        if driver == None:
            return ({
                'success': False,
                'result': 'Unable to launch Chrome instance'
            })

        # allow one minute before we kill it
        driver.set_page_load_timeout(60)

        # starts the page load process, quits driver and returns nothing if we fail
        try:
            driver.get(url)
        except:
            driver.quit()
            return ({'success': False, 'result': 'Unable to load page'})

        # page has now been loaded, we process all the links
        all_links = set()

        # either there are no links or something has gone wrong
        try:
            links = driver.find_elements_by_tag_name('a')
        except:
            driver.quit()
            return ({'success': False, 'result': 'Unable to extract links'})

        # process the links
        for link in links:
            # to cope with:
            # selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
            try:
                this_link = link.get_attribute('href')
                this_link_text = re.sub(
                    '\s+', ' ',
                    re.sub('[\\n|\\r|\\t]', ' ',
                           link.get_attribute('text').strip()))
            except:
                continue

            # sometimes can be empty
            if this_link == None: continue
            if len(this_link) == 0: continue

            # add in the absolute url from the link to our set
            all_links.add((this_link_text,
                           self.utilities.get_absolute_url_from_page_link(
                               url, this_link)))
        driver.quit()

        return ({'success': True, 'result': all_links})
	def __init__(self, db_engine, db_name):
		self.db_engine	= db_engine
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
Beispiel #10
0
class Collector:
    """
	This class does the main work of sorting out the page address to process

	the list of pages **must** be in the ./page_lists directory or it will not work

	when checking page addresses it skips over binary documents with known extensions
		and makes sure we aren't duplicating pages that have already been analyzed
		this means it is safe to re-run on the same list as it won't duplicate entries, but it
		*will* retry pages that may not have loaded
	"""
    def __init__(self, db_name=None, db_engine=None, client_id=None):
        """
		This class can be called to run store_results_from_queue which connects
			to the server_config database to fetch results, in which case a global
			db_name isn't needed, so we have db_name=None to account for that.
			However, if we *do* have a db_name we set up a global config.
		"""
        self.db_name = db_name
        self.db_engine = db_engine
        self.client_id = client_id
        self.debug = True
        self.utilities = Utilities()

        # get global config for this db
        if db_name:
            # set up database connection
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            self.config = sql_driver.get_config()
            self.browser_config = {}

            for item in self.config:
                if 'client' in item:
                    self.browser_config[item] = self.config[item]

            sql_driver.close()

    # __init__

    def process_tasks_from_queue(self, process_num):
        """
		Selects the next page from the task_queue and passes to 
			process_url.  If load is unsucessful places page
			back into queue and updates attempts.  Returns once 
			when there are no pages in the queue under max_attempts.
		"""

        print('\t[p.%s]\t🏃‍♂️ Starting process' % process_num)

        # need a local connection for each queue manager
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # keep getting tasks from queue until none are left at max attempt level
        while sql_driver.get_task_queue_length(
                max_attempts=self.config['max_attempts'],
                unlocked_only=True) != 0:
            # it is possible for two processes to both pass the above conditional
            #	and then try to get a task from the queue at the same time.
            #	however, the second process that attempts to get a task will
            #	get an empty result (and crash), so we have a try/except block here
            #	to handle that condition gracefully
            try:
                target, task = sql_driver.get_task_from_queue(
                    max_attempts=self.config['max_attempts'],
                    client_id=self.client_id)
            except:
                break

            print('\t[p.%s]\t👉 Initializing: %s for target %s' %
                  (process_num, task, target[:50]))

            # import and set up specified browser driver
            # 	note we set up a new browser each time to
            #	get a fresh profile
            if self.browser_config['client_browser_type'] == 'chrome':
                browser_driver = ChromeDriver(self.browser_config,
                                              port_offset=process_num)
            else:
                print(
                    f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!"
                )
                return

            # does the webxray scan or policy capture
            if task == 'get_scan':
                task_result = browser_driver.get_scan(target)
            elif task == 'get_crawl':
                task_result = browser_driver.get_crawl(json.loads(target))
            elif task == 'get_policy':
                task_result = browser_driver.get_scan(target,
                                                      get_text_only=True)
            elif task == 'get_random_crawl':
                task_result = browser_driver.get_random_crawl(target)

            # kill browser
            del browser_driver

            # browser has failed to get result, unlock and continue
            if task_result['success'] == False:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], task_result['result']))

                # for times we don't want to retry, such as a rejected
                #	redirect or network resolution failure, this could be expanded
                fail_cases = [
                    'reached fail limit', 'rejecting redirect',
                    'did not find enough internal links'
                ]

                if task_result[
                        'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[
                            'result']:
                    sql_driver.set_task_as_failed(target, task)
                else:
                    sql_driver.unlock_task_in_queue(target, task)

                # keep track of error regardless of fail/unlock
                sql_driver.log_error({
                    'client_id': 'localhost',
                    'target': target,
                    'task': task,
                    'msg': task_result['result']
                })
                continue

            # debug
            if self.debug:
                print(
                    '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s'
                    % (process_num, task, target[:50]))

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target':
                target,
                'task':
                task,
                'task_result':
                task_result['result'],
                'client_id':
                self.client_id
            })

            if store_result['success'] == True:
                print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}')
            else:
                print(
                    f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}'
                )

        # tidy up
        sql_driver.close()
        del sql_driver

        print('\t[p.%s]\t✋ Completed process' % process_num)
        return

    # process_tasks_from_queue

    def store_result(self, params):
        """
		Handles storing task_result and removing jobs
			from the task_queue.
		"""

        # unpack params
        target = params['target']
        task = params['task']
        task_result = params['task_result']
        client_id = params['client_id']

        # client_ip is optional
        if 'client_ip' in params:
            client_ip = params['client_ip']
        else:
            client_ip = None

        # if db_name is specified we are running in server mode and we
        #	connect to the db which corresponds to the result being
        #	processed.  otherwise, we use the global db_name as we are
        #	running in non-server mode.
        if 'db_name' in params:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(params['db_name'])
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(params['db_name'])
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()
            output_store = OutputStore(params['db_name'], self.db_engine)
        else:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            output_store = OutputStore(self.db_name, self.db_engine)

        if task == 'get_policy':
            store_result = output_store.store_policy(task_result,
                                                     client_id,
                                                     client_ip=client_ip)
            # we never retry policies
            sql_driver.remove_task_from_queue(target, task)
            if store_result['success']:
                result = {'success': True}
            else:
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail on ' + store_result['result']
                })
                result = {'success': False, 'result': store_result['result']}
        # elif task == 'get_crawl' or task == 'get_random_crawl':
        else:
            all_crawls_ok = True

            # We want to be able to re-run random crawls, and to do so we make sure
            #	the crawl_id will match
            if task == 'get_crawl' or task == 'get_scan':
                crawl_id = target
            elif task == 'get_random_crawl':
                crawl_id = []
                for result in task_result:
                    crawl_id.append(result['start_url'])
                crawl_id = json.dumps(crawl_id)

            # tweak to account for differences between scans/crawls
            if task == 'get_scan': task_result = [task_result]

            # keep track of domains
            all_3p_cookie_domains = set()
            all_3p_dom_storage_domains = set()
            all_3p_request_domains = set()
            all_3p_response_domains = set()
            all_3p_websocket_domains = set()

            # When we store a crawl we add optional fields in the page table
            #	that allow us to connect the page loads into a single crawl.
            #	the crawl_id is a hash of the target (which is a json string
            #	derived from the url_list), and the crawl_timestamp which is the
            #	first accessed time from the crawl.
            for crawl_sequence, result in enumerate(task_result):
                store_result = output_store.store_scan({
                    'browser_output':
                    result,
                    'client_id':
                    client_id,
                    'crawl_id':
                    crawl_id,
                    'crawl_timestamp':
                    task_result[0]['accessed'],
                    'crawl_sequence':
                    crawl_sequence,
                    'client_ip':
                    client_ip
                })

                if store_result['success'] != True:
                    all_crawls_ok = False
                else:
                    # we are successful, create entries in page_lookup table
                    page_lookup_table = self.build_lookup_table(
                        'page', store_result['page_id'], {
                            'requests':
                            store_result['page_3p_request_domains'],
                            'responses':
                            store_result['page_3p_response_domains'],
                            'websockets':
                            store_result['page_3p_websocket_domains'],
                            'dom_storage':
                            store_result['page_3p_dom_storage_domains'],
                            'cookies':
                            store_result['page_3p_dom_storage_domains']
                        })

                    for lookup_item in page_lookup_table:
                        sql_driver.add_page_id_domain_lookup_item(
                            page_lookup_table[lookup_item])

                    # we are also making a lookup table for the crawl, keep joing the
                    #	sets as we go along
                    all_3p_request_domains.update(
                        store_result['page_3p_request_domains'])
                    all_3p_response_domains.update(
                        store_result['page_3p_response_domains'])
                    all_3p_websocket_domains.update(
                        store_result['page_3p_websocket_domains'])
                    all_3p_dom_storage_domains.update(
                        store_result['page_3p_dom_storage_domains'])
                    all_3p_cookie_domains.update(
                        store_result['page_3p_dom_storage_domains'])

            if all_crawls_ok:
                sql_driver.remove_task_from_queue(target, task)
                result = {'success': True}

                # build crawl lookup table
                crawl_lookup_table = self.build_lookup_table(
                    'crawl', crawl_id, {
                        'requests': all_3p_request_domains,
                        'responses': all_3p_response_domains,
                        'websockets': all_3p_websocket_domains,
                        'dom_storage': all_3p_dom_storage_domains,
                        'cookies': all_3p_cookie_domains
                    })

                # patch lookup table
                for lookup_item in crawl_lookup_table:
                    sql_driver.add_crawl_id_domain_lookup_item(
                        crawl_lookup_table[lookup_item])

            else:
                sql_driver.unlock_task_in_queue(target, task)
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail to store all scans for crawl_id_target '
                    + target
                })
                result = {
                    'success': False,
                    'result': 'unable to store all crawl loads'
                }

        # tidy up
        output_store.close()
        sql_driver.close()

        # done
        return result

    # store_result

    def build_lookup_table(self, type, id, domains):
        """
		Take all the domains by type and build a lookup table we
			can insert to db.  type is for either page or crawl.
		"""
        domain_lookup_table = {}

        # if given domain/type is not in lookup table we create new
        #	entry, otherwise update extant entry
        for domain, domain_owner_id in domains['requests']:
            if domain not in domain_lookup_table:
                domain_lookup_table[domain] = {
                    f'{type}_id': id,
                    'domain': domain,
                    'domain_owner_id': domain_owner_id,
                    'is_request': True,
                    'is_response': False,
                    'is_cookie': False,
                    'is_websocket': False,
                    'is_domstorage': False
                }

        for domain, domain_owner_id in domains['responses']:
            if domain not in domain_lookup_table:
                domain_lookup_table[domain] = {
                    f'{type}_id': id,
                    'domain': domain,
                    'domain_owner_id': domain_owner_id,
                    'is_request': False,
                    'is_response': True,
                    'is_cookie': False,
                    'is_websocket': False,
                    'is_domstorage': False
                }
            else:
                domain_lookup_table[domain]['is_response'] = True

        for domain, domain_owner_id in domains['websockets']:
            if domain not in domain_lookup_table:
                domain_lookup_table[domain] = {
                    f'{type}_id': id,
                    'domain': domain,
                    'domain_owner_id': domain_owner_id,
                    'is_request': False,
                    'is_response': False,
                    'is_cookie': False,
                    'is_websocket': True,
                    'is_domstorage': False
                }
            else:
                domain_lookup_table[domain]['is_websocket'] = True

        for domain, domain_owner_id in domains['dom_storage']:
            if domain not in domain_lookup_table:
                domain_lookup_table[domain] = {
                    f'{type}_id': id,
                    'domain': domain,
                    'domain_owner_id': domain_owner_id,
                    'is_request': False,
                    'is_response': False,
                    'is_cookie': False,
                    'is_websocket': False,
                    'is_domstorage': True
                }
            else:
                domain_lookup_table[domain]['is_domstorage'] = True

        for domain, domain_owner_id in domains['cookies']:
            if domain not in domain_lookup_table:
                domain_lookup_table[domain] = {
                    f'{type}_id': id,
                    'domain': domain,
                    'domain_owner_id': domain_owner_id,
                    'is_request': False,
                    'is_response': False,
                    'is_cookie': True,
                    'is_websocket': False,
                    'is_domstorage': False
                }
            else:
                domain_lookup_table[domain]['is_cookie'] = True

        return domain_lookup_table

    # build_lookup_table

    def build_crawl_task_queue(self, params):
        """
		Enter crawl tasks to the database after performing checks to 
			verify urls are valid.
		"""

        # these vars are specific to this function
        crawl_file_name = params['crawl_file_name']
        flush_crawl_task_queue = params['flush_crawl_task_queue']

        # only need this sql_driver to build the task list
        sql_driver = PostgreSQLDriver(self.db_name)

        # open list of pages
        try:
            crawl_list = json.load(
                open(os.path.dirname(os.path.abspath(__file__)) +
                     '/../crawl_lists/' + crawl_file_name,
                     'r',
                     encoding='utf-8'))
        except:
            print(
                f'Could not open {crawl_file_name}, is it correctly formatted and present in the ./crawl_lists directory?  Exiting.'
            )
            sql_driver.close()
            exit()

        # get rid of whatever is in there already
        if flush_crawl_task_queue:
            sql_driver.flush_task_queue(task='get_crawl')

        for count, url_list in enumerate(crawl_list):
            # first make sure the urls are valid, if we
            #	encounterd a non-valid url we trash the
            #	entire list
            url_list_valid = True

            # we keep our fixed urls here
            idna_url_list = []

            # look at each url
            for url in url_list:
                if self.utilities.is_url_valid(url) == False:
                    print(
                        f'{url} is not valid from {url_list}, not entering crawl to queue'
                    )
                    url_list_valid = False
                    break

                # perform idna fix
                idna_url_list.append(self.utilities.idna_encode_url(url))

            # we need to put the continue here for the outer loop
            if url_list_valid == False: continue

            # if we are allowing time series we see if page has been scanned in the
            #	specified interval, otherwise if we are *not* allowing a time series
            #	we skip anything already in the db
            if self.config['timeseries_enabled']:
                if sql_driver.crawl_exists(json.dumps(idna_url_list),
                                           timeseries_interval=self.
                                           config['timeseries_interval']):
                    print(f'\t{count} | {url[:30]}... Scanned too recently.')
                    continue
            else:
                if sql_driver.crawl_exists(json.dumps(idna_url_list)):
                    print(f'\t{count} | {url[:30]}... Exists in DB, skipping.')
                    continue

            # we have a valid list, queue it up!
            if url_list_valid:
                sql_driver.add_task_to_queue(json.dumps(idna_url_list),
                                             'get_crawl')
            print(f'\t{count} | {str(idna_url_list)[:30]}... Adding to queue.')

        # done
        sql_driver.close()

    # build_crawl_task_queue

    def build_scan_task_queue(self, params):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # these vars are specific to this function
        pages_file_name = params['pages_file_name']
        flush_scan_task_queue = params['flush_scan_task_queue']
        task = params['task']

        # set up sql connection used to determine if items are already in the db
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # open list of pages
        try:
            url_list = open(os.path.dirname(os.path.abspath(__file__)) +
                            '/../page_lists/' + pages_file_name,
                            'r',
                            encoding='utf-8')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % pages_file_name)
            sql_driver.close()
            exit()

        # get list of pages already scanned
        already_scanned = []
        print('\tFetching list of pages already scanned...')
        if self.config['timeseries_enabled']:
            for url, in sql_driver.get_all_pages_exist(
                    timeseries_interval=self.config['timeseries_interval']):
                already_scanned.append(url)
        else:
            for url, in sql_driver.get_all_pages_exist():
                already_scanned.append(url)
        print(f'\t => {len(already_scanned)} pages already scanned')

        # get rid of whatever is in there already
        if flush_scan_task_queue:
            sql_driver.flush_task_queue(task=task)

        # simple counter used solely for updates to CLI
        count = 0

        print('\t---------------------')
        print('\t Building Page Queue ')
        print('\t---------------------')

        for url in url_list:
            # skip lines that are comments
            if "#" in url[0]: continue

            count += 1

            # make sure url is valid
            if self.utilities.is_url_valid(url) == False:
                print(f'\t\t{count} | {url} is invalid')
                continue

            # perform idna fix
            url = self.utilities.idna_encode_url(url)

            # if we are allowing time series we see if page has been scanned in the
            #	specified interval, otherwise if we are *not* allowing a time series
            #	we skip anything already in the db
            if url in already_scanned and self.config['timeseries_enabled']:
                print(f'\t\t{count} | {url[:30]}... Scanned too recently.')
                continue

            elif url in already_scanned:
                print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.')
                continue

            # add to the queue, duplicates will be
            #	ignored
            sql_driver.add_task_to_queue(url, task)
            print(f'\t\t{count} | {url[:30]}... Adding to queue.')

        # close the db connection
        sql_driver.close()

    # build_scan_task_queue

    def build_policy_task_queue(self,
                                flush_policy_task_queue=True,
                                timeseries_interval=10080):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # set up new db connection
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # get rid of whatever is in there already
        if flush_policy_task_queue:
            sql_driver.flush_task_queue(task='get_policy')

        # get list of all policies we have
        scanned_policies = []
        for policy_url, in sql_driver.get_scanned_policy_urls():
            scanned_policies.append(policy_url)

        # run the query and add to list
        for policy_url, in sql_driver.get_policies_to_collect():
            # if page has an anchor, we drop everything after
            if policy_url[-1] == '#':
                policy_url = policy_url[:-1]
            elif '#' in policy_url:
                policy_url = re.search('^(.+?)#.+$', policy_url).group(1)

            # skip invalid links
            if not self.utilities.is_url_valid(policy_url): continue

            # already did it, skip
            if policy_url in scanned_policies: continue

            sql_driver.add_task_to_queue(policy_url, 'get_policy')

        # fyi
        print('\t%s pages in task_queue for get_policy' %
              sql_driver.get_task_queue_length(task='get_policy'))

        # we no longer need this db connection
        sql_driver.close()

    # build_policy_task_queue

    def store_results_from_queue(self, process_num):
        """
		If we are using a result queue this function will process
			all pending results.
		"""

        # set up new db connection to the server
        from webxray.PostgreSQLDriver import PostgreSQLDriver
        server_sql_driver = PostgreSQLDriver('server_config')

        # time to sleep when queue is empty
        wait_time = 5

        # loop continues indefintely
        while True:
            result = server_sql_driver.get_result_from_queue()
            if not result:
                print(
                    f'\t[p.{process_num}]\t😴 Going to sleep for {wait_time} seconds to wait for more tasks.'
                )
                time.sleep(wait_time)
                continue

            # result is a dictionary object, unpack it
            result_id = result['result_id']
            client_id = result['client_id']
            client_ip = result['client_ip']
            mapped_db = result['mapped_db']
            target = result['target']
            task = result['task']

            # the task_result needs to be uncompressed
            task_result = json.loads(
                bz2.decompress(base64.urlsafe_b64decode(
                    result['task_result'])).decode('utf-8'))

            if self.debug:
                print(
                    f'\t[p.{process_num}]\t📥 Going to store result for {str(target)[:30]}'
                )

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target': target,
                'task': task,
                'task_result': task_result,
                'client_id': client_id,
                'client_ip': client_ip,
                'db_name': mapped_db
            })

            # we finished processing this result, remove it from result queue
            server_sql_driver.remove_result_from_queue(result_id)

            # FYI
            if store_result['success'] == True:
                print('\t[p.%s]\t👍 Success: %s' % (process_num, target[:50]))
            else:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], store_result['result']))

        # techincally we never get here...
        server_sql_driver.close()
        return

    # store_results_from_queue

    def run(self, task='process_tasks_from_queue', pool_size=None):
        """
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there

		when running in slave mode the list is skipping and we got straight to scanning
		"""

        if task == 'process_tasks_from_queue':
            # set up sql connection to get queue_length
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            queue_length = sql_driver.get_task_queue_length()
            sql_driver.close()
            del sql_driver

            print('\t----------------------------------')
            print('\t%s addresses will now be webXray\'d' % queue_length)
            print('\t\t...you can go take a walk. ;-)')
            print('\t----------------------------------')

        # for macOS (darwin) we must specify start method as 'forkserver'
        #	this is essentially voodoo to ward off evil spirits which
        #	appear when large pool sizes are used on macOS
        # get_start_method must be set to 'allow_none', otherwise upon
        #	checking the method it gets set (!) - and if we then get/set again
        #	we get an error
        if sys.platform == 'darwin' and multiprocessing.get_start_method(
                allow_none=True) != 'forkserver':
            multiprocessing.set_start_method('forkserver')
        myPool = multiprocessing.Pool(pool_size)

        # map requires we pass an argument to the function
        #	(even though we don't need to), so we create
        #	a list equal to pool_size which will
        #	spawn the desired number of processes
        process_num = []
        if pool_size == None:
            pool_size = multiprocessing.cpu_count()

        for i in range(0, pool_size):
            process_num.append(i)

        if task == 'process_tasks_from_queue':
            myPool.map(self.process_tasks_from_queue, process_num)
        elif task == 'store_results_from_queue':
            myPool.map(self.store_results_from_queue, process_num)
Beispiel #11
0
    def __init__(self,
                 db_name,
                 db_engine,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_domain_owners=True,
                 start_date=False,
                 end_date=False):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold

        # pass utilities the database info
        self.utilities = Utilities(db_name, db_engine)

        # set up the analyzer we will be using throughout
        self.analyzer = Analyzer(db_name, db_engine)

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        # creates a new directory if it doesn't exist already
        self.report_path = self.utilities.setup_report_dir(self.db_name)

        # this is used in various places to get owner information
        self.domain_owners = self.utilities.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for tld in self.top_tlds:
                if tld: print('\t\t |- %s' % tld)
        else:
            self.top_tlds = [None]
Beispiel #12
0
class Reporter:
    """
	Manages the production of a number of CSV reports.
	"""
    def __init__(self,
                 db_name,
                 db_engine,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_domain_owners=True,
                 start_date=False,
                 end_date=False):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold

        # pass utilities the database info
        self.utilities = Utilities(db_name, db_engine)

        # set up the analyzer we will be using throughout
        self.analyzer = Analyzer(db_name, db_engine)

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        # creates a new directory if it doesn't exist already
        self.report_path = self.utilities.setup_report_dir(self.db_name)

        # this is used in various places to get owner information
        self.domain_owners = self.utilities.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for tld in self.top_tlds:
                if tld: print('\t\t |- %s' % tld)
        else:
            self.top_tlds = [None]

    # __init__

    #####################
    # REPORT GENERATORS #
    #####################

    def generate_db_summary_report(self, print_to_cli=True):
        """
		outputs and stores report of basic data about how many records in db, etc.
		"""
        print('\t================')
        print('\t General Summary')
        print('\t================')

        # get the relevant db summary data
        db_summary = self.analyzer.get_db_summary()

        # print to cli
        if print_to_cli:
            print("\t\tTotal Crawls:\t\t\t%s" % db_summary['total_crawls_ok'])
            print("\t\tTotal Pages:\t\t\t%s" % db_summary['total_pages_ok'])
            print("\t\tTotal Tasks Fail:\t\t%s" %
                  db_summary['total_tasks_fail'])
            print("\t\tTotal Tasks Attempted:\t\t%s" %
                  db_summary['total_tasks_attempted'])
            print("\t\t%% Pages OK:\t\t\t%.2f%%" %
                  db_summary['percent_tasks_ok'])
            print("\t\tTotal Errors:\t\t\t%s" % db_summary['total_errors'])
            print("\t\tTotal Cookies:\t\t\t%s" % db_summary['total_cookies'])
            print("\t\tTotal 3P Cookies:\t\t%s" %
                  db_summary['total_3p_cookies'])
            print("\t\tTotal Dom Storage:\t\t%s" %
                  db_summary['total_dom_storage'])
            print("\t\tTotal Websockets:\t\t%s" %
                  db_summary['total_websockets'])
            print("\t\tTotal Websocket Events:\t\t%s" %
                  db_summary['total_websocket_events'])
            print("\t\tTotal Requests:\t\t\t%s" % db_summary['total_requests'])
            print("\t\tTotal Responses:\t\t%s" % db_summary['total_responses'])
            print('\t\t%% Requests Received:\t\t%.2f%%' %
                  db_summary['percent_requests_received'])
            print("\t\t3P Requests:\t\t\t%s" % db_summary['total_3p_requests'])
            print("\t\t3P Responses:\t\t\t%s" %
                  db_summary['total_3p_responses'])
            print('\t\t%% 3P Requests Received:\t\t%.2f%%' %
                  db_summary['percent_3p_requests_received'])
            print('\t\t' + '-' * 40)

        # write results to csv
        csv_rows = []
        csv_rows.append(('total_crawls_ok', db_summary['total_crawls_ok']))
        csv_rows.append(('total_pages_ok', db_summary['total_pages_ok']))
        csv_rows.append(('total_tasks_fail', db_summary['total_tasks_fail']))
        csv_rows.append(
            ('total_tasks_attempted', db_summary['total_tasks_attempted']))
        csv_rows.append(('percent_pages_ok', db_summary['percent_tasks_ok']))
        csv_rows.append(('total_errors', db_summary['total_errors']))
        csv_rows.append(('total_cookies', db_summary['total_cookies']))
        csv_rows.append(('total_3p_cookies', db_summary['total_3p_cookies']))
        csv_rows.append(('total_dom_storage', db_summary['total_dom_storage']))
        csv_rows.append(('total_websockets', db_summary['total_websockets']))
        csv_rows.append(
            ('total_websocket_events', db_summary['total_websocket_events']))
        csv_rows.append(('total_requests', db_summary['total_requests']))
        csv_rows.append(('total_responses', db_summary['total_responses']))
        csv_rows.append(('percent_requests_received',
                         db_summary['percent_requests_received']))
        csv_rows.append(('total_3p_requests', db_summary['total_3p_requests']))
        csv_rows.append(
            ('total_3p_responses', db_summary['total_3p_responses']))
        csv_rows.append(('percent_3p_requests_received',
                         db_summary['percent_3p_requests_received']))

        self.utilities.write_csv(self.report_path, 'db_summary.csv', csv_rows)

    # generate_db_summary_report

    def generate_stats_report(self):
        """
		High level stats
		"""
        print('\t=============================')
        print('\t Processing High-Level Stats ')
        print('\t=============================')

        for tld_filter in self.top_tlds:
            csv_rows = []

            if tld_filter:
                stats = self.analyzer.get_high_level_stats(tld_filter)
            else:
                stats = self.analyzer.get_high_level_stats()

            if self.tracker_threshold:
                filter_depth = self.tracker_threshold
            else:
                filter_depth = 'no_filter_used'

            csv_rows.append(('n_pages', stats['total_pages']))
            csv_rows.append(('n_crawls', stats['total_crawls']))
            csv_rows.append(('%_pages_ssl', stats['percent_pages_ssl']))
            csv_rows.append(
                ('n_requests_received', stats['total_requests_received']))
            csv_rows.append(
                ('%_requests_received_ssl', stats['percent_requests_ssl']))
            csv_rows.append(('n_1p_requests_received',
                             stats['total_requests_received_1p']))
            csv_rows.append(('%_1p_requests_received_ssl',
                             stats['percent_1p_requests_ssl']))
            csv_rows.append(('n_3p_requests_received',
                             stats['total_requests_received_3p']))
            csv_rows.append(('%_3p_requests_received_ssl',
                             stats['percent_3p_requests_ssl']))
            csv_rows.append(
                ('average_page_load_time', stats['average_page_load_time']))
            csv_rows.append(('%_w/3p_request', stats['percent_w_3p_request']))
            csv_rows.append(('%_w/3p_cookie', stats['percent_w_3p_cookie']))
            csv_rows.append(('%_w/3p_script', stats['percent_w_3p_script']))
            csv_rows.append(('mean_3p_domains', stats['3p_domains_mean']))
            csv_rows.append(('median_3p_domains', stats['3p_domains_median']))
            csv_rows.append(('mode_3p_domains', stats['3p_domains_mode']))
            csv_rows.append(('mean_3p_cookies', stats['3p_cookies_mean']))
            csv_rows.append(('median_3p_cookies', stats['3p_cookies_median']))
            csv_rows.append(('mode_3p_cookies', stats['3p_cookies_mode']))

            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-stats.csv', csv_rows)
            else:
                self.utilities.write_csv(self.report_path, 'stats.csv',
                                         csv_rows)

    # generate_stats_report

    def generate_aggregated_tracking_attribution_report(self):
        """
		generates ranked list of which entities collect data 
			from the greatest number of crawls ('aggregated_tracking_attribution.csv')

		- entities which have subsidiaries are ranked according 
			to the crawls their subsidiaries get data from as well
		- however, parent entities only get one hit on 
			a crawl which has multiple subsidiaries present
		- for example, if a crawl has 'google analytics' and 'doubleclick' 
			that is only one hit for 'google'
		"""
        print('\t======================================')
        print('\t Processing Aggregated Tracking Report ')
        print('\t======================================')

        for tld_filter in self.top_tlds:
            csv_rows = []

            # write out data to csv
            for item in self.analyzer.get_aggregated_tracking_attribution(
                    tld_filter):
                csv_rows.append(
                    (item['percent_crawls'], item['owner_name'],
                     item['owner_country'],
                     self.utilities.get_domain_owner_lineage_combined_string(
                         item['owner_id'])))

            # we want to first sort by owner name and then by percentage
            #	 to account for cases where two owners have the same percentage value
            csv_rows.sort(key=lambda x: x[1].lower())
            csv_rows.sort(key=lambda x: x[0], reverse=True)

            # insert header row after sort
            csv_rows.insert(0, ('percentage_crawls_tracked', 'owner',
                                'owner_country', 'owner_lineage'))

            # write out csv with tld prefix if applicable
            if tld_filter:
                self.utilities.write_csv(
                    self.report_path,
                    tld_filter + '-aggregated_tracking_attribution.csv',
                    csv_rows)
            else:
                self.utilities.write_csv(
                    self.report_path, 'aggregated_tracking_attribution.csv',
                    csv_rows)

    # generate_aggregated_tracking_attribution_report

    def generate_aggregated_3p_ssl_use_report(self):
        """
		this report tells us the percentage of requests made to a given
			third-party are encrypted
		"""

        print('\t=========================================')
        print('\t Processing Aggregated 3P SSL Use Report ')
        print('\t=========================================')

        for tld_filter in self.top_tlds:
            csv_rows = []
            for item in self.analyzer.get_aggregated_3p_ssl_use(tld_filter):
                csv_rows.append(
                    (item['ssl_use'], item['owner_name'],
                     item['owner_country'],
                     self.utilities.get_domain_owner_lineage_combined_string(
                         item['owner_id'])))

            # we want to first sort by owner name and then by percentage
            #	 to account for cases where two owners have the same percentage value
            csv_rows.sort(key=lambda x: x[1].lower())
            csv_rows.sort(key=lambda x: x[0], reverse=True)

            # insert header row after sort
            csv_rows.insert(0, ('percent_requests_encrypted', 'owner',
                                'owner_country', 'owner_lineage'))

            # write out csv with tld prefix if applicable
            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-3p_ssl_use.csv',
                                         csv_rows)
            else:
                self.utilities.write_csv(self.report_path, '3p_ssl_use.csv',
                                         csv_rows)

    # generate_aggregated_3p_ssl_use_report

    def generate_3p_domain_report(self):
        """
		This report tells us the most commonly occuring third-party domains.
		"""
        print('\t==============================')
        print('\t Processing 3P Domains Report ')
        print('\t==============================')

        for tld_filter in self.top_tlds:
            csv_rows = []
            csv_rows.append(('percent_total', 'domain', 'owner',
                             'owner_country', 'owner_lineage'))

            # get_3p_domain_percentages returns a list, we slice it to get only desired num_results
            for item in self.analyzer.get_3p_domain_percentages(
                    tld_filter)[:self.num_results]:

                # figure out the lineage string if we know who owns the domain
                if item['owner_id'] != None:
                    lineage_string = self.utilities.get_domain_owner_lineage_combined_string(
                        item['owner_id'])
                else:
                    lineage_string = None

                csv_rows.append((item['percent_crawls'], item['domain'],
                                 item['owner_name'], item['owner_country'],
                                 lineage_string))

            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-3p_domains.csv',
                                         csv_rows)
            else:
                self.utilities.write_csv(self.report_path, '3p_domains.csv',
                                         csv_rows)

    # generate_3p_domain_report

    def generate_3p_request_report(self, request_type=None):
        """
		this queries the db to get all requests, domains, or domain owners
		next they are counted to find the most common
		and formatted to csv rows and returned
		"""
        if request_type == 'script':
            print('\t=============================')
            print('\t Processing 3P Script Report ')
            print('\t=============================')
        else:
            print('\t==============================')
            print('\t Processing 3P Request Report ')
            print('\t==============================')

        for tld_filter in self.top_tlds:
            csv_rows = []
            csv_rows.append(('percent_total', 'request', 'type', 'domain',
                             'owner', 'owner_country', 'owner_lineage'))

            # get_3p_domain_percentages returns a list, we slice it to get only desired num_results
            for item in self.analyzer.get_3p_request_percentages(
                    tld_filter, request_type)[:self.num_results]:

                # figure out the lineage string if we know who owns the domain
                if item['request_owner_id'] != None:
                    lineage_string = self.utilities.get_domain_owner_lineage_combined_string(
                        item['request_owner_id'])
                else:
                    lineage_string = None

                csv_rows.append(
                    (item['percent_crawls'], item['request_url'],
                     item['request_type'], item['request_domain'],
                     item['request_owner_name'], item['request_owner_country'],
                     lineage_string))

            if tld_filter:
                if request_type:
                    self.utilities.write_csv(
                        self.report_path,
                        tld_filter + '-3p_' + request_type + '.csv', csv_rows)
                else:
                    self.utilities.write_csv(self.report_path,
                                             tld_filter + '-3p_request.csv',
                                             csv_rows)
            else:
                if request_type:
                    self.utilities.write_csv(self.report_path,
                                             '3p_' + request_type + '.csv',
                                             csv_rows)
                else:
                    self.utilities.write_csv(self.report_path,
                                             '3p_request.csv', csv_rows)

    # generate_3p_request_report

    def generate_data_transfer_report(self):
        """
		These reports tell us how much data was transferred across several dimensions
		"""

        print('\t==================================')
        print('\t Processing Data Transfer Reports ')
        print('\t==================================')

        for tld_filter in self.top_tlds:
            # set up filter and file names
            if tld_filter:
                summary_file_name = tld_filter + '-data_xfer_summary.csv'
                domain_file_name = tld_filter + '-data_xfer_by_domain.csv'
                aggregated_file_name = tld_filter + '-data_xfer_aggregated.csv'
            else:
                summary_file_name = 'data_xfer_summary.csv'
                domain_file_name = 'data_xfer_by_domain.csv'
                aggregated_file_name = 'data_xfer_aggregated.csv'

            # get the data from db, tuple of (response_domain, size, is_3p (boolean), domain_owner_id)
            response_sizes = self.sql_driver.get_response_sizes()

            # initialize vars
            first_party_data = 0
            third_party_data = 0
            total_data = 0

            # need Counter object, allows sorting later
            domain_data = collections.Counter()
            owner_data = collections.Counter()

            # process each row
            for item in response_sizes:

                response_domain = item[0]
                response_size = item[1]
                response_is_3p = item[2]
                domain_owner_id = item[3]

                # this is the measure of all data downloaded
                total_data += response_size

                # measures for third and first party data
                if response_is_3p:
                    third_party_data += response_size
                else:
                    first_party_data += response_size

                # data by domain, increment if already in there, otherwise new entry
                if response_domain in domain_data:
                    domain_data[response_domain] += response_size
                else:
                    domain_data[response_domain] = response_size

                # only if we know the owner, increment
                if domain_owner_id:
                    for lineage_id in self.utilities.get_domain_owner_lineage_ids(
                            domain_owner_id):
                        if lineage_id in owner_data:
                            owner_data[lineage_id] += response_size
                        else:
                            owner_data[lineage_id] = response_size

            # avoid divide-by-zero
            if total_data == 0:
                print('\t\tTotal data is zero, no report')
                return

            # output data to csv
            summary_data_csv = []
            summary_data_csv.append(
                ('party', 'percent_total', 'data_transfered_bytes'))
            summary_data_csv.append(('all', '100', total_data))
            summary_data_csv.append(
                ('First',
                 round((first_party_data / total_data) * 100,
                       self.num_decimals), first_party_data))
            summary_data_csv.append(
                ('Third',
                 round((third_party_data / total_data) * 100,
                       self.num_decimals), third_party_data))

            self.utilities.write_csv(self.report_path, summary_file_name,
                                     summary_data_csv)

            # sort and output ranked data
            domain_data = domain_data.most_common()
            domain_data.sort()
            domain_data.sort(reverse=True, key=lambda item: item[1])

            # for csv data
            domain_data_csv = []
            domain_data_csv.append(
                ('percent_total', 'domain', 'data_transfered_bytes'))

            # if num_results is None we get everything, otherwise stops at limit
            for item in domain_data[:self.num_results]:
                domain_data_csv.append(
                    (round((item[1] / total_data) * 100,
                           self.num_decimals), item[0], item[1]))
            self.utilities.write_csv(self.report_path, domain_file_name,
                                     domain_data_csv)

            owner_data = self.utilities.get_most_common_sorted(owner_data)
            owner_data_csv = []
            owner_data_csv.append(('percent_total', 'owner', 'owner_country',
                                   'owner_lineage', 'data_transfered_bytes'))
            # get results for all known owners
            for item in owner_data:
                owner_data_csv.append(
                    (round((item[1] / total_data) * 100, self.num_decimals),
                     self.domain_owners[item[0]]['owner_name'],
                     self.domain_owners[item[0]]['country'],
                     self.utilities.get_domain_owner_lineage_combined_string(
                         item[0]), item[1]))
            self.utilities.write_csv(self.report_path, aggregated_file_name,
                                     owner_data_csv)

    # generate_data_transfer_report

    def generate_use_report(self):
        """
		This function handles the process of generating a csv report which details
			what percentage of pages use third-party content for specific uses,
			the number of requests made for a given type of use on a per-page basis,
			and the percentage of such requests which correspond to a third-party
			cookie.
		"""

        print('\t==========================')
        print('\t Processing 3P Use Report ')
        print('\t==========================')

        for tld_filter in self.top_tlds:
            use_data = self.analyzer.get_3p_use_data(tld_filter)
            all_uses = use_data['all_uses']
            percentage_by_use = use_data['percentage_by_use']
            average_use_occurance_per_page = use_data[
                'average_use_occurance_per_crawl']
            percentage_use_w_cookie = use_data['percentage_use_w_cookie']
            percentage_use_ssl = use_data['percentage_use_ssl']

            csv_rows = []
            csv_rows.append(
                ('use_category', 'percent_crawls_w_use',
                 'ave_occurances_per_page', 'percentage_of_use_w_cookie',
                 'percentage_of_use_ssl'))
            for use in sorted(all_uses):
                if percentage_by_use[use] != None:
                    csv_rows.append((use, percentage_by_use[use],
                                     average_use_occurance_per_page[use],
                                     percentage_use_w_cookie[use],
                                     percentage_use_ssl[use]))
                else:
                    csv_rows.append((use, None, None, None, None))

            # write out csv with tld prefix if applicable
            if tld_filter:
                self.utilities.write_csv(self.report_path,
                                         tld_filter + '-3p_uses.csv', csv_rows)
            else:
                self.utilities.write_csv(self.report_path, '3p_uses.csv',
                                         csv_rows)

    # generate_use_report

    def generate_per_page_network_report(self):
        """
		this report generates data necessary for graph/network analysis by
			outputting a list of page domains and the requests/owners they connect to
			on a per-page basis
		"""

        print('\t====================================')
        print('\t Processing Per-Page Network Report ')
        print('\t====================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('page_start_url', 'page_final_url', 'page_accessed',
                         '3p_request_domain', '3p_domain_owner',
                         '3p_domain_owner_country'))

        # process all records
        for item in self.analyzer.get_page_to_3p_network():
            csv_rows.append(
                (item['page_start_url'], item['page_final_url'],
                 item['page_accessed'], item['request_domain'],
                 item['request_owner_name'], item['request_owner_country']))

        self.utilities.write_csv(self.report_path,
                                 'per_page_network_report.csv', csv_rows)

    # generate_per_page_network_report

    def generate_per_site_network_report(self):
        """
		this report generates data necessary for graph/network analysis by
			outputting a list of page domains and the requests/owners they connect to
			aggregated on a per-site basis (eg combining all pages)
		"""

        print('\t================================')
        print('\t Processing Site Network Report ')
        print('\t================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('page_domain', '3p_request_domain', '3p_domain_owner',
                         '3p_domain_owner_country'))

        for item in self.analyzer.get_site_to_3p_network():
            csv_rows.append(
                (item['page_domain'], item['request_domain'],
                 item['request_owner_name'], item['request_owner_country']))

        self.utilities.write_csv(self.report_path,
                                 'per_site_network_report.csv', csv_rows)

    # generate_per_site_network_report

    def generate_all_pages_request_dump(self):
        """
		Full dump of all requests loaded by all pages across all load times.
			Default is 3p only, can be overridden.
		"""

        print('\t===================================')
        print('\t Processing All Pages request Dump ')
        print('\t===================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('accessed', 'start_url', 'final_url', 'request_url',
                         'request_domain', 'domain_owner'))

        # process all records
        for item in self.analyzer.get_all_pages_requests():
            csv_rows.append(
                (item['accessed'], item['start_url'], item['final_url'],
                 item['request_url'], item['request_domain'],
                 item['request_domain_owner']))

        self.utilities.write_csv(self.report_path,
                                 'all_pages_request_dump.csv', csv_rows)

    # generate_all_pages_request_dump

    def generate_all_pages_cookie_dump(self):
        """
		Full dump of all cookies loaded by all pages across all load times.
			Default is 1p and 3p, can be overridden to 3p only.
		"""

        print('\t==================================')
        print('\t Processing All Pages Cookie Dump ')
        print('\t==================================')

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('accessed', 'start_url', 'final_url', 'cookie_domain',
                         'cookie_owner', 'cookie_name', 'cookie_value'))

        # process all records
        for item in self.analyzer.get_all_pages_cookies():
            csv_rows.append(
                (item['accessed'], item['start_url'], item['final_url'],
                 item['cookie_domain'], item['cookie_owner'],
                 item['cookie_name'], item['cookie_value']))

        self.utilities.write_csv(self.report_path, 'all_pages_cookie_dump.csv',
                                 csv_rows)

    # generate_all_pages_request_dump

    def generate_site_host_report(self):
        """
		First, we update the domain table with the owners
			of the various ip addresses which gives us
			a mapping of pages to hosts.

		Second, we generate a network report for
			site domains to hosts.

		"""
        print('\t=====================')
        print('\t Updating Site Hosts ')
        print('\t=====================')

        self.analyzer.update_site_hosts()

        print('\t==============================')
        print('\t Generating Site Host Network ')
        print('\t==============================')

        site_host_data = self.analyzer.get_site_host_network()

        if len(site_host_data) == 0:
            print('\t\tNo site host data, skipping report.')
            return

        # put output here
        csv_rows = []

        # header row for csv
        csv_rows.append(('page_domain', 'host_name'))

        for item in site_host_data:
            csv_rows.append((item['site_domain'], item['host_name']))

        self.utilities.write_csv(self.report_path, 'site_hosts-network.csv',
                                 csv_rows)

        print('\t============================================')
        print('\t Generating Aggregate Host Ownership Report ')
        print('\t============================================')

        owner_occurances = []
        for owner, in self.sql_driver.get_ip_owners():
            owner_occurances.append(owner)

        csv_rows = [('owner', 'percent_sites_w_owner')]
        for item in self.utilities.get_most_common_sorted(owner_occurances):
            csv_rows.append((item[0], 100 * (item[1] / len(owner_occurances))))

        self.utilities.write_csv(self.report_path, 'site_hosts-aggregated.csv',
                                 csv_rows)

    # generate_site_host_report

    ##############
    # POLICYXRAY #
    ##############

    def initialize_policy_reports(self):
        """
		Run various pre-production steps.
		"""

        print('\t====================================')
        print('\t Updating 3p Domain Disclosure Data ')
        print('\t====================================')

        #self.analyzer.update_request_disclosure()
        self.analyzer.update_crawl_disclosure()

        print('\t\t...done!')

        print('\t======================================')
        print('\t Getting Policy Types List and Counts ')
        print('\t======================================')

        # pre-populate with 'None' which gives all policies
        self.policy_types = [{
            'type': None,
            'count': self.analyzer.get_policy_count()
        }]

        for policy_type, in self.sql_driver.get_available_policy_types():
            self.policy_types.append({
                'type':
                policy_type,
                'count':
                self.analyzer.get_policy_count(policy_type=policy_type)
            })

        print('\t\t...done!')

    # initialize_policy_reports

    def generate_policy_summary_report(self):
        """
		Conducts prelminary analysis steps, determines what types of 
			policies we have, and then initiates the pertinent reports.
		"""
        print('\t==================================')
        print('\t Generating Policy Summary Report ')
        print('\t==================================')

        # header row
        csv_rows = [('Type', 'N', 'Word Count', 'FK Grade', 'FRE',
                     '% 3P Disclosed')]

        # get results for each policy_type
        for policy_type in self.policy_types:
            # makes reports clearer than 'None'
            if policy_type['type'] == None:
                this_policy_type = 'all'
            else:
                this_policy_type = policy_type['type']

            print('\t\tProcessing %s...' % this_policy_type,
                  end='',
                  flush=True)

            # fetch results
            readability_scores = self.analyzer.get_readability_scores(
                policy_type=policy_type['type'])

            csv_rows.append(
                (this_policy_type, policy_type['count'],
                 self.analyzer.get_average_policy_word_count(
                     policy_type=policy_type['type']),
                 readability_scores['ave_fkg'], readability_scores['ave_fre'],
                 self.analyzer.get_percent_crawl_3p_domains_disclosed(
                     policy_type=policy_type['type'])))
            print('done!')

        self.utilities.write_csv(self.report_path, 'policy-summary.csv',
                                 csv_rows)

    # generate_policy_summary_report

    def generate_policy_owner_disclosure_reports(self):
        """
		Determines what types of policies we have, and then
			initiates the pertinent reports.
		"""

        print('\t======================================')
        print('\t Generating Company Disclosure Report ')
        print('\t======================================')

        # header row
        csv_rows = [('Type', 'N', '%% 3P Disclosed')]

        print('\t\tProcessing ...', end='', flush=True)

        company_results = self.analyzer.get_disclosure_by_request_owner()
        csv_rows = [('Domain Owner', 'Total Occurances', 'Total Disclosures',
                     'Percent Disclosed')]
        for item in company_results:
            csv_rows.append(
                (item, company_results[item][0], company_results[item][1],
                 round(company_results[item][2], 2)))

        print('done!')
        self.utilities.write_csv(self.report_path,
                                 'policy-owner_disclosure.csv', csv_rows)

    # generate_policy_owner_disclosure_reports

    def generate_policy_gdpr_report(self):
        """
		Determine percentage of all policy types
			that contain gdpr article 9 terms.
		"""

        print('\t==============================')
        print('\t Generating GDPR Term Report ')
        print('\t==============================')

        term_list = [
            'racial or ethnic origin', 'political opinions',
            'religious or philosophical beliefs', 'trade union membership',
            'genetic data', 'biometric data', 'data concerning health',
            'sex life', 'sexual orientation'
        ]

        self.generate_terms_report('policy-gdpr_terms.csv', term_list)

    # generate_policy_gdpr_report

    def generate_policy_pacification_report(self):
        """
		Determine percentage of all policy types
			that contain pacification terms.
		"""

        print('\t=====================================')
        print('\t Generating Pacification Term Report ')
        print('\t=====================================')

        term_list = [
            'we value', 'we respect', 'important to us', 'help you', 'we care',
            'committed to protecting', 'cares about', 'transparency'
        ]

        self.generate_terms_report('policy-pacification_terms.csv', term_list)

    # generate_policy_pacification_report

    def generate_policy_pii_report(self):
        """
		Determine percentage of all policy types
			that contain pacification terms.
		"""

        print('\t============================')
        print('\t Generating PII Term Report ')
        print('\t============================')

        term_list = [
            'ip address', 'internet protocol address', 'browser type',
            'operating system'
        ]

        self.generate_terms_report('policy-pii_terms.csv', term_list)

    # generate_policy_pacification_report

    def generate_terms_report(self, report_name, term_list):
        """
		Generic function to generate reports on how
			often terms appear in policies.
		"""

        # set up header row
        csv_rows = []
        header_row = ('Type', 'any term')
        for term in term_list:
            header_row = header_row + (term, )

        csv_rows.append(header_row)

        # get results for each policy_type
        for policy_type in self.policy_types:
            # makes reports clearer than 'None'
            if policy_type['type'] == None:
                this_policy_type = 'all'
            else:
                this_policy_type = policy_type['type']

            print('\t\tProcessing %s...' % this_policy_type,
                  end='',
                  flush=True)

            this_csv_row = (this_policy_type, )
            this_csv_row = this_csv_row + (self.analyzer.get_terms_percentage(
                term_list,
                policy_type=policy_type['type'],
                policy_type_count=policy_type['count']), )
            for term in term_list:
                this_csv_row = this_csv_row + (
                    self.analyzer.get_terms_percentage(
                        [term],
                        policy_type=policy_type['type'],
                        policy_type_count=policy_type['count']), )
            csv_rows.append(this_csv_row)
            print('done!')

        self.utilities.write_csv(self.report_path, report_name, csv_rows)
Beispiel #13
0
class OutputStore:
    """	
		This class receives data from the browser, processes it, and stores it in the db
	"""
    def __init__(self, db_engine, db_name):
        self.db_engine = db_engine
        self.db_name = db_name
        self.utilities = Utilities()
        self.url_parser = ParseURL()

    # init

    def store(self,
              url,
              browser_output,
              store_source=False,
              store_1p=True,
              get_file_hashes=False,
              hash_3p_only=False):
        """
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

        # open up a sql connection
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
            exit()

        # get the ip, fqdn, domain, pubsuffix, and tld
        # we need the domain to figure out if cookies/elements are third-party
        origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
            url)

        # if we can't get page domain info we fail gracefully
        if origin_ip_fqdn_domain_pubsuffix_tld is None:
            sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
            return False

        origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0]
        origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1]
        origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2]
        origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3]
        origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4]

        # sql_driver.add_domain both stores the new domain and returns its db row id
        # if it is already in db just return the existing id
        page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn,
                                               origin_domain, origin_pubsuffix,
                                               origin_tld)

        # figure out the privacy policy url and text, starts null
        priv_policy_url = None
        priv_policy_url_text = None

        # read in our list of privacy link terms from the json file in webxray/resources/policyxray
        privacy_policy_term_list = self.utilities.get_privacy_policy_term_list(
        )

        # we reverse links return from browser to check footer links first as that is where policy links tend to be
        all_links = browser_output['all_links']
        all_links.reverse()

        # if we have links search for privacy policy
        if len(all_links) > 0:
            # links are tuple
            for link_text, link_url in all_links:
                # makes sure we have text, skip links without
                if link_text:
                    # need lower for string matching
                    link_text = link_text.lower().strip()
                    # not a link we can use
                    if 'javascript' in link_text: continue
                    # see if the link_text is in our term list
                    if link_text in privacy_policy_term_list:
                        # if the link_url is relative this will convert to absolute
                        priv_policy_url = self.utilities.get_absolute_url_from_page_link(
                            url, link_url)
                        priv_policy_url_text = link_text
                        break

        # if the final page is https (often after a redirect), mark it appropriately
        if browser_output['final_url'][:5] == 'https':
            page_is_ssl = True
        else:
            page_is_ssl = False

        if store_source:
            # handles issue where postgres will crash on inserting null character
            source = browser_output['source'].replace('\x00', ' ')
        else:
            source = None

        # add page
        page_id = sql_driver.add_page(
            browser_output['browser_type'], browser_output['browser_version'],
            browser_output['browser_wait'], browser_output['title'],
            browser_output['meta_desc'], url, browser_output['final_url'],
            priv_policy_url, priv_policy_url_text, page_is_ssl, source,
            browser_output['load_time'], page_domain_id)

        # store cookies
        for cookie in browser_output['cookies']:
            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            # note:
            #	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
            cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                'http://' + cookie['domain'])

            # something went wrong, log and fail gracefully
            if cookie_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url,
                    'Error parsing cookie with domain: ' + cookie['domain'])
                continue

            # otherwise, everything went fine
            cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0]
            cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1]
            cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2]
            cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3]
            cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4]

            # mark third-party cookies
            if origin_domain != cookie_domain:
                is_3p_cookie = True
            else:
                is_3p_cookie = False

            # this is a first party cookie, see if we want to store it
            if is_3p_cookie is False and store_1p is False:
                continue

            # sql_driver.add_domain both stores the new domain and returns its id
            cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn,
                                                     cookie_domain,
                                                     cookie_pubsuffix,
                                                     cookie_tld)

            # name and domain are required, so if they fail we just continue
            try:
                name = cookie['name']
            except:
                continue

            try:
                domain = cookie_domain
            except:
                continue

            # these are optional, fill with null values if fail
            try:
                secure = cookie['secure']
            except:
                secure = None

            try:
                path = cookie['path']
            except:
                path = None

            try:
                httponly = cookie['httponly']
            except:
                httponly = None

            try:
                expiry = cookie['expiry']
            except:
                expiry = None

            try:
                value = cookie['value']
            except:
                value = None

            # all done with this cookie
            sql_driver.add_cookie(page_id, name, secure, path, domain,
                                  httponly, expiry, value, is_3p_cookie,
                                  cookie_domain_id)

        # process requests now
        for request in browser_output['processed_requests']:
            # if the request starts with the following we can't parse anyway, so skip
            if re.match('^(data|about|chrome|blob).+', request):
                continue

            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                request)

            # problem with this request, log and fail gracefully
            if element_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url, 'Error parsing element request: ' + request)
                continue

            element_ip = element_ip_fqdn_domain_pubsuffix_tld[0]
            element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1]
            element_domain = element_ip_fqdn_domain_pubsuffix_tld[2]
            element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3]
            element_tld = element_ip_fqdn_domain_pubsuffix_tld[4]

            # sql_driver.add_domain both stores the new domain and returns its db row id
            element_domain_id = sql_driver.add_domain(element_ip, element_fqdn,
                                                      element_domain,
                                                      element_pubsuffix,
                                                      element_tld)

            # mark third-party elements based on domain
            if origin_domain != element_domain:
                is_3p_element = True
            else:
                is_3p_element = False

            # if we are not storing 1p elements continue
            if is_3p_element is False and store_1p is False:
                continue

            if request[:5] == 'https':
                element_is_ssl = True
            else:
                element_is_ssl = False

            try:
                received = browser_output['processed_requests'][request][
                    'received']
            except:
                received = None

            # get domain of referer and determine if page leaked by referer
            try:
                referer = browser_output['processed_requests'][request][
                    'referer']
            except:
                referer = None

            if referer and len(referer) != 0:
                referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                    referer)

                if referer_ip_fqdn_domain_pubsuffix_tld:
                    if referer_ip_fqdn_domain_pubsuffix_tld[
                            2] == origin_domain:
                        page_domain_in_referer = True
                    else:
                        page_domain_in_referer = False
                else:
                    page_domain_in_referer = None
                    sql_driver.log_error(
                        url, 'Error parsing referer header: ' + referer)
            else:
                page_domain_in_referer = None

            try:
                start_time_offset = browser_output['processed_requests'][
                    request]['start_time_offset']
            except:
                start_time_offset = None

            try:
                load_time = browser_output['processed_requests'][request][
                    'load_time']
            except:
                load_time = None

            try:
                status = browser_output['processed_requests'][request][
                    'status']
            except:
                status = None

            try:
                status_text = browser_output['processed_requests'][request][
                    'status_text']
            except:
                status_text = None

            try:
                content_type = browser_output['processed_requests'][request][
                    'content_type']
            except:
                content_type = None

            try:
                body_size = browser_output['processed_requests'][request][
                    'body_size']
            except:
                body_size = None

            try:
                request_headers = str(browser_output['processed_requests']
                                      [request]['request_headers'])
            except:
                request_headers = None

            try:
                response_headers = str(browser_output['processed_requests']
                                       [request]['response_headers'])
            except:
                response_headers = None

            # consider anything before the "?" to be the element_url
            try:
                element_url = re.search('^(.+?)\?.+$', request).group(1)
            except:
                element_url = request

            # consider anything after the "?" to be the args
            try:
                element_args = re.search('^.+(\?.+)$',
                                         request).group(1)  # start url args
            except:
                element_args = None

            # attempt to parse off the extension
            try:
                element_extension = re.search('\.([0-9A-Za-z]+)$',
                                              element_url).group(1).lower()
            except:
                element_extension = None

            # lists of common extensions, can be expanded
            image_extensions = [
                'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif',
                'tiff', 'webp', 'srf'
            ]
            script_extensions = ['js', 'javascript']
            data_extensions = ['json', 'jsonp', 'xml']
            font_extentions = ['woff', 'ttf', 'otf']
            static_extentions = ['html', 'htm', 'shtml']
            dynamic_extentions = [
                'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'
            ]

            # figure out what type of element it is
            if element_extension in image_extensions:
                element_type = 'image'
            elif element_extension in script_extensions:
                element_type = 'javascript'
            elif element_extension in data_extensions:
                element_type = 'data_structured'
            elif element_extension == 'css':
                element_type = 'style_sheet'
            elif element_extension in font_extentions:
                element_type = 'font'
            elif element_extension in static_extentions:
                element_type = 'page_static'
            elif element_extension == dynamic_extentions:
                element_type = 'page_dynamic'
            elif element_extension == 'swf' or element_extension == 'fla':
                element_type = 'Shockwave Flash'
            else:
                element_type = None

            # file hashing has non-trivial overhead and off by default
            #
            # what this does is uses the same ua/referer as the actual request
            # 	so we are just replaying the last one to get similar response
            # 	note that we aren't sending the same cookies so that could be an issue
            # 	otherwise it is equivalent to a page refresh in theory

            # option to hash only 3p elements observed here
            if (get_file_hashes and hash_3p_only
                    and is_3p_element) or (get_file_hashes
                                           and hash_3p_only == False):
                replay_element_request = urllib.request.Request(
                    request,
                    headers={
                        'User-Agent':
                        browser_output['processed_requests'][request]
                        ['user_agent'],
                        'Referer':
                        referer,
                        'Accept':
                        '*/*'
                    })
                try:
                    file_md5 = hashlib.md5(
                        urllib.request.urlopen(replay_element_request,
                                               timeout=10).read()).hexdigest()
                except:
                    file_md5 = None
            else:
                file_md5 = None

            # store request
            sql_driver.add_element(
                page_id, request, element_url, is_3p_element, element_is_ssl,
                received, referer, page_domain_in_referer, start_time_offset,
                load_time, status, status_text, content_type, body_size,
                request_headers, response_headers, file_md5, element_extension,
                element_type, element_args, element_domain_id)

        # close db connection
        sql_driver.close()

        return True
Beispiel #14
0
class Analyzer:
	"""
	This class performs analysis of our data.
	"""

	def __init__(self,db_name,db_engine, flush_domain_owners):

		# set up global db connection
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()

		# these gets reused frequently, minimize db calls by doing it up here
		self.total_pages 	= self.sql_driver.get_complex_page_count()
		self.total_crawls 	= self.sql_driver.get_crawl_count()

		# pass utilities the database info
		self.utilities = Utilities(db_name,db_engine)

		# initialize the domain owner dict
		self.domain_owners = self.utilities.get_domain_owner_dict()

		# update domain owners
		if flush_domain_owners:
			self.patch_domain_owners()

		# load to memory for faster processing, make sure you
		#	have enough RAM!
		self.get_crawl_id_to_3p_domain_info()
	# __init__

	def get_crawl_id_to_3p_domain_info(self):
		"""
		Many operations needed to access a mapping of crawl_ids to the
			domain name and domain_owner_ids of all types of data
			(requests, responses, cookies, and websockets).  To save
			db calls we set up a massive dictionary once to be reused
			later.
		"""

		print('\tFetching crawl 3p domain lookup info...', end='', flush=True)

		# this is a class global
		self.crawl_id_to_3p_domain_info = {}
		for crawl_id,domain,domain_owner_id in self.sql_driver.get_crawl_id_3p_domain_info():
			if crawl_id not in self.crawl_id_to_3p_domain_info:
				self.crawl_id_to_3p_domain_info[crawl_id] = [{'domain':domain,'owner_id':domain_owner_id}]
			else:
				self.crawl_id_to_3p_domain_info[crawl_id] = self.crawl_id_to_3p_domain_info[crawl_id] + [{'domain':domain,'owner_id':domain_owner_id}]

		print('done!')
	# get_crawl_id_to_3p_domain_info

	def patch_domain_owners(self):
		"""
		in order to analyze what entities receive user data, we need to update
		  the database with domain ownership records we have stored previously
		"""

		# we first clear out what is the db in case the new data has changed, 
		# 	on big dbs takes a while
		print('\tFlushing extant domain owner data...', end='', flush=True)
		self.sql_driver.reset_domain_owners()
		print('done!')

		# next we pull the owner/domain pairings from the json file in 
		# 	the resources dir and add to the db
		print('\tPatching with new domain owner data...', end='', flush=True)
		domain_owner_data = json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8'))
		for item in domain_owner_data:
			# skipping for now, but perhaps find a way to enter this in db?
			if 'revision_date' in item: continue

			# convert lists to strings for db storage
			item['aliases'] 					= json.dumps(item['aliases'])
			item['site_privacy_policy_urls'] 	= json.dumps(item['site_privacy_policy_urls'])
			item['service_privacy_policy_urls'] = json.dumps(item['service_privacy_policy_urls'])
			item['gdpr_statement_urls'] 		= json.dumps(item['gdpr_statement_urls'])
			item['terms_of_use_urls'] 			= json.dumps(item['terms_of_use_urls'])
			item['platforms'] 					= json.dumps(item['platforms'])
			item['uses']						= json.dumps(item['uses'])


			self.sql_driver.add_domain_owner(item)

			for domain in item['domains']:
				self.sql_driver.update_domain_owner(item['id'], domain)

		# update the domain owner dict
		self.domain_owners = self.utilities.get_domain_owner_dict()

		print('done!')
	# patch_domain_owners

	def get_top_tlds(self, limit):
		"""
		finds the most common tlds from all the pages
		type is default to tld, but pubsuffix also works

		returns list of tlds
		"""

		# first we put all the tlds for each page into a list
		tlds = []
		for row in self.sql_driver.get_all_tlds():
			tlds.append(row[0])

		# use this to hold the top tlds
		# it starts with "None" as that means we process all the pages
		top_tlds = [None]

		# set up a global var which has the counts for each tld
		self.page_counts_by_tld = {}
		
		# cut the list to the limit to return only top tlds
		for tld,count in collections.Counter(tlds).most_common()[0:limit]:
			top_tlds.append(tld)
			self.page_counts_by_tld[tld] = count
		
		return top_tlds
	# get_top_tlds

	def get_per_crawl_3p_domain_counts(self, tld_filter = None):
		"""
		determines basic stats for the number of 3p domains contacted per-crawl
		
		note this is distinct domain+pubsuffix, not fqdns (e.g. 'sub.example.com' 
			and sub2.example.com' only count as 'example.com')
		"""

		# now we determine the number of domains each page is connected to by looking at len of list of 3p domains
		per_crawl_3p_request_counts = []
		for crawl_id,count in self.sql_driver.get_crawl_3p_domain_counts():
			per_crawl_3p_request_counts.append(count)

		# crawls that have no 3p requests are not yet in our counts
		# 	so for all uncounted pages we add in zeros
		uncounted_crawls = self.total_crawls - len(per_crawl_3p_request_counts)
		for i in range(0,uncounted_crawls):
			per_crawl_3p_request_counts.append(0)

		return per_crawl_3p_request_counts
	# get_per_crawl_3p_domain_counts

	def get_3p_domain_distribution(self, tld_filter=None):
		"""
		Determines the number of pages which have a given number of 3p domains.
		"""
		per_crawl_3p_request_counts = self.get_per_crawl_3p_domain_counts()
		domain_count_to_page_count = collections.Counter(per_crawl_3p_request_counts)
		domain_count_to_page_distribution = {}
		max_value = 0
		for domain_count in domain_count_to_page_count:
			domain_count_to_page_distribution[domain_count] = domain_count_to_page_count[domain_count]
			if domain_count > max_value:
				max_value = domain_count
		
		full_dist = []
		for domain_count in range(max_value+1):
			if domain_count in domain_count_to_page_distribution:
				full_dist.append({
					'domain_count': domain_count,
					'page_count':	domain_count_to_page_distribution[domain_count]
				})
			else:
				full_dist.append({
					'domain_count': domain_count,
					'page_count':	0
				})

		return full_dist
	# get_3p_domain_distribution

	def get_3p_cookie_distribution(self, tld_filter=None):
		"""
		Determines the number of pages which have a given number of cookies.
		"""
		per_page_3p_cookie_counts = self.get_per_crawl_3p_cookie_counts(tld_filter)
		cookie_count_to_page_count = collections.Counter(per_page_3p_cookie_counts)
		cookie_count_to_page_distribution = {}
		max_value = 0
		for cookie_count in cookie_count_to_page_count:
			cookie_count_to_page_distribution[cookie_count] = cookie_count_to_page_count[cookie_count]
			if cookie_count > max_value:
				max_value = cookie_count
		
		full_dist = []
		for cookie_count in range(max_value+1):
			if cookie_count in cookie_count_to_page_distribution:
				full_dist.append({
					'cookie_count': cookie_count,
					'page_count':	cookie_count_to_page_distribution[cookie_count]
				})
			else:
				full_dist.append({
					'cookie_count': cookie_count,
					'page_count':	0
				})

		return full_dist
	# get_3p_cookie_distribution

	def get_3p_domain_stats(self, tld_filter=None):
		"""
		Returns high-level 3p domain stats.
		"""

		# this is the data we will be getting stats for
		per_crawl_3p_request_counts = self.get_per_crawl_3p_domain_counts(tld_filter)

		# mean and median should always be ok
		mean 	= statistics.mean(per_crawl_3p_request_counts)
		median 	= statistics.median(per_crawl_3p_request_counts)

		# but mode can throw an error, so catch here
		try:
			mode = statistics.mode(per_crawl_3p_request_counts)
		except:
			mode = None

		return({
			'mean': 	mean, 
			'median':	median, 
			'mode':		mode
		})
	# get_3p_domain_stats

	def get_per_crawl_3p_cookie_counts(self, tld_filter = None):
		"""
		determines basic stats for the number of 3p cookies contacted per-crawl
			note that a single 3p many set more than one cookie
		"""
		# each page id corresponds to a list of cookie ids
		crawl_id_to_unique_cookies = {}

		# run query to get all page id, 3p cookie id, 3p cookie domain entries
		for crawl_id,cookie_name,cookie_domain in self.sql_driver.get_crawl_id_3p_cookie_id_3p_cookie_domain(tld_filter):
			# if the page id is not yet seen enter the current cookie id as a fresh list
			#	otherwise, we add to the existing list
			if crawl_id not in crawl_id_to_unique_cookies:
				crawl_id_to_unique_cookies[crawl_id] = [(cookie_name,cookie_domain)]
			else:
				if (cookie_name,cookie_domain) not in crawl_id_to_unique_cookies[crawl_id]:
					crawl_id_to_unique_cookies[crawl_id] = crawl_id_to_unique_cookies[crawl_id] + [(cookie_name,cookie_domain)]

		# determine the number of 3p cookies each crawl has by looking at len of list of cookies
		per_crawl_3p_cookie_counts = []
		for crawl_id in crawl_id_to_unique_cookies:
			per_crawl_3p_cookie_counts.append(len(crawl_id_to_unique_cookies[crawl_id]))

		# crawls that have no 3p cookies are not yet in our counts
		# 	so for all uncounted crawls we add in zeros
		uncounted_crawls = self.total_crawls - len(per_crawl_3p_cookie_counts)
		for i in range(0,uncounted_crawls):
			per_crawl_3p_cookie_counts.append(0)

		return per_crawl_3p_cookie_counts
	# get_per_crawl_3p_cookie_counts

	def get_3p_cookie_stats(self,tld_filter=None):
		"""
		Returns high-level cookie stats.
		"""

		# this is the data we will be getting stats for
		per_page_3p_cookie_counts = self.get_per_crawl_3p_cookie_counts(tld_filter)

		# mean and median should always be ok
		mean 	= statistics.mean(per_page_3p_cookie_counts)
		median 	= statistics.median(per_page_3p_cookie_counts)

		# but mode can throw an error, so catch here
		try:
			mode = statistics.mode(per_page_3p_cookie_counts)
		except:
			mode = None

		return({
			'mean': 	mean, 
			'median':	median, 
			'mode':		mode
		})
	# get_3p_cookie_stats

	def get_db_summary(self):
		"""
		Get basic data about what is in our database.
		"""

		# some of these take longer than others
		total_tasks_fail 			= self.sql_driver.get_pending_task_count()
		total_tasks_attempted 		= self.total_crawls + total_tasks_fail
		percent_tasks_ok 			= (self.total_crawls/total_tasks_attempted)*100
		total_errors 				= self.sql_driver.get_total_errors_count()
		total_cookies 				= self.sql_driver.get_total_cookie_count()
		total_3p_cookies 			= self.sql_driver.get_total_cookie_count(is_3p = True)
		total_dom_storage			= self.sql_driver.get_dom_storage_count()
		total_websockets			= self.sql_driver.get_websocket_count()
		total_websocket_events		= self.sql_driver.get_websocket_event_count()
		total_requests				= self.sql_driver.get_total_request_count()
		total_responses 			= self.sql_driver.get_total_response_count()
		total_requests_received 	= self.sql_driver.get_total_request_count(received = True)
		percent_requests_received 	= (total_requests_received/total_requests)*100
		total_3p_requests			= self.sql_driver.get_total_request_count(party='third')
		total_3p_responses			= self.sql_driver.get_total_response_count(is_3p = True)
		
		# avoid divide-by-zero
		if total_3p_requests > 0:
			total_3p_requests_received 	= self.sql_driver.get_total_request_count(received = True, party='third')
			percent_3p_requests_received = (total_3p_requests_received/total_3p_requests)*100
		else:
			percent_3p_requests_received = 0
		
		# ship it back
		return({
			'total_crawls_ok'				: self.total_crawls,
			'total_pages_ok'				: self.total_pages,
			'total_tasks_fail'				: total_tasks_fail,
			'total_tasks_attempted'			: total_tasks_attempted,
			'percent_tasks_ok'				: percent_tasks_ok,
			'total_errors'					: total_errors,
			'total_cookies'					: total_cookies,
			'total_3p_cookies'				: total_3p_cookies,
			'total_dom_storage'				: total_dom_storage,
			'total_websockets'				: total_websockets,
			'total_websocket_events'		: total_websocket_events,
			'total_requests'				: total_requests,
			'total_responses'				: total_responses,
			'percent_requests_received'		: percent_requests_received,
			'total_3p_requests'				: total_3p_requests,
			'total_3p_responses'			: total_3p_responses,
			'percent_3p_requests_received'	: percent_3p_requests_received,
		})
	# get_db_summary

	def get_high_level_stats(self, tld_filter=None):
		"""
		Get high level stats about what we found.
		"""

		crawls_w_3p_req 		= self.sql_driver.get_crawl_w_3p_req_count()
		percent_w_3p_request 	= (crawls_w_3p_req/self.total_crawls)*100
		total_crawls_cookies 	= self.sql_driver.get_crawl_w_3p_cookie_count()
		percent_w_3p_cookie 	= (total_crawls_cookies/self.total_crawls)*100
		crawls_w_3p_script 		= self.sql_driver.get_crawl_w_3p_script_count()
		percent_w_3p_script		= (crawls_w_3p_script/self.total_crawls)*100
		total_pages_ssl 		= self.sql_driver.get_ssl_page_count()
		percent_pages_ssl		= (total_pages_ssl/self.total_pages)*100

		# request info
		total_requests_received 		= self.sql_driver.get_total_request_count(received = True)
		total_requests_received_ssl		= self.sql_driver.get_total_request_count(received = True, is_ssl = True)

		total_requests_received_1p 		= self.sql_driver.get_total_request_count(received = True, party='first')
		total_requests_received_1p_ssl	= self.sql_driver.get_total_request_count(received = True, party='first', is_ssl = True)

		total_requests_received_3p 		= self.sql_driver.get_total_request_count(received = True, party='third')
		total_requests_received_3p_ssl	= self.sql_driver.get_total_request_count(received = True, party='third', is_ssl = True)

		# ssl
		if total_requests_received > 0:
			percent_requests_ssl 	= (total_requests_received_ssl/total_requests_received)*100
			percent_1p_requests_ssl	= (total_requests_received_1p_ssl/total_requests_received_1p)*100
		else:
			percent_requests_ssl 	= 0
			percent_1p_requests_ssl	= 0

		if total_requests_received_3p:
			percent_3p_requests_ssl	= (total_requests_received_3p_ssl/total_requests_received_3p)*100
		else:
			percent_3p_requests_ssl	= 0

		# load time is seconds
		average_page_load_time = self.sql_driver.get_page_ave_load_time()

		# domains and cookies
		domain_stats	= self.get_3p_domain_stats(tld_filter)
		cookie_stats 	= self.get_3p_cookie_stats(tld_filter)

		return ({
			'total_crawls'					: self.total_crawls,
			'total_pages'					: self.total_pages,
			'percent_pages_ssl'				: percent_pages_ssl,
			'total_requests_received'		: total_requests_received,
			'percent_requests_ssl'			: percent_requests_ssl,
			'total_requests_received_1p'	: total_requests_received_1p,
			'percent_1p_requests_ssl'		: percent_1p_requests_ssl,
			'total_requests_received_3p'	: total_requests_received_3p,
			'percent_3p_requests_ssl'		: percent_3p_requests_ssl,
			'average_page_load_time'		: average_page_load_time,
			'percent_w_3p_request'			: percent_w_3p_request,
			'percent_w_3p_cookie'			: percent_w_3p_cookie,
			'percent_w_3p_script'			: percent_w_3p_script,
			'3p_domains_mean'				: domain_stats['mean'],
			'3p_domains_median'				: domain_stats['median'],
			'3p_domains_mode'				: domain_stats['mode'],
			'3p_cookies_mean'				: cookie_stats['mean'],
			'3p_cookies_median'				: cookie_stats['median'],
			'3p_cookies_mode'				: cookie_stats['mode'],
		})
	# get_high_level_stats

	def get_aggregated_tracking_attribution(self, tld_filter=None):
		"""
		generates ranked list of which entities collect data 
			from the greatest number of crawls

		- entities which have subsidiaries are ranked according 
			to the crawls their subsidiaries get data from as well
		- however, parent entities only get one hit on 
			a crawl which has multiple subsidiaries present
		- for example, if a crawl has 'google analytics' and 'doubleclick' 
			that is only one hit for 'google'

		"""

		# list will have entries for each hit on a given entity
		all_owner_occurances = []

		# each crawl_id is a key which corresponds to a list of 
		#	ids for entities which own the 3p domains
		crawl_to_3p_owners = {}

		# iterate through the entire set of 3p domains for each
		#	crawl
		for crawl_id in self.crawl_id_to_3p_domain_info:

			# this is a set so items which appear more than once only get counted once
			# reset this for each crawl
			crawl_domain_owners = set()

			for item in self.crawl_id_to_3p_domain_info[crawl_id]:
				if item['owner_id']:
					for lineage_id in self.utilities.get_domain_owner_lineage_ids(item['owner_id']):
						crawl_domain_owners.add(lineage_id)

			# we have finished processing for this crawl so we add the owner ids to the full list
			for owner_id in crawl_domain_owners:
				all_owner_occurances.append(owner_id)

		# return a list of dicts
		ranked_aggregated_tracking_attribution = []
		for owner_id, total_crawl_occurances in collections.Counter(all_owner_occurances).most_common():
			ranked_aggregated_tracking_attribution.append({
				'owner_id':			owner_id,
				'owner_name':		self.domain_owners[owner_id]['owner_name'],
				'owner_country':	self.domain_owners[owner_id]['country'],
				'percent_crawls':	(total_crawl_occurances/self.total_crawls)*100,
			})

		return ranked_aggregated_tracking_attribution

		# # get the crawl count for each domain + its children
		# domain_owner_to_crawl_count = {}
		# for domain_owner_id in self.domain_owners:
		# 	# this it the owner + children
		# 	domain_owner_id_list = [domain_owner_id]+self.utilities.get_domain_owner_child_ids(domain_owner_id)
		# 	domain_owner_to_crawl_count[domain_owner_id] = self.sql_driver.get_crawl_count_by_domain_owners(domain_owner_id_list)
		
		# # now figure out the ranking		
		# domain_owners_ranked_high_low = []
		# for domain_owner_id, count in sorted(domain_owner_to_crawl_count.items(), key=lambda item: item[1],reverse=True):
		# 	domain_owners_ranked_high_low.append(domain_owner_id)

		# # return a list of dicts
		# ranked_aggregated_tracking_attribution = []
		# for domain_owner_id in domain_owners_ranked_high_low:
		# 	ranked_aggregated_tracking_attribution.append({
		# 		'owner_id':			domain_owner_id,
		# 		'owner_name':		self.domain_owners[domain_owner_id]['owner_name'],
		# 		'owner_country':	self.domain_owners[domain_owner_id]['country'],
		# 		'percent_crawls':	(domain_owner_to_crawl_count[domain_owner_id]/self.total_crawls)*100,
		# 	})

		# return ranked_aggregated_tracking_attribution
	# get_aggregated_tracking_attribution

	def get_aggregated_3p_ssl_use(self, tld_filter=None):
		"""
		For each request where we know the owner we determine if it is SSL,
			then we figure out the aggregated (owner+children) SSL
			usage percentage
		"""

		# do processing here
		owner_id_ssl_use = {}

		# we iterate over every received request
		# this is a potentially large query b/c we must look at each request on the page
		# since a single domain owner may have more than one requests and these may or may not be with ssl
		for domain,domain_owner_id,is_ssl in self.sql_driver.get_3p_request_domain_owner_id_ssl_use(tld_filter):
			for domain_owner_id in self.utilities.get_domain_owner_lineage_ids(domain_owner_id):
				if domain_owner_id not in owner_id_ssl_use:
					owner_id_ssl_use[domain_owner_id] = [is_ssl]
				else:
					owner_id_ssl_use[domain_owner_id] = owner_id_ssl_use[domain_owner_id] + [is_ssl]

		# output list of dicts
		aggregated_3p_ssl_use = []
		for owner_id in owner_id_ssl_use:
			aggregated_3p_ssl_use.append({
				'owner_id'			: owner_id,
				'owner_name'		: self.domain_owners[owner_id]['owner_name'],
				'owner_country'		: self.domain_owners[owner_id]['country'],
				'ssl_use'			: 100*(sum(owner_id_ssl_use[owner_id])/len(owner_id_ssl_use[owner_id]))
		})

		return aggregated_3p_ssl_use
	# get_aggregated_3p_ssl_use

	def get_site_to_3p_network(self, domain_owner_is_known=False):
		"""
			sql_driver.get_network_ties returns a set of tuples in the format
			(page domain, request domain, request domain owner id)
			we just go through this data to produce the report
		"""
		network = []

		for page_domain,request_domain,request_owner_id in self.sql_driver.get_3p_network_ties():
			# if we know the owner get name and country, otherwise None
			if request_owner_id != None:
				request_owner_name 		= self.domain_owners[request_owner_id]['owner_name']
				request_owner_country	= self.domain_owners[request_owner_id]['country']
			else:
				request_owner_name 		= None
				request_owner_country	= None

			network.append({
				'page_domain'			: page_domain,
				'request_domain'		: request_domain,
				'request_owner_id'		: request_owner_id,
				'request_owner_name'	: request_owner_name,
				'request_owner_country'	: request_owner_country
			})
		return network
	# get_3p_network

	def get_page_to_3p_network(self):
		"""
		Returns the network of all pages between third-party domains.

		Additionally returns information on page redirects and owners.
		"""
		network = []

		for page_start_url,page_final_url,page_accessed,request_domain,request_owner_id in self.sql_driver.get_all_pages_3p_domains_and_owners():
			# if we know the owner get name and country, otherwise None
			if request_owner_id != None:
				request_owner_name 		= self.domain_owners[request_owner_id]['owner_name']
				request_owner_country	= self.domain_owners[request_owner_id]['country']
			else:
				request_owner_name 		= None
				request_owner_country	= None

			network.append({
				'page_start_url'		: page_start_url,
				'page_final_url'		: page_final_url,
				'page_accessed'			: page_accessed,
				'request_domain'		: request_domain,
				'request_owner_id'		: request_owner_id,
				'request_owner_name'	: request_owner_name,
				'request_owner_country'	: request_owner_country
			})
		return network
	# get_page_to_3p_network

	def get_3p_domain_percentages(self,tld_filter=None):
		"""
		Determines what percentage of crawls a given third-party domain is found on and
			owner information.
		"""

		# total crawls for this tld, used to calculate percentages
		if tld_filter:
			total_crawls = self.crawl_counts_by_tld[tld_filter]
		else:
			total_crawls = self.total_crawls

		all_3p_domains = []
		for crawl_id in self.crawl_id_to_3p_domain_info:
			for item in self.crawl_id_to_3p_domain_info[crawl_id]:
				all_3p_domains.append((item['domain'],item['owner_id']))

		domain_percentages = []
		for item, domain_crawl_count in self.utilities.get_most_common_sorted(all_3p_domains):
			domain 		= item[0]
			owner_id 	= item[1]

			# if we know the owner get name and country, otherwise None
			if owner_id != None:
				owner_name 		= self.domain_owners[owner_id]['owner_name']
				owner_country 	= self.domain_owners[owner_id]['country']
				owner_uses = self.domain_owners[owner_id]['uses']
				owner_platforms = self.domain_owners[owner_id]['platforms']
			else:
				owner_name 		= None
				owner_country 	= None
				owner_uses = None
				owner_platforms = None

			domain_percentages.append({
				'percent_crawls': 100*(domain_crawl_count/total_crawls),
				'domain'		: domain,
				'owner_id'		: owner_id,
				'owner_name'	: owner_name,
				'owner_country'	: owner_country,
				'owner_uses' 	: owner_uses,
				'owner_platforms': owner_platforms
			})
		return domain_percentages
	# get_3p_domain_percentages

	def get_3p_request_percentages(self,tld_filter=None,request_type=None):
		"""
		Determine what percentage of pages a given request is found on.  
		
		This is based on the "request_url" which is the url for a given request
			stripped of arguments.
			ex: "https://example.com/track.js?abc=123" would become "https://example.com/track.js"

		Additionally returns relevant owner information.
		"""

		all_3p_requests = []
		
		# total crawls for this tld, used to calculate percentages
		if tld_filter:
			total_crawls = self.crawl_counts_by_tld[tld_filter]
		else:
			total_crawls = self.total_crawls

		for page_id,request_url,request_type,request_domain,request_domain_owner in self.sql_driver.get_3p_requests(tld_filter, request_type):
			all_3p_requests.append((request_url,request_type,request_domain,request_domain_owner))

		request_percentages =[]
		
		for item, request_crawl_count in self.utilities.get_most_common_sorted(all_3p_requests):
			# if we know the owner get name and country, otherwise None
			request_owner_id = item[3]
			if request_owner_id != None:
				request_owner_name 		= self.domain_owners[request_owner_id]['owner_name']
				request_owner_country 	= self.domain_owners[request_owner_id]['country']
			else:
				request_owner_name 		= None
				request_owner_country 	= None

			request_percentages.append({
				'percent_crawls'		: 100*(request_crawl_count/total_crawls),
				'request_url'			: item[0],
				'request_type'			: item[1],
				'request_domain'		: item[2],
				'request_owner_id'		: request_owner_id,
				'request_owner_name'	: request_owner_name,
				'request_owner_country'	: request_owner_country
			})
		return request_percentages
	# get_3p_domain_percentages

	def get_3p_use_data(self,tld_filter=None):
		""""
		For some domains we know what they are used for on a first-party basis (eg marketing).
			This function examines the data we have collected in order to determine what percentage
			of crawls include a request to a third-party domain with a given use, how many
			such requests are made on a per-use basis per-crawl, and finally, what percentage
			of requests per-crawl set a third-party cookie.

		Data is returned as a dict, the first field of which is a set of all the
			uses we know of.
		"""

		# we first need to create a dict whereby each domain 
		#	corresponds to a list of known uses
		# domains with no known uses are not in the list
		#
		# IMPORTANT NOTE:
		#	some domains may have several uses!
		domain_to_use_map = {}

		# a list of all known uses
		all_uses = set()

		for domain,owner_id in self.sql_driver.get_domain_owner_ids():
			if len(self.domain_owners[owner_id]['uses']) > 0:
				domain_to_use_map[domain] = self.domain_owners[owner_id]['uses']
				for use in self.domain_owners[owner_id]['uses']:
					all_uses.add(use)

		# for each crawl, create a list of the set of domains 
		#	which set a cookie
		#
		# note that due to currently unresolved chrome issues we sometimes 
		# 	can get cookies which don't have a corresponding 3p request
		# 	this approach handles that gracefully
		crawl_cookie_domains = {}
		for crawl_id, cookie_domain in self.sql_driver.get_crawl_id_3p_cookie_domain_pairs():
			if crawl_id not in crawl_cookie_domains:
				crawl_cookie_domains[crawl_id] = [cookie_domain]
			else:
				crawl_cookie_domains[crawl_id] = crawl_cookie_domains[crawl_id] + [cookie_domain]

		# next, for each crawl we want a list of uses for domains and if
		#	that domain corresponds to a cookie being set
		# NOTE: the same use may occur many times, this is desired
		# 	as it gives us our counts later on
		crawl_3p_uses = {}

		# for crawl_id, request_domain in self.sql_driver.get_crawl_id_3p_request_domain_pairs(tld_filter):
		for crawl_id in self.crawl_id_to_3p_domain_info:
			for item in self.crawl_id_to_3p_domain_info[crawl_id]:
				domain = item['domain']

				# if this 3p domain has a known use we add it to a list of uses keyed to crawl id
				if domain in domain_to_use_map:
					# check if the domain of this request has a cookie for this crawl
					if crawl_id in crawl_cookie_domains and domain in crawl_cookie_domains[crawl_id]: 
						sets_cookie = True
					else:
						sets_cookie = False

					# add in a tuple of (use,sets_cookie) to a list for this crawl_id
					for use in domain_to_use_map[domain]:
						if crawl_id not in crawl_3p_uses:
							crawl_3p_uses[crawl_id] = [(use,sets_cookie)]
						else:
							crawl_3p_uses[crawl_id] = crawl_3p_uses[crawl_id] + [(use,sets_cookie)]

		# determine how often requests for a give use are encrypted with ssl
		# 	- note that on the same crawl multiple requests for a single use may be made
		# 		and each request may or may not be ssl
		use_ssl 	= {}
		use_total 	= {}
		total_classified = 0
		for domain,domain_owner_id,is_ssl in self.sql_driver.get_3p_request_domain_owner_id_ssl_use(tld_filter):
			# only analyze domains we know the use for
			if domain in domain_to_use_map:
				total_classified += 1
				# each domain may have several uses, add for all
				for use in domain_to_use_map[domain]:
					# increment count of ssl usage
					if is_ssl:
						if use not in use_ssl:
							use_ssl[use] = 1
						else:
							use_ssl[use] = use_ssl[use] + 1
					
					# keep track of total occurances of this use
					if use not in use_total:
						use_total[use] = 1
					else:
						use_total[use] = use_total[use] + 1

		# for each use we will produce summary counts, we 
		#	initialize everyting to zero here
		total_crawls_w_use 				= {}
		total_use_occurances 			= {}
		total_use_occurances_w_cookie 	= {}

		for use in all_uses:
			total_crawls_w_use[use] 				= 0
			total_use_occurances[use] 			= 0
			total_use_occurances_w_cookie[use] 	= 0

		# process each crawl and update the relevant counts
		for crawl_id in crawl_3p_uses:
			# we only want to count use once per-crawl, so
			#	create a set and add to it as we go along
			this_crawl_use_set = set()

			# upate the use occurance counters
			for use, has_cookie in crawl_3p_uses[crawl_id]:
				this_crawl_use_set.add(use)
				total_use_occurances[use] = total_use_occurances[use] + 1
				if has_cookie == True:
					total_use_occurances_w_cookie[use] = total_use_occurances_w_cookie[use] + 1
			
			# each use in the set adds one to the total crawl count
			for use in this_crawl_use_set:
				total_crawls_w_use[use] = total_crawls_w_use[use] + 1

		# the last step is to calculate the relevant percentages and averages

		# used to get percentage by use
		if tld_filter:
			total_crawls = self.crawl_counts_by_tld[tld_filter]
		else:
			total_crawls = self.total_crawls

		percentage_by_use 				= {}
		average_use_occurance_per_crawl 	= {}
		percentage_use_w_cookie 		= {}
		percentage_use_ssl 				= {}
		
		for use in all_uses:
			percentage_by_use[use] 				= 0
			average_use_occurance_per_crawl[use] = 0
			percentage_use_w_cookie[use] 		= 0

		for use in total_crawls_w_use:
			if total_crawls_w_use[use] > 0:
				percentage_by_use[use] 				= 100*(total_crawls_w_use[use]/total_crawls)
				average_use_occurance_per_crawl[use] = total_use_occurances[use]/total_crawls_w_use[use]
				percentage_use_w_cookie[use]		= 100*(total_use_occurances_w_cookie[use]/total_use_occurances[use])
			else:
				percentage_by_use[use] 				= None
				average_use_occurance_per_crawl[use] = None
				percentage_use_w_cookie[use]		= None

			# conditional to account for cases where no instance of a given use is ssl
			if use in use_ssl:
				percentage_use_ssl[use] 			= 100*(use_ssl[use]/use_total[use])
			else:
				percentage_use_ssl[use] 			= 0

		# send back everyting as a keyed dict
		return({
			'all_uses'							: all_uses,
			'percentage_by_use'					: percentage_by_use,
			'average_use_occurance_per_crawl'	: average_use_occurance_per_crawl,
			'percentage_use_w_cookie' 			: percentage_use_w_cookie,
			'percentage_use_ssl'				: percentage_use_ssl
			})
	# get_3p_use_data

	def get_all_pages_requests(self):
		"""
		For all pages get all of the requests associated with each page 
			load.  Default is only_3p, but this can be overridden to get
			1p as well.
		"""
		records = []
		for result in self.sql_driver.get_all_pages_requests():
			try:
				domain_owner = self.utilities.get_domain_owner_lineage_combined_string(result[4])
			except:
				domain_owner = None

			records.append({
				'accessed'				: result[0].isoformat(),
				'start_url'				: result[1],
				'final_url'				: result[2],
				'request_domain'		: result[3],
				'request_domain_owner'	: domain_owner,
				'request_url'			: result[5],
			})
		return records
	# get_all_pages_requests

	def get_all_pages_cookies(self):
		"""
		For all pages get all of the cookies associated with each page 
			load.  Default is 1p and 3p, but this can be overridden to get
			3p only.
		"""
		records = []
		for result in self.sql_driver.get_all_pages_cookies():
			try:
				cookie_owner = self.utilities.get_domain_owner_lineage_combined_string(result[4])
			except:
				cookie_owner = None

			records.append({
				'accessed'		: result[0].isoformat(),
				'start_url'		: result[1],
				'final_url'		: result[2],
				'cookie_domain'	: result[3],
				'cookie_owner'	: cookie_owner,
				'cookie_name'	: result[5],
				'cookie_value'	: result[6],
			})
		return records
	# get_all_pages_cookies

	def get_single_page_request_dump(self,page_start_url):
		"""
		For a given page (defined as unique start_url) get all of the requests associated
			with every page load.  Default is only_3p, but this can be overridden to get
			1p as well.
		"""
		records = []
		for result in self.sql_driver.get_single_page_requests(page_start_url):
			try:
				domain_owner = self.utilities.get_domain_owner_lineage_combined_string(result[6])
			except:
				domain_owner = None

			records.append({
				'page_accessed'			: result[0].isoformat(),
				'start_url'				: result[1],
				'final_url'				: result[2],
				'request_url'			: result[4],
				'request_domain'		: result[5],
				'request_domain_owner'	: domain_owner
			})
		return records
	# get_single_page_request_dump

	def get_single_page_cookie_dump(self,page_start_url):
		"""
		For a given page (defined as unique start_url) get all of the cookies associated
			with every page load.  Default is only_3p, but this can be overridden to get
			1p as well.
		"""
		records = []
		for result in self.sql_driver.get_single_page_cookies(page_start_url):
			try:
				domain_owner = self.utilities.get_domain_owner_lineage_combined_string(result[6])
			except:
				domain_owner = None

			records.append({
				#'page_accessed'			: result[0].isoformat(),
				'page_accessed'			: 'blah',
				'start_url'				: result[1],
				'final_url'				: result[2],
				'is_ssl'				: result[3],
				'cookie_domain'			: result[4],
				'cookie_name'			: result[5],
				'cookie_value'			: result[6],
				'cookie_domain_owner'	: domain_owner
			})
		return records
	# get_single_page_cookie_dump

	def update_site_hosts(self):
		"""
		For each FDQN corresponding to a page we find the
			owner of the associated ip_addr.

		"""

		# required, non-standard
		try:
			from ipwhois import IPWhois
		except:
			print('!!! UNABLE TO UPDATE SITE HOSTS, IPWHOIS NOT INSTALLED !!!')
		
		page_ips_w_no_owner = self.sql_driver.get_page_ips_w_no_owner()
		total_to_update = len(page_ips_w_no_owner)

		progress = 0
		for ip, in page_ips_w_no_owner:
			progress += 1
			print('\t\t %s of %s done' % (progress,total_to_update))

			try:
				obj = IPWhois(ip)
				result = obj.lookup_whois()
				owner = result['nets'][0]['description']
			except:
				print('fail on %s' % ip)
				pass

			# fall back
			if owner == None:
				owner = result['asn_description']

			if owner:
				# combine amazon
				# if 'Amazon' in owner:
				# 	owner = 'amazon'
				# fix strings
				owner = owner.replace('.','')
				owner = owner.replace('"','')
				owner = owner.replace("'","")
				owner = owner.replace('\n', ' ')
				owner = owner.replace('\r', ' ')
				owner = owner.replace(' ','_')
				owner = owner.replace(',','_')
				owner = owner.lower()
				self.sql_driver.update_ip_owner(ip,owner)

	# update_site_hosts

	def get_site_host_network(self):
		"""
		Return all records where we known the owner of the ip_addr
			corresponding to a given page's fqdn.
		"""
		records = []
		for site_domain,host_name in self.sql_driver.get_site_hosts():
			records.append({
				'site_domain'	: site_domain,
				'host_name'		: host_name
			})

		return records
	#get_site_hosts

	##############
	# POLICYXRAY #
	##############

	def get_policy_count(self,policy_type=None):
		"""
		For a given type of policy tells us how many we have, if
			policy_type is None we get total count.
		"""
		return self.sql_driver.get_total_policy_count(policy_type)
	# get_policy_count

	def get_average_policy_word_count(self, policy_type=None):
		"""
		Returns average policy word count, filtered by policy_type.
		"""
		return self.sql_driver.get_average_policy_word_count(policy_type=policy_type)
	# get_average_policy_word_count

	def update_readability_scores(self):
		"""
		This function performs two English-language readability tests: Flesch-Kinkaid
			grade-level and Flesch Reading Ease for any policies we haven't already 
			done.  The python textstat module handle the actual calculations.

		Note these scores are meaningless for non-English language policies.
		"""

		# non-standard lib which must be installed
		from textstat.textstat import textstat

		for policy_id, text in self.sql_driver.get_id_and_policy_text(readability_null = True):
			fre_score = textstat.flesch_reading_ease(text)
			fk_score = textstat.flesch_kincaid_grade(text)
			self.sql_driver.update_readability_scores(policy_id, fre_score, fk_score)
	# update_readability_scores

	def get_readability_scores(self, policy_type=None):
		"""
		Returns average policy word count, filtered by policy_type.
		"""
		ave_fre = self.sql_driver.get_ave_fre(policy_type=policy_type)
		ave_fkg = self.sql_driver.get_ave_fkg(policy_type=policy_type)
		return({
			'ave_fre': ave_fre,
			'ave_fkg': ave_fkg
		})
	# get_readability_scores

	def update_crawl_disclosure(self):
		"""
		Leaving code here in case useful later, but it doesn't make sense in cases where 
			crawls are from different sites so it's staying dormant for now.
		"""
		
		# set up dictionaries so we can pull in the policy_id and policy_text for each page
		crawl_id_to_policy_id_text = {}
		for crawl_id, policy_id, policy_text in self.sql_driver.get_crawl_id_policy_id_policy_text():
			crawl_id_to_policy_id_text[crawl_id] = (policy_id, policy_text)

		# pull in all sets of page_id/request_owner_id we haven't analyzed yet
		for crawl_id, domain_owner_id in self.sql_driver.get_all_crawl_id_3p_request_owner_ids():
			# only process in cases we have an associated policy
			if crawl_id in crawl_id_to_policy_id_text:
				policy_id   = crawl_id_to_policy_id_text[crawl_id][0]
				policy_text = crawl_id_to_policy_id_text[crawl_id][1]
				# default values
				disclosed = False
				disclosed_owner_id = None
				# each owner may have several parent owners and aliases, we check for all of these in the policy
				for this_owner_id, this_owner_name in self.utilities.get_domain_owner_lineage_strings(domain_owner_id,get_aliases=True):
					if this_owner_name in policy_text:
						disclosed = True
						disclosed_owner_id = this_owner_id

				# done for this record, update disclosure table
				self.sql_driver.update_crawl_3p_domain_disclosure(crawl_id, domain_owner_id)
		return
	# update_crawl_disclosure

	def update_request_disclosure(self):
		"""
		For any page where we have a policy we extract all third-party request domains
			where we have determined the owner.  Next, we check if the name of the owner,
			any of it's parent companies, is in a given policy.  Note we also check based
			on "aliases" which are spelling variations on a given owner name (eg 'doubleclick'
			and 'double click').  Once we've done the checks we update the policy_request_disclosure
			table.
		"""

		# set up dictionaries so we can pull in the policy_id and policy_text for each page
		page_id_to_policy_id_text = {}
		for page_id, policy_id, policy_text in self.sql_driver.get_page_id_policy_id_policy_text():
			page_id_to_policy_id_text[page_id] = (policy_id, policy_text)

		# pull in all sets of page_id/request_owner_id we haven't analyzed yet
		for page_id, request_owner_id in self.sql_driver.get_all_page_id_3p_request_owner_ids(not_in_disclosure_table=True):
			# only process in cases we have an associated policy
			if page_id in page_id_to_policy_id_text:
				policy_id   = page_id_to_policy_id_text[page_id][0]
				policy_text = page_id_to_policy_id_text[page_id][1]
				# default values
				disclosed = False
				disclosed_owner_id = None
				# each owner may have several parent owners and aliases, we check for all of these in the policy
				for this_owner_id, this_owner_name in self.utilities.get_domain_owner_lineage_strings(request_owner_id,get_aliases=True):
					if this_owner_name in policy_text:
						disclosed = True
						disclosed_owner_id = this_owner_id

				# done for this record, update disclosure table
				self.sql_driver.update_request_disclosure(
						page_id, policy_id,
						request_owner_id, disclosed, 
						disclosed_owner_id
				)
		return
	# update_request_disclosure

	def get_percent_crawl_3p_domains_disclosed(self, policy_type=None):
		"""
		Determine the global percentage of 3p requests which are disclosed
			in policies.
		"""
		total_identified = self.sql_driver.get_total_crawl_3p_count()
		total_disclosed  = self.sql_driver.get_total_crawl_3p_disclosure_count()
		if total_identified == 0:
			return 0
		else:
			return(100*(total_disclosed/total_identified))
	# get_percent_3p_requests_disclosed

	def get_percent_3p_requests_disclosed(self, policy_type=None):
		"""
		Determine the global percentage of 3p requests which are disclosed
			in privacy policies.

		NOTE A PAGE CAN HAVE SEVERAL POLICIES WITH DISCLOSURE OCCURING IN SOME 
		BUT NOT ALL, WE SHOULD ACCOUNT FOR THIS!
		"""
		total_identified = self.sql_driver.get_total_request_disclosure_count(policy_type=policy_type)
		total_disclosed  = self.sql_driver.get_total_request_disclosure_count(policy_type=policy_type,disclosed=True)
		if total_identified == 0:
			return 0
		else:
			return(100*(total_disclosed/total_identified))
	# get_percent_3p_requests_disclosed

	def get_disclosure_by_request_owner(self):
		"""
		For each domain owner we query the policy_disclosure_table to find
			out if it or its subsidiaries have been disclosed.  This gives a very
			granular view on disclosure on a per-service basis in some cases.

		Note that this is distinct on the page id to avoid over-counting for
			subsidiaries.
		
		Returns a dict which is keyed to the owner name.
		"""
		results = {}
		for owner_id in self.domain_owners:
			child_owner_ids = self.utilities.get_domain_owner_child_ids(owner_id)
			if len(child_owner_ids) > 0:
				total 				= self.sql_driver.get_domain_owner_disclosure_count(owner_id, child_owner_ids=child_owner_ids)
				total_disclosed 	= self.sql_driver.get_domain_owner_disclosure_count(owner_id, child_owner_ids=child_owner_ids, disclosed=True)
			else:
				total 				= self.sql_driver.get_domain_owner_disclosure_count(owner_id)
				total_disclosed 	= self.sql_driver.get_domain_owner_disclosure_count(owner_id, disclosed=True)
			
			if total != 0:
				results[self.domain_owners[owner_id]['owner_name']] = (total,total_disclosed,(total_disclosed/total)*100)	

		# return the dict which can be processed to a csv in the calling class
		return results
	# get_disclosure_by_request_owner

	def get_terms_percentage(self,substrings,policy_type=None,policy_type_count=None):
		total_count = self.sql_driver.get_total_policy_count(policy_type=None)
		if policy_type:
			matches_count = self.sql_driver.get_policy_substrings_count(substrings,policy_type=policy_type)
		else:
			matches_count = self.sql_driver.get_policy_substrings_count(substrings)
		
		return (matches_count/policy_type_count)*100
	# get_terms_percentage

	def stream_rate(self):
		wait_time = 10
		elapsed = 0
		query = 'SELECT COUNT(*) FROM task_queue'
		old_count = sql_driver.fetch_query(query)[0][0]
		all_rates = []
		while True:
			time.sleep(wait_time)
			elapsed += wait_time
			new_count = sql_driver.fetch_query(query)[0][0]
			all_rates.append((old_count-new_count)*60)
			old_count = new_count
			json_data = json.dumps({
				'time': elapsed/60,
				'rate': statistics.mean(all_rates)
				# 'rate': new_count
			})
			yield f"data:{json_data}\n\n"
Beispiel #15
0
	def __init__(self, db_engine, db_name):
		self.db_engine	= db_engine
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
Beispiel #16
0
class OutputStore:
	"""	
		This class receives data from the browser, processes it, and stores it in the db
	"""

	def __init__(self, db_engine, db_name):
		self.db_engine	= db_engine
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
	# init

	def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False):
		"""
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

		# open up a sql connection
		if self.db_engine == 'mysql':
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)
		elif self.db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
			exit()

		# get the ip, fqdn, domain, pubsuffix, and tld
		# we need the domain to figure out if cookies/elements are third-party
		origin_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url)

		# if we can't get page domain info we fail gracefully
		if origin_ip_fqdn_domain_pubsuffix_tld is None:
			sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
			return False

		origin_ip 			= origin_ip_fqdn_domain_pubsuffix_tld[0]
		origin_fqdn 		= origin_ip_fqdn_domain_pubsuffix_tld[1]
		origin_domain 		= origin_ip_fqdn_domain_pubsuffix_tld[2]
		origin_pubsuffix 	= origin_ip_fqdn_domain_pubsuffix_tld[3]
		origin_tld 			= origin_ip_fqdn_domain_pubsuffix_tld[4]
		
		# sql_driver.add_domain both stores the new domain and returns its db row id
		# if it is already in db just return the existing id
		page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld)

		# figure out the privacy policy url and text, starts null
		priv_policy_url = None
		priv_policy_url_text = None

		# read in our list of privacy link terms from the json file in webxray/resources/policyxray
		privacy_policy_term_list = self.utilities.get_privacy_policy_term_list()

		# we reverse links return from browser to check footer links first as that is where policy links tend to be
		all_links = browser_output['all_links']
		all_links.reverse()

		# if we have links search for privacy policy
		if len(all_links) > 0:
			# links are tuple
			for link_text,link_url in all_links:
				# makes sure we have text, skip links without
				if link_text:
					# need lower for string matching
					link_text = link_text.lower().strip()
					# not a link we can use
					if 'javascript' in link_text: continue
					# see if the link_text is in our term list
					if link_text in privacy_policy_term_list:
							# if the link_url is relative this will convert to absolute
							priv_policy_url = self.utilities.get_absolute_url_from_page_link(url,link_url)
							priv_policy_url_text = link_text
							break

		# if the final page is https (often after a redirect), mark it appropriately
		if browser_output['final_url'][:5] == 'https':
			page_is_ssl = True
		else:
			page_is_ssl = False

		if store_source:
			# handles issue where postgres will crash on inserting null character
			source = browser_output['source'].replace('\x00',' ')
		else:
			source = None

		# add page
		page_id = sql_driver.add_page(
			browser_output['browser_type'],
			browser_output['browser_version'],
			browser_output['browser_wait'],
			browser_output['title'],
			browser_output['meta_desc'],
			url, 
			browser_output['final_url'],
			priv_policy_url,
			priv_policy_url_text,
			page_is_ssl,
			source,
			browser_output['load_time'],
			page_domain_id
		)

		# store cookies
		for cookie in browser_output['cookies']:
			# get the ip, fqdn, domain, pubsuffix, and tld
			# we need the domain to figure out if cookies/elements are third-party
			# note:
			#	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
			cookie_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain'])
			
			# something went wrong, log and fail gracefully
			if cookie_ip_fqdn_domain_pubsuffix_tld is None:
				sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain'])
				continue

			# otherwise, everything went fine
			cookie_ip 			= cookie_ip_fqdn_domain_pubsuffix_tld[0]
			cookie_fqdn 		= cookie_ip_fqdn_domain_pubsuffix_tld[1]
			cookie_domain 		= cookie_ip_fqdn_domain_pubsuffix_tld[2]
			cookie_pubsuffix 	= cookie_ip_fqdn_domain_pubsuffix_tld[3]
			cookie_tld 			= cookie_ip_fqdn_domain_pubsuffix_tld[4]

			# mark third-party cookies
			if origin_domain != cookie_domain:
				is_3p_cookie = True
			else:
				is_3p_cookie = False

			# this is a first party cookie, see if we want to store it
			if is_3p_cookie is False and store_1p is False:
				continue

			# sql_driver.add_domain both stores the new domain and returns its id
			cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld)
		
			# name and domain are required, so if they fail we just continue
			try: name = cookie['name']
			except: continue
		
			try: domain = cookie_domain
			except: continue
		
			# these are optional, fill with null values if fail
			try: secure = cookie['secure']
			except: secure = None
		
			try: path = cookie['path']
			except: path = None
		
			try: httponly = cookie['httponly']
			except: httponly = None
		
			try: expiry = cookie['expiry']
			except: expiry = None
		
			try: value = cookie['value']
			except: value = None
		
			# all done with this cookie
			sql_driver.add_cookie(
				page_id,
				name, secure, path, domain, 
				httponly, expiry, value, 
				is_3p_cookie, cookie_domain_id
			)

		# process requests now
		for request in browser_output['processed_requests']:
			# if the request starts with the following we can't parse anyway, so skip
			if re.match('^(data|about|chrome|blob).+', request):
				continue

			# get the ip, fqdn, domain, pubsuffix, and tld
			# we need the domain to figure out if cookies/elements are third-party
			element_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request)

			# problem with this request, log and fail gracefully
			if element_ip_fqdn_domain_pubsuffix_tld is None:
				sql_driver.log_error(url, 'Error parsing element request: '+request)
				continue

			element_ip 			= element_ip_fqdn_domain_pubsuffix_tld[0]
			element_fqdn 		= element_ip_fqdn_domain_pubsuffix_tld[1]
			element_domain 		= element_ip_fqdn_domain_pubsuffix_tld[2]
			element_pubsuffix 	= element_ip_fqdn_domain_pubsuffix_tld[3]
			element_tld 		= element_ip_fqdn_domain_pubsuffix_tld[4]

			# sql_driver.add_domain both stores the new domain and returns its db row id
			element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld)

			# mark third-party elements based on domain
			if origin_domain != element_domain:
				is_3p_element = True
			else:
				is_3p_element = False

			# if we are not storing 1p elements continue
			if is_3p_element is False and store_1p is False:
				continue
			
			if request[:5] == 'https' or request[:3] == 'wss':
				element_is_ssl = True
			else:
				element_is_ssl = False

			try:
				received = browser_output['processed_requests'][request]['received']
			except:
				received = None

			# get domain of referer and determine if page leaked by referer
			try:
				referer = browser_output['processed_requests'][request]['referer']
			except:
				referer = None

			if referer and len(referer) != 0:
				referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer)

				if referer_ip_fqdn_domain_pubsuffix_tld:
					if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain:
						page_domain_in_referer = True
					else:
						page_domain_in_referer = False
				else:
					page_domain_in_referer = None
					sql_driver.log_error(url, 'Error parsing referer header: '+referer)
			else:
				page_domain_in_referer = None

			try:
				start_time_offset = browser_output['processed_requests'][request]['start_time_offset']
			except:
				start_time_offset = None

			try:
				load_time = browser_output['processed_requests'][request]['load_time']
			except:
				load_time = None

			try:
				status = browser_output['processed_requests'][request]['status']
			except:
				status = None

			try:
				status_text = browser_output['processed_requests'][request]['status_text']
			except:
				status_text = None

			try:
				content_type = browser_output['processed_requests'][request]['content_type']
			except:
				content_type = None
			
			try:
				body_size = browser_output['processed_requests'][request]['body_size']
			except:
				body_size = None

			try:
				request_headers = str(browser_output['processed_requests'][request]['request_headers'])
			except:
				request_headers = None

			try:
				response_headers = str(browser_output['processed_requests'][request]['response_headers'])
			except:
				response_headers = None

			# consider anything before the "?" to be the element_url
			try:
				element_url = re.search('^(.+?)\?.+$', request).group(1)
			except:
				element_url = request

			# consider anything after the "?" to be the args
			try:
				element_args = re.search('^.+(\?.+)$', request).group(1) # start url args
			except:
				element_args = None

			# attempt to parse off the extension
			try:
				element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower()
			except:
				element_extension = None
			
			# lists of common extensions, can be expanded
			image_extensions 	= ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf']
			script_extensions 	= ['js', 'javascript']
			data_extensions 	= ['json', 'jsonp', 'xml']
			font_extentions 	= ['woff', 'ttf', 'otf']
			static_extentions 	= ['html', 'htm', 'shtml']
			dynamic_extentions	= ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi']

			# figure out what type of element it is
			if element_extension in image_extensions:
				element_type = 'image'
			elif element_extension in script_extensions:
				element_type = 'javascript'
			elif element_extension in data_extensions:
				element_type = 'data_structured'
			elif element_extension == 'css':
				element_type = 'style_sheet'
			elif element_extension in font_extentions:
				element_type = 'font'
			elif element_extension in static_extentions:
				element_type = 'page_static'
			elif element_extension == dynamic_extentions:
				element_type = 'page_dynamic'
			elif element_extension == 'swf' or element_extension == 'fla':
				element_type = 'Shockwave Flash'
			else:
				element_type = None

			# file hashing has non-trivial overhead and off by default
			#
			# what this does is uses the same ua/referer as the actual request
			# 	so we are just replaying the last one to get similar response
			# 	note that we aren't sending the same cookies so that could be an issue
			# 	otherwise it is equivalent to a page refresh in theory

			# option to hash only 3p elements observed here
			if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False):
				replay_element_request = urllib.request.Request(
					request,
					headers = {
						'User-Agent' : browser_output['processed_requests'][request]['user_agent'],
						'Referer' : referer,
						'Accept' : '*/*'
					}
				)
				try:
					file_md5 = hashlib.md5(urllib.request.urlopen(replay_element_request,timeout=10).read()).hexdigest()
				except:
					file_md5 = None
			else:
				file_md5 = None

			# final tasks is to truncate the request if it is 
			#	over 2k characters as it is likely
			#	binary data and may cause problems inserting
			#	into TEXT fields in database
			#
			#  TODO:
			#	better handle binary data in general
			if len(request) >= 2000: request = request[:2000]
			if len(element_url) >= 2000: element_url = element_url[:2000]

			# store request
			sql_driver.add_element(
				page_id,
				request, element_url,
				is_3p_element, element_is_ssl,
				received,
				referer,
				page_domain_in_referer,
				start_time_offset,
				load_time,
				status,
				status_text,
				content_type,
				body_size,
				request_headers,
				response_headers,
				file_md5,
				element_extension,
				element_type,
				element_args,
				element_domain_id
			)

		# close db connection
		sql_driver.close()

		return True
Beispiel #17
0
    pool_size = 1
elif db_engine == 'postgres':
    from webxray.PostgreSQLDriver import PostgreSQLDriver
    sql_driver = PostgreSQLDriver()

    # if we are using postgres the database can handle many
    #	connections so we set pool_size to None which sets up
    #	one process per processor core
    pool_size = None
else:
    print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
    quit()

# import our custom utilities
from webxray.Utilities import Utilities
utilities = Utilities(db_engine=db_engine)

# check for various dependencies, python version, etc.
utilities.check_dependencies()

# SET CONFIG
#
# There are a large number of setting for webXray, which are
#	set in a 'config' variable.  Two default configurations are
#	'haystack' which collects data needed for examining data transfers
#	and 'forensic' which collects everything, including images,
#	page text, and the content of files.  It is A VERY BAD IDEA
#	to conduct forensic scans on lists of random webpages
#	as you may be downloading and storing files you do not want.
#
# Only use forensic when you are TOTALLY SURE you want to retain
Beispiel #18
0
class ChromeDriver:
	"""
	This class allows for using the production Chrome browser with webXray.
	Requirements are Selenium, Chrome, and ChromeDriver.

	Pros:
		Production browser which is largely identical to real-world use
		By turning headless off it is very similar to a 'real' session
		By turning headless on the CPU/Mem usage is lower than otherwise
	Cons:
		Less testing with webxray than phantomjs, does not handle many paralell instances very well
		In headless mode prior to 64.0.3254.0, the cookie database does not get created and no cookies are returned
	"""

	def __init__(self,ua=False):
		"""
		set various global options here
		"""

		# set here if you want to use headless mode for Chrome
		self.headless = True

		# if you want to get potentially dangerous requests, set 
		#	this to true.
		# false by default for hopefully obvious reasons
		self.allow_insecure = False

		# if you have trouble getting chrome to start
		#	change these values manually
		self.chromedriver_path  = None
		self.chrome_binary_path = None

		# we want to give our browsers a full minute to try to
		#	download content, but gracefully timeout thereafter
		self.page_timeout_seconds = 60

		# Set ua if we have it, see get_ua_for_headless
		#	for details
		self.ua = ua
		
		# useful for various tasks
		self.utilities = Utilities()

		return None
	# init

	def create_chromedriver(self):
		"""
		Since we have many functions we can perform we consolidate
			chromedriver code here.
		"""

		# set up options object
		chrome_options = Options()

		# if we have chrome binary set it up
		if self.chrome_binary_path:
			chrome_options.binary_location = self.chrome_binary_path
		
		# live dangerously
		if self.allow_insecure:
			chrome_options.add_argument('--allow-running-insecure-content')

		# thank god for this option
		chrome_options.add_argument('--mute-audio')
		
		# if we are headless we also mix up window size a bit
		if self.headless:
			chrome_options.add_argument('headless')
			chrome_options.add_argument('disable-gpu')
			window_x = random.randrange(1050,1920)
			window_y = random.randrange(900,1080)
			chrome_options.add_argument('window-size=%sx%s' % (window_x,window_y))

		# if we have a ua set it here
		if self.ua: 
			chrome_options.add_argument('user-agent='+self.ua)

		# 'desired_capabilities' is essentially a second way to set options for chrome
		# we set loggingPrefs to turn on the performance log which we need to analyze network traffic
		# see: https://sites.google.com/a/chromium.org/chromedriver/logging/performance-log
		# pageLoadStrategy is set to 'none' to make sure we don't get stuck on pages that never finish loading
		# once 'eager' is implemented in chromedriver that may be preferable
		# see: https://w3c.github.io/webdriver/#dfn-table-of-page-load-strategies
		chrome_capabilities = {
			'loggingPrefs': {'performance': 'ALL'}, 
			'pageLoadStrategy': 'none'
		}

		# attempt to start driver, fail gracefull otherwise
		try:
			# if we have chromedriver path set it up
			if self.chromedriver_path:
				driver = webdriver.Chrome(
					self.chromedriver_path,
					desired_capabilities=chrome_capabilities,
					chrome_options=chrome_options
				)
			else:
				driver = webdriver.Chrome(
					desired_capabilities=chrome_capabilities,
					chrome_options=chrome_options
				)
		except:
			return None

		# allow one minute before we kill it, seperate from browser_wait
		driver.set_page_load_timeout(self.page_timeout_seconds)

		return driver
	# init_headless_driver

	def get_ua_for_headless(self):
		"""
		Using chrome in headless sends a 'Headless' ua string,
			here we figure out the current ua and remove the 
			'Headless' part to help with compatability

		This requires firing up a new browser instance
			and destroying it, so this should be called once
			and resused if possible and this is not in
			__init___ on purpose
		"""
		driver = self.create_chromedriver()
		if driver != None:
			ua = driver.execute_script('return navigator.userAgent')
			driver.quit()
			return ua.replace('Headless','')
		else:
			return None
	# get_ua_for_headless

	def get_webxray_scan_data(self, url, browser_wait):
		"""
		This function loads the page, monitors network traffic, and returns relevant data/logs.

		IMPORTANT: headless will miss all cookies in chrome versions < 64.0.3254.0

		This uses the chrome performance log to get network traffic details, see following for details:
			- https://gist.githubusercontent.com/klepikov/5457750/raw/ecedc6dd4eed82f318db91adb923627716fb6b58/test.py
			- https://sites.google.com/a/chromium.org/chromedriver/logging/performance-log
		"""

		driver = self.create_chromedriver()

		# we can't start Chrome, return error message as result
		if driver == None:
			return({
				'success': False,
				'result': 'Unable to launch Chrome instance'
			})

		# allow one minute before we kill it, seperate from browser_wait
		driver.set_page_load_timeout(self.page_timeout_seconds)

		# start the page load process, return error message if we fail
		try:
			driver.get(url)
		except Exception as e:
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to load page: '+str(e).replace('\n', ' ')
			})

		# while the browser may be finished loading the page, scripts may still making
		# 	additional requests, so we wait to let all that finish
		time.sleep(browser_wait)

		# if the page has an alert window open it will throw the following exception when trying
		#	to get the current_url: selenium.common.exceptions.UnexpectedAlertPresentException
		# in theory we should be able to set an option for UNEXPECTED_ALERT_BEHAVIOUR to ACCEPT
		# 	but it does not seem to be supported by chromedriver at present
		# in some cases we can catch the items we need before an alert fires, otherwise
		# 	we fail gracefully, but this is a bug that needs resolution
		try:
			final_url 	= driver.current_url
			title 		= driver.title
			page_source = driver.page_source
		except:
			# quit the driver or it will never die!
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to load page, possible javascript alert issue'
			})

		# handle odd bug where title is a 'webelement' object
		if not isinstance(title, str): title = None

		# We use the Chrome performance log get network traffic. Chrome performance log outputs a 
		#	number of independent 'message' events which are keyed to a 'requestId'.  What we want
		#	to send upstream is a dictionary keyed on the requested url so we do a lot of processing
		#	here to stitch together a coherent log in the format expected by wbxr.
		#
		# There are two types of network events we are concerned with: normal http 
		#	requests (initiated by Network.requestWillBeSent) and websocket requests (initiated
		#	by Network.webSocketCreated).
		#
		# For normal events, we add entries to the 'requests' dictionary which we key to the requested
		#	url.  The reason for this is a single requestId may correspond with many urls in
		#	cases where a request results in redirects occuring.  However, data from the 
		#	Network.loadingFinished event does not include the url, so we key that seperately
		#	in the load_finish_data dict and then attach it later on.  Note that if a request to
		#	x.com results in redirects to y.com and z.com, all three will end up sharing
		#	the same loadingFinished data.
		#
		# webSocket events are a special case in that they are not strictly HTTP events, but 
		#	they do two things we are concerned with: potentially linking a user to 
		#	a third-party domain and setting cookies.  The url contacted is only exposed in the
		#	first event, Network.webSocketCreated, so we must use the requestId to tie together
		#	subsequent Network.webSocketWillSendHandshakeRequest and 
		#	Network.webSocketHandshakeResponseReceived events.  We use the dictionary websocket_requests
		#	to keep track of such events, and we then reprocess them to be keyed to the url in our
		#	normal requests log.  Note that to keep track of websocket request we use 'websocket'
		#	for content type, and there may be a better way to handle this.

		# http requests are keyed to URL
		requests 		   = {}
		
		# these events are keyed to requestID
		load_finish_data   = {}
		websocket_requests = {}

		# to get page load time we will figure out when the first request and final load finished occured
		first_start_time = None
		last_end_time 	 = None

		# for debuging
		duplicate_keys = []

		# crunch through all the chrome logs here, the main event!
		for log_item in driver.get_log('performance'):
			for key, this_log_item in log_item.items():
				# we are only interested in message events
				if key == 'message':
					# we have to read in this value to get json data
					log_item_data 	= json.loads(this_log_item)
					message_type	= log_item_data['message']['method']

					################################
					# normal http event processing #
					################################

					# we have a new http event, create new empty entry keyed to url
					# and keep track of start time info
					if message_type == 'Network.requestWillBeSent':
						this_request = log_item_data['message']['params']['request']
						this_url 	 = this_request['url']
						
						# skip if not http(s)
						if not re.match('^https?://', this_url): continue

						# the presence of 'redirectResponse' means a prior request is redirected
						#	so we update the status of the original request here and
						#	then continue processing the new request
						if 'redirectResponse' in log_item_data['message']['params']:
							redirect_info = log_item_data['message']['params']['redirectResponse']
							original_url = redirect_info['url']
							
							# the request was received, mark it
							requests[original_url].update({'received':		True})

							# record status code and text
							requests[original_url].update({'status':		redirect_info['status']})
							requests[original_url].update({'status_text':	redirect_info['statusText']})
						
							# try to get response headers, fail gracefully as they are already None
							try:
								requests[this_url].update({'response_headers':this_response['headersText']})
							except:
								pass
						
							try:
								requests[this_url].update({'content_type':this_response['headers']['Content-Type']})
							except:
								pass

						# if a new request we initialize entry
						if this_url not in requests:
							requests[this_url] = {}

							# we use this to get the load_finish_data later on
							requests[this_url].update({'request_id': log_item_data['message']['params']['requestId']})
	
							# we set received to false to start with
							requests[this_url].update({'received':			False})

							# initialze response values to None in case we don't get response
							requests[this_url].update({'end_time':			None})
							requests[this_url].update({'status':			None})
							requests[this_url].update({'status_text':		None})
							requests[this_url].update({'response_headers':	None})
							requests[this_url].update({'content_type':		None})
							requests[this_url].update({'body_size':			None})
							requests[this_url].update({'end_time':			None})
							requests[this_url].update({'user_agent':		None})
							requests[this_url].update({'referer':			None})

							# each request has a start_time, we use this to figure out the time it took to download
							this_start_time = log_item_data['message']['params']['timestamp']
							requests[this_url].update({'start_time':this_start_time})
							
							# update global start time to measure page load time
							if first_start_time == None or this_start_time < first_start_time:
								first_start_time = this_start_time

							# get the request headers
							requests[this_url].update({'request_headers':this_request['headers']})

							# these can fail, if so, we ignore
							try:
								requests[this_url].update({'user_agent':this_request['headers']['User-Agent']})
							except:
								pass

							try:
								requests[this_url].update({'referer':this_request['headers']['Referer']})
							except:
								pass
						# this_url already exists, log
						else:
							duplicate_keys.append(this_url)
							continue

					# we have received a response to our request, update appropriately
					if message_type == 'Network.responseReceived':
						this_response 	= log_item_data['message']['params']['response']
						this_url 	 	= this_response['url']

						# skip if not http(s)
						if not re.match('^https?://', this_url): continue

						# the request was received, mark it
						requests[this_url].update({'received':		True})

						# record status code and text
						requests[this_url].update({'status':		this_response['status']})
						requests[this_url].update({'status_text':	this_response['statusText']})
						
						# try to get response headers, fail gracefully as they are already None
						try:
							requests[this_url].update({'response_headers':this_response['headersText']})
						except:
							pass
						
						try:
							requests[this_url].update({'content_type':this_response['headers']['Content-Type']})
						except:
							pass

					# load finish events are keyed to requestId and may apply to many requested urls
					#	so we keep this in a seperate dictionary to be relinked when we're done
					if message_type == 'Network.loadingFinished':
						this_request_id = log_item_data['message']['params']['requestId']
						this_end_time	= log_item_data['message']['params']['timestamp']

						# update global end time
						if last_end_time == None or this_end_time > last_end_time:
							last_end_time = this_end_time

						if this_request_id not in load_finish_data:
							load_finish_data[this_request_id] = {}

						# size is updated during loading and is shown in logs, but we only want the final size which is here
						load_finish_data[this_request_id].update({'body_size':log_item_data['message']['params']['encodedDataLength']})

						# we use this to calculate the total time for all requests
						load_finish_data[this_request_id].update({'end_time':this_end_time})

					##############################
					# webSocket event processing #
					##############################

					# we have a new websocket, create new empty entry keyed to requestId
					# 	this will be rekeyed to url
					# note we ignore timing data for websockets
					if message_type == 'Network.webSocketCreated':
						this_url 		= log_item_data['message']['params']['url']
						this_request_id = log_item_data['message']['params']['requestId']

						if this_request_id not in websocket_requests:
							websocket_requests[this_request_id] = {}
							websocket_requests[this_request_id].update({'url': 				this_url})
							websocket_requests[this_request_id].update({'content_type':		'websocket'})
							websocket_requests[this_request_id].update({'received':			False})
							websocket_requests[this_request_id].update({'end_time':			None})
							websocket_requests[this_request_id].update({'status':			None})
							websocket_requests[this_request_id].update({'status_text':		None})
							websocket_requests[this_request_id].update({'response_headers':	None})
							websocket_requests[this_request_id].update({'body_size':		None})
							websocket_requests[this_request_id].update({'end_time':			None})
							websocket_requests[this_request_id].update({'start_time':		None})
							websocket_requests[this_request_id].update({'user_agent':		None})
							websocket_requests[this_request_id].update({'referer':			None})

					# websocket request made, update relevant fields
					if message_type == 'Network.webSocketWillSendHandshakeRequest':
						this_request 	= log_item_data['message']['params']['request']
						this_request_id = log_item_data['message']['params']['requestId']
						websocket_requests[this_request_id].update({'request_headers':	this_request['headers']})
						websocket_requests[this_request_id].update({'user_agent':		this_request['headers']['User-Agent']})

					# websocket response received, update relevant fields
					if message_type == 'Network.webSocketHandshakeResponseReceived':
						this_response 	= log_item_data['message']['params']['response']
						this_request_id = log_item_data['message']['params']['requestId']
						websocket_requests[this_request_id].update({'received':			True})
						websocket_requests[this_request_id].update({'status':			this_response['status']})
						websocket_requests[this_request_id].update({'status_text':		this_response['statusText']})
						websocket_requests[this_request_id].update({'response_headers':	this_response['headersText']})
		# end log processing loop

		# append load finish info to requests
		for this_url in requests:
			this_request_id = requests[this_url]['request_id']
			if this_request_id in load_finish_data:
				requests[this_url].update({'body_size': load_finish_data[this_request_id]['body_size']})
				
				# load_time is start time minus end time,
				# 	multiplied by 1k to convert to miliseconds
				load_time = (load_finish_data[this_request_id]['end_time'] - requests[this_url]['start_time'])*1000
				
				# we shouldn't be getting <=0, but make it null if this happens
				if load_time <= 0:
					requests[this_url].update({'load_time': load_time})
				else:
					requests[this_url].update({'load_time': None})
			else:
				requests[this_url].update({'body_size': None})
				requests[this_url].update({'load_time': None})

		# append websocket data to requests data
		for item in websocket_requests:
			requests[websocket_requests[item]['url']] = websocket_requests[item]

		# return all the links for later processing
		all_links = []
		try:
			links = driver.find_elements_by_tag_name('a')
			for link in links:
				all_links.append([link.get_attribute('text'),link.get_attribute('href')])
		except:
			pass

		# get the page meta description
		try:
			meta_desc = driver.find_element_by_xpath("//meta[@name='description']").get_attribute("content")
		except:
			meta_desc = None

		# get the language of the page
		try:
			lang = driver.find_element_by_xpath('/html').get_attribute('lang')
		except:
			lang = None

		# get all the cookies
		# 	the selenium get_cookies method does not return third-party cookies
		#	so we open the cookie db directly from the chrome profile
		#	note that in headless mode this does not work in chrome versions
		#	prior to 64.0.3254.0 and no cookies will be returned
		cookies = []
		try:
			conn = sqlite3.connect(driver.capabilities['chrome']['userDataDir']+'/Default/Cookies')
			c = conn.cursor()
			c.execute("SELECT name,is_secure,path,host_key,expires_utc,is_httponly,value FROM cookies")
			for cookie in c.fetchall():
				cookies.append({
					'name': 		cookie[0],
					'secure':		cookie[1],
					'path':			cookie[2],
					'domain': 		cookie[3],
					'expiry':		cookie[4],
					'httponly':		cookie[5],
					'value':		cookie[6]
				})
		except:
			return({
				'success': False,
				'result': 'Cookie database not loaded, if this message appears often something is fundamentally wrong and requires attention!'
			})

		if self.headless == True:
			browser_version = driver.capabilities['version'] + ' [headless]'
		else:
			browser_version = driver.capabilities['version']

		# other parts of webxray expect this data format, common to all browser drivers used
		return_dict = {
			'browser_type':			driver.capabilities['browserName'],
			'browser_version':		browser_version,
			'browser_wait':			browser_wait,
			'start_url':			url, 
			'final_url': 			final_url,
			'title': 				title,
			'meta_desc': 			meta_desc,
			'lang':					lang,
			'load_time': 			int((last_end_time - first_start_time)*1000),
			'processed_requests': 	requests,
			'cookies': 				cookies,
			'all_links':			all_links,
			'source':				page_source
		}
		
		# quit the driver or it will never die!
		driver.quit()

		return ({
			'success': True,
			'result': return_dict
		})
	# get_webxray_scan_data

	def get_page_source_text_title_lang(self, url):
		"""
		Given a url, this function attempts to load the page, inject
			the Readability.js library, execute it, and extract
			only the text of a given page.

		Function returns the success value (True/False), page source, the 
			extracted text, and the page title in case of success; in 
			case of failure returns an error message.
		"""

		# set up sql_driver for logging errors
		driver = self.create_chromedriver()

		# browser hasn't started and error already printed to cli
		if driver == None:
			return({
				'success': False,
				'result': 'Unable to launch Chrome instance'
			})

		# starts the page load process, quits driver and returns nothing if we fail
		try:
			driver.get(url)
		except Exception as e:
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to load page: '+str(e).replace('\n', ' ')
			})

		# if we can't get source something is wrong, return None
		try:
			page_source = driver.page_source
		except:
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to read page source'
			})

		# if page title fails put in a null value
		try:
			page_title = driver.title
		except:
			page_title = None

		# get the language of the page
		try:
			page_lang = driver.find_element_by_xpath('/html').get_attribute('lang')
		except:
			page_lang = None

		# inject the locally downloaded copy of readability into the page
		#	and extract the content
		#
		# NOTE: you must download readability on your own and place in the 
		#	appropriate directory
		readability_js = open(os.path.dirname(os.path.abspath(__file__))+'/resources/policyxray/readability.js', 'r', encoding='utf-8').read()
		try:
			readabilty_result = json.loads(driver.execute_script("""
				%s
				var readabilityObj = new Readability("%s", document);
				return JSON.stringify(readabilityObj.parse(), null, 2);
			""" % (readability_js,url)))
			driver.quit()
		except:
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to inject Readability.js'
			})

		# readability failure, return None
		if readabilty_result == None:
			return({
				'success': False,
				'result': 'Empty Readability result'
			})

		# readability has HTML formatting, remove it (and extra spaces), return None on failure
		try:
			page_text = re.sub('\s+', ' ', re.sub('&.+?;', ' ', re.sub('<.+?>', ' ', readabilty_result['content'])))
		except:
			return({
				'success': False,
				'result': 'Unable to remove HTML from Readability result'
			})

		# the page text is empty, return None
		if len(page_text) == 0: 
			return({
				'success': False,
				'result': 'Empty result after HTML removal'
			})

		# looks good, return tuple
		return({
			'success': True,
			'result': {
				'page_source': 	page_source,
				'page_text': 	page_text,
				'page_title': 	page_title,
				'page_lang': 	page_lang
			}
		})
	# get_page_source_text_title_lang

	def get_absolute_page_links(self,url):
		"""
		Returns all links on the page, changes relative links to
			absolute links as well.
		"""

		# initialize browser instance
		driver = self.create_chromedriver()

		# browser hasn't started and error already printed to cli
		if driver == None:
			return({
				'success': False,
				'result': 'Unable to launch Chrome instance'
			})

		# allow one minute before we kill it
		driver.set_page_load_timeout(60)

		# starts the page load process, quits driver and returns nothing if we fail
		try:
			driver.get(url)
		except:
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to load page'
			})

		# page has now been loaded, we process all the links
		all_links = set()
		
		# either there are no links or something has gone wrong
		try:
			links = driver.find_elements_by_tag_name('a')
		except:
			driver.quit()
			return({
				'success': False,
				'result': 'Unable to extract links'
			})

		# process the links
		for link in links:
			# to cope with:
			# selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
			try:
				this_link = link.get_attribute('href')
				this_link_text = re.sub('\s+', ' ', re.sub('[\\n|\\r|\\t]',' ',link.get_attribute('text').strip()))
			except:
				continue

			# sometimes can be empty
			if this_link == None: continue
			if len(this_link) == 0: continue

			# add in the absolute url from the link to our set
			all_links.add((this_link_text,self.utilities.get_absolute_url_from_page_link(url,this_link)))
		driver.quit()

		return({
			'success': True,
			'result': all_links
		})