Beispiel #1
0
    def __init__(self, db_name=None, db_engine=None, client_id=None):
        """
		This class can be called to run store_results_from_queue which connects
			to the server_config database to fetch results, in which case a global
			db_name isn't needed, so we have db_name=None to account for that.
			However, if we *do* have a db_name we set up a global config.
		"""
        self.db_name = db_name
        self.db_engine = db_engine
        self.client_id = client_id
        self.debug = True
        self.utilities = Utilities()

        # get global config for this db
        if db_name:
            # set up database connection
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            self.config = sql_driver.get_config()
            self.browser_config = {}

            for item in self.config:
                if 'client' in item:
                    self.browser_config[item] = self.config[item]

            sql_driver.close()
Beispiel #2
0
    def build_policy_task_queue(self,
                                flush_policy_task_queue=True,
                                timeseries_interval=10080):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # set up new db connection
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # get rid of whatever is in there already
        if flush_policy_task_queue:
            sql_driver.flush_task_queue(task='get_policy')

        # get list of all policies we have
        scanned_policies = []
        for policy_url, in sql_driver.get_scanned_policy_urls():
            scanned_policies.append(policy_url)

        # run the query and add to list
        for policy_url, in sql_driver.get_policies_to_collect():
            # if page has an anchor, we drop everything after
            if policy_url[-1] == '#':
                policy_url = policy_url[:-1]
            elif '#' in policy_url:
                policy_url = re.search('^(.+?)#.+$', policy_url).group(1)

            # skip invalid links
            if not self.utilities.is_url_valid(policy_url): continue

            # already did it, skip
            if policy_url in scanned_policies: continue

            sql_driver.add_task_to_queue(policy_url, 'get_policy')

        # fyi
        print('\t%s pages in task_queue for get_policy' %
              sql_driver.get_task_queue_length(task='get_policy'))

        # we no longer need this db connection
        sql_driver.close()
    def process_url(self, url):
        """
		this function takes a specified url, loads it in the browser
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

        # set up sql connection used to log errors and do checks
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)

        # output store does the heavy lifting of analyzing browser output and storing to db
        output_store = OutputStore(self.db_engine, self.db_name)

        # support for loading same page with multiple browsers - purposefully undocumented
        for browser_type in self.browser_types:

            # import and set up specified browser driver
            # 	note we need to set up a new browser each time to
            #	get a fresh profile
            if browser_type == 'chrome':
                browser_driver = ChromeDriver(ua=self.chrome_ua)

            # attempt to load the page, fail gracefully
            try:
                browser_output = browser_driver.get_webxray_scan_data(
                    url, self.browser_wait)
            except:
                print('\t\t%-50s Browser %s Did Not Return' %
                      (url[:50], browser_type))
                sql_driver.log_error(url, 'Unable to load page')
                sql_driver.close()
                return

            # if there was a problem we log the error
            if browser_output['success'] == False:
                print('\t\t%-50s Browser %s Error: %s' %
                      (url[:50], browser_type, browser_output['result']))
                sql_driver.log_error(url, 'Unable to load page')
                sql_driver.close()
                return
            else:
                # no error, treat result as browser output
                browser_output = browser_output['result']

            # attempt to store the output
            if output_store.store(url, browser_output):
                print('\t\t%-50s Success with %s' % (url[:50], browser_type))
            else:
                print('\t\t%-50s Fail with %s' % (url[:50], browser_type))
                sql_driver.log_error(url, 'Unable to load page')

        sql_driver.close()
        return
	def store(self, url, browser_output, store_source=False, store_1p=True):
		"""
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

		# open up a sql connection
		if self.db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)
		else:
			print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
			exit()

		# get the ip, fqdn, domain, pubsuffix, and tld
		# we need the domain to figure out if cookies/elements are third-party
		origin_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url)

		# if we can't get page domain info we fail gracefully
		if origin_ip_fqdn_domain_pubsuffix_tld is None:
			sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
			return False

		origin_ip 			= origin_ip_fqdn_domain_pubsuffix_tld[0]
		origin_fqdn 		= origin_ip_fqdn_domain_pubsuffix_tld[1]
		origin_domain 		= origin_ip_fqdn_domain_pubsuffix_tld[2]
		origin_pubsuffix 	= origin_ip_fqdn_domain_pubsuffix_tld[3]
		origin_tld 			= origin_ip_fqdn_domain_pubsuffix_tld[4]
		
		# sql_driver.add_domain both stores the new domain and returns its db row id
		# if it is already in db just return the existing id
		page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld)

		# if the final page is https (often after a redirect), mark it appropriately
		if browser_output['final_url'][:5] == 'https':
			page_is_ssl = True
		else:
			page_is_ssl = False

		if store_source:
			source = browser_output['source']
		else:
			source = None

		# add page
		page_id = sql_driver.add_page(
			browser_output['browser_type'],
			browser_output['browser_version'],
			browser_output['browser_wait'],
			browser_output['title'],
			browser_output['meta_desc'],
			url, 
			browser_output['final_url'],
			page_is_ssl,
			source,
			browser_output['load_time'],
			page_domain_id
		)

		# store cookies
		for cookie in browser_output['cookies']:
			# get the ip, fqdn, domain, pubsuffix, and tld
			# we need the domain to figure out if cookies/elements are third-party
			# note:
			#	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
			cookie_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain'])
			
			# something went wrong, log and fail gracefully
			if cookie_ip_fqdn_domain_pubsuffix_tld is None:
				sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain'])
				continue

			# otherwise, everything went fine
			cookie_ip 			= cookie_ip_fqdn_domain_pubsuffix_tld[0]
			cookie_fqdn 		= cookie_ip_fqdn_domain_pubsuffix_tld[1]
			cookie_domain 		= cookie_ip_fqdn_domain_pubsuffix_tld[2]
			cookie_pubsuffix 	= cookie_ip_fqdn_domain_pubsuffix_tld[3]
			cookie_tld 			= cookie_ip_fqdn_domain_pubsuffix_tld[4]

			# mark third-party cookies
			if origin_domain != cookie_domain:
				is_3p_cookie = True
			else:
				is_3p_cookie = False

			# this is a first party cookie, see if we want to store it
			if is_3p_cookie is False and store_1p is False:
				continue

			# sql_driver.add_domain both stores the new domain and returns its id
			cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld)
		
			# name and domain are required, so if they fail we just continue
			try: name = cookie['name']
			except: continue
		
			try: domain = cookie_domain
			except: continue
		
			# these are optional, fill with null values if fail
			try: secure = cookie['secure']
			except: secure = None
		
			try: path = cookie['path']
			except: path = None
		
			try: httponly = cookie['httponly']
			except: httponly = None
		
			try: expiry = cookie['expiry']
			except: expiry = None
		
			try: value = cookie['value']
			except: value = None
		
			# all done with this cookie
			sql_driver.add_cookie(
				page_id,
				name, secure, path, domain, 
				httponly, expiry, value, 
				is_3p_cookie, cookie_domain_id
			)

		# process requests now
		for request in browser_output['processed_requests']:
			# if the request starts with the following we can't parse anyway, so skip
			if re.match('^(data|about|chrome|blob).+', request):
				continue

			# get the ip, fqdn, domain, pubsuffix, and tld
			# we need the domain to figure out if cookies/elements are third-party
			element_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request)

			# problem with this request, log and fail gracefully
			if element_ip_fqdn_domain_pubsuffix_tld is None:
				sql_driver.log_error(url, 'Error parsing element request: '+request)
				continue

			element_ip 			= element_ip_fqdn_domain_pubsuffix_tld[0]
			element_fqdn 		= element_ip_fqdn_domain_pubsuffix_tld[1]
			element_domain 		= element_ip_fqdn_domain_pubsuffix_tld[2]
			element_pubsuffix 	= element_ip_fqdn_domain_pubsuffix_tld[3]
			element_tld 		= element_ip_fqdn_domain_pubsuffix_tld[4]

			# sql_driver.add_domain both stores the new domain and returns its db row id
			element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld)

			# mark third-party elements based on domain
			if origin_domain != element_domain:
				is_3p_element = True
			else:
				is_3p_element = False

			# if we are not storing 1p elements continue
			if is_3p_element is False and store_1p is False:
				continue
			
			if request[:5] == 'https' or request[:3] == 'wss':
				element_is_ssl = True
			else:
				element_is_ssl = False

			try:
				received = browser_output['processed_requests'][request]['received']
			except:
				received = None

			# get domain of referer and determine if page leaked by referer
			try:
				referer = browser_output['processed_requests'][request]['referer']
			except:
				referer = None

			if referer and len(referer) != 0:
				referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer)

				if referer_ip_fqdn_domain_pubsuffix_tld:
					if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain:
						page_domain_in_referer = True
					else:
						page_domain_in_referer = False
				else:
					page_domain_in_referer = None
					sql_driver.log_error(url, 'Error parsing referer header: '+referer)
			else:
				page_domain_in_referer = None

			try:
				start_time_offset = browser_output['processed_requests'][request]['start_time_offset']
			except:
				start_time_offset = None

			try:
				load_time = browser_output['processed_requests'][request]['load_time']
			except:
				load_time = None

			try:
				status = browser_output['processed_requests'][request]['status']
			except:
				status = None

			try:
				status_text = browser_output['processed_requests'][request]['status_text']
			except:
				status_text = None

			try:
				content_type = browser_output['processed_requests'][request]['content_type']
			except:
				content_type = None
			
			try:
				body_size = browser_output['processed_requests'][request]['body_size']
			except:
				body_size = None

			try:
				request_headers = str(browser_output['processed_requests'][request]['request_headers'])
			except:
				request_headers = None

			try:
				response_headers = str(browser_output['processed_requests'][request]['response_headers'])
			except:
				response_headers = None

			# consider anything before the "?" to be the element_url
			try:
				element_url = re.search('^(.+?)\?.+$', request).group(1)
			except:
				element_url = request

			# consider anything after the "?" to be the args
			try:
				element_args = re.search('^.+(\?.+)$', request).group(1) # start url args
			except:
				element_args = None

			# attempt to parse off the extension
			try:
				element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower()
			except:
				element_extension = None
			
			# lists of common extensions, can be expanded
			image_extensions 	= ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf']
			script_extensions 	= ['js', 'javascript']
			data_extensions 	= ['json', 'jsonp', 'xml']
			font_extentions 	= ['woff', 'ttf', 'otf']
			static_extentions 	= ['html', 'htm', 'shtml']
			dynamic_extentions	= ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi']

			# figure out what type of element it is
			if element_extension in image_extensions:
				element_type = 'image'
			elif element_extension in script_extensions:
				element_type = 'javascript'
			elif element_extension in data_extensions:
				element_type = 'data_structured'
			elif element_extension == 'css':
				element_type = 'style_sheet'
			elif element_extension in font_extentions:
				element_type = 'font'
			elif element_extension in static_extentions:
				element_type = 'page_static'
			elif element_extension == dynamic_extentions:
				element_type = 'page_dynamic'
			elif element_extension == 'swf' or element_extension == 'fla':
				element_type = 'Shockwave Flash'
			else:
				element_type = None

			file_md5 = None

			# final tasks is to truncate the request if it is 
			#	over 2k characters as it is likely
			#	binary data and may cause problems inserting
			#	into TEXT fields in database
			#
			#  TODO:
			#	better handle binary data in general
			if len(request) >= 2000: request = request[:2000]
			if len(element_url) >= 2000: element_url = element_url[:2000]

			# store request
			sql_driver.add_element(
				page_id,
				request, element_url,
				is_3p_element, element_is_ssl,
				received,
				referer,
				page_domain_in_referer,
				start_time_offset,
				load_time,
				status,
				status_text,
				content_type,
				body_size,
				request_headers,
				response_headers,
				element_extension,
				element_type,
				element_args,
				element_domain_id
			)

		# close db connection
		sql_driver.close()

		return True
Beispiel #5
0
    def run(self, task='process_tasks_from_queue', pool_size=None):
        """
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there

		when running in slave mode the list is skipping and we got straight to scanning
		"""

        if task == 'process_tasks_from_queue':
            # set up sql connection to get queue_length
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            queue_length = sql_driver.get_task_queue_length()
            sql_driver.close()
            del sql_driver

            print('\t----------------------------------')
            print('\t%s addresses will now be webXray\'d' % queue_length)
            print('\t\t...you can go take a walk. ;-)')
            print('\t----------------------------------')

        # for macOS (darwin) we must specify start method as 'forkserver'
        #	this is essentially voodoo to ward off evil spirits which
        #	appear when large pool sizes are used on macOS
        # get_start_method must be set to 'allow_none', otherwise upon
        #	checking the method it gets set (!) - and if we then get/set again
        #	we get an error
        if sys.platform == 'darwin' and multiprocessing.get_start_method(
                allow_none=True) != 'forkserver':
            multiprocessing.set_start_method('forkserver')
        myPool = multiprocessing.Pool(pool_size)

        # map requires we pass an argument to the function
        #	(even though we don't need to), so we create
        #	a list equal to pool_size which will
        #	spawn the desired number of processes
        process_num = []
        if pool_size == None:
            pool_size = multiprocessing.cpu_count()

        for i in range(0, pool_size):
            process_num.append(i)

        if task == 'process_tasks_from_queue':
            myPool.map(self.process_tasks_from_queue, process_num)
        elif task == 'store_results_from_queue':
            myPool.map(self.store_results_from_queue, process_num)
Beispiel #6
0
    def process_tasks_from_queue(self, process_num):
        """
		Selects the next page from the task_queue and passes to 
			process_url.  If load is unsucessful places page
			back into queue and updates attempts.  Returns once 
			when there are no pages in the queue under max_attempts.
		"""

        print('\t[p.%s]\t🏃‍♂️ Starting process' % process_num)

        # need a local connection for each queue manager
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # keep getting tasks from queue until none are left at max attempt level
        while sql_driver.get_task_queue_length(
                max_attempts=self.config['max_attempts'],
                unlocked_only=True) != 0:
            # it is possible for two processes to both pass the above conditional
            #	and then try to get a task from the queue at the same time.
            #	however, the second process that attempts to get a task will
            #	get an empty result (and crash), so we have a try/except block here
            #	to handle that condition gracefully
            try:
                target, task = sql_driver.get_task_from_queue(
                    max_attempts=self.config['max_attempts'],
                    client_id=self.client_id)
            except:
                break

            print('\t[p.%s]\t👉 Initializing: %s for target %s' %
                  (process_num, task, target[:50]))

            # import and set up specified browser driver
            # 	note we set up a new browser each time to
            #	get a fresh profile
            if self.browser_config['client_browser_type'] == 'chrome':
                browser_driver = ChromeDriver(self.browser_config,
                                              port_offset=process_num)
            else:
                print(
                    f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!"
                )
                return

            # does the webxray scan or policy capture
            if task == 'get_scan':
                task_result = browser_driver.get_scan(target)
            elif task == 'get_crawl':
                task_result = browser_driver.get_crawl(json.loads(target))
            elif task == 'get_policy':
                task_result = browser_driver.get_scan(target,
                                                      get_text_only=True)
            elif task == 'get_random_crawl':
                task_result = browser_driver.get_random_crawl(target)

            # kill browser
            del browser_driver

            # browser has failed to get result, unlock and continue
            if task_result['success'] == False:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], task_result['result']))

                # for times we don't want to retry, such as a rejected
                #	redirect or network resolution failure, this could be expanded
                fail_cases = [
                    'reached fail limit', 'rejecting redirect',
                    'did not find enough internal links'
                ]

                if task_result[
                        'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[
                            'result']:
                    sql_driver.set_task_as_failed(target, task)
                else:
                    sql_driver.unlock_task_in_queue(target, task)

                # keep track of error regardless of fail/unlock
                sql_driver.log_error({
                    'client_id': 'localhost',
                    'target': target,
                    'task': task,
                    'msg': task_result['result']
                })
                continue

            # debug
            if self.debug:
                print(
                    '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s'
                    % (process_num, task, target[:50]))

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target':
                target,
                'task':
                task,
                'task_result':
                task_result['result'],
                'client_id':
                self.client_id
            })

            if store_result['success'] == True:
                print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}')
            else:
                print(
                    f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}'
                )

        # tidy up
        sql_driver.close()
        del sql_driver

        print('\t[p.%s]\t✋ Completed process' % process_num)
        return
Beispiel #7
0
    def build_scan_task_queue(self, params):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # these vars are specific to this function
        pages_file_name = params['pages_file_name']
        flush_scan_task_queue = params['flush_scan_task_queue']
        task = params['task']

        # set up sql connection used to determine if items are already in the db
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # open list of pages
        try:
            url_list = open(os.path.dirname(os.path.abspath(__file__)) +
                            '/../page_lists/' + pages_file_name,
                            'r',
                            encoding='utf-8')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % pages_file_name)
            sql_driver.close()
            exit()

        # get list of pages already scanned
        already_scanned = []
        print('\tFetching list of pages already scanned...')
        if self.config['timeseries_enabled']:
            for url, in sql_driver.get_all_pages_exist(
                    timeseries_interval=self.config['timeseries_interval']):
                already_scanned.append(url)
        else:
            for url, in sql_driver.get_all_pages_exist():
                already_scanned.append(url)
        print(f'\t => {len(already_scanned)} pages already scanned')

        # get rid of whatever is in there already
        if flush_scan_task_queue:
            sql_driver.flush_task_queue(task=task)

        # simple counter used solely for updates to CLI
        count = 0

        print('\t---------------------')
        print('\t Building Page Queue ')
        print('\t---------------------')

        for url in url_list:
            # skip lines that are comments
            if "#" in url[0]: continue

            count += 1

            # make sure url is valid
            if self.utilities.is_url_valid(url) == False:
                print(f'\t\t{count} | {url} is invalid')
                continue

            # perform idna fix
            url = self.utilities.idna_encode_url(url)

            # if we are allowing time series we see if page has been scanned in the
            #	specified interval, otherwise if we are *not* allowing a time series
            #	we skip anything already in the db
            if url in already_scanned and self.config['timeseries_enabled']:
                print(f'\t\t{count} | {url[:30]}... Scanned too recently.')
                continue

            elif url in already_scanned:
                print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.')
                continue

            # add to the queue, duplicates will be
            #	ignored
            sql_driver.add_task_to_queue(url, task)
            print(f'\t\t{count} | {url[:30]}... Adding to queue.')

        # close the db connection
        sql_driver.close()
Beispiel #8
0
    def store_result(self, params):
        """
		Handles storing task_result and removing jobs
			from the task_queue.
		"""

        # unpack params
        target = params['target']
        task = params['task']
        task_result = params['task_result']
        client_id = params['client_id']

        # client_ip is optional
        if 'client_ip' in params:
            client_ip = params['client_ip']
        else:
            client_ip = None

        # if db_name is specified we are running in server mode and we
        #	connect to the db which corresponds to the result being
        #	processed.  otherwise, we use the global db_name as we are
        #	running in non-server mode.
        if 'db_name' in params:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(params['db_name'])
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(params['db_name'])
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()
            output_store = OutputStore(params['db_name'], self.db_engine)
        else:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            output_store = OutputStore(self.db_name, self.db_engine)

        if task == 'get_policy':
            store_result = output_store.store_policy(task_result,
                                                     client_id,
                                                     client_ip=client_ip)
            # we never retry policies
            sql_driver.remove_task_from_queue(target, task)
            if store_result['success']:
                result = {'success': True}
            else:
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail on ' + store_result['result']
                })
                result = {'success': False, 'result': store_result['result']}
        # elif task == 'get_crawl' or task == 'get_random_crawl':
        else:
            all_crawls_ok = True

            # We want to be able to re-run random crawls, and to do so we make sure
            #	the crawl_id will match
            if task == 'get_crawl' or task == 'get_scan':
                crawl_id = target
            elif task == 'get_random_crawl':
                crawl_id = []
                for result in task_result:
                    crawl_id.append(result['start_url'])
                crawl_id = json.dumps(crawl_id)

            # tweak to account for differences between scans/crawls
            if task == 'get_scan': task_result = [task_result]

            # keep track of domains
            all_3p_cookie_domains = set()
            all_3p_dom_storage_domains = set()
            all_3p_request_domains = set()
            all_3p_response_domains = set()
            all_3p_websocket_domains = set()

            # When we store a crawl we add optional fields in the page table
            #	that allow us to connect the page loads into a single crawl.
            #	the crawl_id is a hash of the target (which is a json string
            #	derived from the url_list), and the crawl_timestamp which is the
            #	first accessed time from the crawl.
            for crawl_sequence, result in enumerate(task_result):
                store_result = output_store.store_scan({
                    'browser_output':
                    result,
                    'client_id':
                    client_id,
                    'crawl_id':
                    crawl_id,
                    'crawl_timestamp':
                    task_result[0]['accessed'],
                    'crawl_sequence':
                    crawl_sequence,
                    'client_ip':
                    client_ip
                })

                if store_result['success'] != True:
                    all_crawls_ok = False
                else:
                    # we are successful, create entries in page_lookup table
                    page_lookup_table = self.build_lookup_table(
                        'page', store_result['page_id'], {
                            'requests':
                            store_result['page_3p_request_domains'],
                            'responses':
                            store_result['page_3p_response_domains'],
                            'websockets':
                            store_result['page_3p_websocket_domains'],
                            'dom_storage':
                            store_result['page_3p_dom_storage_domains'],
                            'cookies':
                            store_result['page_3p_dom_storage_domains']
                        })

                    for lookup_item in page_lookup_table:
                        sql_driver.add_page_id_domain_lookup_item(
                            page_lookup_table[lookup_item])

                    # we are also making a lookup table for the crawl, keep joing the
                    #	sets as we go along
                    all_3p_request_domains.update(
                        store_result['page_3p_request_domains'])
                    all_3p_response_domains.update(
                        store_result['page_3p_response_domains'])
                    all_3p_websocket_domains.update(
                        store_result['page_3p_websocket_domains'])
                    all_3p_dom_storage_domains.update(
                        store_result['page_3p_dom_storage_domains'])
                    all_3p_cookie_domains.update(
                        store_result['page_3p_dom_storage_domains'])

            if all_crawls_ok:
                sql_driver.remove_task_from_queue(target, task)
                result = {'success': True}

                # build crawl lookup table
                crawl_lookup_table = self.build_lookup_table(
                    'crawl', crawl_id, {
                        'requests': all_3p_request_domains,
                        'responses': all_3p_response_domains,
                        'websockets': all_3p_websocket_domains,
                        'dom_storage': all_3p_dom_storage_domains,
                        'cookies': all_3p_cookie_domains
                    })

                # patch lookup table
                for lookup_item in crawl_lookup_table:
                    sql_driver.add_crawl_id_domain_lookup_item(
                        crawl_lookup_table[lookup_item])

            else:
                sql_driver.unlock_task_in_queue(target, task)
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail to store all scans for crawl_id_target '
                    + target
                })
                result = {
                    'success': False,
                    'result': 'unable to store all crawl loads'
                }

        # tidy up
        output_store.close()
        sql_driver.close()

        # done
        return result
    def run(self, pool_size):
        """
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there
		"""

        # the list of url MUST be in the page_lists directory!
        try:
            url_list = open(os.path.dirname(os.path.abspath(__file__)) +
                            '/../page_lists/' + self.pages_file_name,
                            'r',
                            encoding='utf-8')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % self.pages_file_name)
            exit()

        # set up sql connection used to determine if items are already in the db
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)

        # this list gets mapped to the Pool, very important!
        urls_to_process = set()

        # simple counter used solely for updates to CLI
        count = 0

        print('\t------------------------')
        print('\t Building List of Pages ')
        print('\t------------------------')

        for url in url_list:
            # skip lines that are comments
            if "#" in url[0]: continue

            count += 1

            # only do lines starting with https?://
            if not (re.match('^https?://.+', url)):
                print("\t\t%s | %-50s Not a valid address, Skipping." %
                      (count, url[:50]))
                continue

            # non-ascii domains may cause issues, so we need to convert them to
            # 	idna/ascii/utf-8
            # this requires splitting apart the url, converting the domain to idna,
            #	and pasting it all back together

            split_url = urlsplit(url.strip())
            idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
            url = urlunsplit(
                (split_url.scheme, idna_fixed_netloc, split_url.path,
                 split_url.query, split_url.fragment))

            # if it is a m$ office or other doc, skip
            if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url):
                print("\t\t%s | %-50s Not an HTML document, Skipping." %
                      (count, url[:50]))
                continue

            # skip if in db already
            if sql_driver.page_exists(url):
                print("\t\t%s | %-50s Exists in DB, Skipping." %
                      (count, url[:50]))
                continue

            # only add if not in list already
            if url not in urls_to_process:
                print("\t\t%s | %-50s Adding." % (count, url[:50]))
                urls_to_process.add(url)
            else:
                print("\t\t%s | %-50s Already queued, Skipping." %
                      (count, url[:50]))

        # close the db connection
        sql_driver.close()

        print('\t----------------------------------')
        print('\t%s addresses will now be webXray\'d' % len(urls_to_process))
        print('\t\tBrowser(s) are %s' % self.browser_types)
        print('\t\tBrowser wait time is %s seconds' % self.browser_wait)
        print('\t\t...you can go take a walk. ;-)')
        print('\t----------------------------------')

        # for macOS (darwin) we must specify start method as 'forkserver'
        #	this is essentially voodoo to ward off evil spirits which
        #	appear when large pool sizes are used on macOS
        # get_start_method must be set to 'allow_none', otherwise upon
        #	checking the method it gets set (!) - and if we then get/set again
        #	we get an error
        if sys.platform == 'darwin' and multiprocessing.get_start_method(
                allow_none=True) != 'forkserver':
            multiprocessing.set_start_method('forkserver')
        myPool = multiprocessing.Pool(pool_size)
        myPool.map(self.process_url, urls_to_process)

        # FYI
        self.print_runtime()