Beispiel #1
0
    def __init__(self, db_name=None, db_engine=None):
        # if we have db params set up global db connection, otherwise we don't bother
        if db_name:
            if db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                self.sql_driver = SQLiteDriver(db_name)
            elif db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                self.sql_driver = PostgreSQLDriver(db_name)
            else:
                print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' %
                      db_engine)
                quit()
        elif db_engine:
            if db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                self.sql_driver = SQLiteDriver()
            elif db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                self.sql_driver = PostgreSQLDriver()
            else:
                print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' %
                      db_engine)
                quit()

        self.url_parser = ParseURL()
Beispiel #2
0
    def __init__(self, db_name=None, db_engine=None, client_id=None):
        """
		This class can be called to run store_results_from_queue which connects
			to the server_config database to fetch results, in which case a global
			db_name isn't needed, so we have db_name=None to account for that.
			However, if we *do* have a db_name we set up a global config.
		"""
        self.db_name = db_name
        self.db_engine = db_engine
        self.client_id = client_id
        self.debug = True
        self.utilities = Utilities()

        # get global config for this db
        if db_name:
            # set up database connection
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            self.config = sql_driver.get_config()
            self.browser_config = {}

            for item in self.config:
                if 'client' in item:
                    self.browser_config[item] = self.config[item]

            sql_driver.close()
Beispiel #3
0
    def __init__(self):
        """
		Set up our server configuration here.

		Note we store config details in server_config.json
			because __init__ is run each time a worker
			processes a request this means we can
			modify our config on the fly without having
			to restart the server
		"""

        # connect to server config db to get client_config
        self.server_sql_driver = PostgreSQLDriver('server_config')

        # important parts of config currently are to
        #	generate our whitelist of allowed ips
        #	and to map our clients to their respective
        #	databases
        self.whitelisted_ips = []
        self.client_id_to_db = {}
        for client in self.server_sql_driver.get_client_configs():
            if client['live']:
                if self.server_sql_driver.check_db_exist(client['mapped_db']):
                    self.whitelisted_ips.append(client['client_ip'])
                    self.client_id_to_db[
                        client['client_id']] = client['mapped_db']
                else:
                    print(
                        f"Database {client['mapped_db']} for client {client['client_id']} does not exist"
                    )
Beispiel #4
0
	def __init__(self,db_name,db_engine, flush_domain_owners):

		# set up global db connection
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()

		# these gets reused frequently, minimize db calls by doing it up here
		self.total_pages 	= self.sql_driver.get_complex_page_count()
		self.total_crawls 	= self.sql_driver.get_crawl_count()

		# pass utilities the database info
		self.utilities = Utilities(db_name,db_engine)

		# initialize the domain owner dict
		self.domain_owners = self.utilities.get_domain_owner_dict()

		# update domain owners
		if flush_domain_owners:
			self.patch_domain_owners()

		# load to memory for faster processing, make sure you
		#	have enough RAM!
		self.get_crawl_id_to_3p_domain_info()
Beispiel #5
0
    def build_policy_task_queue(self,
                                flush_policy_task_queue=True,
                                timeseries_interval=10080):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # set up new db connection
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # get rid of whatever is in there already
        if flush_policy_task_queue:
            sql_driver.flush_task_queue(task='get_policy')

        # get list of all policies we have
        scanned_policies = []
        for policy_url, in sql_driver.get_scanned_policy_urls():
            scanned_policies.append(policy_url)

        # run the query and add to list
        for policy_url, in sql_driver.get_policies_to_collect():
            # if page has an anchor, we drop everything after
            if policy_url[-1] == '#':
                policy_url = policy_url[:-1]
            elif '#' in policy_url:
                policy_url = re.search('^(.+?)#.+$', policy_url).group(1)

            # skip invalid links
            if not self.utilities.is_url_valid(policy_url): continue

            # already did it, skip
            if policy_url in scanned_policies: continue

            sql_driver.add_task_to_queue(policy_url, 'get_policy')

        # fyi
        print('\t%s pages in task_queue for get_policy' %
              sql_driver.get_task_queue_length(task='get_policy'))

        # we no longer need this db connection
        sql_driver.close()
Beispiel #6
0
	def __init__(self, db_name, db_engine):
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
		self.debug		= False
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(self.db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()
		self.config 	= self.sql_driver.get_config()
Beispiel #7
0
    def process_tasks_from_queue(self, process_num):
        """
		Selects the next page from the task_queue and passes to 
			process_url.  If load is unsucessful places page
			back into queue and updates attempts.  Returns once 
			when there are no pages in the queue under max_attempts.
		"""

        print('\t[p.%s]\t🏃‍♂️ Starting process' % process_num)

        # need a local connection for each queue manager
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # keep getting tasks from queue until none are left at max attempt level
        while sql_driver.get_task_queue_length(
                max_attempts=self.config['max_attempts'],
                unlocked_only=True) != 0:
            # it is possible for two processes to both pass the above conditional
            #	and then try to get a task from the queue at the same time.
            #	however, the second process that attempts to get a task will
            #	get an empty result (and crash), so we have a try/except block here
            #	to handle that condition gracefully
            try:
                target, task = sql_driver.get_task_from_queue(
                    max_attempts=self.config['max_attempts'],
                    client_id=self.client_id)
            except:
                break

            print('\t[p.%s]\t👉 Initializing: %s for target %s' %
                  (process_num, task, target[:50]))

            # import and set up specified browser driver
            # 	note we set up a new browser each time to
            #	get a fresh profile
            if self.browser_config['client_browser_type'] == 'chrome':
                browser_driver = ChromeDriver(self.browser_config,
                                              port_offset=process_num)
            else:
                print(
                    f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!"
                )
                return

            # does the webxray scan or policy capture
            if task == 'get_scan':
                task_result = browser_driver.get_scan(target)
            elif task == 'get_crawl':
                task_result = browser_driver.get_crawl(json.loads(target))
            elif task == 'get_policy':
                task_result = browser_driver.get_scan(target,
                                                      get_text_only=True)
            elif task == 'get_random_crawl':
                task_result = browser_driver.get_random_crawl(target)

            # kill browser
            del browser_driver

            # browser has failed to get result, unlock and continue
            if task_result['success'] == False:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], task_result['result']))

                # for times we don't want to retry, such as a rejected
                #	redirect or network resolution failure, this could be expanded
                fail_cases = [
                    'reached fail limit', 'rejecting redirect',
                    'did not find enough internal links'
                ]

                if task_result[
                        'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[
                            'result']:
                    sql_driver.set_task_as_failed(target, task)
                else:
                    sql_driver.unlock_task_in_queue(target, task)

                # keep track of error regardless of fail/unlock
                sql_driver.log_error({
                    'client_id': 'localhost',
                    'target': target,
                    'task': task,
                    'msg': task_result['result']
                })
                continue

            # debug
            if self.debug:
                print(
                    '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s'
                    % (process_num, task, target[:50]))

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target':
                target,
                'task':
                task,
                'task_result':
                task_result['result'],
                'client_id':
                self.client_id
            })

            if store_result['success'] == True:
                print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}')
            else:
                print(
                    f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}'
                )

        # tidy up
        sql_driver.close()
        del sql_driver

        print('\t[p.%s]\t✋ Completed process' % process_num)
        return
Beispiel #8
0
    def build_scan_task_queue(self, params):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # these vars are specific to this function
        pages_file_name = params['pages_file_name']
        flush_scan_task_queue = params['flush_scan_task_queue']
        task = params['task']

        # set up sql connection used to determine if items are already in the db
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # open list of pages
        try:
            url_list = open(os.path.dirname(os.path.abspath(__file__)) +
                            '/../page_lists/' + pages_file_name,
                            'r',
                            encoding='utf-8')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % pages_file_name)
            sql_driver.close()
            exit()

        # get list of pages already scanned
        already_scanned = []
        print('\tFetching list of pages already scanned...')
        if self.config['timeseries_enabled']:
            for url, in sql_driver.get_all_pages_exist(
                    timeseries_interval=self.config['timeseries_interval']):
                already_scanned.append(url)
        else:
            for url, in sql_driver.get_all_pages_exist():
                already_scanned.append(url)
        print(f'\t => {len(already_scanned)} pages already scanned')

        # get rid of whatever is in there already
        if flush_scan_task_queue:
            sql_driver.flush_task_queue(task=task)

        # simple counter used solely for updates to CLI
        count = 0

        print('\t---------------------')
        print('\t Building Page Queue ')
        print('\t---------------------')

        for url in url_list:
            # skip lines that are comments
            if "#" in url[0]: continue

            count += 1

            # make sure url is valid
            if self.utilities.is_url_valid(url) == False:
                print(f'\t\t{count} | {url} is invalid')
                continue

            # perform idna fix
            url = self.utilities.idna_encode_url(url)

            # if we are allowing time series we see if page has been scanned in the
            #	specified interval, otherwise if we are *not* allowing a time series
            #	we skip anything already in the db
            if url in already_scanned and self.config['timeseries_enabled']:
                print(f'\t\t{count} | {url[:30]}... Scanned too recently.')
                continue

            elif url in already_scanned:
                print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.')
                continue

            # add to the queue, duplicates will be
            #	ignored
            sql_driver.add_task_to_queue(url, task)
            print(f'\t\t{count} | {url[:30]}... Adding to queue.')

        # close the db connection
        sql_driver.close()
Beispiel #9
0
    def build_crawl_task_queue(self, params):
        """
		Enter crawl tasks to the database after performing checks to 
			verify urls are valid.
		"""

        # these vars are specific to this function
        crawl_file_name = params['crawl_file_name']
        flush_crawl_task_queue = params['flush_crawl_task_queue']

        # only need this sql_driver to build the task list
        sql_driver = PostgreSQLDriver(self.db_name)

        # open list of pages
        try:
            crawl_list = json.load(
                open(os.path.dirname(os.path.abspath(__file__)) +
                     '/../crawl_lists/' + crawl_file_name,
                     'r',
                     encoding='utf-8'))
        except:
            print(
                f'Could not open {crawl_file_name}, is it correctly formatted and present in the ./crawl_lists directory?  Exiting.'
            )
            sql_driver.close()
            exit()

        # get rid of whatever is in there already
        if flush_crawl_task_queue:
            sql_driver.flush_task_queue(task='get_crawl')

        for count, url_list in enumerate(crawl_list):
            # first make sure the urls are valid, if we
            #	encounterd a non-valid url we trash the
            #	entire list
            url_list_valid = True

            # we keep our fixed urls here
            idna_url_list = []

            # look at each url
            for url in url_list:
                if self.utilities.is_url_valid(url) == False:
                    print(
                        f'{url} is not valid from {url_list}, not entering crawl to queue'
                    )
                    url_list_valid = False
                    break

                # perform idna fix
                idna_url_list.append(self.utilities.idna_encode_url(url))

            # we need to put the continue here for the outer loop
            if url_list_valid == False: continue

            # if we are allowing time series we see if page has been scanned in the
            #	specified interval, otherwise if we are *not* allowing a time series
            #	we skip anything already in the db
            if self.config['timeseries_enabled']:
                if sql_driver.crawl_exists(json.dumps(idna_url_list),
                                           timeseries_interval=self.
                                           config['timeseries_interval']):
                    print(f'\t{count} | {url[:30]}... Scanned too recently.')
                    continue
            else:
                if sql_driver.crawl_exists(json.dumps(idna_url_list)):
                    print(f'\t{count} | {url[:30]}... Exists in DB, skipping.')
                    continue

            # we have a valid list, queue it up!
            if url_list_valid:
                sql_driver.add_task_to_queue(json.dumps(idna_url_list),
                                             'get_crawl')
            print(f'\t{count} | {str(idna_url_list)[:30]}... Adding to queue.')

        # done
        sql_driver.close()
Beispiel #10
0
# 	db_engine can be 'mysql', 'postgres', or 'sqlite'
#	sqlite requires no configuation, but mysql and postgres
#		need user/pw set up in the relevant driver in the 
#		./webxray directory
db_engine = 'sqlite'

# set up database connection
if db_engine == 'mysql':
    from webxray.MySQLDriver import MySQLDriver
    sql_driver = MySQLDriver()
elif db_engine == 'sqlite':
    from webxray.SQLiteDriver import SQLiteDriver
    sql_driver = SQLiteDriver()
elif db_engine == 'postgres':
    from webxray.PostgreSQLDriver import PostgreSQLDriver
    sql_driver = PostgreSQLDriver()
else:
    print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
    quit()

####################
# HELPER FUNCTIONS #
####################

def select_wbxr_db():
    """
    databases are stored with a prefix (default 'wbxr_'), this function helps select a database in interactive mode
    """

    # you can optionally specify a different prefix here by setting "db_prefix = '[PREFIX]'"
    wbxr_dbs = sql_driver.get_wbxr_dbs_list()
Beispiel #11
0
	def process_url(self, url):
		"""
		this function takes a specified url, loads it in the browser (currently phantomjs)
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

		# set up sql connection used to log errors and do timeseries checks
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# output store does the heavy lifting of analyzing browser output and storing to db
		output_store = OutputStore(self.db_engine, self.db_name)

		# support for loading same page with multiple browsers - purposefully undocumented 
		for browser_type in self.browser_types:

			# import and set up specified browser driver
			# 	note we need to set up a new browser each time to 
			#	get a fresh profile
			if browser_type == 'phantomjs':
				browser_driver 	= PhantomDriver()
			elif browser_type == 'chrome':
				browser_driver 	= ChromeDriver(ua=self.chrome_ua)

			# support for timeseries collections - purposefully undocumented 
			if self.allow_timeseries:
				page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type)
				if page_last_accessed_browser_type:
					time_diff = datetime.now()-page_last_accessed_browser_type[0]
					if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type:
						print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type))
						continue

			# attempt to load the page, fail gracefully
			try:
				browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait)
			except:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return		
			
			# if there was a problem browser_output will be None
			if browser_output == None:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return

			# attempt to store the output
			if output_store.store(url, browser_output):
				print('\t\t%-50s Success with %s' % (url[:50],browser_type))
			else:
				print('\t\t%-50s Fail with %s' % (url[:50],browser_type))
				sql_driver.log_error(url, 'Unable to load page')

		sql_driver.close()
		return
Beispiel #12
0
	def run(self, pool_size):
		"""
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there
		"""

		# the list of url MUST be in the page_lists directory!
		try:
			url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()

		# set up sql connection used to determine if items are already in the db
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# this list gets mapped to the Pool, very important!
		urls_to_process = set()

		# simple counter used solely for updates to CLI
		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for url in url_list:
			# skip lines that are comments
			if "#" in url[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', url)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50]))
				continue

			# non-ascii domains will crash phantomjs, so we need to convert them to 
			# 	idna/ascii/utf-8
			# this requires splitting apart the url, converting the domain to idna,
			#	and pasting it all back together
			
			split_url = urlsplit(url.strip())
			idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
			url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment))

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50]))
				continue

			# skip if in db already unless we are doing a timeseries
			if self.allow_timeseries == False:
				if sql_driver.page_exists(url):
					print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50]))
					continue
	
			# only add if not in list already
			if url not in urls_to_process:
				print("\t\t%s | %-50s Adding." % (count, url[:50]))
				urls_to_process.add(url)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50]))

		# close the db connection
		sql_driver.close()

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(urls_to_process))
		print('\t\tBrowser(s) are %s' % self.browser_types)
		print('\t\tBrowser wait time is %s seconds' % self.browser_wait)
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		# for macOS (darwin) we must specify start method as 'forkserver'
		#	this is essentially voodoo to ward off evil spirits which 
		#	appear when large pool sizes are used on macOS
		# get_start_method must be set to 'allow_none', otherwise upon
		#	checking the method it gets set (!) - and if we then get/set again
		#	we get an error
		if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver':
			multiprocessing.set_start_method('forkserver')
		myPool = multiprocessing.Pool(pool_size)
		myPool.map(self.process_url, urls_to_process)

		# FYI
		self.print_runtime()
Beispiel #13
0
    def __init__(self,
                 db_name,
                 db_engine,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_domain_owners=True,
                 start_date=False,
                 end_date=False):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold

        # pass utilities the database info
        self.utilities = Utilities(db_name, db_engine)

        # set up the analyzer we will be using throughout
        self.analyzer = Analyzer(db_name, db_engine)

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        # creates a new directory if it doesn't exist already
        self.report_path = self.utilities.setup_report_dir(self.db_name)

        # this is used in various places to get owner information
        self.domain_owners = self.utilities.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.analyzer.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for tld in self.top_tlds:
                if tld: print('\t\t |- %s' % tld)
        else:
            self.top_tlds = [None]
Beispiel #14
0
    def store(self,
              url,
              browser_output,
              store_source=False,
              store_1p=True,
              get_file_hashes=False,
              hash_3p_only=False):
        """
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

        # open up a sql connection
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
            exit()

        # get the ip, fqdn, domain, pubsuffix, and tld
        # we need the domain to figure out if cookies/elements are third-party
        origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
            url)

        # if we can't get page domain info we fail gracefully
        if origin_ip_fqdn_domain_pubsuffix_tld is None:
            sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
            return False

        origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0]
        origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1]
        origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2]
        origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3]
        origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4]

        # sql_driver.add_domain both stores the new domain and returns its db row id
        # if it is already in db just return the existing id
        page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn,
                                               origin_domain, origin_pubsuffix,
                                               origin_tld)

        # figure out the privacy policy url and text, starts null
        priv_policy_url = None
        priv_policy_url_text = None

        # read in our list of privacy link terms from the json file in webxray/resources/policyxray
        privacy_policy_term_list = self.utilities.get_privacy_policy_term_list(
        )

        # we reverse links return from browser to check footer links first as that is where policy links tend to be
        all_links = browser_output['all_links']
        all_links.reverse()

        # if we have links search for privacy policy
        if len(all_links) > 0:
            # links are tuple
            for link_text, link_url in all_links:
                # makes sure we have text, skip links without
                if link_text:
                    # need lower for string matching
                    link_text = link_text.lower().strip()
                    # not a link we can use
                    if 'javascript' in link_text: continue
                    # see if the link_text is in our term list
                    if link_text in privacy_policy_term_list:
                        # if the link_url is relative this will convert to absolute
                        priv_policy_url = self.utilities.get_absolute_url_from_page_link(
                            url, link_url)
                        priv_policy_url_text = link_text
                        break

        # if the final page is https (often after a redirect), mark it appropriately
        if browser_output['final_url'][:5] == 'https':
            page_is_ssl = True
        else:
            page_is_ssl = False

        if store_source:
            # handles issue where postgres will crash on inserting null character
            source = browser_output['source'].replace('\x00', ' ')
        else:
            source = None

        # add page
        page_id = sql_driver.add_page(
            browser_output['browser_type'], browser_output['browser_version'],
            browser_output['browser_wait'], browser_output['title'],
            browser_output['meta_desc'], url, browser_output['final_url'],
            priv_policy_url, priv_policy_url_text, page_is_ssl, source,
            browser_output['load_time'], page_domain_id)

        # store cookies
        for cookie in browser_output['cookies']:
            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            # note:
            #	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
            cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                'http://' + cookie['domain'])

            # something went wrong, log and fail gracefully
            if cookie_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url,
                    'Error parsing cookie with domain: ' + cookie['domain'])
                continue

            # otherwise, everything went fine
            cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0]
            cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1]
            cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2]
            cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3]
            cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4]

            # mark third-party cookies
            if origin_domain != cookie_domain:
                is_3p_cookie = True
            else:
                is_3p_cookie = False

            # this is a first party cookie, see if we want to store it
            if is_3p_cookie is False and store_1p is False:
                continue

            # sql_driver.add_domain both stores the new domain and returns its id
            cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn,
                                                     cookie_domain,
                                                     cookie_pubsuffix,
                                                     cookie_tld)

            # name and domain are required, so if they fail we just continue
            try:
                name = cookie['name']
            except:
                continue

            try:
                domain = cookie_domain
            except:
                continue

            # these are optional, fill with null values if fail
            try:
                secure = cookie['secure']
            except:
                secure = None

            try:
                path = cookie['path']
            except:
                path = None

            try:
                httponly = cookie['httponly']
            except:
                httponly = None

            try:
                expiry = cookie['expiry']
            except:
                expiry = None

            try:
                value = cookie['value']
            except:
                value = None

            # all done with this cookie
            sql_driver.add_cookie(page_id, name, secure, path, domain,
                                  httponly, expiry, value, is_3p_cookie,
                                  cookie_domain_id)

        # process requests now
        for request in browser_output['processed_requests']:
            # if the request starts with the following we can't parse anyway, so skip
            if re.match('^(data|about|chrome|blob).+', request):
                continue

            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                request)

            # problem with this request, log and fail gracefully
            if element_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url, 'Error parsing element request: ' + request)
                continue

            element_ip = element_ip_fqdn_domain_pubsuffix_tld[0]
            element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1]
            element_domain = element_ip_fqdn_domain_pubsuffix_tld[2]
            element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3]
            element_tld = element_ip_fqdn_domain_pubsuffix_tld[4]

            # sql_driver.add_domain both stores the new domain and returns its db row id
            element_domain_id = sql_driver.add_domain(element_ip, element_fqdn,
                                                      element_domain,
                                                      element_pubsuffix,
                                                      element_tld)

            # mark third-party elements based on domain
            if origin_domain != element_domain:
                is_3p_element = True
            else:
                is_3p_element = False

            # if we are not storing 1p elements continue
            if is_3p_element is False and store_1p is False:
                continue

            if request[:5] == 'https':
                element_is_ssl = True
            else:
                element_is_ssl = False

            try:
                received = browser_output['processed_requests'][request][
                    'received']
            except:
                received = None

            # get domain of referer and determine if page leaked by referer
            try:
                referer = browser_output['processed_requests'][request][
                    'referer']
            except:
                referer = None

            if referer and len(referer) != 0:
                referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                    referer)

                if referer_ip_fqdn_domain_pubsuffix_tld:
                    if referer_ip_fqdn_domain_pubsuffix_tld[
                            2] == origin_domain:
                        page_domain_in_referer = True
                    else:
                        page_domain_in_referer = False
                else:
                    page_domain_in_referer = None
                    sql_driver.log_error(
                        url, 'Error parsing referer header: ' + referer)
            else:
                page_domain_in_referer = None

            try:
                start_time_offset = browser_output['processed_requests'][
                    request]['start_time_offset']
            except:
                start_time_offset = None

            try:
                load_time = browser_output['processed_requests'][request][
                    'load_time']
            except:
                load_time = None

            try:
                status = browser_output['processed_requests'][request][
                    'status']
            except:
                status = None

            try:
                status_text = browser_output['processed_requests'][request][
                    'status_text']
            except:
                status_text = None

            try:
                content_type = browser_output['processed_requests'][request][
                    'content_type']
            except:
                content_type = None

            try:
                body_size = browser_output['processed_requests'][request][
                    'body_size']
            except:
                body_size = None

            try:
                request_headers = str(browser_output['processed_requests']
                                      [request]['request_headers'])
            except:
                request_headers = None

            try:
                response_headers = str(browser_output['processed_requests']
                                       [request]['response_headers'])
            except:
                response_headers = None

            # consider anything before the "?" to be the element_url
            try:
                element_url = re.search('^(.+?)\?.+$', request).group(1)
            except:
                element_url = request

            # consider anything after the "?" to be the args
            try:
                element_args = re.search('^.+(\?.+)$',
                                         request).group(1)  # start url args
            except:
                element_args = None

            # attempt to parse off the extension
            try:
                element_extension = re.search('\.([0-9A-Za-z]+)$',
                                              element_url).group(1).lower()
            except:
                element_extension = None

            # lists of common extensions, can be expanded
            image_extensions = [
                'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif',
                'tiff', 'webp', 'srf'
            ]
            script_extensions = ['js', 'javascript']
            data_extensions = ['json', 'jsonp', 'xml']
            font_extentions = ['woff', 'ttf', 'otf']
            static_extentions = ['html', 'htm', 'shtml']
            dynamic_extentions = [
                'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'
            ]

            # figure out what type of element it is
            if element_extension in image_extensions:
                element_type = 'image'
            elif element_extension in script_extensions:
                element_type = 'javascript'
            elif element_extension in data_extensions:
                element_type = 'data_structured'
            elif element_extension == 'css':
                element_type = 'style_sheet'
            elif element_extension in font_extentions:
                element_type = 'font'
            elif element_extension in static_extentions:
                element_type = 'page_static'
            elif element_extension == dynamic_extentions:
                element_type = 'page_dynamic'
            elif element_extension == 'swf' or element_extension == 'fla':
                element_type = 'Shockwave Flash'
            else:
                element_type = None

            # file hashing has non-trivial overhead and off by default
            #
            # what this does is uses the same ua/referer as the actual request
            # 	so we are just replaying the last one to get similar response
            # 	note that we aren't sending the same cookies so that could be an issue
            # 	otherwise it is equivalent to a page refresh in theory

            # option to hash only 3p elements observed here
            if (get_file_hashes and hash_3p_only
                    and is_3p_element) or (get_file_hashes
                                           and hash_3p_only == False):
                replay_element_request = urllib.request.Request(
                    request,
                    headers={
                        'User-Agent':
                        browser_output['processed_requests'][request]
                        ['user_agent'],
                        'Referer':
                        referer,
                        'Accept':
                        '*/*'
                    })
                try:
                    file_md5 = hashlib.md5(
                        urllib.request.urlopen(replay_element_request,
                                               timeout=10).read()).hexdigest()
                except:
                    file_md5 = None
            else:
                file_md5 = None

            # store request
            sql_driver.add_element(
                page_id, request, element_url, is_3p_element, element_is_ssl,
                received, referer, page_domain_in_referer, start_time_offset,
                load_time, status, status_text, content_type, body_size,
                request_headers, response_headers, file_md5, element_extension,
                element_type, element_args, element_domain_id)

        # close db connection
        sql_driver.close()

        return True
Beispiel #15
0
    def get_client_task(self, client_ip, client_id):
        """
		We determine what the client should be doing when it
			sends us a 'READY' message.  If we find a task
			in our queue we sent it back, otherwise we send 'WAIT' 
			and the client will contact us again.
		"""

        # connect to appropriate db for this client, if none found
        #	return wait command
        if client_id in self.client_id_to_db:
            sql_driver = PostgreSQLDriver(self.client_id_to_db[client_id])
        else:
            print(
                'client_id not in client_id_to_db list, returning wait command'
            )
            return {'task': 'wait'}

        # get config for this db
        config = sql_driver.get_config()

        # get client config
        client_config = {}
        for item in config:
            if 'client' in item:
                client_config[item] = config[item]

        # if we have items in task_queue we send them back, otherwise
        #	we sent a wait command
        if sql_driver.get_task_queue_length(
                max_attempts=config['max_attempts'], unlocked_only=True) != 0:
            # if this fails we wait
            try:
                target, task = sql_driver.get_task_from_queue(
                    max_attempts=config['max_attempts'], client_id=client_id)
            except:
                print('✋ Returning command to wait.')
                return {'task': 'wait'}

            if task == 'get_scan':
                print(f'👉 Returning command to scan {target}')
                return {
                    'task': 'get_scan',
                    'target': target,
                    'client_config': client_config
                }
            elif task == 'get_crawl':
                print(f'👉 Returning command to crawl {target[:30]}...')
                return {
                    'task': 'get_crawl',
                    'target': json.loads(target),
                    'client_config': client_config
                }
            elif task == 'get_policy':
                print(f'👉 Returning command to get_policy {target}')
                return {
                    'task': 'get_policy',
                    'target': target,
                    'client_config': client_config
                }
            elif task == 'get_random_crawl':
                print(f'👉 Returning command to get_random_crawl {target}')
                return {
                    'task': 'get_random_crawl',
                    'target': target,
                    'client_config': client_config
                }
        else:
            print('✋ Returning command to wait.')
            return {'task': 'wait'}
        sql_driver.close()
        del sql_driver
Beispiel #16
0
    def store_results_from_queue(self, process_num):
        """
		If we are using a result queue this function will process
			all pending results.
		"""

        # set up new db connection to the server
        from webxray.PostgreSQLDriver import PostgreSQLDriver
        server_sql_driver = PostgreSQLDriver('server_config')

        # time to sleep when queue is empty
        wait_time = 5

        # loop continues indefintely
        while True:
            result = server_sql_driver.get_result_from_queue()
            if not result:
                print(
                    f'\t[p.{process_num}]\t😴 Going to sleep for {wait_time} seconds to wait for more tasks.'
                )
                time.sleep(wait_time)
                continue

            # result is a dictionary object, unpack it
            result_id = result['result_id']
            client_id = result['client_id']
            client_ip = result['client_ip']
            mapped_db = result['mapped_db']
            target = result['target']
            task = result['task']

            # the task_result needs to be uncompressed
            task_result = json.loads(
                bz2.decompress(base64.urlsafe_b64decode(
                    result['task_result'])).decode('utf-8'))

            if self.debug:
                print(
                    f'\t[p.{process_num}]\t📥 Going to store result for {str(target)[:30]}'
                )

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target': target,
                'task': task,
                'task_result': task_result,
                'client_id': client_id,
                'client_ip': client_ip,
                'db_name': mapped_db
            })

            # we finished processing this result, remove it from result queue
            server_sql_driver.remove_result_from_queue(result_id)

            # FYI
            if store_result['success'] == True:
                print('\t[p.%s]\t👍 Success: %s' % (process_num, target[:50]))
            else:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], store_result['result']))

        # techincally we never get here...
        server_sql_driver.close()
        return
Beispiel #17
0
    def run(self, task='process_tasks_from_queue', pool_size=None):
        """
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there

		when running in slave mode the list is skipping and we got straight to scanning
		"""

        if task == 'process_tasks_from_queue':
            # set up sql connection to get queue_length
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            queue_length = sql_driver.get_task_queue_length()
            sql_driver.close()
            del sql_driver

            print('\t----------------------------------')
            print('\t%s addresses will now be webXray\'d' % queue_length)
            print('\t\t...you can go take a walk. ;-)')
            print('\t----------------------------------')

        # for macOS (darwin) we must specify start method as 'forkserver'
        #	this is essentially voodoo to ward off evil spirits which
        #	appear when large pool sizes are used on macOS
        # get_start_method must be set to 'allow_none', otherwise upon
        #	checking the method it gets set (!) - and if we then get/set again
        #	we get an error
        if sys.platform == 'darwin' and multiprocessing.get_start_method(
                allow_none=True) != 'forkserver':
            multiprocessing.set_start_method('forkserver')
        myPool = multiprocessing.Pool(pool_size)

        # map requires we pass an argument to the function
        #	(even though we don't need to), so we create
        #	a list equal to pool_size which will
        #	spawn the desired number of processes
        process_num = []
        if pool_size == None:
            pool_size = multiprocessing.cpu_count()

        for i in range(0, pool_size):
            process_num.append(i)

        if task == 'process_tasks_from_queue':
            myPool.map(self.process_tasks_from_queue, process_num)
        elif task == 'store_results_from_queue':
            myPool.map(self.store_results_from_queue, process_num)
Beispiel #18
0
    def __init__(self,
                 db_engine,
                 db_name,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_owner_db=True):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_engine = db_engine
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.top_tlds = []
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold
        self.start_time = datetime.now()

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            self.sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(self.db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            exit()

        # this is reused often, do it once to save time
        self.get_pages_ok_count = self.sql_driver.get_pages_ok_count()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        self.setup_report_dir()

        print('\t============================')
        print('\t Patching Domain Owner Data ')
        print('\t============================')

        if flush_owner_db:
            # update the domains to their owners in the db, can be overridden
            #	by changing flush_owner_db to false
            self.patch_domain_owners()
        else:
            print('\t\t\tSkipping')

        # this is used in various places to get owner information
        self.domain_owners = self.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for (tld, pages) in self.top_tlds:
                if tld: print('\t\t |- %s (%s)' % (tld, pages))
        else:
            # othewise we push in a single empty entry
            self.top_tlds.append((None, self.get_pages_ok_count))

        # SPECIAL FEATURE FOR EXPERTS: tracker domain filter
        #
        # you can set a threshold of the number of sites a given 3p domain
        #	is connected to - domains connecting to many sites may correlate those visits
        #	so we call these 'tracker domains'
        #
        # the 'tracker_threshold' variable set above controls the filtering level
        #
        # on large set of sites (e.g. >10k) this works well but on small samples
        #  (e.g. <500) it doesn't work as well as known tracker domains may only
        #  appear on a single site
        #
        # this is off by default and unless you understand what you are doing
        # 	don't use this...but because you are reading the source code for an otherwise
        #	undocumented feature you are probably competent to use it ;-)
        #
        # longer-term we may want to train off a bigger corpus to find tracker domains and
        #	have them prepackaged
        #
        # use at your own risk!
        if tracker_threshold:
            print('\t===================================================')
            print('\t Getting tracker domains with threshold level of %s' %
                  self.tracker_threshold)
            print('\t===================================================')
            print('\t\tProcessing...', end='', flush=True)
            self.tracker_domains = self.get_tracker_domains(
                self.tracker_threshold)
            print('done!')
        else:
            # set to None so various downstream operations get skipped
            self.tracker_domains = None
Beispiel #19
0
    def store_result(self, params):
        """
		Handles storing task_result and removing jobs
			from the task_queue.
		"""

        # unpack params
        target = params['target']
        task = params['task']
        task_result = params['task_result']
        client_id = params['client_id']

        # client_ip is optional
        if 'client_ip' in params:
            client_ip = params['client_ip']
        else:
            client_ip = None

        # if db_name is specified we are running in server mode and we
        #	connect to the db which corresponds to the result being
        #	processed.  otherwise, we use the global db_name as we are
        #	running in non-server mode.
        if 'db_name' in params:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(params['db_name'])
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(params['db_name'])
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()
            output_store = OutputStore(params['db_name'], self.db_engine)
        else:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            output_store = OutputStore(self.db_name, self.db_engine)

        if task == 'get_policy':
            store_result = output_store.store_policy(task_result,
                                                     client_id,
                                                     client_ip=client_ip)
            # we never retry policies
            sql_driver.remove_task_from_queue(target, task)
            if store_result['success']:
                result = {'success': True}
            else:
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail on ' + store_result['result']
                })
                result = {'success': False, 'result': store_result['result']}
        # elif task == 'get_crawl' or task == 'get_random_crawl':
        else:
            all_crawls_ok = True

            # We want to be able to re-run random crawls, and to do so we make sure
            #	the crawl_id will match
            if task == 'get_crawl' or task == 'get_scan':
                crawl_id = target
            elif task == 'get_random_crawl':
                crawl_id = []
                for result in task_result:
                    crawl_id.append(result['start_url'])
                crawl_id = json.dumps(crawl_id)

            # tweak to account for differences between scans/crawls
            if task == 'get_scan': task_result = [task_result]

            # keep track of domains
            all_3p_cookie_domains = set()
            all_3p_dom_storage_domains = set()
            all_3p_request_domains = set()
            all_3p_response_domains = set()
            all_3p_websocket_domains = set()

            # When we store a crawl we add optional fields in the page table
            #	that allow us to connect the page loads into a single crawl.
            #	the crawl_id is a hash of the target (which is a json string
            #	derived from the url_list), and the crawl_timestamp which is the
            #	first accessed time from the crawl.
            for crawl_sequence, result in enumerate(task_result):
                store_result = output_store.store_scan({
                    'browser_output':
                    result,
                    'client_id':
                    client_id,
                    'crawl_id':
                    crawl_id,
                    'crawl_timestamp':
                    task_result[0]['accessed'],
                    'crawl_sequence':
                    crawl_sequence,
                    'client_ip':
                    client_ip
                })

                if store_result['success'] != True:
                    all_crawls_ok = False
                else:
                    # we are successful, create entries in page_lookup table
                    page_lookup_table = self.build_lookup_table(
                        'page', store_result['page_id'], {
                            'requests':
                            store_result['page_3p_request_domains'],
                            'responses':
                            store_result['page_3p_response_domains'],
                            'websockets':
                            store_result['page_3p_websocket_domains'],
                            'dom_storage':
                            store_result['page_3p_dom_storage_domains'],
                            'cookies':
                            store_result['page_3p_dom_storage_domains']
                        })

                    for lookup_item in page_lookup_table:
                        sql_driver.add_page_id_domain_lookup_item(
                            page_lookup_table[lookup_item])

                    # we are also making a lookup table for the crawl, keep joing the
                    #	sets as we go along
                    all_3p_request_domains.update(
                        store_result['page_3p_request_domains'])
                    all_3p_response_domains.update(
                        store_result['page_3p_response_domains'])
                    all_3p_websocket_domains.update(
                        store_result['page_3p_websocket_domains'])
                    all_3p_dom_storage_domains.update(
                        store_result['page_3p_dom_storage_domains'])
                    all_3p_cookie_domains.update(
                        store_result['page_3p_dom_storage_domains'])

            if all_crawls_ok:
                sql_driver.remove_task_from_queue(target, task)
                result = {'success': True}

                # build crawl lookup table
                crawl_lookup_table = self.build_lookup_table(
                    'crawl', crawl_id, {
                        'requests': all_3p_request_domains,
                        'responses': all_3p_response_domains,
                        'websockets': all_3p_websocket_domains,
                        'dom_storage': all_3p_dom_storage_domains,
                        'cookies': all_3p_cookie_domains
                    })

                # patch lookup table
                for lookup_item in crawl_lookup_table:
                    sql_driver.add_crawl_id_domain_lookup_item(
                        crawl_lookup_table[lookup_item])

            else:
                sql_driver.unlock_task_in_queue(target, task)
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail to store all scans for crawl_id_target '
                    + target
                })
                result = {
                    'success': False,
                    'result': 'unable to store all crawl loads'
                }

        # tidy up
        output_store.close()
        sql_driver.close()

        # done
        return result
Beispiel #20
0
    def store_result(self, data):
        """
		We've gotten data from a client, attempt to store it.
		"""

        # unpack params
        client_id = data['client_id']
        client_ip = data['client_ip']
        success = data['success']
        task = data['task']
        task_result = data['task_result']

        # we only load the json string if it is
        #	not a crawl
        if task != 'get_crawl':
            target = json.loads(data['target'])
        else:
            target = data['target']

        # get db connection from config
        mapped_db = self.client_id_to_db[client_id]

        # create db connection
        if client_id in self.client_id_to_db:
            sql_driver = PostgreSQLDriver(mapped_db)
        else:
            return 'FAIL: client_id not in client_id_to_db list'

        # get config for this db
        config = sql_driver.get_config()

        # if we're not expecting this result we ignore it
        if not sql_driver.is_task_in_queue({'task': task, 'target': target}):
            return 'FAIL: task not in queue, ignoring'

        # if browser failed we increment attempts and log the error
        if success == False:
            print(f'👎 Error for {target}: %s' % {task_result})

            # for times we don't want to retry, such as a rejected
            #	redirect or network resolution failure, this could be expanded
            fail_cases = [
                'reached fail limit', 'rejecting redirect',
                'did not find enough internal links'
            ]

            if task_result in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result:
                sql_driver.set_task_as_failed(target, task)
            else:
                sql_driver.unlock_task_in_queue(target, task)

            sql_driver.log_error({
                'client_id': client_id,
                'target': target,
                'task': task,
                'msg': task_result
            })
            sql_driver.close()
            del sql_driver
            return 'FAIL'

        # we only need to put the result in the queue, allows
        #	us to respond to clients faster and keep the results
        #	compressed
        self.server_sql_driver.add_result_to_queue({
            'client_id': client_id,
            'client_ip': client_ip,
            'mapped_db': mapped_db,
            'target': target,
            'task': task,
            'task_result': task_result
        })

        # close out db connection and send back our response
        sql_driver.close()
        del sql_driver
        return 'OK'