コード例 #1
0
    def build_policy_task_queue(self,
                                flush_policy_task_queue=True,
                                timeseries_interval=10080):
        """
		Takes a given list of pages and puts them into a queue
			to be scanned either by the same machine building 
			the queue, or remote machines.
		"""

        # set up new db connection
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # get rid of whatever is in there already
        if flush_policy_task_queue:
            sql_driver.flush_task_queue(task='get_policy')

        # get list of all policies we have
        scanned_policies = []
        for policy_url, in sql_driver.get_scanned_policy_urls():
            scanned_policies.append(policy_url)

        # run the query and add to list
        for policy_url, in sql_driver.get_policies_to_collect():
            # if page has an anchor, we drop everything after
            if policy_url[-1] == '#':
                policy_url = policy_url[:-1]
            elif '#' in policy_url:
                policy_url = re.search('^(.+?)#.+$', policy_url).group(1)

            # skip invalid links
            if not self.utilities.is_url_valid(policy_url): continue

            # already did it, skip
            if policy_url in scanned_policies: continue

            sql_driver.add_task_to_queue(policy_url, 'get_policy')

        # fyi
        print('\t%s pages in task_queue for get_policy' %
              sql_driver.get_task_queue_length(task='get_policy'))

        # we no longer need this db connection
        sql_driver.close()
コード例 #2
0
    def run(self, task='process_tasks_from_queue', pool_size=None):
        """
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there

		when running in slave mode the list is skipping and we got straight to scanning
		"""

        if task == 'process_tasks_from_queue':
            # set up sql connection to get queue_length
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            queue_length = sql_driver.get_task_queue_length()
            sql_driver.close()
            del sql_driver

            print('\t----------------------------------')
            print('\t%s addresses will now be webXray\'d' % queue_length)
            print('\t\t...you can go take a walk. ;-)')
            print('\t----------------------------------')

        # for macOS (darwin) we must specify start method as 'forkserver'
        #	this is essentially voodoo to ward off evil spirits which
        #	appear when large pool sizes are used on macOS
        # get_start_method must be set to 'allow_none', otherwise upon
        #	checking the method it gets set (!) - and if we then get/set again
        #	we get an error
        if sys.platform == 'darwin' and multiprocessing.get_start_method(
                allow_none=True) != 'forkserver':
            multiprocessing.set_start_method('forkserver')
        myPool = multiprocessing.Pool(pool_size)

        # map requires we pass an argument to the function
        #	(even though we don't need to), so we create
        #	a list equal to pool_size which will
        #	spawn the desired number of processes
        process_num = []
        if pool_size == None:
            pool_size = multiprocessing.cpu_count()

        for i in range(0, pool_size):
            process_num.append(i)

        if task == 'process_tasks_from_queue':
            myPool.map(self.process_tasks_from_queue, process_num)
        elif task == 'store_results_from_queue':
            myPool.map(self.store_results_from_queue, process_num)
コード例 #3
0
    def process_tasks_from_queue(self, process_num):
        """
		Selects the next page from the task_queue and passes to 
			process_url.  If load is unsucessful places page
			back into queue and updates attempts.  Returns once 
			when there are no pages in the queue under max_attempts.
		"""

        print('\t[p.%s]\t🏃‍♂️ Starting process' % process_num)

        # need a local connection for each queue manager
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # keep getting tasks from queue until none are left at max attempt level
        while sql_driver.get_task_queue_length(
                max_attempts=self.config['max_attempts'],
                unlocked_only=True) != 0:
            # it is possible for two processes to both pass the above conditional
            #	and then try to get a task from the queue at the same time.
            #	however, the second process that attempts to get a task will
            #	get an empty result (and crash), so we have a try/except block here
            #	to handle that condition gracefully
            try:
                target, task = sql_driver.get_task_from_queue(
                    max_attempts=self.config['max_attempts'],
                    client_id=self.client_id)
            except:
                break

            print('\t[p.%s]\t👉 Initializing: %s for target %s' %
                  (process_num, task, target[:50]))

            # import and set up specified browser driver
            # 	note we set up a new browser each time to
            #	get a fresh profile
            if self.browser_config['client_browser_type'] == 'chrome':
                browser_driver = ChromeDriver(self.browser_config,
                                              port_offset=process_num)
            else:
                print(
                    f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!"
                )
                return

            # does the webxray scan or policy capture
            if task == 'get_scan':
                task_result = browser_driver.get_scan(target)
            elif task == 'get_crawl':
                task_result = browser_driver.get_crawl(json.loads(target))
            elif task == 'get_policy':
                task_result = browser_driver.get_scan(target,
                                                      get_text_only=True)
            elif task == 'get_random_crawl':
                task_result = browser_driver.get_random_crawl(target)

            # kill browser
            del browser_driver

            # browser has failed to get result, unlock and continue
            if task_result['success'] == False:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], task_result['result']))

                # for times we don't want to retry, such as a rejected
                #	redirect or network resolution failure, this could be expanded
                fail_cases = [
                    'reached fail limit', 'rejecting redirect',
                    'did not find enough internal links'
                ]

                if task_result[
                        'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[
                            'result']:
                    sql_driver.set_task_as_failed(target, task)
                else:
                    sql_driver.unlock_task_in_queue(target, task)

                # keep track of error regardless of fail/unlock
                sql_driver.log_error({
                    'client_id': 'localhost',
                    'target': target,
                    'task': task,
                    'msg': task_result['result']
                })
                continue

            # debug
            if self.debug:
                print(
                    '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s'
                    % (process_num, task, target[:50]))

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target':
                target,
                'task':
                task,
                'task_result':
                task_result['result'],
                'client_id':
                self.client_id
            })

            if store_result['success'] == True:
                print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}')
            else:
                print(
                    f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}'
                )

        # tidy up
        sql_driver.close()
        del sql_driver

        print('\t[p.%s]\t✋ Completed process' % process_num)
        return