def build_policy_task_queue(self, flush_policy_task_queue=True, timeseries_interval=10080): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # set up new db connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # get rid of whatever is in there already if flush_policy_task_queue: sql_driver.flush_task_queue(task='get_policy') # get list of all policies we have scanned_policies = [] for policy_url, in sql_driver.get_scanned_policy_urls(): scanned_policies.append(policy_url) # run the query and add to list for policy_url, in sql_driver.get_policies_to_collect(): # if page has an anchor, we drop everything after if policy_url[-1] == '#': policy_url = policy_url[:-1] elif '#' in policy_url: policy_url = re.search('^(.+?)#.+$', policy_url).group(1) # skip invalid links if not self.utilities.is_url_valid(policy_url): continue # already did it, skip if policy_url in scanned_policies: continue sql_driver.add_task_to_queue(policy_url, 'get_policy') # fyi print('\t%s pages in task_queue for get_policy' % sql_driver.get_task_queue_length(task='get_policy')) # we no longer need this db connection sql_driver.close()
def build_scan_task_queue(self, params): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # these vars are specific to this function pages_file_name = params['pages_file_name'] flush_scan_task_queue = params['flush_scan_task_queue'] task = params['task'] # set up sql connection used to determine if items are already in the db if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # open list of pages try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + pages_file_name, 'r', encoding='utf-8') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % pages_file_name) sql_driver.close() exit() # get list of pages already scanned already_scanned = [] print('\tFetching list of pages already scanned...') if self.config['timeseries_enabled']: for url, in sql_driver.get_all_pages_exist( timeseries_interval=self.config['timeseries_interval']): already_scanned.append(url) else: for url, in sql_driver.get_all_pages_exist(): already_scanned.append(url) print(f'\t => {len(already_scanned)} pages already scanned') # get rid of whatever is in there already if flush_scan_task_queue: sql_driver.flush_task_queue(task=task) # simple counter used solely for updates to CLI count = 0 print('\t---------------------') print('\t Building Page Queue ') print('\t---------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # make sure url is valid if self.utilities.is_url_valid(url) == False: print(f'\t\t{count} | {url} is invalid') continue # perform idna fix url = self.utilities.idna_encode_url(url) # if we are allowing time series we see if page has been scanned in the # specified interval, otherwise if we are *not* allowing a time series # we skip anything already in the db if url in already_scanned and self.config['timeseries_enabled']: print(f'\t\t{count} | {url[:30]}... Scanned too recently.') continue elif url in already_scanned: print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.') continue # add to the queue, duplicates will be # ignored sql_driver.add_task_to_queue(url, task) print(f'\t\t{count} | {url[:30]}... Adding to queue.') # close the db connection sql_driver.close()