Python SQLiteDriver.flush_task_queue Exemples

Langage de programmation: Python

Espace de nommage/Pack: webxray.SQLiteDriver

Class/Type: SQLiteDriver

Méthode/Fonction: flush_task_queue

Exemples au hotexamples.com: 2

Python SQLiteDriver.flush_task_queue - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de webxray.SQLiteDriver.SQLiteDriver.flush_task_queue extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

SQLiteDriver(17)

close(9)

log_error(4)

get_task_queue_length(3)

add_task_to_queue(2)

flush_task_queue(2)

unlock_task_in_queue(2)

add_page_id_domain_lookup_item(1)

add_page(1)

add_element(1)

add_cookie(1)

get_config(1)

get_policies_to_collect(1)

get_scanned_policy_urls(1)

get_task_from_queue(1)

add_domain(1)

add_crawl_id_domain_lookup_item(1)

page_exists(1)

remove_task_from_queue(1)

set_task_as_failed(1)

get_all_pages_exist(1)

Méthodes fréquemment utilisées

SQLiteDriver (17)

close (9)

log_error (4)

get_task_queue_length (3)

add_task_to_queue (2)

flush_task_queue (2)

unlock_task_in_queue (2)

add_page_id_domain_lookup_item (1)

add_page (1)

add_element (1)

Méthodes fréquemment utilisées

add_cookie (1)

get_config (1)

get_policies_to_collect (1)

get_scanned_policy_urls (1)

get_task_from_queue (1)

add_domain (1)

add_crawl_id_domain_lookup_item (1)

page_exists (1)

remove_task_from_queue (1)

set_task_as_failed (1)

get_all_pages_exist (1)

Méthodes fréquemment utilisées

get_all_pages_exist (1)

Exemple #1

0

Afficher le fichier

def build_policy_task_queue(self, flush_policy_task_queue=True, timeseries_interval=10080): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # set up new db connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # get rid of whatever is in there already if flush_policy_task_queue: sql_driver.flush_task_queue(task='get_policy') # get list of all policies we have scanned_policies = [] for policy_url, in sql_driver.get_scanned_policy_urls(): scanned_policies.append(policy_url) # run the query and add to list for policy_url, in sql_driver.get_policies_to_collect(): # if page has an anchor, we drop everything after if policy_url[-1] == '#': policy_url = policy_url[:-1] elif '#' in policy_url: policy_url = re.search('^(.+?)#.+$', policy_url).group(1) # skip invalid links if not self.utilities.is_url_valid(policy_url): continue # already did it, skip if policy_url in scanned_policies: continue sql_driver.add_task_to_queue(policy_url, 'get_policy') # fyi print('\t%s pages in task_queue for get_policy' % sql_driver.get_task_queue_length(task='get_policy')) # we no longer need this db connection sql_driver.close()

Exemple #2

0

Afficher le fichier

def build_scan_task_queue(self, params): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # these vars are specific to this function pages_file_name = params['pages_file_name'] flush_scan_task_queue = params['flush_scan_task_queue'] task = params['task'] # set up sql connection used to determine if items are already in the db if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # open list of pages try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + pages_file_name, 'r', encoding='utf-8') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % pages_file_name) sql_driver.close() exit() # get list of pages already scanned already_scanned = [] print('\tFetching list of pages already scanned...') if self.config['timeseries_enabled']: for url, in sql_driver.get_all_pages_exist( timeseries_interval=self.config['timeseries_interval']): already_scanned.append(url) else: for url, in sql_driver.get_all_pages_exist(): already_scanned.append(url) print(f'\t => {len(already_scanned)} pages already scanned') # get rid of whatever is in there already if flush_scan_task_queue: sql_driver.flush_task_queue(task=task) # simple counter used solely for updates to CLI count = 0 print('\t---------------------') print('\t Building Page Queue ') print('\t---------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # make sure url is valid if self.utilities.is_url_valid(url) == False: print(f'\t\t{count} | {url} is invalid') continue # perform idna fix url = self.utilities.idna_encode_url(url) # if we are allowing time series we see if page has been scanned in the # specified interval, otherwise if we are *not* allowing a time series # we skip anything already in the db if url in already_scanned and self.config['timeseries_enabled']: print(f'\t\t{count} | {url[:30]}... Scanned too recently.') continue elif url in already_scanned: print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.') continue # add to the queue, duplicates will be # ignored sql_driver.add_task_to_queue(url, task) print(f'\t\t{count} | {url[:30]}... Adding to queue.') # close the db connection sql_driver.close()