def __init__(self, db_name=None, db_engine=None, client_id=None): """ This class can be called to run store_results_from_queue which connects to the server_config database to fetch results, in which case a global db_name isn't needed, so we have db_name=None to account for that. However, if we *do* have a db_name we set up a global config. """ self.db_name = db_name self.db_engine = db_engine self.client_id = client_id self.debug = True self.utilities = Utilities() # get global config for this db if db_name: # set up database connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.config = sql_driver.get_config() self.browser_config = {} for item in self.config: if 'client' in item: self.browser_config[item] = self.config[item] sql_driver.close()
def build_policy_task_queue(self, flush_policy_task_queue=True, timeseries_interval=10080): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # set up new db connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # get rid of whatever is in there already if flush_policy_task_queue: sql_driver.flush_task_queue(task='get_policy') # get list of all policies we have scanned_policies = [] for policy_url, in sql_driver.get_scanned_policy_urls(): scanned_policies.append(policy_url) # run the query and add to list for policy_url, in sql_driver.get_policies_to_collect(): # if page has an anchor, we drop everything after if policy_url[-1] == '#': policy_url = policy_url[:-1] elif '#' in policy_url: policy_url = re.search('^(.+?)#.+$', policy_url).group(1) # skip invalid links if not self.utilities.is_url_valid(policy_url): continue # already did it, skip if policy_url in scanned_policies: continue sql_driver.add_task_to_queue(policy_url, 'get_policy') # fyi print('\t%s pages in task_queue for get_policy' % sql_driver.get_task_queue_length(task='get_policy')) # we no longer need this db connection sql_driver.close()
def process_url(self, url): """ this function takes a specified url, loads it in the browser and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do checks if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data( url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem we log the error if browser_output['success'] == False: print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result'])) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return else: # no error, treat result as browser output browser_output = browser_output['result'] # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50], browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def store(self, url, browser_output, store_source=False, store_1p=True): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: source = browser_output['source'] else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], page_is_ssl, source, browser_output['load_time'], page_domain_id ) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie( page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id ) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing element request: '+request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https' or request[:3] == 'wss': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request]['received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request]['referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error(url, 'Error parsing referer header: '+referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request]['load_time'] except: load_time = None try: status = browser_output['processed_requests'][request]['status'] except: status = None try: status_text = browser_output['processed_requests'][request]['status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request]['content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request]['body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'][request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'][request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf'] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None file_md5 = None # final tasks is to truncate the request if it is # over 2k characters as it is likely # binary data and may cause problems inserting # into TEXT fields in database # # TODO: # better handle binary data in general if len(request) >= 2000: request = request[:2000] if len(element_url) >= 2000: element_url = element_url[:2000] # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, element_extension, element_type, element_args, element_domain_id ) # close db connection sql_driver.close() return True
def run(self, task='process_tasks_from_queue', pool_size=None): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there when running in slave mode the list is skipping and we got straight to scanning """ if task == 'process_tasks_from_queue': # set up sql connection to get queue_length if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() queue_length = sql_driver.get_task_queue_length() sql_driver.close() del sql_driver print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % queue_length) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method( allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) # map requires we pass an argument to the function # (even though we don't need to), so we create # a list equal to pool_size which will # spawn the desired number of processes process_num = [] if pool_size == None: pool_size = multiprocessing.cpu_count() for i in range(0, pool_size): process_num.append(i) if task == 'process_tasks_from_queue': myPool.map(self.process_tasks_from_queue, process_num) elif task == 'store_results_from_queue': myPool.map(self.store_results_from_queue, process_num)
def process_tasks_from_queue(self, process_num): """ Selects the next page from the task_queue and passes to process_url. If load is unsucessful places page back into queue and updates attempts. Returns once when there are no pages in the queue under max_attempts. """ print('\t[p.%s]\t🏃♂️ Starting process' % process_num) # need a local connection for each queue manager if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # keep getting tasks from queue until none are left at max attempt level while sql_driver.get_task_queue_length( max_attempts=self.config['max_attempts'], unlocked_only=True) != 0: # it is possible for two processes to both pass the above conditional # and then try to get a task from the queue at the same time. # however, the second process that attempts to get a task will # get an empty result (and crash), so we have a try/except block here # to handle that condition gracefully try: target, task = sql_driver.get_task_from_queue( max_attempts=self.config['max_attempts'], client_id=self.client_id) except: break print('\t[p.%s]\t👉 Initializing: %s for target %s' % (process_num, task, target[:50])) # import and set up specified browser driver # note we set up a new browser each time to # get a fresh profile if self.browser_config['client_browser_type'] == 'chrome': browser_driver = ChromeDriver(self.browser_config, port_offset=process_num) else: print( f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!" ) return # does the webxray scan or policy capture if task == 'get_scan': task_result = browser_driver.get_scan(target) elif task == 'get_crawl': task_result = browser_driver.get_crawl(json.loads(target)) elif task == 'get_policy': task_result = browser_driver.get_scan(target, get_text_only=True) elif task == 'get_random_crawl': task_result = browser_driver.get_random_crawl(target) # kill browser del browser_driver # browser has failed to get result, unlock and continue if task_result['success'] == False: print('\t[p.%s]\t👎 Error: %s %s' % (process_num, target[:50], task_result['result'])) # for times we don't want to retry, such as a rejected # redirect or network resolution failure, this could be expanded fail_cases = [ 'reached fail limit', 'rejecting redirect', 'did not find enough internal links' ] if task_result[ 'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[ 'result']: sql_driver.set_task_as_failed(target, task) else: sql_driver.unlock_task_in_queue(target, task) # keep track of error regardless of fail/unlock sql_driver.log_error({ 'client_id': 'localhost', 'target': target, 'task': task, 'msg': task_result['result'] }) continue # debug if self.debug: print( '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s' % (process_num, task, target[:50])) # store_result also handles task queue mangement store_result = self.store_result({ 'target': target, 'task': task, 'task_result': task_result['result'], 'client_id': self.client_id }) if store_result['success'] == True: print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}') else: print( f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}' ) # tidy up sql_driver.close() del sql_driver print('\t[p.%s]\t✋ Completed process' % process_num) return
def build_scan_task_queue(self, params): """ Takes a given list of pages and puts them into a queue to be scanned either by the same machine building the queue, or remote machines. """ # these vars are specific to this function pages_file_name = params['pages_file_name'] flush_scan_task_queue = params['flush_scan_task_queue'] task = params['task'] # set up sql connection used to determine if items are already in the db if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # open list of pages try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + pages_file_name, 'r', encoding='utf-8') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % pages_file_name) sql_driver.close() exit() # get list of pages already scanned already_scanned = [] print('\tFetching list of pages already scanned...') if self.config['timeseries_enabled']: for url, in sql_driver.get_all_pages_exist( timeseries_interval=self.config['timeseries_interval']): already_scanned.append(url) else: for url, in sql_driver.get_all_pages_exist(): already_scanned.append(url) print(f'\t => {len(already_scanned)} pages already scanned') # get rid of whatever is in there already if flush_scan_task_queue: sql_driver.flush_task_queue(task=task) # simple counter used solely for updates to CLI count = 0 print('\t---------------------') print('\t Building Page Queue ') print('\t---------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # make sure url is valid if self.utilities.is_url_valid(url) == False: print(f'\t\t{count} | {url} is invalid') continue # perform idna fix url = self.utilities.idna_encode_url(url) # if we are allowing time series we see if page has been scanned in the # specified interval, otherwise if we are *not* allowing a time series # we skip anything already in the db if url in already_scanned and self.config['timeseries_enabled']: print(f'\t\t{count} | {url[:30]}... Scanned too recently.') continue elif url in already_scanned: print(f'\t\t{count} | {url[:30]}... Exists in DB, skipping.') continue # add to the queue, duplicates will be # ignored sql_driver.add_task_to_queue(url, task) print(f'\t\t{count} | {url[:30]}... Adding to queue.') # close the db connection sql_driver.close()
def store_result(self, params): """ Handles storing task_result and removing jobs from the task_queue. """ # unpack params target = params['target'] task = params['task'] task_result = params['task_result'] client_id = params['client_id'] # client_ip is optional if 'client_ip' in params: client_ip = params['client_ip'] else: client_ip = None # if db_name is specified we are running in server mode and we # connect to the db which corresponds to the result being # processed. otherwise, we use the global db_name as we are # running in non-server mode. if 'db_name' in params: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(params['db_name']) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(params['db_name']) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(params['db_name'], self.db_engine) else: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(self.db_name, self.db_engine) if task == 'get_policy': store_result = output_store.store_policy(task_result, client_id, client_ip=client_ip) # we never retry policies sql_driver.remove_task_from_queue(target, task) if store_result['success']: result = {'success': True} else: # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail on ' + store_result['result'] }) result = {'success': False, 'result': store_result['result']} # elif task == 'get_crawl' or task == 'get_random_crawl': else: all_crawls_ok = True # We want to be able to re-run random crawls, and to do so we make sure # the crawl_id will match if task == 'get_crawl' or task == 'get_scan': crawl_id = target elif task == 'get_random_crawl': crawl_id = [] for result in task_result: crawl_id.append(result['start_url']) crawl_id = json.dumps(crawl_id) # tweak to account for differences between scans/crawls if task == 'get_scan': task_result = [task_result] # keep track of domains all_3p_cookie_domains = set() all_3p_dom_storage_domains = set() all_3p_request_domains = set() all_3p_response_domains = set() all_3p_websocket_domains = set() # When we store a crawl we add optional fields in the page table # that allow us to connect the page loads into a single crawl. # the crawl_id is a hash of the target (which is a json string # derived from the url_list), and the crawl_timestamp which is the # first accessed time from the crawl. for crawl_sequence, result in enumerate(task_result): store_result = output_store.store_scan({ 'browser_output': result, 'client_id': client_id, 'crawl_id': crawl_id, 'crawl_timestamp': task_result[0]['accessed'], 'crawl_sequence': crawl_sequence, 'client_ip': client_ip }) if store_result['success'] != True: all_crawls_ok = False else: # we are successful, create entries in page_lookup table page_lookup_table = self.build_lookup_table( 'page', store_result['page_id'], { 'requests': store_result['page_3p_request_domains'], 'responses': store_result['page_3p_response_domains'], 'websockets': store_result['page_3p_websocket_domains'], 'dom_storage': store_result['page_3p_dom_storage_domains'], 'cookies': store_result['page_3p_dom_storage_domains'] }) for lookup_item in page_lookup_table: sql_driver.add_page_id_domain_lookup_item( page_lookup_table[lookup_item]) # we are also making a lookup table for the crawl, keep joing the # sets as we go along all_3p_request_domains.update( store_result['page_3p_request_domains']) all_3p_response_domains.update( store_result['page_3p_response_domains']) all_3p_websocket_domains.update( store_result['page_3p_websocket_domains']) all_3p_dom_storage_domains.update( store_result['page_3p_dom_storage_domains']) all_3p_cookie_domains.update( store_result['page_3p_dom_storage_domains']) if all_crawls_ok: sql_driver.remove_task_from_queue(target, task) result = {'success': True} # build crawl lookup table crawl_lookup_table = self.build_lookup_table( 'crawl', crawl_id, { 'requests': all_3p_request_domains, 'responses': all_3p_response_domains, 'websockets': all_3p_websocket_domains, 'dom_storage': all_3p_dom_storage_domains, 'cookies': all_3p_cookie_domains }) # patch lookup table for lookup_item in crawl_lookup_table: sql_driver.add_crawl_id_domain_lookup_item( crawl_lookup_table[lookup_item]) else: sql_driver.unlock_task_in_queue(target, task) # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail to store all scans for crawl_id_target ' + target }) result = { 'success': False, 'result': 'unable to store all crawl loads' } # tidy up output_store.close() sql_driver.close() # done return result
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r', encoding='utf-8') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains may cause issues, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit( (split_url.scheme, idna_fixed_netloc, split_url.path, split_url.query, split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method( allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()