def run(self, pool_size): try: uri_list = open('./page_lists/' + self.pages_file_name, 'r') except: print( 'File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() sql_driver = MySQLDriver(self.db_name) # sort out what uris we are processing from the list uris_to_process = [] count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for uri in uri_list: # skip lines that are comments if "#" in uri[0]: continue count += 1 # drop trailing '/, clean off white space, make lower, create cli-safe uri # with parse.quote, but exclude :/ b/c of http:// uri = re.sub('/$', '', urllib.parse.quote(uri.strip(), safe=":/").lower()) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50])) continue # skip if in db already if sql_driver.page_exists(uri): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50])) continue # only add if not in list already if uri not in uris_to_process: print("\t\t%s | %-50s Adding." % (count, uri[:50])) uris_to_process.append(uri) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50])) print('\t----------------------------------') print('\t%s pages will now be webXray\'d' % len(uris_to_process)) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') myPool = Pool(pool_size) myPool.map(self.process_uri, uris_to_process)
def run(self, pool_size): try: uri_list = open('./page_lists/'+self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() sql_driver = MySQLDriver(self.db_name) # sort out what uris we are processing from the list uris_to_process = [] count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for uri in uri_list: # skip lines that are comments if "#" in uri[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', uri)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50])) continue # drop trailing '/, clean off white space, make lower, create cli-safe uri # with parse.quote, but exclude :/ b/c of http:// uri = re.sub('/$', '', urllib.parse.quote(uri.strip(), safe=":/").lower()) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50])) continue # skip if in db already if sql_driver.page_exists(uri): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50])) continue # only add if not in list already if uri not in uris_to_process: print("\t\t%s | %-50s Adding." % (count, uri[:50])) uris_to_process.append(uri) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50])) print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(uris_to_process)) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') myPool = Pool(pool_size) myPool.map(self.process_uri, uris_to_process)
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r', encoding='utf-8') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already unless we are doing a timeseries if self.allow_timeseries == False: if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()
def run(self, pool_size): """ this function manages the parallel processing of the url list using the python Pool class the function first reads the list of urls out of the page_lists directory, cleans it for known issues (eg common binary files), and issues with idna encoding (tricky!) then the page list is mapped to the process_url function and executed in parallell pool_size is defined in the run_webxray.py file, see details there """ # the list of url MUST be in the page_lists directory! try: url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() # set up sql connection used to determine if items are already in the db if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # this list gets mapped to the Pool, very important! urls_to_process = set() # simple counter used solely for updates to CLI count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for url in url_list: # skip lines that are comments if "#" in url[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', url)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the url, converting the domain to idna, # and pasting it all back together split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment)) # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50])) continue # skip if in db already unless we are doing a timeseries if self.allow_timeseries == False: if sql_driver.page_exists(url): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50])) continue # only add if not in list already if url not in urls_to_process: print("\t\t%s | %-50s Adding." % (count, url[:50])) urls_to_process.add(url) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50])) # close the db connection sql_driver.close() print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(urls_to_process)) print('\t\tBrowser(s) are %s' % self.browser_types) print('\t\tBrowser wait time is %s seconds' % self.browser_wait) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') # for macOS (darwin) we must specify start method as 'forkserver' # this is essentially voodoo to ward off evil spirits which # appear when large pool sizes are used on macOS # get_start_method must be set to 'allow_none', otherwise upon # checking the method it gets set (!) - and if we then get/set again # we get an error if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver': multiprocessing.set_start_method('forkserver') myPool = multiprocessing.Pool(pool_size) myPool.map(self.process_url, urls_to_process) # FYI self.print_runtime()
def run(self, pool_size): try: uri_list = open('./page_lists/'+self.pages_file_name, 'r') except: print('File "%s" does not exist, file must be in ./page_lists directory. Exiting.' % self.pages_file_name) exit() sql_driver = MySQLDriver(self.db_name) # sort out what uris we are processing from the list uris_to_process = [] count = 0 print('\t------------------------') print('\t Building List of Pages ') print('\t------------------------') for uri in uri_list: # skip lines that are comments if "#" in uri[0]: continue count += 1 # only do lines starting with https?:// if not (re.match('^https?://.+', uri)): print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50])) continue # non-ascii domains will crash phantomjs, so we need to convert them to # idna/ascii/utf-8 # this requires splitting apart the uri, converting the domain to idna, # and pasting it all back together. ugly. parsed_uri = urlsplit(uri.strip()) uri = parsed_uri[0] + "://" uri += parsed_uri[1].encode('idna').decode('utf-8') # if chunks exist glue them back together if len(parsed_uri[2]) != 0: uri += parsed_uri[2] if len(parsed_uri[3]) != 0: uri += '?' + parsed_uri[3] if len(parsed_uri[4]) != 0: uri += '#' + parsed_uri[4] # if it is a m$ office or other doc, skip if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri): print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50])) continue # skip if in db already if sql_driver.page_exists(uri): print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50])) continue # only add if not in list already if uri not in uris_to_process: print("\t\t%s | %-50s Adding." % (count, uri[:50])) uris_to_process.append(uri) else: print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50])) print('\t----------------------------------') print('\t%s addresses will now be webXray\'d' % len(uris_to_process)) print('\t\t...you can go take a walk. ;-)') print('\t----------------------------------') myPool = Pool(pool_size) myPool.map(self.process_uri, uris_to_process)