def execute(self, url, browser_wait): """ Main function, loads page and analyzes results. """ print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Single Site Test On: %s' % url) print('\tBrowser type is %s' % self.browser_type) print('\tBrowser wait time is %s seconds' % browser_wait) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if self.browser_type == 'phantomjs': browser_driver = PhantomDriver() elif self.browser_type == 'chrome': browser_driver = ChromeDriver() chrome_ua = browser_driver.get_ua_for_headless() browser_driver = ChromeDriver(ua=chrome_ua) # attempt to get the page browser_output = browser_driver.get_webxray_scan_data( url, browser_wait) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t' + url) print('\n\t------------------{ Final URL }------------------') print('\t' + browser_output['final_url']) print('\n\t------------------{ Domain }------------------') print('\t' + origin_domain) print( '\n\t------------------{ Seconds to Complete Download }------------------' ) print('\t%s' % (browser_output['load_time'] / 1000)) print('\n\t------------------{ 3rd Party Cookies }------------------') cookie_list = [] for cookie in browser_output['cookies']: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( 'http://' + cookie['domain']) # something went wrong, but we continue to go process the elements if cookie_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse cookie') continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # print external cookies if origin_domain not in cookie_domain: cookie_list.append( re.sub('^\.', '', cookie['domain']) + ' -> ' + cookie['name']) cookie_list.sort() count = 0 for cookie in cookie_list: count += 1 print('\t%s) %s' % (count, cookie)) print( '\n\t------------------{ 3p Domains Requested }------------------') element_domains = [] for request in browser_output['processed_requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( request) # problem with this request, bail on it and do the next if element_ip_fqdn_domain_pubsuffix_tld is None: continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] if origin_domain not in element_domain: if element_domain not in element_domains: element_domains.append(element_domain) element_domains.sort() count = 0 for domain in element_domains: count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item] + ' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem browser_output will be None if browser_output == None: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem we log the error if browser_output['success'] == False: print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result'])) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return else: # no error, treat result as browser output browser_output = browser_output['result'] # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def execute(self, url, browser_wait): """ Main function, loads page and analyzes results. """ print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Single Site Test On: %s' % url) print('\tBrowser type is %s' % self.browser_type) print('\tBrowser wait time is %s seconds' % browser_wait) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if self.browser_type == 'phantomjs': browser_driver = PhantomDriver() elif self.browser_type == 'chrome': browser_driver = ChromeDriver() chrome_ua = browser_driver.get_ua_for_headless() browser_driver = ChromeDriver(ua=chrome_ua) # attempt to get the page browser_output = browser_driver.get_webxray_scan_data(url, browser_wait) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t'+url) print('\n\t------------------{ Final URL }------------------') print('\t'+browser_output['final_url']) print('\n\t------------------{ Domain }------------------') print('\t'+origin_domain) print('\n\t------------------{ Seconds to Complete Download }------------------') print('\t%s' % (browser_output['load_time']/1000)) print('\n\t------------------{ 3rd Party Cookies }------------------') cookie_list = [] for cookie in browser_output['cookies']: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, but we continue to go process the elements if cookie_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse cookie') continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # print external cookies if origin_domain not in cookie_domain: cookie_list.append(re.sub('^\.', '', cookie['domain'])+' -> '+cookie['name']) cookie_list.sort() count = 0 for cookie in cookie_list: count += 1 print('\t%s) %s' % (count,cookie)) print('\n\t------------------{ 3p Domains Requested }------------------') element_domains = [] for request in browser_output['processed_requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, bail on it and do the next if element_ip_fqdn_domain_pubsuffix_tld is None: continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] if origin_domain not in element_domain: if element_domain not in element_domains: element_domains.append(element_domain) element_domains.sort() count = 0 for domain in element_domains: count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item]+' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))