def process_uri(self, uri): sql_driver = MySQLDriver(self.db_name) output_store = OutputStore(self.db_name) phantom_driver = PhantomDriver( '--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') # this can be higher or lower depending on network load # generally, 90 seems to be fine, so keep with it try: phantom_output = phantom_driver.execute(uri, 90) except: print("\t\t%-50s Phantomjs Did Not Return." % uri[:50]) sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.") return if re.match('^FAIL.+', phantom_output): print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output)) sql_driver.log_error(uri, phantom_output) else: print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output))) # closes our db connections sql_driver.close() output_store.close() return
def analyze_url(uri: str) -> List[str]: """ Analyze given URI and get page information by using webXray. Arg: uri(str): Any URI that is not analyzed yet. Return: dict {"uri": list[str]}: (key: given uri, value: listed domain name of cookies) """ parser = ParseURI() pd = PhantomDriver("--ignore-ssl-errors=true --ssl-protocol=any", "wbxr_logger.js") output = pd.execute(uri, 25) if re.match("^FAIL.+", output): # Probably this isn't needed return [] else: try: parsed_data = json.loads(re.search("(\{.+\})", output).group(1)) except Exception as e: print(e) return [] orig_domain = parser.get_domain_pubsuffix_tld(uri)[0] cookie_domains = map(lambda x: x["domain"], parsed_data["cookies"]) tpcookie_domains = filter( lambda x: parser.get_domain_pubsuffix_tld(f"http://{x[1:]}")[0] != orig_domain, cookie_domains, ) tpcookie_domain_names = map(remove_dot, tpcookie_domains) return list(tpcookie_domain_names)
def process_uri(self, uri): sql_driver = MySQLDriver(self.db_name) output_store = OutputStore(self.db_name) phantom_driver = PhantomDriver('--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') # this can be higher or lower depending on network load # generally, 90 seems to be fine, so keep with it try: phantom_output = phantom_driver.execute(uri, 90) except: print("\t\t%-50s Phantomjs Did Not Return." % uri[:50]) sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.") return if re.match('^FAIL.+', phantom_output): print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output)) sql_driver.log_error(uri, phantom_output) else: print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output))) # closes our db connections sql_driver.close() output_store.close() return
def execute(self, url, browser_wait): """ Main function, loads page and analyzes results. """ print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Single Site Test On: %s' % url) print('\tBrowser type is %s' % self.browser_type) print('\tBrowser wait time is %s seconds' % browser_wait) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if self.browser_type == 'phantomjs': browser_driver = PhantomDriver() elif self.browser_type == 'chrome': browser_driver = ChromeDriver() chrome_ua = browser_driver.get_ua_for_headless() browser_driver = ChromeDriver(ua=chrome_ua) # attempt to get the page browser_output = browser_driver.get_webxray_scan_data( url, browser_wait) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t' + url) print('\n\t------------------{ Final URL }------------------') print('\t' + browser_output['final_url']) print('\n\t------------------{ Domain }------------------') print('\t' + origin_domain) print( '\n\t------------------{ Seconds to Complete Download }------------------' ) print('\t%s' % (browser_output['load_time'] / 1000)) print('\n\t------------------{ 3rd Party Cookies }------------------') cookie_list = [] for cookie in browser_output['cookies']: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( 'http://' + cookie['domain']) # something went wrong, but we continue to go process the elements if cookie_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse cookie') continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # print external cookies if origin_domain not in cookie_domain: cookie_list.append( re.sub('^\.', '', cookie['domain']) + ' -> ' + cookie['name']) cookie_list.sort() count = 0 for cookie in cookie_list: count += 1 print('\t%s) %s' % (count, cookie)) print( '\n\t------------------{ 3p Domains Requested }------------------') element_domains = [] for request in browser_output['processed_requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( request) # problem with this request, bail on it and do the next if element_ip_fqdn_domain_pubsuffix_tld is None: continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] if origin_domain not in element_domain: if element_domain not in element_domains: element_domains.append(element_domain) element_domains.sort() count = 0 for domain in element_domains: count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item] + ' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem we log the error if browser_output['success'] == False: print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result'])) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return else: # no error, treat result as browser output browser_output = browser_output['result'] # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def process_url(self, url): """ this function takes a specified url, loads it in the browser (currently phantomjs) and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do timeseries checks if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'phantomjs': browser_driver = PhantomDriver() elif browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # support for timeseries collections - purposefully undocumented if self.allow_timeseries: page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type) if page_last_accessed_browser_type: time_diff = datetime.now()-page_last_accessed_browser_type[0] if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type: print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type)) continue # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem browser_output will be None if browser_output == None: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50],browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50],browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def execute(self, url, browser_wait): """ Main function, loads page and analyzes results. """ print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Single Site Test On: %s' % url) print('\tBrowser type is %s' % self.browser_type) print('\tBrowser wait time is %s seconds' % browser_wait) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if self.browser_type == 'phantomjs': browser_driver = PhantomDriver() elif self.browser_type == 'chrome': browser_driver = ChromeDriver() chrome_ua = browser_driver.get_ua_for_headless() browser_driver = ChromeDriver(ua=chrome_ua) # attempt to get the page browser_output = browser_driver.get_webxray_scan_data(url, browser_wait) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t'+url) print('\n\t------------------{ Final URL }------------------') print('\t'+browser_output['final_url']) print('\n\t------------------{ Domain }------------------') print('\t'+origin_domain) print('\n\t------------------{ Seconds to Complete Download }------------------') print('\t%s' % (browser_output['load_time']/1000)) print('\n\t------------------{ 3rd Party Cookies }------------------') cookie_list = [] for cookie in browser_output['cookies']: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, but we continue to go process the elements if cookie_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse cookie') continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # print external cookies if origin_domain not in cookie_domain: cookie_list.append(re.sub('^\.', '', cookie['domain'])+' -> '+cookie['name']) cookie_list.sort() count = 0 for cookie in cookie_list: count += 1 print('\t%s) %s' % (count,cookie)) print('\n\t------------------{ 3p Domains Requested }------------------') element_domains = [] for request in browser_output['processed_requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, bail on it and do the next if element_ip_fqdn_domain_pubsuffix_tld is None: continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] if origin_domain not in element_domain: if element_domain not in element_domains: element_domains.append(element_domain) element_domains.sort() count = 0 for domain in element_domains: count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item]+' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))
def report(self, uri): phantom_driver = PhantomDriver('--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') phantom_output = phantom_driver.execute(uri, 90) if re.match('^FAIL.+', phantom_output): print("\tERROR URI: "+uri+"\n\t\tExiting on: "+phantom_output) exit() origin_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(uri) origin_domain = origin_domain_pubsuffix_tld[0] origin_pubsuffix = origin_domain_pubsuffix_tld[1] origin_tld = origin_domain_pubsuffix_tld[2] # parse out the json from our phantom_output try: data = json.loads(re.search('(\{.+\})', phantom_output).group(1)) except Exception as e: print("\t\tException: %s" % e) print("\t\tphantom_output was unreadable") print(phantom_output[:100]) return '' print("\n\t------------------{ URI }------------------") print("\t"+uri) print("\n\t------------------{ Final URI }------------------") print("\t"+data["final_uri"]) print("\n\t------------------{ Domain }------------------") print("\t"+origin_domain) print("\n\t------------------{ Title }------------------") print("\t"+data["title"]) print("\n\t------------------{ Description }------------------") print("\t"+data["meta_desc"]) print("\n\t------------------{ 3rd Party Cookies }------------------") cookie_list = [] for cookie in data["cookies"]: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld("http://"+cookie["domain"]) cookie_domain = cookie_domain_pubsuffix_tld[0] cookie_pubsuffix = cookie_domain_pubsuffix_tld[1] cookie_tld = cookie_domain_pubsuffix_tld[2] # print external cookies if origin_domain not in cookie_domain: cookie_list.append(re.sub("^\.", "", cookie["domain"])+" -> "+cookie["name"])#+" = "+cookie["value"]) cookie_list.sort() for cookie in cookie_list: print("\t"+cookie) print("\n\t------------------{ External Requests }------------------") requested_domains = [] for request in data["requested_uris"]: # if the request starts with "data" we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue # get domain, pubsuffix, and tld from request requested_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(request) requested_domain = requested_domain_pubsuffix_tld[0] requested_pubsuffix = requested_domain_pubsuffix_tld[1] requested_tld = requested_domain_pubsuffix_tld[2] if origin_domain not in requested_domain: if requested_domain not in requested_domains: requested_domains.append(requested_domain) requested_domains.sort() for domain in requested_domains: print("\t"+domain)
def report(self, uri): phantom_driver = PhantomDriver( '--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') phantom_output = phantom_driver.execute(uri, 90) if re.match('^FAIL.+', phantom_output): print("\tERROR URI: " + uri + "\n\t\tExiting on: " + phantom_output) exit() origin_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld( uri) origin_domain = origin_domain_pubsuffix_tld[0] origin_pubsuffix = origin_domain_pubsuffix_tld[1] origin_tld = origin_domain_pubsuffix_tld[2] # parse out the json from our phantom_output try: data = json.loads(re.search('(\{.+\})', phantom_output).group(1)) except Exception as e: print("\t\tException: %s" % e) print("\t\tphantom_output was unreadable") print(phantom_output[:100]) return '' print("\n\t------------------{ URI }------------------") print("\t" + uri) print("\n\t------------------{ Final URI }------------------") print("\t" + data["final_uri"]) print("\n\t------------------{ Domain }------------------") print("\t" + origin_domain) print("\n\t------------------{ Title }------------------") print("\t" + data["title"]) print("\n\t------------------{ Description }------------------") print("\t" + data["meta_desc"]) print("\n\t------------------{ 3rd Party Cookies }------------------") cookie_list = [] for cookie in data["cookies"]: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld( "http://" + cookie["domain"]) cookie_domain = cookie_domain_pubsuffix_tld[0] cookie_pubsuffix = cookie_domain_pubsuffix_tld[1] cookie_tld = cookie_domain_pubsuffix_tld[2] # print external cookies if origin_domain not in cookie_domain: cookie_list.append( re.sub("^\.", "", cookie["domain"]) + " -> " + cookie["name"]) #+" = "+cookie["value"]) cookie_list.sort() for cookie in cookie_list: print("\t" + cookie) print("\n\t------------------{ External Requests }------------------") requested_domains = [] for request in data["requested_uris"]: # if the request starts with "data" we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue # get domain, pubsuffix, and tld from request requested_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld( request) requested_domain = requested_domain_pubsuffix_tld[0] requested_pubsuffix = requested_domain_pubsuffix_tld[1] requested_tld = requested_domain_pubsuffix_tld[2] if origin_domain not in requested_domain: if requested_domain not in requested_domains: requested_domains.append(requested_domain) requested_domains.sort() for domain in requested_domains: print("\t" + domain)