def __init__(self, db_name=None, db_engine=None): # if we have db params set up global db connection, otherwise we don't bother if db_name: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() elif db_engine: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver() elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver() else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.url_parser = ParseURL()
def __init__(self): self.url_parser = ParseURL() self.domain_owners = {} self.id_to_owner = {} self.id_to_parent = {} # set up the domain ownership dictionary for item in json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')): if item['id'] == '-': continue self.id_to_owner[item['id']] = item['name'] self.id_to_parent[item['id']] = item['parent_id'] for domain in item['domains']: self.domain_owners[domain] = item['id']
def __init__(self, db_name, db_engine): self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() self.debug = False if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.config = self.sql_driver.get_config()
def __init__(self, browser_type): self.url_parser = ParseURL() self.browser_type = browser_type self.domain_owners = {} self.id_to_owner = {} self.id_to_parent = {} # set up the domain ownership dictionary for item in json.load( open( os.path.dirname(os.path.abspath(__file__)) + '/resources/domain_owners/domain_owners.json', 'r')): self.id_to_owner[item['id']] = item['owner_name'] self.id_to_parent[item['id']] = item['parent_id'] for domain in item['domains']: self.domain_owners[domain] = item['id']
def __init__(self, browser_type): self.url_parser = ParseURL() self.browser_type = browser_type self.domain_owners = {} self.id_to_owner = {} self.id_to_parent = {} # set up the domain ownership dictionary for item in json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')): self.id_to_owner[item['id']] = item['owner_name'] self.id_to_parent[item['id']] = item['parent_id'] for domain in item['domains']: self.domain_owners[domain] = item['id']
def __init__(self, db_engine, db_name): self.db_engine = db_engine self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL()
class OutputStore: """ This class receives data from the browser, processes it, and stores it in the db """ def __init__(self, db_engine, db_name): self.db_engine = db_engine self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() # init def store(self, url, browser_output, store_source=False, store_1p=True): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: source = browser_output['source'] else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], page_is_ssl, source, browser_output['load_time'], page_domain_id ) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie( page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id ) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing element request: '+request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https' or request[:3] == 'wss': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request]['received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request]['referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error(url, 'Error parsing referer header: '+referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request]['load_time'] except: load_time = None try: status = browser_output['processed_requests'][request]['status'] except: status = None try: status_text = browser_output['processed_requests'][request]['status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request]['content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request]['body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'][request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'][request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf'] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None file_md5 = None # final tasks is to truncate the request if it is # over 2k characters as it is likely # binary data and may cause problems inserting # into TEXT fields in database # # TODO: # better handle binary data in general if len(request) >= 2000: request = request[:2000] if len(element_url) >= 2000: element_url = element_url[:2000] # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, element_extension, element_type, element_args, element_domain_id ) # close db connection sql_driver.close() return True
class SingleScan: """ Loads and analyzes a single page, print outputs to cli Very simple and does not require a db being configured """ def __init__(self, browser_type): self.url_parser = ParseURL() self.browser_type = browser_type self.domain_owners = {} self.id_to_owner = {} self.id_to_parent = {} # set up the domain ownership dictionary for item in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')): self.id_to_owner[item['id']] = item['owner_name'] self.id_to_parent[item['id']] = item['parent_id'] for domain in item['domains']: self.domain_owners[domain] = item['id'] # end init def get_lineage(self, id): """ Find the upward chain of ownership for a given domain. """ if self.id_to_parent[id] == None: return [id] else: return [id] + self.get_lineage(self.id_to_parent[id]) # end get_lineage def execute(self, url, browser_wait): """ Main function, loads page and analyzes results. """ print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Single Site Test On: %s' % url) print('\tBrowser type is %s' % self.browser_type) print('\tBrowser wait time is %s seconds' % browser_wait) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if self.browser_type == 'chrome': browser_driver = ChromeDriver() chrome_ua = browser_driver.get_ua_for_headless() browser_driver = ChromeDriver(ua=chrome_ua) # attempt to get the page browser_output = browser_driver.get_webxray_scan_data( url, browser_wait) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t' + url) print('\n\t------------------{ Final URL }------------------') print('\t' + browser_output['final_url']) print('\n\t------------------{ Domain }------------------') print('\t' + origin_domain) print( '\n\t------------------{ Seconds to Complete Download }------------------' ) print('\t%s' % (browser_output['load_time'] / 1000)) print('\n\t------------------{ 3rd Party Cookies }------------------') cookie_list = [] for cookie in browser_output['cookies']: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( 'http://' + cookie['domain']) # something went wrong, but we continue to go process the elements if cookie_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse cookie') continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # print external cookies if origin_domain not in cookie_domain: cookie_list.append( re.sub('^\.', '', cookie['domain']) + ' -> ' + cookie['name']) cookie_list.sort() count = 0 for cookie in cookie_list: count += 1 print('\t%s) %s' % (count, cookie)) print( '\n\t------------------{ 3p Domains Requested }------------------') element_domains = [] for request in browser_output['processed_requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( request) # problem with this request, bail on it and do the next if element_ip_fqdn_domain_pubsuffix_tld is None: continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] if origin_domain not in element_domain: if element_domain not in element_domains: element_domains.append(element_domain) element_domains.sort() count = 0 for domain in element_domains: count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item] + ' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))
class SingleScan: """ Loads and analyzes a single page, print outputs to cli Very simple and does not require a db being configured """ def __init__(self, browser_type): self.url_parser = ParseURL() self.browser_type = browser_type self.domain_owners = {} self.id_to_owner = {} self.id_to_parent = {} # set up the domain ownership dictionary for item in json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')): self.id_to_owner[item['id']] = item['owner_name'] self.id_to_parent[item['id']] = item['parent_id'] for domain in item['domains']: self.domain_owners[domain] = item['id'] # end init def get_lineage(self, id): """ Find the upward chain of ownership for a given domain. """ if self.id_to_parent[id] == None: return [id] else: return [id] + self.get_lineage(self.id_to_parent[id]) # end get_lineage def execute(self, url, browser_wait): """ Main function, loads page and analyzes results. """ print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('Single Site Test On: %s' % url) print('\tBrowser type is %s' % self.browser_type) print('\tBrowser wait time is %s seconds' % browser_wait) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if self.browser_type == 'phantomjs': browser_driver = PhantomDriver() elif self.browser_type == 'chrome': browser_driver = ChromeDriver() chrome_ua = browser_driver.get_ua_for_headless() browser_driver = ChromeDriver(ua=chrome_ua) # attempt to get the page browser_output = browser_driver.get_webxray_scan_data(url, browser_wait) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t'+url) print('\n\t------------------{ Final URL }------------------') print('\t'+browser_output['final_url']) print('\n\t------------------{ Domain }------------------') print('\t'+origin_domain) print('\n\t------------------{ Seconds to Complete Download }------------------') print('\t%s' % (browser_output['load_time']/1000)) print('\n\t------------------{ 3rd Party Cookies }------------------') cookie_list = [] for cookie in browser_output['cookies']: # get domain, pubsuffix, and tld from cookie # we have to append http b/c the parser will fail, this is a lame hack, should fix cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, but we continue to go process the elements if cookie_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse cookie') continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # print external cookies if origin_domain not in cookie_domain: cookie_list.append(re.sub('^\.', '', cookie['domain'])+' -> '+cookie['name']) cookie_list.sort() count = 0 for cookie in cookie_list: count += 1 print('\t%s) %s' % (count,cookie)) print('\n\t------------------{ 3p Domains Requested }------------------') element_domains = [] for request in browser_output['processed_requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request): continue element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, bail on it and do the next if element_ip_fqdn_domain_pubsuffix_tld is None: continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] if origin_domain not in element_domain: if element_domain not in element_domains: element_domains.append(element_domain) element_domains.sort() count = 0 for domain in element_domains: count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item]+' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))
def __init__(self, config, port_offset=1, chrome_path=None, headless=True): self.debug = False # unpack config if self.debug: print(config) self.prewait = config['client_prewait'] self.no_event_wait = config['client_no_event_wait'] self.max_wait = config['client_max_wait'] self.return_page_text = config['client_get_text'] self.return_bodies = config['client_get_bodies'] self.return_bodies_base64 = config['client_get_bodies_b64'] self.return_screen_shot = config['client_get_screen_shot'] self.reject_redirects = config['client_reject_redirects'] self.crawl_depth = config['client_crawl_depth'] self.crawl_retries = config['client_crawl_retries'] self.page_load_strategy = config['client_page_load_strategy'] self.min_internal_links = config['client_min_internal_links'] self.headless = headless # custom library in /webxray self.url_parser = ParseURL() # prevents get_scan from closing browser # when we are doing a crawl self.is_crawl = False # gets overwritten once, so we don't have to keep # figuring it out when doing crawls self.browser_type = None self.browser_version = None self.user_agent = None # we can override the path here if chrome_path: chrome_cmd = chrome_cmd else: # if path is not specified we use the common # paths for each os if platform.system() == 'Darwin': chrome_cmd = '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome ' elif platform.system() == 'Linux': chrome_cmd = '/usr/bin/google-chrome ' elif platform.system() == 'Windows': chrome_cmd = 'start chrome ' else: print('Unable to determine Operating System and therefore cannot guess correct Chrome path, see ChromeDriver.py for details.') exit() # use port offset to avoid collissions between processes port = 9222+port_offset # each process will use it's own debugging port or we use default 9222 chrome_cmd += '--remote-debugging-port=%s' % port # sets up blank profile chrome_cmd += ' --guest' # not sure this really does anything chrome_cmd += ' --disable-gpu' # disable sandbox to worki inside docker chrome_cmd += ' --no-sandbox' # set up headless if self.headless: chrome_cmd += ' --headless' # if we're in production send the subprocess output to dev/null, None is normal if not self.debug: devnull = open(os.devnull, 'w') else: devnull = None # run command and as subprocess if self.debug: print(f'going to run command: "{chrome_cmd}"') subprocess.Popen(chrome_cmd,shell=True,stdin=None,stdout=devnull,stderr=devnull,close_fds=True) # allow browser to launch time.sleep(5) # the debugger address has a 'json' path where we can find the websocket # address which is how we send devtools commands, thus we extract the value # "webSocketDebuggerUrl" from the first json object try: debuggerAddress_json = json.loads(urllib.request.urlopen('http://localhost:%s/json' % port).read().decode()) if self.debug: print(debuggerAddress_json) webSocketDebuggerUrl = debuggerAddress_json[0]['webSocketDebuggerUrl'] self.launched = True except Exception as e: self.launched = False return # third, once we have the websocket address we open a connection # and we are (finally) able to communicate with chrome via devtools! # note this connection must be closed! self.devtools_connection = create_connection(webSocketDebuggerUrl) # important, makes sure we don't get stuck # waiting for messages to arrive self.devtools_connection.settimeout(3) # this is incremented globally self.current_ws_command_id = 0 # prevent downloading files, the /dev/null is redundant if self.debug: print('going to disable downloading') response = self.get_single_ws_response('Page.setDownloadBehavior','"behavior":"deny","downloadPath":"/dev/null"') if response['success'] == False: self.exit() return response else: response = response['result'] if self.debug: print(f'{response}') # done return
class OutputStore: """ This class receives data from the browser, processes it, and stores it in the db """ def __init__(self, db_engine, db_name): self.db_engine = db_engine self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() # init def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # figure out the privacy policy url and text, starts null priv_policy_url = None priv_policy_url_text = None # read in our list of privacy link terms from the json file in webxray/resources/policyxray privacy_policy_term_list = self.utilities.get_privacy_policy_term_list( ) # we reverse links return from browser to check footer links first as that is where policy links tend to be all_links = browser_output['all_links'] all_links.reverse() # if we have links search for privacy policy if len(all_links) > 0: # links are tuple for link_text, link_url in all_links: # makes sure we have text, skip links without if link_text: # need lower for string matching link_text = link_text.lower().strip() # not a link we can use if 'javascript' in link_text: continue # see if the link_text is in our term list if link_text in privacy_policy_term_list: # if the link_url is relative this will convert to absolute priv_policy_url = self.utilities.get_absolute_url_from_page_link( url, link_url) priv_policy_url_text = link_text break # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: # handles issue where postgres will crash on inserting null character source = browser_output['source'].replace('\x00', ' ') else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], priv_policy_url, priv_policy_url_text, page_is_ssl, source, browser_output['load_time'], page_domain_id) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( 'http://' + cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error( url, 'Error parsing cookie with domain: ' + cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie(page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error( url, 'Error parsing element request: ' + request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request][ 'received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request][ 'referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld( referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[ 2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error( url, 'Error parsing referer header: ' + referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][ request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request][ 'load_time'] except: load_time = None try: status = browser_output['processed_requests'][request][ 'status'] except: status = None try: status_text = browser_output['processed_requests'][request][ 'status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request][ 'content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request][ 'body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'] [request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'] [request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = [ 'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf' ] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = [ 'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi' ] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None # file hashing has non-trivial overhead and off by default # # what this does is uses the same ua/referer as the actual request # so we are just replaying the last one to get similar response # note that we aren't sending the same cookies so that could be an issue # otherwise it is equivalent to a page refresh in theory # option to hash only 3p elements observed here if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False): replay_element_request = urllib.request.Request( request, headers={ 'User-Agent': browser_output['processed_requests'][request] ['user_agent'], 'Referer': referer, 'Accept': '*/*' }) try: file_md5 = hashlib.md5( urllib.request.urlopen(replay_element_request, timeout=10).read()).hexdigest() except: file_md5 = None else: file_md5 = None # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, file_md5, element_extension, element_type, element_args, element_domain_id) # close db connection sql_driver.close() return True
class OutputStore: """ This class receives data from the browser, processes it, and stores it in the db """ def __init__(self, db_engine, db_name): self.db_engine = db_engine self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() # init def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'mysql': from webxray.MySQLDriver import MySQLDriver sql_driver = MySQLDriver(self.db_name) elif self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # figure out the privacy policy url and text, starts null priv_policy_url = None priv_policy_url_text = None # read in our list of privacy link terms from the json file in webxray/resources/policyxray privacy_policy_term_list = self.utilities.get_privacy_policy_term_list() # we reverse links return from browser to check footer links first as that is where policy links tend to be all_links = browser_output['all_links'] all_links.reverse() # if we have links search for privacy policy if len(all_links) > 0: # links are tuple for link_text,link_url in all_links: # makes sure we have text, skip links without if link_text: # need lower for string matching link_text = link_text.lower().strip() # not a link we can use if 'javascript' in link_text: continue # see if the link_text is in our term list if link_text in privacy_policy_term_list: # if the link_url is relative this will convert to absolute priv_policy_url = self.utilities.get_absolute_url_from_page_link(url,link_url) priv_policy_url_text = link_text break # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: # handles issue where postgres will crash on inserting null character source = browser_output['source'].replace('\x00',' ') else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], priv_policy_url, priv_policy_url_text, page_is_ssl, source, browser_output['load_time'], page_domain_id ) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie( page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id ) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing element request: '+request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https' or request[:3] == 'wss': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request]['received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request]['referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error(url, 'Error parsing referer header: '+referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request]['load_time'] except: load_time = None try: status = browser_output['processed_requests'][request]['status'] except: status = None try: status_text = browser_output['processed_requests'][request]['status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request]['content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request]['body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'][request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'][request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf'] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None # file hashing has non-trivial overhead and off by default # # what this does is uses the same ua/referer as the actual request # so we are just replaying the last one to get similar response # note that we aren't sending the same cookies so that could be an issue # otherwise it is equivalent to a page refresh in theory # option to hash only 3p elements observed here if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False): replay_element_request = urllib.request.Request( request, headers = { 'User-Agent' : browser_output['processed_requests'][request]['user_agent'], 'Referer' : referer, 'Accept' : '*/*' } ) try: file_md5 = hashlib.md5(urllib.request.urlopen(replay_element_request,timeout=10).read()).hexdigest() except: file_md5 = None else: file_md5 = None # final tasks is to truncate the request if it is # over 2k characters as it is likely # binary data and may cause problems inserting # into TEXT fields in database # # TODO: # better handle binary data in general if len(request) >= 2000: request = request[:2000] if len(element_url) >= 2000: element_url = element_url[:2000] # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, file_md5, element_extension, element_type, element_args, element_domain_id ) # close db connection sql_driver.close() return True
class SingleScan: """ Loads and analyzes a single page, print outputs to cli Very simple and does not require a db being configured """ def __init__(self): self.url_parser = ParseURL() self.domain_owners = {} self.id_to_owner = {} self.id_to_parent = {} # set up the domain ownership dictionary for item in json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')): if item['id'] == '-': continue self.id_to_owner[item['id']] = item['name'] self.id_to_parent[item['id']] = item['parent_id'] for domain in item['domains']: self.domain_owners[domain] = item['id'] # end init def get_lineage(self, id): """ Find the upward chain of ownership for a given domain. """ if self.id_to_parent[id] == None: return [id] else: return [id] + self.get_lineage(self.id_to_parent[id]) # end get_lineage def execute(self, url, config): """ Main function, loads page and analyzes results. """ print('\t~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') print('\tSingle Site Test On: %s' % url) print('\t - Browser type is %s' % config['client_browser_type']) print('\t - Browser max wait time is %s seconds' % config['client_max_wait']) # make sure it is an http(s) address if not re.match('^https?://', url): print('\tNot a valid url, aborting') return None # import and set up specified browser driver if config['client_browser_type'] == 'chrome': from webxray.ChromeDriver import ChromeDriver browser_driver = ChromeDriver(config) else: print('INVALID BROWSER TYPE FOR %s, QUITTING!' % config['client_browser_type']) exit() # attempt to get the page browser_output = browser_driver.get_scan(url) # if there was a problem we print the error if browser_output['success'] == False: print('\t\t%-50s Browser Error: %s' % (url[:50], browser_output['result'])) return else: browser_output = browser_output['result'] # get the ip, fqdn, domain, pubsuffix, and tld from the URL # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we bail out if origin_ip_fqdn_domain_pubsuffix_tld is None: print('could not parse origin domain') return None origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] print('\n\t------------------{ URL }------------------') print('\t %s' % url) print('\n\t------------------{ Final URL }------------------') print('\t %s' % browser_output['final_url']) print('\n\t------------------{ Title }------------------') print('\t %s' % browser_output['title']) print('\n\t------------------{ Description }------------------') print('\t %s' % browser_output['meta_desc']) print('\n\t------------------{ Domain }------------------') print('\t %s' % origin_domain) print('\n\t------------------{ Seconds to Complete Download }------------------') print('\t%s' % (browser_output['load_time'])) print('\n\t------------------{ Cookies }------------------') # put relevant fields from cookies into list we can sort cookie_list = [] for cookie in browser_output['cookies']: cookie_list.append(cookie['domain']+' -> '+cookie['name']+' -> '+cookie['value']) cookie_list.sort() for count,cookie in enumerate(cookie_list): print(f'\t[{count}] {cookie}') print('\n\t------------------{ Local Storage }------------------') for item in browser_output['dom_storage']: print('\t%s (is local: %s): %s' % (item['security_origin'],item['is_local_storage'],item['key'])) print('\n\t------------------{ Domains Requested }------------------') request_domains = set() for request in browser_output['requests']: # if the request starts with 'data'/etc we can't parse tld anyway, so skip if re.match('^(data|about|chrome).+', request['url']): continue # parse domain from the security_origin domain_info = self.url_parser.get_parsed_domain_info(request['url']) if domain_info['success'] == False: print('\tUnable to parse domain info for %s with error %s' % (request['url'], domain_info['result'])) continue # if origin_domain != domain_info['result']['domain']: request_domains.add(domain_info['result']['domain']) count = 0 for domain in sorted(request_domains): count += 1 if domain in self.domain_owners: lineage = '' for item in self.get_lineage(self.domain_owners[domain]): lineage += self.id_to_owner[item]+' > ' print('\t%s) %s [%s]' % (count, domain, lineage[:-3])) else: print('\t%s) %s [Unknown Owner]' % (count, domain))
def __init__(self, db_engine, db_name): self.db_engine = db_engine self.db_name = db_name self.url_parser = ParseURL()
class Utilities: def __init__(self, db_name=None, db_engine=None): # if we have db params set up global db connection, otherwise we don't bother if db_name: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(db_name) else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() elif db_engine: if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver() elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver() else: print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.url_parser = ParseURL() # __init__ def check_dependencies(self): import sys if sys.version_info[0] < 3 or sys.version_info[1] < 4: print( '******************************************************************************' ) print( ' Python 3.4 or above is required for webXray; please check your installation. ' ) print( '******************************************************************************' ) quit() try: from websocket import create_connection except: print('*******************************************************') print(' The websocket-client library is needed for webXray. ') print(' Please try running "pip3 install -r requirements.txt" ') print('*******************************************************') quit() try: from textstat.textstat import textstat except: print('*******************************************************') print(' The textstat library is needed for webXray. ') print(' Please try running "pip3 install -r requirements.txt" ') print('*******************************************************') quit() try: import lxml.html except: print('*******************************************************') print(' The lxml library is needed for webXray. ') print(' Please try running "pip3 install -r requirements.txt" ') print('*******************************************************') quit() # check_dependencies def get_default_config(self, config_type): # the following are two pre-configured options for # haystack and forensic scans, can be tweaked as desired if config_type == 'haystack': return { 'client_browser_type': 'chrome', 'client_prewait': 10, 'client_no_event_wait': 20, 'client_max_wait': 60, 'client_get_bodies': False, 'client_get_bodies_b64': False, 'client_get_screen_shot': False, 'client_get_text': False, 'client_crawl_depth': 3, 'client_crawl_retries': 5, 'client_page_load_strategy': 'none', 'client_reject_redirects': False, 'client_min_internal_links': 5, 'max_attempts': 5, 'store_1p': True, 'store_base64': False, 'store_files': True, 'store_screen_shot': False, 'store_source': False, 'store_page_text': False, 'store_links': True, 'store_dom_storage': True, 'store_responses': True, 'store_request_xtra_headers': True, 'store_response_xtra_headers': True, 'store_requests': True, 'store_websockets': True, 'store_websocket_events': True, 'store_event_source_msgs': True, 'store_cookies': True, 'store_security_details': True, 'timeseries_enabled': True, 'timeseries_interval': 0 } elif config_type == 'forensic': return { 'client_browser_type': 'chrome', 'client_prewait': 10, 'client_no_event_wait': 20, 'client_max_wait': 60, 'client_get_bodies': True, 'client_get_bodies_b64': True, 'client_get_screen_shot': True, 'client_get_text': True, 'client_crawl_depth': 3, 'client_crawl_retries': 5, 'client_page_load_strategy': 'none', 'client_reject_redirects': True, 'client_min_internal_links': 5, 'max_attempts': 5, 'store_1p': True, 'store_base64': True, 'store_files': True, 'store_screen_shot': True, 'store_source': True, 'store_page_text': True, 'store_links': True, 'store_dom_storage': True, 'store_responses': True, 'store_request_xtra_headers': True, 'store_response_xtra_headers': True, 'store_requests': True, 'store_websockets': True, 'store_websocket_events': True, 'store_event_source_msgs': True, 'store_cookies': True, 'store_security_details': True, 'timeseries_enabled': True, 'timeseries_interval': 0 } elif config_type == 'custom': print('Create a custom config in Utilities.py') quit() else: print('Invalid config option, see Utilities.py') quit() # get_default_config def select_wbxr_db(self): """ databases are stored with a prefix (default 'wbxr_'), this function helps select a database in interactive mode """ # you can optionally specify a different prefix here by setting "db_prefix = '[PREFIX]'" wbxr_dbs = self.sql_driver.get_wbxr_dbs_list() wbxr_dbs.sort() if len(wbxr_dbs) == 0: print( '''\t\tThere are no databases to analyze, please try [C]ollecting data or import an existing wbxr-formatted database manually.''') interaction() return for index, db_name in enumerate(wbxr_dbs): print('\t\t[%s] %s' % (index, db_name)) max_index = len(wbxr_dbs) - 1 # interaction step: loop until we get acceptable input while True: selected_db_index = input("\n\tPlease select database by number: ") if selected_db_index.isdigit(): selected_db_index = int(selected_db_index) if selected_db_index >= 0 and selected_db_index <= max_index: break else: print( '\t\t You entered an invalid string, please select a number in the range 0-%s.' % max_index) continue else: print( '\t\t You entered an invalid string, please select a number in the range 0-%s.' % max_index) continue db_name = wbxr_dbs[selected_db_index] return db_name # select_wbxr_db def stream_rate(self, type='scan', return_json=False, client_id=None): """ This function is a generator which determines the rate at which pages are being add to the db allowing us to evaluate our rate of progress. """ # initialize dictionary to store rate data client_rate_data = {} # this diction will hold all the rates for each client so we can # easily figure out the average rate all_rates = {} # None store the aggregate data for all clients client_rate_data[None] = {} all_rates[None] = [] # add entries for each client for client_id, in self.sql_driver.get_client_list(): client_rate_data[client_id] = {} all_rates[client_id] = [] # for client_id in ['wbxr0','wbxr1','wbxr2','wbxr3','wbxr4','wbxr5']: # client_rate_data[client_id] = {} # all_rates[client_id] = [] crawl_depth = self.sql_driver.get_config()['client_crawl_depth'] # set time window we want to look at to see how many # pages have been recently added # set the time gap between updates, leaving it too short # means lots of db calls if type == 'scan' or type == 'policy': wait_seconds = 10 interval_seconds = 600 elif type == 'task': wait_seconds = 30 interval_seconds = 30 # keep track of how long we've been doing this elapsed_seconds = 0 # for tasks if type == 'task': old_task_count = self.sql_driver.get_pending_task_count() # this runs forever, no terminating condition while True: # simple increment, note we we /60 before we return # for minutes conversion elapsed_seconds += wait_seconds remaining_tasks = self.sql_driver.get_task_queue_length() total_count = 0 for client_id, count in self.sql_driver.get_recent_page_count_by_client_id( interval_seconds): total_count += count # to get rate/hour we take the number of pages we've added per # second *3600 current_rate = (count / interval_seconds) * 3600 # this list is all the rates we've seen all_rates[client_id] = all_rates[client_id] + [current_rate] # nice built-in to get the average rate average_rate = statistics.mean(all_rates[client_id]) # figure out how much longer to go, gracefully handle # a rate of zero if average_rate != 0: remaining_hours = remaining_tasks / average_rate else: remaining_hours = 0 # dictionary of the data to return client_rate_data[client_id] = { 'elapsed_minutes': round(elapsed_seconds / 60, 2), 'current_rate': round(current_rate, 2), 'average_rate': round(average_rate, 2), 'remaining_tasks': remaining_tasks, 'remaining_hours': round(remaining_hours, 2) * crawl_depth } # for overall measure total_current_rate = (total_count / interval_seconds) * 3600 all_rates[None] += [total_current_rate] total_average_rate = statistics.mean(all_rates[None]) # figure out how much longer to go, gracefully handle # a rate of zero if total_average_rate != 0: remaining_hours = round( (remaining_tasks / total_average_rate) * crawl_depth, 2) else: remaining_hours = 0 # round down for days if remaining_hours > 24: remaining_time = f'{round(remaining_hours/24,2)} days' else: remaining_time = f'{remaining_hours} hours' client_rate_data[None] = { 'elapsed_minutes': round(elapsed_seconds / 60, 2), 'current_rate': round(total_current_rate, 2), 'average_rate': round(total_average_rate, 2), 'remaining_tasks': remaining_tasks, 'remaining_hours': remaining_time } # if we are called by the flask admin_console it is # easiest to do json formatting here, otherwise # we don't. if return_json: yield f"data:{json.dumps(client_rate_data)}\n\n" else: yield client_rate_data # wait until we send a new update time.sleep(wait_seconds) # stream_rate def setup_report_dir(self, db_name): """ Create directory for where the reports go if it does not exist, returns the path. """ if os.path.exists('./reports') == False: print('\t\tMaking global reports directory at ./reports.') os.makedirs('./reports') # set global report_path report_path = './reports/' + db_name # set up subdir for this analysis if os.path.exists(report_path) == False: print('\t\tMaking subdirectory for reports at %s' % report_path) os.makedirs(report_path) print('\t\tStoring output in %s' % report_path) return report_path # setup_report_dir def write_csv(self, report_path, file_name, csv_rows, num_decimals=2): """ basic utility function to write list of csv rows to a file """ full_file_path = report_path + '/' + file_name with open(full_file_path, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for row in csv_rows: rounded_row = [] for item in row: # round floats and decimals if isinstance(item, float) or isinstance( item, decimal.Decimal): rounded_row.append(round(item, num_decimals)) else: rounded_row.append(item) csv_writer.writerow(rounded_row) print('\t\tOutput written to %s' % full_file_path) # write_csv def print_runtime(self, action_name, start_time): """ Just for CLI info """ print('-' * 40) print('\t%s finished in %s' % (action_name, str(datetime.now() - start_time))) print('-' * 40) # print_runtime def get_absolute_url_from_page_link(self, page_url, link_url): """ Given a page_url and a link_url from that page we determine the absolute url of the link from the page_url. """ # ex nihilo nihil fit if link_url == None: return None if len(link_url) == 0: return None # we use the info from the original url for converting # relative links to absolute parsed_page_url = urlparse(page_url) # this is an absolute url already, nothing further to do to if re.match('^https?://', link_url): return (link_url) # link with no scheme, paste it in elif re.match('^//', link_url): return (parsed_page_url.scheme + ':' + link_url) # relative link, fix it up else: if link_url[0] != '/': return (parsed_page_url.scheme + '://' + parsed_page_url.netloc + '/' + link_url) else: return (parsed_page_url.scheme + '://' + parsed_page_url.netloc + link_url) # this only happens if something breaks return None # get_absolute_url_from_link def get_most_common_sorted(self, list_in): """ takes a list, finds the most common items and then resorts alpha (b/c python's Counter will arbitrarily order items with same count), then sorts again for most-common assumes list_in contains alphanumeric tuples """ most_common_sorted = collections.Counter(list_in).most_common() most_common_sorted.sort() most_common_sorted.sort(reverse=True, key=lambda item: item[1]) return most_common_sorted # get_most_common_sorted ######################### # POLICY EXTRACTION # ######################### def get_policy_link_terms(self): """ Returns a list of terms used to indicate a link may be a policy, note languages are all mixed together. """ policy_link_terms = [] # go through json file and merge terms together for lang_term_set in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/policyxray/policy_terms.json', 'r', encoding='utf-8')): for term in lang_term_set['policy_link_terms']: policy_link_terms.append(term) return policy_link_terms # get_policy_link_terms def get_policy_verification_terms(self): """ Returns a dictionary of terms used to verify several types of policies, note languages are all mixed together. """ policy_verification_terms = {} policy_verification_terms['privacy_policy'] = [] policy_verification_terms['terms_of_service'] = [] policy_verification_terms['cookie_policy'] = [] policy_verification_terms['ad_choices'] = [] policy_verification_terms['gdpr_statement'] = [] policy_verification_terms['ccpa_statement'] = [] # go through json file and merge terms together for lang_term_set in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/policyxray/policy_terms.json', 'r', encoding='utf-8')): for term in lang_term_set['privacy_policy_verification_terms']: policy_verification_terms[ 'privacy_policy'] = policy_verification_terms[ 'privacy_policy'] + [term] for term in lang_term_set['terms_of_service_verification_terms']: policy_verification_terms[ 'terms_of_service'] = policy_verification_terms[ 'terms_of_service'] + [term] for term in lang_term_set['cookie_policy_verification_terms']: policy_verification_terms[ 'cookie_policy'] = policy_verification_terms[ 'cookie_policy'] + [term] for term in lang_term_set['ad_choices_verification_terms']: policy_verification_terms[ 'ad_choices'] = policy_verification_terms['ad_choices'] + [ term ] for term in lang_term_set['gdpr_statement_verification_terms']: policy_verification_terms[ 'gdpr_statement'] = policy_verification_terms[ 'gdpr_statement'] + [term] for term in lang_term_set['ccpa_statement_verification_terms']: policy_verification_terms[ 'ccpa_statement'] = policy_verification_terms[ 'ccpa_statement'] + [term] return policy_verification_terms # get_policy_verification_terms def get_lang_to_privacy_policy_term_dict(self): """ Returns a dict of privacy policy terms keyed by language code. """ lang_to_terms = {} for lang_term_set in json.load( open(os.path.dirname(os.path.abspath(__file__)) + '/resources/policyxray/policy_terms.json', 'r', encoding='utf-8')): lang_to_terms[ lang_term_set['lang']] = lang_term_set['policy_terms'] return lang_to_terms # get_lang_to_priv_term_dict ######################### # DOMAIN OWNERSHIP # ######################### def get_domain_owner_dict(self): """ read out everything in the domain_owner table into a dictionary so we can easily use it as a global lookup table this is purposefully independent of self.patch_domain_owners and does not assume the above has been run, however will return and empty dictionary if the db has not been patched yet reasons for above is that if user does not wish to update with the current json file historical data will remain consistent """ # domain_owners is both returned as well as made available to other class functions self.domain_owners = {} domain_owner_raw_data = self.sql_driver.get_all_domain_owner_data() if domain_owner_raw_data: for item in domain_owner_raw_data: # add everything to the dict self.domain_owners[item[0]] = { 'parent_id': item[1], 'owner_name': item[2], 'aliases': json.loads(item[3]), 'homepage_url': item[4], 'site_privacy_policy_urls': json.loads(item[5]), 'service_privacy_policy_urls': json.loads(item[6]), 'gdpr_statement_urls': json.loads(item[7]), 'terms_of_use_urls': json.loads(item[8]), 'platforms': json.loads(item[9]), 'uses': json.loads(item[10]), 'notes': item[11], 'country': item[12] } return self.domain_owners # get_domain_owner_dict def get_domain_owner_lineage_ids(self, id): """ for a given domain owner id, return the list which corresponds to its ownership lineage """ if self.domain_owners[id]['parent_id'] == None: return [id] else: return [id] + self.get_domain_owner_lineage_ids( self.domain_owners[id]['parent_id']) # get_domain_owner_lineage_ids def get_domain_owner_lineage_strings(self, owner_id, get_aliases=False): """ given an owner_id this function returns a list which is the full lineage of ownership optionally will also return aliases (e.g. 'Doubleclick' and 'Double Click') """ lineage_strings = [] for owner_id in self.get_domain_owner_lineage_ids(owner_id): lineage_strings.append( (owner_id, self.domain_owners[owner_id]['owner_name'])) if get_aliases: for alias in self.domain_owners[owner_id]['aliases']: lineage_strings.append((owner_id, alias)) return lineage_strings # get_domain_owner_lineage_strings def get_domain_owner_lineage_combined_string(self, owner_id): """ given an owner_id this function returns a single string which is the full lineage of ownership """ lineage_string = '' for item in self.get_domain_owner_lineage_strings(owner_id): lineage_string += item[1] + ' > ' return lineage_string[:-3] # get_domain_owner_lineage_combined_string def get_domain_owner_child_ids(self, id): """ for a given owner id, get all of its children/subsidiaries """ # first get all the children ids if they exist child_ids = [] for item in self.domain_owners: if self.domain_owners[item]['parent_id'] == id: child_ids.append(item) # if we have children, call recursively if len(child_ids) > 0: for child_id in child_ids: child_ids.extend(self.get_domain_owner_child_ids(child_id)) # return an empty list if no children return child_ids # get_domain_owner_child_ids def is_url_valid(self, url): """ Performs checks to verify if the url can actually be scanned. """ # only do http links if not (re.match('^https?://.+', url)): return False # if we can't get the url_path it is invalid try: url_path = urlsplit(url.strip().lower()).path except: return False # if we can't do idna conversion it is invalid try: idna_fixed_netloc = urlsplit( url.strip()).netloc.encode('idna').decode('utf-8') except: return False # these are common file types we want to avoid illegal_extensions = [ 'apk', 'dmg', 'doc', 'docx', 'exe', 'ics', 'iso', 'pdf', 'ppt', 'pptx', 'rtf', 'txt', 'xls', 'xlsx' ] # if we can't parse the extension it doesn't exist and is # therefore ok by our standards try: url_extension = re.search('\.([0-9A-Za-z]+)$', url_path).group(1) if url_extension in illegal_extensions: return False except: return True # it's good return True # is_url_valid def idna_encode_url(self, url, no_fragment=False): """ Non-ascii domains will crash some browsers, so we need to convert them to idna/ascii/utf-8. This requires splitting apart the url, converting the domain to idna, and pasting it all back together """ split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') if no_fragment: return urlunsplit((split_url.scheme, idna_fixed_netloc, split_url.path, split_url.query, '')) else: return urlunsplit( (split_url.scheme, idna_fixed_netloc, split_url.path, split_url.query, split_url.fragment)) # idna_encode_url def is_url_internal(self, origin_url, target_url): """ Given two urls (origin, target) determines if the target is internal to the origin based on subsuffix+1 domain. """ origin_domain = self.url_parser.get_parsed_domain_info(origin_url) target_domain = self.url_parser.get_parsed_domain_info(target_url) # we return None to signify we couldn't parse the urls if not origin_domain['success'] or not target_domain['success']: return None else: origin_domain = origin_domain['result']['domain'] target_domain = target_domain['result']['domain'] if origin_domain != target_domain: return False else: return True
class ChromeDriver: def __init__(self, config, port_offset=1, chrome_path=None, headless=True): self.debug = False # unpack config if self.debug: print(config) self.prewait = config['client_prewait'] self.no_event_wait = config['client_no_event_wait'] self.max_wait = config['client_max_wait'] self.return_page_text = config['client_get_text'] self.return_bodies = config['client_get_bodies'] self.return_bodies_base64 = config['client_get_bodies_b64'] self.return_screen_shot = config['client_get_screen_shot'] self.reject_redirects = config['client_reject_redirects'] self.crawl_depth = config['client_crawl_depth'] self.crawl_retries = config['client_crawl_retries'] self.page_load_strategy = config['client_page_load_strategy'] self.min_internal_links = config['client_min_internal_links'] self.headless = headless # custom library in /webxray self.url_parser = ParseURL() # prevents get_scan from closing browser # when we are doing a crawl self.is_crawl = False # gets overwritten once, so we don't have to keep # figuring it out when doing crawls self.browser_type = None self.browser_version = None self.user_agent = None # we can override the path here if chrome_path: chrome_cmd = chrome_cmd else: # if path is not specified we use the common # paths for each os if platform.system() == 'Darwin': chrome_cmd = '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome ' elif platform.system() == 'Linux': chrome_cmd = '/usr/bin/google-chrome ' elif platform.system() == 'Windows': chrome_cmd = 'start chrome ' else: print('Unable to determine Operating System and therefore cannot guess correct Chrome path, see ChromeDriver.py for details.') exit() # use port offset to avoid collissions between processes port = 9222+port_offset # each process will use it's own debugging port or we use default 9222 chrome_cmd += '--remote-debugging-port=%s' % port # sets up blank profile chrome_cmd += ' --guest' # not sure this really does anything chrome_cmd += ' --disable-gpu' # disable sandbox to worki inside docker chrome_cmd += ' --no-sandbox' # set up headless if self.headless: chrome_cmd += ' --headless' # if we're in production send the subprocess output to dev/null, None is normal if not self.debug: devnull = open(os.devnull, 'w') else: devnull = None # run command and as subprocess if self.debug: print(f'going to run command: "{chrome_cmd}"') subprocess.Popen(chrome_cmd,shell=True,stdin=None,stdout=devnull,stderr=devnull,close_fds=True) # allow browser to launch time.sleep(5) # the debugger address has a 'json' path where we can find the websocket # address which is how we send devtools commands, thus we extract the value # "webSocketDebuggerUrl" from the first json object try: debuggerAddress_json = json.loads(urllib.request.urlopen('http://localhost:%s/json' % port).read().decode()) if self.debug: print(debuggerAddress_json) webSocketDebuggerUrl = debuggerAddress_json[0]['webSocketDebuggerUrl'] self.launched = True except Exception as e: self.launched = False return # third, once we have the websocket address we open a connection # and we are (finally) able to communicate with chrome via devtools! # note this connection must be closed! self.devtools_connection = create_connection(webSocketDebuggerUrl) # important, makes sure we don't get stuck # waiting for messages to arrive self.devtools_connection.settimeout(3) # this is incremented globally self.current_ws_command_id = 0 # prevent downloading files, the /dev/null is redundant if self.debug: print('going to disable downloading') response = self.get_single_ws_response('Page.setDownloadBehavior','"behavior":"deny","downloadPath":"/dev/null"') if response['success'] == False: self.exit() return response else: response = response['result'] if self.debug: print(f'{response}') # done return # __init__ def get_single_ws_response(self,method,params=''): """ Attempt to send ws_command and return response, note this only works if you don't have the queue being flooded with network events, handles crashes gracefully. """ self.current_ws_command_id += 1 try: self.devtools_connection.send('{"id":%s,"method":"%s","params":{%s}}' % (self.current_ws_command_id,method,params)) return ({ 'success' : True, 'result' : json.loads(self.devtools_connection.recv()) }) except: return ({ 'success' : False, 'result' : 'Crashed on get_single_ws_response.' }) # get_single_ws_response def send_ws_command(self,method,params='',override_id=None): """ Attempt to send ws_command, handle crashes gracefully. """ self.current_ws_command_id += 1 try: self.devtools_connection.send('{"id":%s,"method":"%s","params":{%s}}' % (self.current_ws_command_id,method,params)) return ({ 'success' : True, 'result' : self.current_ws_command_id }) except: return ({ 'success' : False, 'result' : 'Crashed on send_ws_command.' }) # send_ws_command def get_next_ws_response(self): """ Either get the next ws response or send None on timeout or crash. """ try: return json.loads(self.devtools_connection.recv()) except: return None # get_next_ws_response def exit(self): """ Tidy things up before exiting. """ if self.launched: self.send_ws_command('Browser.close') self.devtools_connection.close() # exit def get_crawl(self, url_list): """ Performs multiple page loads using the same profile, which allows cookies to be transferred across loads and potentially allow for more tracking. """ # setting this globally prevents the browser # from being closed after get_scan self.is_crawl = True # we return a list which is all the get_scan # results we find results = [] # do each url for url in url_list: result = self.get_scan(url) if result['success']: results.append(result['result']) else: error = result['result'] self.exit() return ({ 'success': False, 'result': error }) # now it is ok to close the browser/ws connection self.exit() # done! return ({ 'success': True, 'result': results }) # get_crawl def get_random_crawl(self, seed_url): """ Based on an intial seed page conducts a first scan to get traffic and links, then loads additional pages on the same site based on links. Note the cookies from each page load carry over, thus we do not allow any domain-level redirects on page loads as this would skew our ability to categorize cookies as first or third-party. """ # setting this globally prevents the browser # from being closed after get_scan self.is_crawl = True # we return a list which is all the get_scan # results we find results = [] # removing trailing / seed_url = re.sub('/$', '',seed_url) if self.debug: print(f'going to scan seed_url {seed_url}') result = self.get_scan(seed_url) if not result['success']: self.exit() return ({ 'success': False, 'result': result["result"] }) else: origin_url = result['result']['final_url'] scanned_urls = [seed_url] results.append(result['result']) if self.debug: print(f'origin url is {origin_url}') # holds urls we may scan unique_urls = set() # look at links from the seed page, we will quit # either when we exceed self.crawl_depth or run out of links for link in result['result']['all_links']: # only do internal links if not link['internal']: continue # (re)encode the url url = self.idna_encode_url(link['href'], no_fragment=True) # idna_encode failure yields a None value, skip if not url: continue # removing trailing / url = re.sub('/$', '',url) # make sure it is a real web page if not self.is_url_valid(url): continue # we already scanned this if url == seed_url or url == origin_url: continue # yay, it's usable unique_urls.add(url) # no need to do any scans if we can't find urls if len(unique_urls) < self.crawl_depth: self.exit() return ({ 'success' : False, 'result' : 'did not find enough internal links' }) # we allow a certain number of failed page loads, but eventually # we must give up failed_urls = [] # keep scanning pages until we've done enough for url in unique_urls: # if we have enough results break if len(scanned_urls) == self.crawl_depth: break # give up! if len(failed_urls) > self.crawl_retries: self.exit() return ({ 'success' : False, 'result' : 'reached fail limit' }) # do the scan result = self.get_scan(url) # either keep result or keep track of failures if result['success']: # reject redirects based on origin_url is_redirect = self.is_url_internal(origin_url,result['result']['final_url']) if is_redirect == None or is_redirect == False: if self.debug: print(f"caught redirect from {url} to {result['result']['final_url']}") failed_urls.append(url) else: results.append(result['result']) scanned_urls.append(url) else: if self.debug: print(f"fail on {result['result']}") failed_urls.append(url) if self.debug: print('crawled urls:') for res in results: print(res['start_url'],res['final_url']) # now it is ok to close the browser/ws connection self.exit() # done! num_results = len(results) if num_results < self.crawl_depth: return ({ 'success': False, 'result': 'unable to crawl specified number of pages' }) else: return ({ 'success': True, 'result': results }) # get_random_crawl def get_scan(self, url, get_text_only=False): """ The primary function for this class, performs a number of tasks based on the config including, but not limited to: - capture network traffic - capture response bodies - capture screen shots - capture page text using readability Note that if get_text_only is true we only do basic tasks such as getting the policy, and we return far less content which is useful for doing text capture. """ # let the games begin if self.debug: print('starting %s' % url) # we can't start Chrome, return error message as result if not self.launched: return ({ 'success': False, 'result': 'Unable to launch Chrome instance, check that Chrome is installed in the expected location, see ChromeDriver.py for details.' }) # Network events are stored as lists of dictionaries which are # returned. requests = [] request_extra_headers = [] responses = [] response_extra_headers = [] websockets = [] websocket_events = [] event_source_msgs = [] load_finish_events = [] # Response bodies are keyed to the request_id when they are # returned to calling function, and we get the response bodies # by issuing websocket commands so we we first keep track # of which command is linked to which ws_id. Note this data is # for internal processes and not returned ws_id_to_req_id = {} # When we get the websocket response we stored the body keyed # to the request id, this is returned response_bodies = {} # We keep dom_storage here, the dict key is a tuple of the securityOrigin # isLocalStorage, and the domstorage key. This way we can keep only final # values in cases they are overwritten. Note this data is # for internal processes and not returned dom_storage_holder = {} # Before we return the result we store the unique domstorage items to a # list of dicts dom_storage = [] # We merge the following types of websocket events websocket_event_types = [ 'Network.webSocketFrameError', 'Network.webSocketFrameReceived', 'Network.webSocketFrameSent', 'Network.webSocketWillSendHandshakeRequest', 'Network.webSocketHandshakeResponseReceived', 'Network.webSocketClosed' ] # The timestamps provided by Chrome DevTools are "Monotonically increasing time # in seconds since an arbitrary point in the past." What this means is they are # essentially offsets (deltas) and not real timestamps. However, the Network.requestWillBeSent # also has a "wallTime" which is a UNIX timestamp. So what we do below is set the # origin_walltime which to be the earliest wallTime we've seen as this allow us to later # use the "timestamps" to determine the real-world time when an event happened. origin_walltime = None first_timestamp = None # keeps track of what ws_id belongs to which type of command, we # remove entries when we get a response pending_ws_id_to_cmd = {} # get browser version and user agent if self.debug: print('going to get browser version') response = self.get_single_ws_response('Browser.getVersion') if response['success'] == False: self.exit() return response elif 'result' not in response['result']: self.exit() return ({ 'success': False, 'result': 'No result for ws command' }) else: response = response['result'] if self.debug: print(f'ws response: {response}') if not self.browser_type: self.browser_type = re.match('^(.+)?/(.+)$',response['result']['product'])[1] self.browser_version = re.match('^(.+)?/(.+)$',response['result']['product'])[2] self.user_agent = response['result']['userAgent'] # remove 'Headless' from the user_agent if self.headless: response = self.get_single_ws_response('Network.setUserAgentOverride','"userAgent":"%s"' % self.user_agent.replace('Headless','')) if response['success'] == False: self.exit() return response elif 'result' not in response['result']: self.exit() return ({ 'success': False, 'result': 'No result for ws command' }) else: response = response['result'] if self.debug: print(f'ws response: {response}') # enable network and domstorage when doing a network_log if not get_text_only: if self.debug: print('going to enable network logging') response = self.get_single_ws_response('Network.enable') if response['success'] == False: self.exit() return response elif 'result' not in response['result']: self.exit() return ({ 'success': False, 'result': 'No result for ws command' }) else: response = response['result'] if self.debug: print(f'ws response: {response}') if self.debug: print('going to enable domstorage logging') response = self.get_single_ws_response('DOMStorage.enable') if response['success'] == False: self.exit() return response else: response = response['result'] if self.debug: print(f'ws response: {response}') if self.debug: print('going to disable cache') response = self.get_single_ws_response('Network.setCacheDisabled','"cacheDisabled":true') if response['success'] == False: self.exit() return response else: response = response['result'] if self.debug: print(f'ws response: {response}') # start the page load process, fail gracefully, currently using # selenium b/c it gives us a timeout, but may move to devtools if self.debug: print(f'going to load {url}') # try: # self.driver.get(url) # except Exception as e: # # close browser/websocket # self.exit() # # return the error # return ({ # 'success': False, # 'result': str(e) # }) response = self.get_single_ws_response('Page.navigate','"url":"%s"' % url) if response['success'] == False: self.exit() return response else: response = response['result'] if self.debug: print(f'ws response: {response}') # this is the main loop where we get network log data if not get_text_only: ############################# # DEVTOOLS NETWORK LOG DATA # ############################# if self.debug: print('##############################') if self.debug: print(' Going to process Network Log ') if self.debug: print('##############################') # Keep track of how long we've been reading ws data response_loop_start = datetime.datetime.now() # Keetp track of when we last saw a Network event time_since_last_response = datetime.datetime.now() # Length of time since we last saw a Network event elapsed_no_event = 0 # Keep track of what second we are on so we know # when to scroll, is incremented whenever the second # changes (eg 1.99 -> 2.10 = 1 -> 2) last_second = 0 # We keep collecting devtools_responses in this loop until either we haven't seen # network activity for the no_event_wait value or we exceed the max_wait # time. while True: # update how long we've been going loop_elapsed = (datetime.datetime.now()-response_loop_start).total_seconds() # perform two scrolls once a second if int(loop_elapsed) > last_second: last_second = int(loop_elapsed) for i in range(0,10): if self.debug: print(f'{last_second} : performing scroll #{i}') self.do_scroll() self.do_scroll() # see if time to stop elapsed_no_event = (datetime.datetime.now()-time_since_last_response).total_seconds() if loop_elapsed < self.prewait: if self.debug: print(f'{loop_elapsed}: In prewait period') if loop_elapsed > self.prewait and (elapsed_no_event > self.no_event_wait or loop_elapsed > self.max_wait): if self.debug: print(f'{loop_elapsed} No event for {elapsed_no_event}, max_wait is {self.max_wait}, breaking Network log loop.') break # try to get ws response, returns None if no response devtools_response = self.get_next_ws_response() # determine how long since we last got a response with # a Network event, if we didn't get a response we wait # for a second if devtools_response: if 'method' in devtools_response: if 'Network' in devtools_response['method']: time_since_last_response = datetime.datetime.now() else: if self.debug: print(f'No events for {elapsed_no_event} seconds; main loop running for {loop_elapsed}') else: if self.debug: print(f'No events for {elapsed_no_event} seconds; main loop running for {loop_elapsed}') time.sleep(1) continue # if we make it this far devtools_response was not None if self.debug: print(loop_elapsed,json.dumps(devtools_response)[:100]) # PRESENCE OF 'METHOD' MEANS WE PROCESS LOG DATA if 'method' in devtools_response: # REQUEST if devtools_response['method'] == 'Network.requestWillBeSent': cleaned_request = self.clean_request(devtools_response['params']) cleaned_request['event_order'] = len(requests) # update global start time to measure page load time and calculate offsets if origin_walltime == None or cleaned_request['wall_time'] < origin_walltime: origin_walltime = cleaned_request['wall_time'] if first_timestamp == None or cleaned_request['timestamp'] < first_timestamp: first_timestamp = cleaned_request['timestamp'] # DOCUMENT ME if 'redirectResponse' in devtools_response['params']: redirect_response = {} redirect_response['response'] = devtools_response['params']['redirectResponse'] redirect_response['requestId'] = devtools_response['params']['requestId'] redirect_response['loaderId'] = devtools_response['params']['loaderId'] redirect_response['timestamp'] = devtools_response['params']['timestamp'] redirect_response['type'] = devtools_response['params']['type'] redirect_response['event_order'] = len(responses) responses.append(self.clean_response(redirect_response)) cleaned_request['redirect_response_url'] = devtools_response['params']['redirectResponse']['url'] else: cleaned_request['redirect_response_url'] = None requests.append(cleaned_request) # REQUEST EXTRA INFO if devtools_response['method'] == 'Network.requestWillBeSentExtraInfo': request_extra_headers.append({ 'request_id' : devtools_response['params']['requestId'], 'headers' : devtools_response['params']['headers'], 'associated_cookies': devtools_response['params']['associatedCookies'] }) # RESPONSE if devtools_response['method'] == 'Network.responseReceived': responses.append(self.clean_response(devtools_response['params'])) # RESPONSE EXTRA INFO if devtools_response['method'] == 'Network.responseReceivedExtraInfo': response_extra_headers.append({ 'request_id' : devtools_response['params']['requestId'], 'headers' : devtools_response['params']['headers'], 'blocked_cookies' : devtools_response['params']['blockedCookies'], }) # LOAD FINISHED if devtools_response['method'] == 'Network.loadingFinished': request_id = devtools_response['params']['requestId'] load_finish_events.append({ 'encoded_data_length': devtools_response['params']['encodedDataLength'], 'request_id': request_id, 'timestamp': devtools_response['params']['timestamp'], }) # WEBSOCKETS if devtools_response['method'] == 'Network.webSocketCreated': if 'initiator' in devtools_response['params']: this_initiator = devtools_response['params']['initiator'] else: this_initiator = None websockets.append({ 'request_id' : devtools_response['params']['requestId'], 'url' : devtools_response['params']['url'], 'initiator' : this_initiator, 'event_order' : len(websockets) }) if devtools_response['method'] in websocket_event_types: if 'errorMessage' in devtools_response['params']: payload = devtools_response['params']['errorMessage'] elif 'request' in devtools_response['params']: payload = devtools_response['params']['request'] elif 'response' in devtools_response['params']: payload = devtools_response['params']['response'] else: payload = None websocket_events.append({ 'request_id' : devtools_response['params']['requestId'], 'timestamp' : devtools_response['params']['timestamp'], 'event_type' : devtools_response['method'].replace('Network.',''), 'payload' : payload, 'event_order' : len(websocket_events) }) # EVENT SOURCE if devtools_response['method'] == 'Network.eventSourceMessageReceived': event_source_msgs.append({ 'internal_request_id' : devtools_response['params']['requestId'], 'timestamp' : devtools_response['params']['timestamp'], 'event_name' : devtools_response['params']['eventName'], 'event_id' : devtools_response['params']['eventId'], 'data' : devtools_response['params']['data'] }) # DOMSTORAGE if devtools_response['method'] == 'DOMStorage.domStorageItemAdded' or devtools_response['method'] == 'DOMStorage.domStorageItemUpdated': dom_storage_id = devtools_response['params']['storageId'] ds_key = ( dom_storage_id['securityOrigin'], dom_storage_id['isLocalStorage'], devtools_response['params']['key'] ) dom_storage_holder[ds_key] = devtools_response['params']['newValue'] # no need to continue processing if we got nothing back if len(responses) == 0: self.exit() return ({ 'success': False, 'result': 'No responses for page' }) if len(load_finish_events) == 0: self.exit() return ({ 'success': False, 'result': 'No load_finish_events for page' }) # Stop getting additional DOMStorage events response = self.send_ws_command('DOMStorage.disable') if response['success'] == False: self.exit() return response else: # if we are not getting the log we still do the prewait/scroll if self.debug: print(f'going to prewait for {self.prewait}') for i in range(0,self.prewait): self.do_scroll time.sleep(1) ##################### # DEVTOOLS COMMANDS # ##################### # only issue body commands for network_log if not get_text_only: if self.return_bodies: if self.debug: print('######################################') if self.debug: print(' Going to send response body commands ') if self.debug: print('######################################') # send commands to get response bodies for event in load_finish_events: request_id = event['request_id'] response = self.send_ws_command('Network.getResponseBody',f'"requestId":"{request_id}"') if response['success'] == False: self.exit() return response else: ws_id = response['result'] ws_id_to_req_id[ws_id] = request_id pending_ws_id_to_cmd[ws_id] = 'response_body' if self.debug: print('\tdone') # No longer need Network domain enabled self.send_ws_command('Network.disable') if response['success'] == False: self.exit() return response if self.debug: print('###########################################') if self.debug: print(' Going to send devtools websocket commands ') if self.debug: print('###########################################') # send the ws commands to get above data response = self.send_ws_command('Page.getNavigationHistory') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'page_nav' response = self.send_ws_command('Runtime.evaluate',params='"expression":"document.documentElement.outerHTML","timeout":1000') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'page_src' response = self.send_ws_command('Runtime.evaluate',params='"expression":"document.documentElement.lang","timeout":1000') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'html_lang' # LINKS js = json.dumps(""" var wbxr_links = (function () { var wbxr_processed_links = []; var wbxr_links = document.links; for (var wbxr_i = 0; wbxr_i < wbxr_links.length; wbxr_i++) { wbxr_processed_links.push({ 'text' : wbxr_links[wbxr_i]['innerText'], 'href' : wbxr_links[wbxr_i]['href'], 'protocol' : wbxr_links[wbxr_i]['protocol'] }); } return (wbxr_processed_links); }()); wbxr_links; """) response = self.send_ws_command('Runtime.evaluate',params=f'"expression":{js},"timeout":1000,"returnByValue":true') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'links' # META_DESC js = json.dumps(""" document.querySelector('meta[name="description" i]').content; """) response = self.send_ws_command('Runtime.evaluate',params=f'"expression":{js},"timeout":1000,"returnByValue":true') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'meta_desc' # PAGE_TEXT / READABILITY_HTML # # Inject the locally downloaded copy of readability into the page # and extract the content. Note you must download readability on # your own and place in the appropriate directory if self.return_page_text or get_text_only: # if we can't load readability it likely isn't installed, raise error try: readability_js = open(os.path.dirname(os.path.abspath(__file__))+'/resources/policyxray/Readability.js', 'r', encoding='utf-8').read() js = json.dumps(f""" var wbxr_readability = (function() {{ {readability_js} var documentClone = document.cloneNode(true); var article = new Readability(documentClone).parse(); return (article); }}()); wbxr_readability; """) response = self.send_ws_command('Runtime.evaluate',params=f'"expression":{js},"timeout":1000,"returnByValue":true') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'page_text' except: print('\t****************************************************') print('\t The Readability.js library is needed for webXray to') print('\t extract text, and it appears to be missing. ') print() print('\t Please go to https://github.com/mozilla/readability') print('\t download the file Readability.js and place it ') print('\t in the directory "webxray/resources/policyxray/" ') print('\t****************************************************') self.exit() return ({ 'success': False, 'result': 'Attempting to extract text but Readability.js is not found.' }) else: page_text = None readability_html = None if self.return_screen_shot: # scroll back to top for screen shot try: self.driver.execute_script('window.scrollTo(0, 0);') except: pass response = self.send_ws_command('Page.captureScreenshot') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'screen_shot' else: screen_shot = None # do cookies last response = self.send_ws_command('Network.getAllCookies') if response['success'] == False: self.exit() return response else: ws_id = response['result'] pending_ws_id_to_cmd[ws_id] = 'cookies' # Keep track of how long we've been reading ws data response_loop_start = datetime.datetime.now() # just to let us know how much work to do if self.debug: print('Pending ws requests: %s %s' % (url, len(pending_ws_id_to_cmd))) # Keep going until we get all the pending responses or 3min timeout while True: # if result is None we are either out of responses (prematurely) or # we failed devtools_response = self.get_next_ws_response() if not devtools_response: self.exit() return ({ 'success': False, 'result': 'Unable to get devtools response.' }) # update how long we've been going loop_elapsed = (datetime.datetime.now()-response_loop_start).total_seconds() # if we're still processing responses after 3 min, kill it if loop_elapsed > 180: self.exit() return ({ 'success': False, 'result': 'Timeout when processing devtools responses.' }) if self.debug: print(loop_elapsed,json.dumps(devtools_response)[:100]) # if response has an 'id' see which of our commands it goes to if 'id' in devtools_response: ws_id = devtools_response['id'] # we don't care about this if ws_id not in pending_ws_id_to_cmd: continue # remove the current one from pending # if self.debug: print('Going to remove ws_id %s from pending' % ws_id) cmd = pending_ws_id_to_cmd[ws_id] del pending_ws_id_to_cmd[ws_id] if self.debug: print(f'Removing {ws_id}:{cmd}, pending ws_id count is %s' % len(pending_ws_id_to_cmd)) # NAV HISTORY/FINAL_URL if cmd == 'page_nav': try: final_url = devtools_response['result']['entries'][-1]['url'] title = devtools_response['result']['entries'][-1]['title'] except: self.exit() return ({ 'success': False, 'result': 'Unable to get final_url,title via Devtools' }) # this is the first time we know it is a redirect, return now to save further wasted effort is_redirect = self.is_url_internal(url,final_url) if self.reject_redirects and (is_redirect == None or is_redirect == False): self.exit() return ({ 'success': False, 'result': 'rejecting redirect' }) # PAGE SOURCE elif cmd == 'page_src': try: page_source = devtools_response['result']['result']['value'] except: self.exit() return ({ 'success': False, 'result': 'Unable to get page_source via Devtools' }) # HTML LANG elif cmd == 'html_lang': try: lang = devtools_response['result']['result']['value'] except: self.exit() return ({ 'success': False, 'result': 'Unable to get html lang via Devtools' }) # RESPONSE BODIES elif cmd == 'response_body': if 'result' not in devtools_response: if self.debug: print('response body error: %s' % devtools_response) continue # if we are here we already know return_bodies is true so we # just have to check the reponse is either not base64 or we # do want to return base64 if devtools_response['result']['base64Encoded'] == False or self.return_bodies_base64: response_bodies[ws_id_to_req_id[ws_id]] = { 'body': devtools_response['result']['body'], 'is_base64': devtools_response['result']['base64Encoded'] } # SCREENSHOT elif cmd == 'screen_shot': if 'result' in devtools_response: screen_shot = devtools_response['result']['data'] # COOKIES elif cmd == 'cookies': try: cookies = devtools_response['result']['cookies'] except: self.exit() return ({ 'success': False, 'result': 'Unable to get cookies via Devtools' }) # LINKS elif cmd == 'links': try: js_links = devtools_response['result']['result']['value'] except: js_links = [] # META_DESC elif cmd == 'meta_desc': try: meta_desc = devtools_response['result']['result']['value'] except: meta_desc = None # PAGE_TEXT elif cmd == 'page_text': # if we don't get a result we don't care try: page_text = devtools_response['result']['result']['value']['textContent'] readability_html = devtools_response['result']['result']['value']['content'] except: page_text = None readability_html = None # we've gotten all the reponses we need, break if len(pending_ws_id_to_cmd) == 0: if self.debug: print('Got all ws responses!') break # end ws loop # catch redirect to illegal url if not self.is_url_valid(final_url): self.exit() return ({ 'success': False, 'result': 'Redirected to illegal url: '+final_url }) # process links and mark if internal all_links = [] internal_link_count = 0 for link in js_links: # filtering steps if 'href' not in link: continue if len(link['href']) == 0: continue if link['protocol'][:4] != 'http': continue # get rid of trailing # and / if link['href'].strip()[-1:] == '#': link['href'] = link['href'].strip()[:-1] if link['href'].strip()[-1:] == '/': link['href'] = link['href'].strip()[:-1] # sometimes the text will be a dict (very rarely) # so we convert to string link_text = str(link['text']).strip() # set up the dict if self.is_url_internal(final_url,link['href']): internal_link_count += 1 link = { 'text' : link_text, 'href' : link['href'].strip(), 'internal' : True } else: link = { 'text' : link_text, 'href' : link['href'].strip(), 'internal' : False } # only add unique links if link not in all_links: all_links.append(link) # fail if we don't have enough internal links if self.min_internal_links: if internal_link_count < self.min_internal_links: self.exit() return ({ 'success': False, 'result': 'did not find enough internal links' }) # reprocess domstorage into list of dicts if doing network_log if not get_text_only: if self.debug: print('Fixing domstorage') for ds_key in dom_storage_holder: dom_storage.append({ 'security_origin' : ds_key[0], 'is_local_storage' : ds_key[1], 'key' : ds_key[2], 'value' : dom_storage_holder[ds_key] }) ################################################ # FIX TIMESTAMPS: ONLY NEEDED FOR NETWORK_LOG # ################################################ if not get_text_only: # See note above regarding how chrome timestamps work, in the below blocks # we fix the timestamps to reflect real world time. if self.debug: print('Fixing timestamps') # likely nothing was loaded if not first_timestamp: self.exit() return ({ 'success': False, 'result': 'first_timestamp was None' }) # Page load time is the delta between the origin_walltime and the final_walltime # we initialize final_walltime to None as if it does not get updated nothing # was loaded and we failed. final_walltime = None # As we update the load_finish_event timestamps we also update the final_walltime. for load_finish_event in load_finish_events: fixed_timestamp = self.fixed_timestamp(origin_walltime, first_timestamp, load_finish_event['timestamp']) load_finish_event['timestamp'] = fixed_timestamp if final_walltime == None or fixed_timestamp > final_walltime: final_walltime = fixed_timestamp # These timestamp fixes are straightforward for request in requests: request['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, request['timestamp']) for response in responses: response['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, response['timestamp']) for websocket_event in websocket_events: websocket_event['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, websocket_event['timestamp']) for event_source_msg in event_source_msgs: event_source_msg['timestamp'] = self.fixed_timestamp(origin_walltime, first_timestamp, event_source_msg['timestamp']) # Session cookies have expires of -1 so we sent to None for cookie in cookies: if cookie['expires']: if cookie['expires'] > 0: cookie['expires'] = cookie['expires'] else: cookie['expires'] = None # If origin_walltime or final_walltime are None that means # we didn't record any Network.requestWillBeSent or # Network.loadingFinished events, and this was not a successful # page load if origin_walltime == None or final_walltime == None: self.exit() return ({ 'success': False, 'result': 'Unable to calculate load time, possible nothing was loaded' }) else: # get seconds between the last time we got a load finish and # the first request load_time = (datetime.datetime.fromtimestamp(final_walltime) - datetime.datetime.fromtimestamp(origin_walltime)).total_seconds() #load_time = 0 else: # we only do a prewait if not doing network log load_time = self.prewait # other parts of webxray expect this data format, common to all browser drivers used if self.debug: print('returning data on %s' % url) return_dict = { 'accessed' : origin_walltime, 'all_links' : all_links, 'client_timezone' : '_'.join(time.tzname), 'browser_type' : self.browser_type, 'browser_version' : self.browser_version, 'prewait' : self.prewait, 'no_event_wait' : self.no_event_wait, 'max_wait' : self.max_wait, 'start_url' : url, 'final_url' : final_url, 'title' : title, 'meta_desc' : meta_desc, 'lang' : lang, 'load_time' : load_time, 'requests' : requests, 'request_extra_headers' : request_extra_headers, 'responses' : responses, 'response_extra_headers': response_extra_headers, 'load_finish_events' : load_finish_events, 'websockets' : websockets, 'websocket_events' : websocket_events, 'event_source_msgs' : event_source_msgs, 'response_bodies' : response_bodies, 'cookies' : cookies, 'dom_storage' : dom_storage, 'page_source' : page_source, 'page_text' : page_text, 'readability_html' : readability_html, 'screen_shot' : screen_shot, 'page_load_strategy' : self.page_load_strategy } # Close browser and websocket connection, if doing a crawl # this happens in get_crawl_traffic if self.is_crawl == False: self.exit() # done! return ({ 'success': True, 'result': return_dict }) # get_scan def clean_request(self, request_params): """ Many of the request fields are optional so we make sure we make them None if not present and also normalize the naming convention. Returns a dict. """ cleaned_request = {} # get non-optional values first cleaned_request['request_id'] = request_params['requestId'] cleaned_request['loader_id'] = request_params['loaderId'] cleaned_request['document_url'] = request_params['documentURL'] cleaned_request['timestamp'] = request_params['timestamp'] cleaned_request['wall_time'] = request_params['wallTime'] cleaned_request['initiator'] = request_params['initiator'] # handle optional values in main params if 'type' in request_params: cleaned_request['type'] = request_params['type'] else: cleaned_request['type'] = None if 'frameId' in request_params: cleaned_request['frame_id'] = request_params['frameId'] else: cleaned_request['frame_id'] = None if 'hasUserGesture' in request_params: cleaned_request['has_user_gesture'] = request_params['hasUserGesture'] else: cleaned_request['has_user_gesture'] = None if 'redirectResponse' in request_params: cleaned_request['redirect_response_url'] = request_params['redirectResponse']['url'] else: cleaned_request['redirect_response_url'] = None # for readability this_request = request_params['request'] # get non-optional values first cleaned_request['url'] = this_request['url'] cleaned_request['method'] = this_request['method'] cleaned_request['headers'] = this_request['headers'] cleaned_request['initial_priority'] = this_request['initialPriority'] cleaned_request['referrer_policy'] = this_request['referrerPolicy'] # handle optional values in request if 'urlFragment' in this_request: cleaned_request['url_fragment'] = this_request['urlFragment'] else: cleaned_request['url_fragment'] = None if 'postData' in this_request: cleaned_request['post_data'] = this_request['postData'] else: cleaned_request['post_data'] = None if 'mixedContentType' in this_request: cleaned_request['mixed_content_type'] = this_request['mixedContentType'] else: cleaned_request['mixed_content_type'] = None if 'isLinkPreload' in this_request: cleaned_request['is_link_preload'] = this_request['isLinkPreload'] else: cleaned_request['is_link_preload'] = None # done! return cleaned_request # clean_request def clean_response(self, response_params): """ Many of the response fields are optional so we make sure we make them None if not present and also normalize the naming convention. Returns a dict. """ cleaned_response = {} # get non-optional param values first cleaned_response['request_id'] = response_params['requestId'] cleaned_response['loader_id'] = response_params['loaderId'] cleaned_response['timestamp'] = response_params['timestamp'] cleaned_response['type'] = response_params['type'] # handle optional param values if 'frameId' in response_params: cleaned_response['frame_id'] = response_params['frameId'] else: cleaned_response['frame_id'] = None # handle non-optional reponse values this_response = response_params['response'] cleaned_response['url'] = this_response['url'] cleaned_response['status'] = this_response['status'] cleaned_response['status_text'] = this_response['statusText'] cleaned_response['response_headers'] = this_response['headers'] cleaned_response['mime_type'] = this_response['mimeType'] cleaned_response['connection_reused'] = this_response['connectionReused'] cleaned_response['connection_id'] = this_response['connectionId'] cleaned_response['encoded_data_length'] = this_response['encodedDataLength'] cleaned_response['security_state'] = this_response['securityState'] # handle optional response values if 'requestHeaders' in this_response: cleaned_response['request_headers'] = this_response['requestHeaders'] else: cleaned_response['request_headers'] = None if 'remoteIPAddress' in this_response: cleaned_response['remote_ip_address'] = this_response['remoteIPAddress'] else: cleaned_response['remote_ip_address'] = None if 'remotePort' in this_response: cleaned_response['remote_port'] = this_response['remotePort'] else: cleaned_response['remote_port'] = None if 'fromDiskCache' in this_response: cleaned_response['from_disk_cache'] = this_response['fromDiskCache'] else: cleaned_response['from_disk_cache'] = None if 'fromServiceWorker' in this_response: cleaned_response['from_service_worker'] = this_response['fromServiceWorker'] else: cleaned_response['from_service_worker'] = None if 'fromPrefetchCache' in this_response: cleaned_response['from_prefetch_cache'] = this_response['fromPrefetchCache'] else: cleaned_response['from_prefetch_cache'] = None if 'timing' in this_response: cleaned_response['timing'] = this_response['timing'] else: cleaned_response['timing'] = None if 'protocol' in this_response: cleaned_response['protocol'] = this_response['protocol'] else: cleaned_response['protocol'] = None if 'securityDetails' in this_response: cleaned_response['security_details'] = this_response['securityDetails'] else: cleaned_response['security_details'] = None # done! return cleaned_response # clean_response def fixed_timestamp(self,origin_walltime,first_timestamp,timestamp): """ See notes above for details. """ # first calculate the timestamp offset elapsed_time = timestamp - first_timestamp # now add offset to the origin time to get the real time return origin_walltime + elapsed_time # fixed_timestamp def is_url_valid(self, url): """ Performs checks to verify if the url can actually be scanned. """ # only do http links if not (re.match('^https?://.+', url)): return False # if we can't get the url_path it is invalid try: url_path = urlsplit(url.strip().lower()).path except: return False # these are common file types we want to avoid illegal_extensions = [ 'apk', 'dmg', 'doc', 'docx', 'exe', 'ics', 'iso', 'pdf', 'ppt', 'pptx', 'rtf', 'txt', 'xls', 'xlsx' ] # if we can't parse the extension it doesn't exist and is # therefore ok by our standards try: url_extension = re.search('\.([0-9A-Za-z]+)$', url_path).group(1) if url_extension in illegal_extensions: return False except: return True # it's good return True # is_url_valid def idna_encode_url(self, url, no_fragment=False): """ Non-ascii domains will crash some browsers, so we need to convert them to idna/ascii/utf-8. This requires splitting apart the url, converting the domain to idna, and pasting it all back together. Note that this can fail, particularly in regards to idna encoding of invalid addresses (eg http://.example.com) so we return None in fail event. """ try: split_url = urlsplit(url.strip()) idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8') if no_fragment: return urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,'')) else: return urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment)) except: return None # idna_encode_url def is_url_internal(self,origin_url,target_url): """ Given two urls (origin, target) determines if the target is internal to the origin based on subsuffix+1 domain. """ origin_domain = self.url_parser.get_parsed_domain_info(origin_url) target_domain = self.url_parser.get_parsed_domain_info(target_url) # we return None to signify we couldn't parse the urls if not origin_domain['success'] or not target_domain['success']: return None else: origin_domain = origin_domain['result']['domain'] target_domain = target_domain['result']['domain'] if origin_domain != target_domain: return False else: return True # is_url_internal def do_scroll(self): """ Performs a random scroll action on Y axis, can be called at regular intervals to surface content on pages. """ self.send_ws_command('Input.dispatchMouseEvent','"x":0,"y":0,"type":"mouseWheel","deltaX":0,"deltaY":%s' % random.randrange(10,100))
class OutputStore: """ This class receives data from the browser, processes it, and stores it in the db """ def __init__(self, db_name, db_engine): self.db_name = db_name self.utilities = Utilities() self.url_parser = ParseURL() self.debug = False if db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver self.sql_driver = SQLiteDriver(self.db_name) elif db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver self.sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() self.config = self.sql_driver.get_config() # __init__ def close(self): """ Just to make sure we close the db connection. """ self.sql_driver.close() # close def store_scan(self, params): """ This function pre-processes data from the browser, inserts it into database, and handles linking various entries across tables. """ # unpack params browser_output = params['browser_output'] client_id = params['client_id'] crawl_id = params['crawl_id'] crawl_timestamp = params['crawl_timestamp'] crawl_sequence = params['crawl_sequence'] # client_ip is optional if 'client_ip' in params: client_ip = params['client_ip'] else: client_ip = None if self.debug: print('going to store scan %s' % browser_output['start_url']) # keep track of domains page_3p_cookie_domains = set() page_3p_dom_storage_domains = set() page_3p_request_domains = set() page_3p_response_domains = set() page_3p_websocket_domains = set() # convert from timestamp to datetime object that will go to the db accessed = datetime.fromtimestamp(browser_output['accessed']) # first make sure we don't have it already if self.sql_driver.page_exists(browser_output['start_url'],accessed): return {'success': False, 'result': 'exists in db already'} # if we have no responses the page didn't load at all and we skip # unless we are using basic driver and then it's ok if len(browser_output['responses']) == 0 and browser_output['browser_type'] != 'basic': return {'success': False, 'result': 'no responses received'} # ignore any malformed unicode characters page_source = browser_output['page_source'].encode('utf-8', 'ignore').decode() # store source if self.config['store_source']: if self.debug: print('going to store source %s' % browser_output['start_url']) page_source_md5 = self.store_file(page_source, False, 'page_source') else: page_source_md5 = None # store readability_html if self.config['store_page_text'] and browser_output['page_text']: if self.debug: print('going to store readability_html') # ignore any malformed unicode characters readability_html = browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip() readability_source_md5 = self.store_file(readability_html, False, 'readability_html') # store_page_text handles some addition operations if self.debug: print('going to store page_text') page_text_id = self.store_page_text(readability_html,readability_source_md5) else: page_text_id = None # process info on the start_url domain if self.debug: print('going to parse start/final_url %s' % browser_output['start_url']) start_url = browser_output['start_url'] start_url_domain_info = self.url_parser.get_parsed_domain_info(start_url) if start_url_domain_info['success'] == False: err_msg = 'unable to parse start_url_domain_info info for %s with error %s' % (browser_output['start_url'], start_url_domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) return {'success': False, 'result': 'could not parse start_url'} else: # needed for comparisons later on start_url_domain = start_url_domain_info['result']['domain'] # add start_url domain and get id start_url_domain_id = self.sql_driver.add_domain(start_url_domain_info['result']) # process info on the final_url domain # note: we use the final_url domain as the benchmark for determine 1p/3p final_url = browser_output['final_url'] final_url_domain_info = self.url_parser.get_parsed_domain_info(final_url) if final_url_domain_info['success'] == False: err_msg = 'unable to parse final_url_domain_info info for %s with error %s' % (browser_output['final_url'], final_url_domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) return {'success': False, 'result': 'could not parse final_url'} else: final_url_domain = final_url_domain_info['result']['domain'] # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id final_url_domain_id = self.sql_driver.add_domain(final_url_domain_info['result']) # check if the page has redirected to a new domain if start_url_domain != final_url_domain: page_domain_redirect = True else: page_domain_redirect = False # this is semi-redundant but ensures that any config changes made while # a result is queued are followed if self.config['client_reject_redirects'] and page_domain_redirect: return {'success': False, 'result': 'rejecting redirect'} # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False # (optionally) process and store links, this allows us to go back later and do deeper scans # as well as do more with policies # links starts as empty list links = [] # keep track of link counts as helpful for filtering pages link_count_internal = 0 link_count_external = 0 if self.config['store_links']: if self.debug: print('going to process links %s' % browser_output['start_url']) # we use the list of policy_link_terms to flag that a link *might* # be for a policy, we check if it actually is policy in PolicyCollector.py policy_link_terms = self.utilities.get_policy_link_terms() # process links, duplicates get ignored by db for link in browser_output['all_links']: # skip if href not valid if not self.utilities.is_url_valid(link['href']): continue # unpack values and catch any unicode errors link_text = link['text'].encode('utf-8', 'ignore').decode() link_url = link['href'].encode('utf-8', 'ignore').decode() # get rid of trailing # and / if link_url.strip()[-1:] == '#': link_url = link_url.strip()[:-1] if link_url.strip()[-1:] == '/': link_url = link_url.strip()[:-1] # sometimes the text will be a dict (very rarely) # so we convert to string link_text = str(link_text).strip() # clean up white space and remove line breaks link_text = re.sub('\n|\r|\t|\s+',' ',link_text.strip()) link_url = re.sub('\n|\r|\t|\s+',' ',link_url.strip()) # catch nulls link_text = link_text.replace('\x00','NULL_REPLACED_FOR_PSQL') link_url = link_url.replace('\x00','NULL_REPLACED_FOR_PSQL') # update counts if link['internal']: link_count_internal += 1 else: link_count_external += 1 # flag links that could be policies, default False link_is_policy = False # determine if a policy term appears in the link for policy_term in policy_link_terms: if policy_term in link_text.lower(): link_is_policy = True break link_domain_info = self.url_parser.get_parsed_domain_info(link_url) if link_domain_info['success'] == False: # don't bother with storing errors link_domain_id = None else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id link_domain_id = self.sql_driver.add_domain(link_domain_info['result']) links.append({ 'url' : link_url, 'text' : link_text, 'is_internal' : link['internal'], 'is_policy' : link_is_policy, 'domain_id' : link_domain_id }) # if we got the screen shot we get the hash and store it to the file table screen_shot_md5 = None if browser_output['screen_shot'] and self.config['store_screen_shot']: if self.debug: print('going to store screen shot %s' % browser_output['start_url']) # store file to get md5 screen_shot_md5 = self.store_file(browser_output['screen_shot'],True,'screen_shot') # if we have timestamp it is also an 'accessed' field from # a page load so we convert that as well if crawl_timestamp: crawl_timestamp = datetime.fromtimestamp(crawl_timestamp) # ignore any malformed unicode characters if browser_output['title']: browser_output['title'] = browser_output['title'].encode('utf-8', 'ignore').decode() if browser_output['meta_desc']: browser_output['meta_desc'] = browser_output['meta_desc'].encode('utf-8', 'ignore').decode() if browser_output['lang']: browser_output['lang'] = browser_output['lang'].encode('utf-8', 'ignore').decode() # now we know link counts we can store the page if self.debug: print('going to store page %s' % browser_output['start_url']) page_id = self.sql_driver.add_page({ 'accessed' : accessed, 'browser_type' : browser_output['browser_type'], 'browser_version' : browser_output['browser_version'], 'browser_prewait' : browser_output['prewait'], 'browser_no_event_wait' : browser_output['no_event_wait'], 'browser_max_wait' : browser_output['max_wait'], 'page_load_strategy' : browser_output['page_load_strategy'], 'title' : browser_output['title'], 'meta_desc' : browser_output['meta_desc'], 'lang' : browser_output['lang'], 'start_url' : browser_output['start_url'], 'final_url' : browser_output['final_url'], 'is_ssl' : page_is_ssl, 'page_domain_redirect' : page_domain_redirect, 'link_count_internal' : link_count_internal, 'link_count_external' : link_count_external, 'load_time' : browser_output['load_time'], 'start_url_domain_id' : start_url_domain_id, 'final_url_domain_id' : final_url_domain_id, 'client_id' : client_id, 'client_timezone' : browser_output['client_timezone'], 'client_ip' : client_ip, 'page_text_id' : page_text_id, 'screen_shot_md5' : screen_shot_md5, 'page_source_md5' : page_source_md5, 'crawl_id' : crawl_id, 'crawl_timestamp' : crawl_timestamp, 'crawl_sequence' : crawl_sequence }) # STORE LINKS if self.config['store_links']: if self.debug: print('going to store links %s' % browser_output['start_url']) for link in links: link_id = self.sql_driver.add_link(link) if link_id: self.sql_driver.join_link_to_page(page_id,link_id) # PROCESS DOM_STORAGE if self.config['store_dom_storage']: if self.debug: print('going to process dom storage %s' % browser_output['start_url']) for dom_storage in browser_output['dom_storage']: # parse domain from the security_origin, which is equivalent to a url domain_info = self.url_parser.get_parsed_domain_info(dom_storage['security_origin']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (dom_storage['security_origin'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id dom_storage['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark if third-party storage if final_url_domain != domain_info['result']['domain']: dom_storage['is_3p'] = True else: dom_storage['is_3p'] = False # key to page dom_storage['page_id'] = page_id # replace null b/c postgres will die otherwise dom_storage['key'] = dom_storage['key'].replace('\x00','NULL_REPLACED_FOR_PSQL') dom_storage['value'] = dom_storage['value'].replace('\x00','NULL_REPLACED_FOR_PSQL') # there types of illegal utf-8 characters that psql doesn't like, eg trying to store # '\uded5' gives this error when storing in psql: # 'UnicodeEncodeError: 'utf-8' codec can't encode character '\uded5' in position 0: surrogates not allowed' # # to overcome the above, we use python's backslashreplace to keep the original data in # a way that won't cause our queries to die # see https://docs.python.org/3/library/codecs.html#error-handlers dom_storage['key'] = dom_storage['key'].encode('utf-8','backslashreplace') dom_storage['value'] = dom_storage['value'].encode('utf-8','backslashreplace') # now that we've encoded with backslashes we decode to get the semi-original data dom_storage['key'] = dom_storage['key'].decode('utf-8') dom_storage['value'] = dom_storage['value'].decode('utf-8') # all done with this item self.sql_driver.add_dom_storage(dom_storage) # update domains if dom_storage['is_3p']: page_3p_dom_storage_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) # PROCESS LOAD FINISH if self.debug: print('going to process load finish data %s' % browser_output['start_url']) load_finish_data = {} for load_finish_event in browser_output['load_finish_events']: load_finish_data[load_finish_event['request_id']] = load_finish_event['encoded_data_length'] # RESPONSE EXTRA HEADERS if self.debug: print('going to process response extra header data %s' % browser_output['start_url']) http_cookies = [] internal_id_to_resp_ex_headers = {} for response_extra_header in browser_output['response_extra_headers']: response_extra_header['page_id'] = page_id response_extra_header['cookies_set'] = None # to check for domain leakage in headers we make a big string keyed to the internal id if response_extra_header['request_id'] not in internal_id_to_resp_ex_headers: internal_id_to_resp_ex_headers[response_extra_header['request_id']] = str(response_extra_header['headers']) else: internal_id_to_resp_ex_headers[response_extra_header['request_id']] += str(response_extra_header['headers']) for item in response_extra_header['headers']: if item.lower() == 'set-cookie': response_extra_header['cookies_set'] = response_extra_header['headers'][item] # when we add cookies later on we mark those that came from response headers, # note we try/pass on this in case we can't parse for cookie in response_extra_header['cookies_set'].split('\n'): if 'domain' in cookie.lower(): try: name = re.match('^(.+?)=',cookie)[0][:-1] domain = re.match('^.+domain=(.+?)(;|$)',cookie.lower())[1] if domain[0] == '.': domain = domain[1:] http_cookies.append((domain,name)) except: pass if self.config['store_response_xtra_headers']: self.sql_driver.add_response_extra_header(response_extra_header) # PROCESS RESPONSES response_received_req_ids = [] if self.debug: print('going to process response data %s' % browser_output['start_url']) for response in browser_output['responses']: # defaut values that may get over-written response['file_md5'] = None response['is_data'] = False response['is_3p'] = None response['is_ssl'] = None response['page_domain_in_headers'] = False # first handle non-http urls and optionally store content if re.match('^(data|about|chrome|blob|javascript).+', response['url']): if 'base64' in response['url'].lower() or 'image' in response['type'].lower(): is_base64 = True else: is_base64 = False # store_file follows the config as far as actually storing the file goes # and will either return the md5 or None # make sure we're following our configuration if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False): response['file_md5'] = self.store_file(response['url'],is_base64,response['type']) else: response['file_md5'] = None response['url'] = None response['is_data'] = True response['domain_id'] = None else: # parse, store, and get id of domain; if fails skip domain_info = self.url_parser.get_parsed_domain_info(response['url']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (response['url'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: response_domain = domain_info['result']['domain'] response['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # now add ip if response['remote_ip_address']: self.sql_driver.add_domain_ip_addr(response['domain_id'],response['remote_ip_address']) # mark third-party responses based on final_url domain if response_domain != final_url_domain: response['is_3p'] = True else: response['is_3p'] = False # determine if encrypted if response['url'][:5] == 'https' or response['url'][:3] == 'wss': response['is_ssl'] = True else: response['is_ssl'] = False # keep track of the request ids of each reponse to mark as received response_received_req_ids.append(response['request_id']) # we do no more processing at this point if not self.config['store_responses']: continue # lower case the type, simplifies db queries response['type'] = response['type'].lower() # store the security details if they exist if response['security_details'] and self.config['store_security_details']: response['security_details_id'] = self.sql_driver.add_security_details(response['security_details']) else: response['security_details_id'] = None # store the size of the request if response['request_id'] in load_finish_data: response['final_data_length'] = load_finish_data[response['request_id']] else: response['final_data_length'] = None # parse off args/etc # consider anything before the "?" to be the element_url try: response['base_url'] = re.search('^(.+?)\?.+$', response['url']).group(1) except: response['base_url'] = response['url'] # attempt to parse off the extension try: response['extension'] = re.search('\.([0-9A-Za-z]+)$', response['base_url']).group(1).lower() except: response['extension'] = None # First see if this request_id is present in response_bodies, and if # the entry is not None, then we store it to the db if config says to. if response['request_id'] in browser_output['response_bodies']: if browser_output['response_bodies'][response['request_id']]: # make sure we're following our configuration is_base64 = browser_output['response_bodies'][response['request_id']]['is_base64'] if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False): response['file_md5'] = self.store_file( browser_output['response_bodies'][response['request_id']]['body'], is_base64, response['type'] ) else: response['file_md5'] = None # link to page response['page_id'] = page_id # parse data headers, accounts for upper/lower case variations (eg 'set-cookie', 'Set-Cookie') response['content_type'] = None response['cookies_set'] = None for item in response['response_headers']: if item.lower() == 'content-type': response['content_type'] = response['response_headers'][item] if item.lower() == 'set-cookie': response['cookies_set'] = response['response_headers'][item] # if we have request_headers look for cookies sent response['cookies_sent'] = None if response['request_headers']: for item in response['request_headers']: if item.lower() == 'cookie': response['cookies_sent'] = response['request_headers'][item] # parse referer header response['referer'] = None for item in response['response_headers']: if item.lower() == 'referer': response['referer'] = response['response_headers'][item] # check if domain leaked in referer if response['request_id'] in internal_id_to_resp_ex_headers: if final_url_domain in internal_id_to_resp_ex_headers[response['request_id']]: response['page_domain_in_headers'] = True # convert from timestamp to datetime object that will go to the db response['timestamp'] = datetime.fromtimestamp(response['timestamp']) # store self.sql_driver.add_response(response) # update domains if response['is_3p']: page_3p_response_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) # REQUEST EXTRA HEADERS if self.debug: print('going to process request extra headers data %s' % browser_output['start_url']) internal_id_to_req_ex_headers = {} for request_extra_header in browser_output['request_extra_headers']: request_extra_header['page_id'] = page_id request_extra_header['cookies_sent'] = None # to check for domain leakage in headers we make a big string keyed to the internal id if request_extra_header['request_id'] not in internal_id_to_req_ex_headers: internal_id_to_req_ex_headers[request_extra_header['request_id']] = str(request_extra_header['headers']) else: internal_id_to_req_ex_headers[request_extra_header['request_id']] += str(request_extra_header['headers']) for item in request_extra_header['headers']: if item.lower() == 'cookie': request_extra_header['cookies_sent'] = request_extra_header['headers'][item] if self.config['store_request_xtra_headers']: self.sql_driver.add_request_extra_header(request_extra_header) # PROCESS REQUESTS if self.config['store_requests']: if self.debug: print('going to process request data %s' % browser_output['start_url']) for request in browser_output['requests']: # defaut values that may get over-written request['file_md5'] = None request['is_data'] = False request['is_3p'] = None request['is_ssl'] = None request['page_domain_in_headers'] = False # first handle non-http urls and optionally store content if re.match('^(data|about|chrome|blob|javascript).+', request['url']): if 'base64' in request['url'].lower() or 'image' in request['url'].lower(): is_base64 = True else: is_base64 = False # store_file follows the config as far as actually storing the file goes # and will either return the md5 or None # make sure we're following our configuration if self.config['store_files'] and (self.config['store_base64'] or is_base64 == False): request['file_md5'] = self.store_file(request['url'],is_base64,request['type']) else: request['file_md5'] = None request['url'] = None request['is_data'] = True request['domain_id'] = None else: # parse, store, and get id of domain; if fails skip domain_info = self.url_parser.get_parsed_domain_info(request['url']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (request['url'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: request_domain = domain_info['result']['domain'] request['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark third-party requests based on final_url domain if request_domain != final_url_domain: request['is_3p'] = True else: request['is_3p'] = False # determine if encrypted if request['url'][:5] == 'https' or request['url'][:3] == 'wss': request['is_ssl'] = True else: request['is_ssl'] = False # replace null b/c postgres will die otherwise if request['post_data']: request['post_data'] = request['post_data'].replace('\x00','NULL_REPLACED_FOR_PSQL') # consider anything after the "?" to be the GET data try: get_string = re.search('^.+\?(.+)$', request['url']).group(1) get_string = get_string.replace('\x00','NULL_REPLACED_FOR_PSQL') get_data = {} for key_val in get_string.split('&'): get_data[key_val.split('=')[0]] = key_val.split('=')[1] request['get_data'] = json.dumps(get_data) except: request['get_data'] = None # mark if response received if request['request_id'] in response_received_req_ids: request['response_received'] = True else: request['response_received'] = None # mark if the loading finished if request['request_id'] in load_finish_data: request['load_finished'] = True else: request['load_finished'] = None # lower case the type, simplifies db queries if request['type']: request['type'] = request['type'].lower() # parse off args/etc # consider anything before the "?" to be the element_url try: request['base_url'] = re.search('^(.+?)\?.+$', request['url']).group(1) except: request['base_url'] = request['url'] # attempt to parse off the extension try: request['extension'] = re.search('\.([0-9A-Za-z]+)$', request['base_url']).group(1).lower() except: request['extension'] = None # link to page request['page_id'] = page_id # parse referer header request['referer'] = None for item in request['headers']: if item.lower() == 'referer': request['referer'] = request['headers'][item] # check if domain leaked in headers if request['request_id'] in internal_id_to_req_ex_headers: if final_url_domain in internal_id_to_req_ex_headers[request['request_id']]: request['page_domain_in_headers'] = True # convert from timestamp to datetime object that will go to the db request['timestamp'] = datetime.fromtimestamp(request['timestamp']) # all done self.sql_driver.add_request(request) # update domains if request['is_3p']: page_3p_request_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) # PROCESS WEBSOCKETS if self.config['store_websockets']: if self.debug: print('going to process websocket data %s' % browser_output['start_url']) ws_id_map = {} for websocket in browser_output['websockets']: domain_info = self.url_parser.get_parsed_domain_info(websocket['url']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (websocket['url'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id websocket['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark if third-party connection if final_url_domain != domain_info['result']['domain']: websocket['is_3p'] = True else: websocket['is_3p'] = False websocket['page_id'] = page_id this_websocket_id = self.sql_driver.add_websocket(websocket) # update domains if websocket['is_3p']: page_3p_websocket_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) if websocket['request_id'] not in ws_id_map: ws_id_map[websocket['request_id']] = this_websocket_id else: print('ERROR WS_REQ_ID ALREADY IN MAP') # PROCESS WEBSOCKET EVENTS if self.config['store_websockets'] and self.config['store_websocket_events']: for websocket_event in browser_output['websocket_events']: websocket_event['page_id'] = page_id if websocket_event['request_id'] in ws_id_map: websocket_event['websocket_id'] = ws_id_map[websocket_event['request_id']] else: websocket_event['websocket_id'] = None # convert from timestamp to datetime object that will go to the db websocket_event['timestamp'] = datetime.fromtimestamp(websocket_event['timestamp']) self.sql_driver.add_websocket_event(websocket_event) # PROCESS EVENT SOURCE MSGS if self.config['store_event_source_msgs']: if self.debug: print('going to process event source data %s' % browser_output['start_url']) for event_source_msg in browser_output['event_source_msgs']: event_source_msg['page_id'] = page_id # convert from timestamp to datetime object that will go to the db event_source_msg['timestamp'] = datetime.fromtimestamp(event_source_msg['timestamp']) self.sql_driver.add_event_source_msg(event_source_msg) # PROCESS COOKIES if self.config['store_cookies']: if self.debug: print('going to process cookies %s' % browser_output['start_url']) for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// # parse domain from the security_origin, which is equivalent to a url domain_info = self.url_parser.get_parsed_domain_info('http://'+cookie['domain']) if domain_info['success'] == False: err_msg = 'unable to parse domain info for %s with error %s' % (cookie['domain'], domain_info['result']) if self.debug: print(err_msg) self.sql_driver.log_error({ 'client_id' : client_id, 'target' : start_url, 'task' : 'output_store', 'msg' : err_msg }) continue else: # self.sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id cookie['domain_id'] = self.sql_driver.add_domain(domain_info['result']) # mark if third-party cookie if final_url_domain != domain_info['result']['domain']: cookie['is_3p'] = True else: cookie['is_3p'] = False # key to page cookie['page_id'] = page_id # fix var names cookie['http_only'] = cookie['httpOnly'] # attempt to convert cookie expiry from timestamp to datetime object, note we # need try/except as python datetime object cannot have year > 9999 and some # cookies do that cookie['expires_timestamp'] = None if cookie['expires']: try: cookie['expires_timestamp'] = datetime.fromtimestamp(cookie['expires']) except: pass # this is optional, do fall-back if 'sameSite' in cookie: cookie['same_site'] = cookie['sameSite'] else: cookie['same_site'] = None # see if this cookie was set via http response if cookie['domain'][0] == '.': cookie_tuple = (cookie['domain'][1:],cookie['name']) else: cookie_tuple = (cookie['domain'],cookie['name']) if cookie_tuple in http_cookies: cookie['is_set_by_response'] = True else: cookie['is_set_by_response'] = False # all done with this cookie self.sql_driver.add_cookie(cookie) # update domains if cookie['is_3p']: page_3p_cookie_domains.add((domain_info['result']['domain'],domain_info['result']['domain_owner_id'])) if self.debug: print('done storing scan %s' % browser_output['start_url']) return { 'success' : True, 'page_id' : page_id, 'page_3p_request_domains' : page_3p_request_domains, 'page_3p_response_domains' : page_3p_response_domains, 'page_3p_websocket_domains' : page_3p_websocket_domains, 'page_3p_dom_storage_domains' : page_3p_dom_storage_domains, 'page_3p_cookie_domains' : page_3p_cookie_domains } # store_scan def store_file(self,body,is_base64,type): """ Hashes and stores file, returns file_md5. """ # in theory we shouldn't get here if it is base64, so this is a fail-safe check if not self.config['store_base64']: if is_base64 or type.lower()=='image': return None # note hash is on original data, which we modify to remove \x00 before we store file_md5 = hashlib.md5(body.encode()).hexdigest() # store to db, note query will be ignored on conflict # but since we calculate the md5 as above that is fine self.sql_driver.add_file({ 'md5' : file_md5, 'body' : body.replace('\x00','NULL_REPLACED_FOR_PSQL'), 'type' : type.lower(), 'is_base64' : is_base64 }) return file_md5 # store_file def store_policy(self, browser_output, client_id, client_ip=None): """ We attempt to figure out if the text provided is a policy, if so we store it to the database. """ # keep values in a dict here policy = {} # attempt to get_policy was a success, extract data from # dict, since postgres cannot handle '\x00' we convert to # string for several fields and use .replace('\x00',' ') to # clean the input policy['client_id'] = client_id policy['client_ip'] = client_ip policy['browser_type'] = browser_output['browser_type'] policy['browser_version'] = browser_output['browser_version'] policy['browser_prewait'] = browser_output['prewait'] policy['start_url'] = browser_output['start_url'] policy['final_url'] = browser_output['final_url'] policy['title'] = browser_output['title'] policy['meta_desc'] = browser_output['meta_desc'] policy['lang'] = browser_output['lang'] policy['fk_score'] = None policy['fre_score'] = None policy['word_count'] = None policy['type'] = None policy['match_term'] = None policy['match_text'] = None policy['match_text_type'] = None policy['confidence'] = None policy['page_text_id'] = None policy['page_source_md5'] = None # if readability failed we bail if not browser_output['readability_html'] or not browser_output['page_text']: self.sql_driver.close() return { 'success' : False, 'result' : 'No readability result' } # ignore any malformed unicode characters readability_html = browser_output['readability_html'].encode('utf-8', 'ignore').decode().strip() page_text = browser_output['page_text'].encode('utf-8', 'ignore').decode().strip() page_source = browser_output['page_source'].encode('utf-8', 'ignore').decode() # bail on empty text if len(page_text) == 0: self.sql_driver.close() return { 'success' : False, 'result' : 'Empty page text' } # load the source into lxml so we can do additional processing, # if we fail we bail try: lxml_doc = lxml.html.fromstring(readability_html) except: return ({ 'success': False, 'result': 'Could not parse readability_html with lxml' }) # if the text is less than 500 words we ignore it if len(page_text.split(' ')) < 500: self.sql_driver.close() return { 'success' : False, 'result' : 'Page text < 500 words' } # once we have the text we figure out if it is # a policy, start false, override on match is_policy = False # first look for matches on page title # we give this confidence of 100 as it is # definitely a match if policy['title']: policy_type_result = self.determine_policy_type_from_text(policy['title']) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = 'title' policy['confidence'] = 100 # deep checks may generate false positives so # they have confidence of 0 until they can # be verified, note we may do this here # or later on deep_checks = True if deep_checks: policy['confidence'] = 0 # convert the url path to a sentence by replacing # common delimiters with spaces and attempt matches if self.debug: print('going to do checks on url path') if not is_policy: url_path_string = re.sub('[-|_|/|\.]',' ',urlsplit(policy['start_url']).path) if len(url_path_string) > 0: policy_type_result = self.determine_policy_type_from_text(url_path_string) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = 'url_path' if self.debug: print('going to do checks on meta desc') if not is_policy and policy['meta_desc']: policy_type_result = self.determine_policy_type_from_text(policy['meta_desc']) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = 'meta_desc' # iterate over all types of heading tags to extract text # and check for policy matches. note we go in order of # importance (eg h1->h7->span,etc) if self.debug: print('going to do checks on heading tags') if not is_policy: for tag_type in ['h1','h2','h3','h4','h5','h6','h7','span','strong','em']: if is_policy: break tags = lxml_doc.cssselect(tag_type) if len(tags) > 0: for tag in tags: tag_text = tag.text_content() # if it is > 15 words it is likely not a heading if len(tag_text.split(' ')) > 15: break policy_type_result = self.determine_policy_type_from_text(tag_text) if policy_type_result['success'] == True: is_policy = True policy['type'] = policy_type_result['result']['policy_type'] policy['match_term'] = policy_type_result['result']['match_term'] policy['match_text'] = policy_type_result['result']['match_text'] policy['match_text_type'] = tag_type # if it is a policy we do additional processing # before storing in db, otherwise we fail # gracefully if is_policy: if self.debug: print('going to store readability_html') readability_source_md5 = self.store_file(readability_html, False, 'readability_html') if self.debug: print('going to store page_text') # store_page_text handles some addition operations if self.debug: print('going to store page_text') policy['page_text_id'] = self.store_page_text(readability_html, readability_source_md5) if self.debug: print(f"page_text_id is {policy['page_text_id']}") if self.debug: print('going to store page_source') policy['page_source_md5'] = self.store_file(page_source, False, 'page_source') if self.debug: print('going to do reading ease scores') # get readability scores, scores below zero are # invalid so we null them policy['fre_score'] = textstat.flesch_reading_ease(page_text) if policy['fre_score'] <= 0: policy['fre_score'] = None policy['fk_score'] = textstat.flesch_kincaid_grade(page_text) if policy['fk_score'] <= 0: policy['fk_score'] = None if self.debug: print('going to store policy') # add to db and get id for this policy policy_id = self.sql_driver.add_policy(policy) if self.debug: print('going to link policy to pages') # attach policy to all links with this url, not we can filter # do only do internal links for page_id, crawl_id in self.sql_driver.get_page_ids_from_link_url(policy['start_url'],internal_links_only=True): self.sql_driver.attach_policy_to_page(policy_id,page_id) self.sql_driver.attach_policy_to_crawl(policy_id,crawl_id) if self.debug: print(f'\t� Success: {policy["start_url"]}') self.sql_driver.close() return {'success': True} else: if self.debug: print(f'\t👎 Fail: {policy["start_url"]}') self.sql_driver.close() return { 'success': False, 'result': 'Not policy' } # store_policy def determine_policy_type_from_text(self, text): """ Determine if a given text fragment indicates a given type of policy. Returns dict. """ # clear whitespace text = re.sub('\s+',' ',text) # retrieve values from policy_terms.json policy_verification_terms = self.utilities.get_policy_verification_terms() policy_type_keys = [] for key in policy_verification_terms: policy_type_keys.append(key) # randomize the order we do our checks random.shuffle(policy_type_keys) # look for matches against verification terms for policy_type in policy_type_keys: for term in policy_verification_terms[policy_type]: if term in text.lower(): return({ 'success': True, 'result' :{ 'policy_type': policy_type, 'match_term': term, 'match_text': text } }) # no match return ({'success': False}) # determine_policy_type_from_text def store_page_text(self,readability_html,readability_source_md5): # the actual 'page_text' output from readability doesn't properly seperate words # that use markup as a space. eg '<h3>this</h3><p>that</p>' becomes 'thisthat' # whereas 'this that' is what a user would see in the browser # to overcome the above issue we have to manually strip out html and do some # cleaning of our own. page_text = re.sub('<!--.+-->',' ', readability_html) page_text = re.sub('<svg.+</svg>',' ', page_text) page_text = re.sub('<.+?>', ' ', page_text) page_text = re.sub('[\n|\r]', ' ', page_text) page_text = re.sub('\s+', ' ', page_text) page_text = unicodedata.normalize('NFKD',html.unescape(page_text.strip())) # postgres can't handle nulls page_text = page_text.replace('\x00','NULL_REPLACED_FOR_PSQL') # return the id return self.sql_driver.add_page_text({ 'text' : page_text.replace('\x00',' '), 'word_count' : len(page_text.split()), 'readability_source_md5' : readability_source_md5 })