Ejemplo n.º 1
0
    def process_uri(self, uri):
        sql_driver = MySQLDriver(self.db_name)
        output_store = OutputStore(self.db_name)
        phantom_driver = PhantomDriver(
            '--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js')

        # this can be higher or lower depending on network load
        # generally, 90 seems to be fine, so keep with it
        try:
            phantom_output = phantom_driver.execute(uri, 90)
        except:
            print("\t\t%-50s Phantomjs Did Not Return." % uri[:50])
            sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.")
            return

        if re.match('^FAIL.+', phantom_output):
            print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output))
            sql_driver.log_error(uri, phantom_output)
        else:
            print("\t\t%-50s %s" %
                  (uri[:50], output_store.store(uri, phantom_output)))

        # closes our db connections
        sql_driver.close()
        output_store.close()
        return
Ejemplo n.º 2
0
	def process_uri(self, uri):
		sql_driver 		= MySQLDriver(self.db_name)
		output_store 	= OutputStore(self.db_name)
		phantom_driver 	= PhantomDriver('--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js')

		# this can be higher or lower depending on network load
		# generally, 90 seems to be fine, so keep with it
		try:
			phantom_output = phantom_driver.execute(uri, 90)
		except:
			print("\t\t%-50s Phantomjs Did Not Return." % uri[:50])
			sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.")
			return	

		if re.match('^FAIL.+', phantom_output):
			print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output))
			sql_driver.log_error(uri, phantom_output)
		else:
			print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output)))
	
		# closes our db connections
		sql_driver.close()
		output_store.close()
		return
Ejemplo n.º 3
0
class OutputStore:
	def __init__(self, dbname):
		self.uri_parser = ParseURI()
		self.sql_driver = MySQLDriver(dbname)
	# end init

	def store(self, uri, phantom_output):
		# parse out the json from our phantom_output
		# sometimes phantom prints out errors before the json, (despite us turning
		# it off!), so we match inside of the {}s to get json only
		try:
			data = json.loads(re.search('(\{.+\})', phantom_output).group(1))
		except Exception as e:
			self.sql_driver.log_error(uri, "Could Not Load JSON: %s" % e)
			return 'Could Not Load JSON'

		# we need to parse the domain to determine if requests are local or 3rd party
		# we need pubsuffix and tld for later analysis so store them now

		origin_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(uri)
		origin_domain = origin_domain_pubsuffix_tld[0]
		origin_pubsuffix = origin_domain_pubsuffix_tld[1]
		origin_tld = origin_domain_pubsuffix_tld[2]
		
		if re.match('^Exception.+', origin_domain):
			self.sql_driver.log_error(uri, 'Could not parse TLD for %s' % uri)
			return 'Could not parse TLD for %s' % uri

		page_domain_id = self.sql_driver.add_domain(origin_domain, origin_pubsuffix, origin_tld)

		# newFollowLogger is giving us the follow JSON data:
		# note source is null for now, but store anyway
# 		big_out = {
# 			final_uri: final_uri,
# 			title: page.title,
# 			meta_desc : meta_desc,
# 			requested_uris: JSON.stringify(requested_uris),
# 			received_uris: JSON.stringify(received_uris),
# 			cookies: phantom.cookies,
# 			source: 'NULL',
# 		};

		# we are now storing uris with and without args and saving args
		# we must unquote to uri to get back to original state so can parse
		start_uri_original = urllib.parse.unquote(uri)

		try:
			start_uri_no_args = re.search('^(.+?)\?.+$', start_uri_original).group(1) # start uri no args
		except:
			start_uri_no_args = uri

		try:
			start_uri_args = re.search('^.+(\?.+)$', start_uri_original).group(1) # start uri args
		except:
			start_uri_args = 'NULL'
		
		# same for the final uri (this is where we are after potential redirects)
		final_uri = re.sub('\"', '', json.dumps(data["final_uri"]))
		final_uri_original = urllib.parse.unquote(final_uri)
		
		try:
			final_uri_no_args = re.search('^(.+?)\?.+$', final_uri_original).group(1) # start uri no args
		except:
			final_uri_no_args = final_uri

		try:
			final_uri_args = re.search('^.+(\?.+)$', final_uri_original).group(1) # start uri args
		except:
			final_uri_args = 'NULL'
	
		# add page
		# json.dumps to make sure strings go out ok for db
		page_id = self.sql_driver.add_page(								 
			str(re.sub('\"', '', json.dumps(data["title"]))),
			str(re.sub('\"', '', json.dumps(data["meta_desc"]))),
			uri, 
			start_uri_no_args, 
			start_uri_args,
			final_uri, 
			final_uri_no_args, 
			final_uri_args,
			str(re.sub('\"', '', json.dumps(data["source"]))),
			str(re.sub('\"', '', json.dumps(data["requested_uris"]))),
			str(re.sub('\"', '', json.dumps(data["received_uris"]))),
			page_domain_id)

		for cookie in data["cookies"]:
			# store external cookies, uri_parser fails on non-http, we should fix this
			# right now a lame hack is to prepend http://
			cookie_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld("http://"+cookie["domain"])
			cookie_domain = cookie_domain_pubsuffix_tld[0]
			cookie_pubsuffix = cookie_domain_pubsuffix_tld[1]
			cookie_tld = cookie_domain_pubsuffix_tld[2]

			# something went wrong, but carry on...
			if re.match('^Exception.+', cookie_domain):
				self.sql_driver.log_error(uri, 'Error parsing cookie: '+cookie_domain)
				continue

			# this is a 3party cookie
			if origin_domain != cookie_domain:

				cookie_domain_id = self.sql_driver.add_domain(cookie_domain, cookie_pubsuffix, cookie_tld)
			
				# name and domain are required, so if they fail we just continue
				try: name = cookie["name"]
				except: continue
			
				try: domain = cookie_domain
				except: continue
			
				# these are optional, keep going with "N/A" vals
			
				try: secure = cookie["secure"]
				except: secure = "N/A"
			
				try: path = cookie["path"]
				except: path = "N/A"
			
				try: expires = cookie["expires"]
				except: expires = "N/A"
			
				try: httponly = cookie["httponly"]
				except: httponly = "N/A"
			
				try: expiry = cookie["expiry"]
				except: expiry = "N/A"
			
				try: value = cookie["value"]
				except: value = "N/A"
			
				cookie_id = self.sql_driver.add_cookie(	name, secure, path, domain, 
														expires, httponly, expiry, value, 
														cookie_domain_id)
				self.sql_driver.add_cookie_to_page(cookie_id, page_id)

		for request in data["requested_uris"]:
			# if the request starts with "data" we can't parse tld anyway, so skip
			if re.match('^(data|about|chrome).+', request):
				continue

			# get domain, pubsuffix, and tld from request
			requested_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(request)
			requested_domain = requested_domain_pubsuffix_tld[0]
			requested_pubsuffix = requested_domain_pubsuffix_tld[1]
			requested_tld = requested_domain_pubsuffix_tld[2]
			
			# see if we got back what we requested, if not a few things may have happened
			# 	* malformed uri
			#	* resource is gone or never existed
			#	* network latency (ie it didn't arrive in window specified)
			#	* we could be behind a firewall or censorship mechanism (eg gfw, golden shield)
			#	* our IP is blacklisted b/c we are totally a bot X-D
			# the point being, interpret this value with an open mind
			
			if request in data['received_uris']:
				recieved = '1'
			else:
				recieved = '0'
			
			# catch exceptions
			if re.match('^Exception.+', requested_domain):
				self.sql_driver.log_error(uri, 'Error parsing element request: '+request)
				continue

			# store new elements
			if origin_domain != requested_domain:
				full_uri = request
				try:
					element_uri = re.search('^(.+?)\?.+$', full_uri).group(1) # start uri no args
				except:
					element_uri = full_uri

				# attempt to parse off the extension
				try:
					element_extension = re.search('\.([0-9A-Za-z]+)$', element_uri).group(1).lower()
				except:
					element_extension = "NULL"
				
				# figure out what type of element it is
				if element_extension == 'png' or element_extension == 'jpg' or element_extension == 'jpgx' or element_extension == 'jpeg' or element_extension == 'gif' or element_extension == 'svg' or element_extension == 'bmp' or element_extension == 'tif' or element_extension == 'tiff' or element_extension == 'webp' or element_extension == 'srf':
					element_type = 'image'
				elif element_extension == 'js' or element_extension == 'javascript':
					element_type = 'javascript'
				elif element_extension == 'json' or element_extension == 'jsonp' or element_extension == 'xml':
					element_type = 'data_structured'
				elif element_extension == 'css':
					element_type = 'style_sheet'
				elif element_extension == 'woff' or  element_extension == 'ttf' or  element_extension == 'otf':
					element_type = 'font'
				elif element_extension == 'htm' or element_extension == 'html' or element_extension == 'shtml':
					element_type = 'page_static'
				elif element_extension == 'php' or element_extension == 'asp' or element_extension == 'jsp' or element_extension == 'aspx' or element_extension == 'ashx' or element_extension == 'pl' or element_extension == 'cgi' or element_extension == 'fcgi':
					element_type = 'page_dynamic'
				elif element_extension == 'swf' or element_extension == 'fla':
					element_type = 'Shockwave Flash'
				elif element_extension == 'NULL':
					element_type = 'NULL'
				else:
					element_type = 'unknown'

				try:
					args = re.search('^.+(\?.+)$', full_uri).group(1) # start uri args
				except:
					args = 'NULL'

				element_domain_id = self.sql_driver.add_domain(requested_domain, requested_pubsuffix, requested_tld)

				element_id = self.sql_driver.add_element("NULL", full_uri, element_uri, recieved, element_extension, element_type, args, element_domain_id)
				self.sql_driver.add_element_to_page(element_id, page_id)

		return 'Successfully Added to DB'
	# end report()
	
	def close(self):
		# close mysql connections
		self.uri_parser.close()
		self.sql_driver.close()
		return
Ejemplo n.º 4
0
	def process_url(self, url):
		"""
		this function takes a specified url, loads it in the browser (currently phantomjs)
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

		# set up sql connection used to log errors and do timeseries checks
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# output store does the heavy lifting of analyzing browser output and storing to db
		output_store = OutputStore(self.db_engine, self.db_name)

		# support for loading same page with multiple browsers - purposefully undocumented 
		for browser_type in self.browser_types:

			# import and set up specified browser driver
			# 	note we need to set up a new browser each time to 
			#	get a fresh profile
			if browser_type == 'phantomjs':
				browser_driver 	= PhantomDriver()
			elif browser_type == 'chrome':
				browser_driver 	= ChromeDriver(ua=self.chrome_ua)

			# support for timeseries collections - purposefully undocumented 
			if self.allow_timeseries:
				page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type)
				if page_last_accessed_browser_type:
					time_diff = datetime.now()-page_last_accessed_browser_type[0]
					if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type:
						print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type))
						continue

			# attempt to load the page, fail gracefully
			try:
				browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait)
			except:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return
			
			# if there was a problem we log the error
			if browser_output['success'] == False:
				print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result']))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return
			else:
				# no error, treat result as browser output
				browser_output = browser_output['result']

			# attempt to store the output
			if output_store.store(url, browser_output):
				print('\t\t%-50s Success with %s' % (url[:50],browser_type))
			else:
				print('\t\t%-50s Fail with %s' % (url[:50],browser_type))
				sql_driver.log_error(url, 'Unable to load page')

		sql_driver.close()
		return
Ejemplo n.º 5
0
	def process_url(self, url):
		"""
		this function takes a specified url, loads it in the browser (currently phantomjs)
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

		# set up sql connection used to log errors and do timeseries checks
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# output store does the heavy lifting of analyzing browser output and storing to db
		output_store = OutputStore(self.db_engine, self.db_name)

		# support for loading same page with multiple browsers - purposefully undocumented 
		for browser_type in self.browser_types:

			# import and set up specified browser driver
			# 	note we need to set up a new browser each time to 
			#	get a fresh profile
			if browser_type == 'phantomjs':
				browser_driver 	= PhantomDriver()
			elif browser_type == 'chrome':
				browser_driver 	= ChromeDriver(ua=self.chrome_ua)

			# support for timeseries collections - purposefully undocumented 
			if self.allow_timeseries:
				page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type)
				if page_last_accessed_browser_type:
					time_diff = datetime.now()-page_last_accessed_browser_type[0]
					if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type:
						print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type))
						continue

			# attempt to load the page, fail gracefully
			try:
				browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait)
			except:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return		
			
			# if there was a problem browser_output will be None
			if browser_output == None:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return

			# attempt to store the output
			if output_store.store(url, browser_output):
				print('\t\t%-50s Success with %s' % (url[:50],browser_type))
			else:
				print('\t\t%-50s Fail with %s' % (url[:50],browser_type))
				sql_driver.log_error(url, 'Unable to load page')

		sql_driver.close()
		return
Ejemplo n.º 6
0
class Reporter:
	def __init__(self, db_name, num_tlds, num_results, tracker_threshold = 0):
		self.db_name 			= db_name
		self.sql_driver 		= MySQLDriver(self.db_name)
		self.num_tlds 			= num_tlds
		self.num_results 		= num_results
		self.tracker_threshold	= tracker_threshold
		self.startTime			= datetime.now()
		self.pages_ok_count		= self.sql_driver.pages_ok_count()
		
		print('\t=============================')
		print('\t Checking Output Directories ')
		print('\t=============================')				
		
		self.setup_report_dir()
		
		print('\t===========================')
		print('\t Patching DB with Org Data ')
		print('\t===========================')		
		# update the domains to their owners	
		self.patch_org_data()
		print('\t\tSuccess!')
		
		# if num_tlds is 0 we don't care about sub-analysis on the tlds (which takes time)
		# so we just push in the wildcard with the total page count
		# note that this is the *default* behavior
		if self.num_tlds == 0:
			self.top_tlds = []
			self.top_tlds.append(('*',self.pages_ok_count))
		# otherwise, we *do* care about the tlds, so get them
		else:
			print('\t=====================')
			print('\t Getting top %s tlds' % self.num_tlds)
			print('\t=====================')
			print('\t\tProcessing...')
			self.top_tlds = self.get_top_tlds(self.num_tlds)
			print('\t\tSuccess!')
			print('\t\tThe top tlds are:')
			for (tld, pages) in self.top_tlds:
				print('\t\t |- %s (%s)' % (tld,pages))

		# SPECIAL SAUCE, FOR EXPERTS: tracker domains!
		#
		# idea for this is you set a threshold of the number of sites a given domain
		#	is connected to - domains connecting to many sites may correlate those visits
		#	via referer strings etc, so we call these 'tracker domains'
		#
		# on a really large set of sites (e.g. 1M+) this works well but on small samples
		#  (e.g. 500) it doesn't work well at all as known tracker domains may only
		#  appear on a single site
		# 
		# this is off by default and unless you understand what you are doing...
		# 	DON'T USE THIS!
		#
		# longer-term we may want to train off a bigger corpus to find tracker domains and
		#	have them prepackaged
		#
		if tracker_threshold:
			print('\t=========================')
			print('\t Getting tracker domains ')
			print('\t=========================')
			print('\t\tProcessing...')
			self.tracker_domains = self.get_tracker_domains(self.tracker_threshold)
			print('\t\tSuccess!')
		else:
			# set to False so various downstream operations get skipped
			self.tracker_domains = False

	# end __init__

	#########################
	#	HELPERS, GENERAL	#
	#########################

	def setup_report_dir(self):
		# create directory for where the reports go if not exist
		if os.path.exists('./reports') == False:
			print('\t\tMaking global reports directory at ./reports.')
			os.makedirs('./reports')
		
		# set global report_path, trim off the wbxr_ prefix
		self.report_path = './reports/'+self.db_name[5:]
	
		# set up subdir for this database analysis
		if os.path.exists(self.report_path) == False:
			print('\t\tMaking subdirectory for reports at %s' % self.report_path)
			os.makedirs(self.report_path)

		# just a notice
		print('\t\tStoring output in %s' % self.report_path)
	# setup_report_dir

	# reuse this a lot
	def write_csv(self, file_name, csv_rows):
		full_file_path = self.report_path+'/'+file_name 
		file_out = open(full_file_path, 'w')
		for row in csv_rows:
			file_out.write(row)
		file_out.close()
		print('\t\t'+'*'*40)
		print('\t\tOutput written to %s' % full_file_path)
	# write_csv

	# just fyi
	def print_runtime(self):
		print('~='*40)
		print("Finished!")
		print("Time to process: "+str(datetime.now()-self.startTime)+"\n")
		print('-'*80)
	# end print_runtime

	# X-(
	def fatal(self, msg):
		print('FATAL ERROR: %s' % msg)
		print('EXITING.')
		exit()
	# fatal

	#############################
	#	HELPERS, DATABASE/INIT	#
	#############################

	def patch_org_data(self):
		# in order to analyze what entities receive user data, we need to update
		#   the database with domain ownership records we have store previously
		#
		# we first clear out what is in there in case the new data has changed
		#    perhaps make this optional, on big dbs takes a while
		#
		print('\t\tFlushing extant org data...')
		self.sql_driver.reset_domains_orgs()

		# next we pull the org/domain pairings from the json file in the resources dir
		# and add to the db
		print('\t\tPatching with new org data...')
		raw_data = open(os.path.join(os.path.dirname(__file__), './resources/org_domains/org_domains.json'), 'r')
		json_data = json.load(raw_data)
	
		# the default id for orgs is 1, so we advance from there
		id = 1
		for item in json_data:
			id += 1
			self.sql_driver.add_org(id, item['organization'], item['notes'], item['country'])
			for domain in item['domains']:
				self.sql_driver.update_domain_org(id, domain)
	# end patch_org_data

	def get_top_tlds(self, limit, type = 'tld'):
		# finds the most common tlds from all the pages
		# type is default to tld, but pubsuffix also works
		# have to do some weird sorting b/c python is arbitrary and f***s up diff tests
		# returns array

		tlds = []

		for row in self.sql_driver.get_all_tlds(type):
			tlds.append(row[0])

		top_tlds = collections.Counter(tlds).most_common()
	
		# sort alphabetical
		top_tlds.sort()
	
		# sub-sort on num occurances
		top_tlds.sort(reverse=True, key=lambda item:item[1])

		# cut the array
		top_tlds = top_tlds[0:limit]

		# push in wild cards
		top_tlds.insert(0, ('*',self.pages_ok_count))
		
		return top_tlds
	# end get_top_tlds

	def get_tracker_domains(self, threshold):
		# NOTE: first determines all pairings of page domains and element domains
		#	note this is then unique on SITES, not on PAGES
		#	e.g. if you have several pages from the same site these links only
		#	count once
		#
		# RETURNS: list of domains which link at least the threshold number of sites
		all_domains = []
		for page_domain_element_domain in self.sql_driver.get_page_domain_element_domain_pairs():
			all_domains.append(page_domain_element_domain[1])

		# count up all the pairs, convert to items() so can process as tuples
		domain_counts = collections.Counter(all_domains).items()

		# put the return values here
		tracker_domains = []

		# check against threshold
		for domain_count in domain_counts:
			if domain_count[1] >= threshold:
				tracker_domains.append(domain_count[0])

		# EDGE CASE
		# 	likely due to a large threshold we have no tracker domains,
		#	so we throw warning and log error
		if len(tracker_domains) == 0:
			self.sql_driver.log_error('Analaysis Warning', 'Tracker Threshold of %s resulted in no tracking domains.' % threshold)
			print('\t\t-----------WARNING-----------')
			print('\t\tTracker Threshold of %s resulted in no tracking domains.' % threshold)
			print('\t\t-----------------------------')

		return tracker_domains
	# get_tracker_domains

	#########################
	# 	REPORTS, GENERAL	#
	#########################

	def header(self):
		# just outputs really basic data about how many records in db, etc.
		#
		print('\t=================')
		print('\t General Summary')
		print('\t=================')
	
		output_for_csv = []

		#write csv header
		output_for_csv.append('"Item","Value"\n')

		total_pages_ok = self.sql_driver.pages_ok_count()

		print("\t\tTotal Pages OK:\t\t\t%s" % total_pages_ok)
	
		output_for_csv.append('"Total Pages OK","%s"\n' % total_pages_ok)
	
		total_pages_noload = self.sql_driver.pages_noload_count()
		total_pages_attempted = total_pages_ok + total_pages_noload
	
		print("\t\tTotal Pages FAIL:\t\t%s" % total_pages_noload)
		output_for_csv.append('"Total Pages FAIL","%s"\n' % total_pages_noload)
	
		print("\t\tTotal Pages Attempted:\t\t%s" % total_pages_attempted)
		output_for_csv.append('"Total Pages Attempted","%s"\n' % total_pages_attempted)
	
		percent_pages_OK = int((total_pages_ok/total_pages_attempted)*100)
	
		print("\t\t%% Pages OK:\t\t\t%s%%" % percent_pages_OK)
		output_for_csv.append('"%% Pages OK","%s"\n' % percent_pages_OK)
	
		total_errors = self.sql_driver.total_errors_count()
		print("\t\tTotal Errors:\t\t\t%s" % total_errors)
		output_for_csv.append('"Total Errors","%s"\n' % total_errors)
	
		total_cookies = self.sql_driver.total_cookie_count()
		print("\t\tTotal Cookies:\t\t\t%s" % total_cookies)
		output_for_csv.append('"Total Cookies","%s"\n' % total_cookies)
	
		total_pages_with_cookies = self.sql_driver.pages_w_cookie_count()
		print("\t\tPages with Cookies:\t\t%s" % total_pages_with_cookies)
		output_for_csv.append('"Pages with Cookies","%s"\n' % total_pages_with_cookies)

		percent_with_cookies = (total_pages_with_cookies/total_pages_ok)*100
		print("\t\t%% Pages with Cookies:\t\t%s%%" % int(percent_with_cookies))
		output_for_csv.append('"%% Pages with Cookies","%s"\n' % int(percent_with_cookies))
	
		total_elements = self.sql_driver.total_element_count()
		print("\t\tTotal Elements Requested:\t%s" % total_elements)
		output_for_csv.append('"Total Elements Requested","%s"\n' % total_elements)

		total_elements_received = self.sql_driver.total_element_count(received = True)
		print("\t\tTotal Elements Received:\t%s" % total_elements_received)
		output_for_csv.append('"Total Elements Received","%s"\n' % total_elements_received)

		percent_element_received = int((total_elements_received/total_elements)*100)
		print('\t\t%% Elements Received:\t\t%s%%' % percent_element_received)
		output_for_csv.append('"%% Elements Received","%s"\n' % percent_element_received)

		total_pages_with_elements = self.sql_driver.pages_w_element_count()
		print("\t\tPages with Elements:\t\t%s" % total_pages_with_elements)
		output_for_csv.append('"Pages with Elements","%s"\n' % total_pages_with_elements)

		percent_with_elements = (total_pages_with_elements/total_pages_ok)*100
		print("\t\t%% Pages with Elements:\t\t%s%%" % int(percent_with_elements))
		output_for_csv.append('"%% Pages With Elements","%s"\n' % int(percent_with_elements))
		
		self.write_csv('db_summary.csv', output_for_csv)
		print('\t'+'-'*80+'\n')
	# header

	def get_network_ties(self):
		# this report generates data necessary for graph/network analysis by
		#	outputting a list of page domains and the elements/orgs they connect to

		print('\t=============================')
		print('\t Processing Network Ties ')
		print('\t=============================')

		# put output here
		output_for_csv = []
		
		# header row for csv		
		output_for_csv.append('"page_domain","3p_org","3p_domain"\n')
		
		# sql_driver.get_network_ties returns a set of tuples in the format
		#	(page domain, page org, element domain, element org)
		#	we just go through this data to produce the report
		# note: the report is currently omitting page org, but it can be altered easily
		
		for edge in self.sql_driver.get_network_ties():
			# if a page has no elements, edge[2] will be 'None' so we skip it
			#	an alternate approach would be to include as orphan nodes
			if edge[2]: output_for_csv.append('"%s","%s","%s",\n' % (edge[0],edge[3],edge[2]))

		# done!
		self.write_csv('network.csv', output_for_csv)
		print('\t'+'-'*80+'\n')
	# get_network_ties		
	
	def get_summary_by_tld(self):
		print('\t=============================')
		print('\t Processing Summaries by TLD ')
		print('\t=============================')
		output_for_csv = []
		output_for_csv.append('"TLD","N","% TOTAL","N W/3PE","% W/3PE","N W/COOKIE","% W/COOKIE","N W/JS","% W/JS","3P DOMAIN MEAN","3P DOMAIN MEDIAN","3P DOMAIN MODE","TRACKER FILTER DEPTH"\n')

		# now do per-tld numbers
		for tld in self.top_tlds:
			print('\t\tGetting summary for %s' % tld[0])

			if tld[0] != '*':
				tld_filter = tld[0]
			else:
				tld_filter = ''

			total_pages 			= self.sql_driver.get_complex_page_count(tld_filter)
			total_pages_percent 	= (total_pages/self.pages_ok_count)*100
			total_pages_elements 	= self.sql_driver.get_complex_page_count(tld_filter, 'elements', self.tracker_domains)
			percent_with_elements 	= (total_pages_elements/total_pages)*100
			total_pages_cookies 	= self.sql_driver.get_complex_page_count(tld_filter, 'cookies', self.tracker_domains)
			percent_with_cookies 	= (total_pages_cookies/total_pages)*100
			total_pages_js 			= self.sql_driver.get_complex_page_count(tld_filter, 'javascript', self.tracker_domains)
			percent_with_js 		= (total_pages_js/total_pages)*100

			if self.tracker_threshold:
				filter_depth = self.tracker_threshold
			else:
				filter_depth = 'No Tracker Filter Used'

			stats 	= self.get_page_3p_stats(tld[0])
			mean 	= stats[0]
			median 	= stats[1]
			mode 	= stats[2]

			output_for_csv.append('"%s","%s","%.2f","%s","%.2f","%s","%.2f","%s","%.2f","%.2f","%s","%s","%s"\n' % (
				tld[0], 
				total_pages, 
				total_pages_percent, 
				total_pages_elements, 
				percent_with_elements, 
				total_pages_cookies, 
				percent_with_cookies, 
				total_pages_js, 
				percent_with_js,
				mean,
				median,
				mode,
				filter_depth))		
	
		self.write_csv('summary_by_tld.csv', output_for_csv)
	# end get_summary_by_tld
	
	#####################
	#	REPORTS, MAIN	#
	#####################

	def get_reports_by_tld(self, type='', sub_type=''):
		print('\t=============================')
		print('\tProcessing Top %s %s %s' % (self.num_results, type, sub_type))
		print('\t=============================')
	
		# keep output here
		csv_output = []

		# write out the header row for the csv
		if type is 'elements':
			csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Organization","Country", "Element","Extension","Type","Domain","Total Pages","Raw Count","Percent Total"\n')
		elif type is 'domains':
			csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Domain","Organization","Country","Number of Pages","Raw Count","Percent Total"\n')
		elif type is 'orgs':
			csv_output.append('"TLD","TLD Rank","Intra-TLD Rank","Organization","Country","Number of Pages","Raw Count","Percent Total"\n')
		else:
			self.fatal('Wrong type specified in get_reports_by_tld, must be "elements", "domains", or "orgs".')

		tld_count = 0
	
		for tld in self.top_tlds:
			current_tld = tld[0]
			total_pages = tld[1]

			print('\t\tcurrently on: '+current_tld)
		
			# filter on current page tld
			if tld[0] != '*':
				tld_filter = tld[0]
				tld_count += 1
			else:
				tld_filter = ''

			# get results with specified filter
			results_rows = self.get_results_rows(total_pages, type, sub_type, tld_filter)

			current_row = 0
			# loop results
			for result_row in results_rows:
				current_row += 1
				if type is 'elements':
					csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % (
						current_tld,
						tld_count,
						current_row, 
						result_row[0][0], 	# org_name
						result_row[0][1], 	# country
						result_row[0][2], 	# element
						result_row[0][3], 	# extension
						result_row[0][4], 	# type
						result_row[0][5], 	# domain
						total_pages,
						result_row[1], 		# raw_count
						(result_row[1]/total_pages)*100)
				elif type is 'domains':
					total_item = result_row[1]
					csv_row = '"%s","%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % (
						current_tld,
						tld_count,
						current_row, 
						result_row[0][0], 	# domain_name
						result_row[0][1], 	# org_name
						result_row[0][2], 	# org_country
						total_pages, 
						total_item, 
						(total_item/total_pages)*100)
				elif type is 'orgs':
					total_item = result_row[1]
					csv_row = '"%s","%s","%s","%s","%s","%s","%s","%.2f"\n' % (
						current_tld,
						tld_count,
						current_row, 
						result_row[0][0], 		# org_name
						result_row[0][1], 		# org_country
						total_pages,
						total_item, 
						(total_item/total_pages)*100)
				csv_output.append(csv_row)
				if current_row >= self.num_results: break

		# write out csv
		file_name = type + '-by-tld'
	
		# this really only applied to elements at present
		if sub_type:
			file_name += '-' + sub_type
		
		file_name += '.csv'	

		# store the report
		self.write_csv(file_name, csv_output)
	# end get_reports_by_tld	

	def get_results_rows(self, total_pages, type, sub_type = '', tld_filter=''):
		# this queries the db to get all elements, domains, or orgs
		# next they are counted to find the most common
		# and formatted to csv rows and returned
		
		# query the db
		if type is 'elements':
			# rows are (page.start_uri, org.name, element.element_uri, element.extension, element.type, element_domain.domain)
			query_results = self.sql_driver.get_elements(tld_filter, sub_type)
		elif type is 'domains':
			# rows are (page.start_uri, element_domain.domain, org.name)
			query_results = self.sql_driver.get_domains(tld_filter)
		elif type is 'orgs':
			# row are page.start_uri, org.name
			query_results = self.sql_driver.get_orgs(tld_filter)
		else:
			self.fatal('Type must be elements, domains, or orgs.')

		# count up the unique elements, domains, or orgs we are looking for
		results_counter = collections.Counter()
		for row in query_results:
			# remove first element in tuple as it is the page uri, now irrelevant
			# add rest to results counter as this is what we care about now
			results_counter[row[1:]] += 1


		# python's most_common() arbitrarily re-orders items with same value, making
		#	debugging a nightmare, so we have to double sort here
	
		# convert to list we can sort
		results_counter = results_counter.most_common()

		# sort alphabetical
		results_counter.sort()

		# sub-sort on num occurrences
		results_counter.sort(reverse=True, key=lambda item:item[1])

		return results_counter
	# end get_results_rows
	
	def get_page_3p_stats(self, tld = ''):
		# This function calls get_page_element_domain_pairs to get a list of tuples
		#	st. each tuple is a unique (page address, domain of an element) paring
		# This list of tuples is then iterated so that we count how many domains
		#	each page is linked to
		# IMPORTANT: get_page_element_domain_pairs is *already* sorted by page, otherwise
		#	 loop below would not work
		
		if tld == '*':
			tld = ''
		
		# init vars
		this_page_element_count = 0
		all_page_element_counts = []
		last_page = ''

		# run query, process rows
		for row in self.sql_driver.get_page_uri_element_domain_pairs(tld):
			# page has no tackers, count is zero
			if not row[1]:
				all_page_element_counts.append(0)
			# page has trackers, add count
			else:
				# this is the same page, increment count
				if row[0] == last_page:
					if self.tracker_domains:
						if row[1] in self.tracker_domains:
							this_page_element_count += 1
					else:
						this_page_element_count += 1
				# this is a new page, store our running count, reset to 1
				# update last_page
				else:
					if last_page != '': all_page_element_counts.append(this_page_element_count)
					last_page = row[0]
					if self.tracker_domains:
						if row[1] in self.tracker_domains:
							this_page_element_count = 1
						else:
							this_page_element_count = 0
					else:
						this_page_element_count = 1

		# if we have an outstanding value, give it an increment
		if this_page_element_count != 0:
			# enter the last value into the list
			all_page_element_counts.append(this_page_element_count)

		# mean and median should always be ok
		mean 	= statistics.mean(all_page_element_counts)
		median 	= statistics.median(all_page_element_counts)

		# but mode can throw an error, so catch here
		try:
			mode = statistics.mode(all_page_element_counts)
		except:
			mode = 'NULL'	

		return(mean, median, mode)
	# get_page_3p_stats
# end class Reporter
Ejemplo n.º 7
0
    def store(self,
              url,
              browser_output,
              store_source=False,
              store_1p=True,
              get_file_hashes=False,
              hash_3p_only=False):
        """
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

        # open up a sql connection
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
            exit()

        # get the ip, fqdn, domain, pubsuffix, and tld
        # we need the domain to figure out if cookies/elements are third-party
        origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
            url)

        # if we can't get page domain info we fail gracefully
        if origin_ip_fqdn_domain_pubsuffix_tld is None:
            sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
            return False

        origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0]
        origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1]
        origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2]
        origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3]
        origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4]

        # sql_driver.add_domain both stores the new domain and returns its db row id
        # if it is already in db just return the existing id
        page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn,
                                               origin_domain, origin_pubsuffix,
                                               origin_tld)

        # figure out the privacy policy url and text, starts null
        priv_policy_url = None
        priv_policy_url_text = None

        # read in our list of privacy link terms from the json file in webxray/resources/policyxray
        privacy_policy_term_list = self.utilities.get_privacy_policy_term_list(
        )

        # we reverse links return from browser to check footer links first as that is where policy links tend to be
        all_links = browser_output['all_links']
        all_links.reverse()

        # if we have links search for privacy policy
        if len(all_links) > 0:
            # links are tuple
            for link_text, link_url in all_links:
                # makes sure we have text, skip links without
                if link_text:
                    # need lower for string matching
                    link_text = link_text.lower().strip()
                    # not a link we can use
                    if 'javascript' in link_text: continue
                    # see if the link_text is in our term list
                    if link_text in privacy_policy_term_list:
                        # if the link_url is relative this will convert to absolute
                        priv_policy_url = self.utilities.get_absolute_url_from_page_link(
                            url, link_url)
                        priv_policy_url_text = link_text
                        break

        # if the final page is https (often after a redirect), mark it appropriately
        if browser_output['final_url'][:5] == 'https':
            page_is_ssl = True
        else:
            page_is_ssl = False

        if store_source:
            # handles issue where postgres will crash on inserting null character
            source = browser_output['source'].replace('\x00', ' ')
        else:
            source = None

        # add page
        page_id = sql_driver.add_page(
            browser_output['browser_type'], browser_output['browser_version'],
            browser_output['browser_wait'], browser_output['title'],
            browser_output['meta_desc'], url, browser_output['final_url'],
            priv_policy_url, priv_policy_url_text, page_is_ssl, source,
            browser_output['load_time'], page_domain_id)

        # store cookies
        for cookie in browser_output['cookies']:
            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            # note:
            #	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
            cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                'http://' + cookie['domain'])

            # something went wrong, log and fail gracefully
            if cookie_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url,
                    'Error parsing cookie with domain: ' + cookie['domain'])
                continue

            # otherwise, everything went fine
            cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0]
            cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1]
            cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2]
            cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3]
            cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4]

            # mark third-party cookies
            if origin_domain != cookie_domain:
                is_3p_cookie = True
            else:
                is_3p_cookie = False

            # this is a first party cookie, see if we want to store it
            if is_3p_cookie is False and store_1p is False:
                continue

            # sql_driver.add_domain both stores the new domain and returns its id
            cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn,
                                                     cookie_domain,
                                                     cookie_pubsuffix,
                                                     cookie_tld)

            # name and domain are required, so if they fail we just continue
            try:
                name = cookie['name']
            except:
                continue

            try:
                domain = cookie_domain
            except:
                continue

            # these are optional, fill with null values if fail
            try:
                secure = cookie['secure']
            except:
                secure = None

            try:
                path = cookie['path']
            except:
                path = None

            try:
                httponly = cookie['httponly']
            except:
                httponly = None

            try:
                expiry = cookie['expiry']
            except:
                expiry = None

            try:
                value = cookie['value']
            except:
                value = None

            # all done with this cookie
            sql_driver.add_cookie(page_id, name, secure, path, domain,
                                  httponly, expiry, value, is_3p_cookie,
                                  cookie_domain_id)

        # process requests now
        for request in browser_output['processed_requests']:
            # if the request starts with the following we can't parse anyway, so skip
            if re.match('^(data|about|chrome|blob).+', request):
                continue

            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                request)

            # problem with this request, log and fail gracefully
            if element_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url, 'Error parsing element request: ' + request)
                continue

            element_ip = element_ip_fqdn_domain_pubsuffix_tld[0]
            element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1]
            element_domain = element_ip_fqdn_domain_pubsuffix_tld[2]
            element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3]
            element_tld = element_ip_fqdn_domain_pubsuffix_tld[4]

            # sql_driver.add_domain both stores the new domain and returns its db row id
            element_domain_id = sql_driver.add_domain(element_ip, element_fqdn,
                                                      element_domain,
                                                      element_pubsuffix,
                                                      element_tld)

            # mark third-party elements based on domain
            if origin_domain != element_domain:
                is_3p_element = True
            else:
                is_3p_element = False

            # if we are not storing 1p elements continue
            if is_3p_element is False and store_1p is False:
                continue

            if request[:5] == 'https':
                element_is_ssl = True
            else:
                element_is_ssl = False

            try:
                received = browser_output['processed_requests'][request][
                    'received']
            except:
                received = None

            # get domain of referer and determine if page leaked by referer
            try:
                referer = browser_output['processed_requests'][request][
                    'referer']
            except:
                referer = None

            if referer and len(referer) != 0:
                referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                    referer)

                if referer_ip_fqdn_domain_pubsuffix_tld:
                    if referer_ip_fqdn_domain_pubsuffix_tld[
                            2] == origin_domain:
                        page_domain_in_referer = True
                    else:
                        page_domain_in_referer = False
                else:
                    page_domain_in_referer = None
                    sql_driver.log_error(
                        url, 'Error parsing referer header: ' + referer)
            else:
                page_domain_in_referer = None

            try:
                start_time_offset = browser_output['processed_requests'][
                    request]['start_time_offset']
            except:
                start_time_offset = None

            try:
                load_time = browser_output['processed_requests'][request][
                    'load_time']
            except:
                load_time = None

            try:
                status = browser_output['processed_requests'][request][
                    'status']
            except:
                status = None

            try:
                status_text = browser_output['processed_requests'][request][
                    'status_text']
            except:
                status_text = None

            try:
                content_type = browser_output['processed_requests'][request][
                    'content_type']
            except:
                content_type = None

            try:
                body_size = browser_output['processed_requests'][request][
                    'body_size']
            except:
                body_size = None

            try:
                request_headers = str(browser_output['processed_requests']
                                      [request]['request_headers'])
            except:
                request_headers = None

            try:
                response_headers = str(browser_output['processed_requests']
                                       [request]['response_headers'])
            except:
                response_headers = None

            # consider anything before the "?" to be the element_url
            try:
                element_url = re.search('^(.+?)\?.+$', request).group(1)
            except:
                element_url = request

            # consider anything after the "?" to be the args
            try:
                element_args = re.search('^.+(\?.+)$',
                                         request).group(1)  # start url args
            except:
                element_args = None

            # attempt to parse off the extension
            try:
                element_extension = re.search('\.([0-9A-Za-z]+)$',
                                              element_url).group(1).lower()
            except:
                element_extension = None

            # lists of common extensions, can be expanded
            image_extensions = [
                'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif',
                'tiff', 'webp', 'srf'
            ]
            script_extensions = ['js', 'javascript']
            data_extensions = ['json', 'jsonp', 'xml']
            font_extentions = ['woff', 'ttf', 'otf']
            static_extentions = ['html', 'htm', 'shtml']
            dynamic_extentions = [
                'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'
            ]

            # figure out what type of element it is
            if element_extension in image_extensions:
                element_type = 'image'
            elif element_extension in script_extensions:
                element_type = 'javascript'
            elif element_extension in data_extensions:
                element_type = 'data_structured'
            elif element_extension == 'css':
                element_type = 'style_sheet'
            elif element_extension in font_extentions:
                element_type = 'font'
            elif element_extension in static_extentions:
                element_type = 'page_static'
            elif element_extension == dynamic_extentions:
                element_type = 'page_dynamic'
            elif element_extension == 'swf' or element_extension == 'fla':
                element_type = 'Shockwave Flash'
            else:
                element_type = None

            # file hashing has non-trivial overhead and off by default
            #
            # what this does is uses the same ua/referer as the actual request
            # 	so we are just replaying the last one to get similar response
            # 	note that we aren't sending the same cookies so that could be an issue
            # 	otherwise it is equivalent to a page refresh in theory

            # option to hash only 3p elements observed here
            if (get_file_hashes and hash_3p_only
                    and is_3p_element) or (get_file_hashes
                                           and hash_3p_only == False):
                replay_element_request = urllib.request.Request(
                    request,
                    headers={
                        'User-Agent':
                        browser_output['processed_requests'][request]
                        ['user_agent'],
                        'Referer':
                        referer,
                        'Accept':
                        '*/*'
                    })
                try:
                    file_md5 = hashlib.md5(
                        urllib.request.urlopen(replay_element_request,
                                               timeout=10).read()).hexdigest()
                except:
                    file_md5 = None
            else:
                file_md5 = None

            # store request
            sql_driver.add_element(
                page_id, request, element_url, is_3p_element, element_is_ssl,
                received, referer, page_domain_in_referer, start_time_offset,
                load_time, status, status_text, content_type, body_size,
                request_headers, response_headers, file_md5, element_extension,
                element_type, element_args, element_domain_id)

        # close db connection
        sql_driver.close()

        return True
Ejemplo n.º 8
0
	def store(self, url, browser_output, store_source=False, store_1p=True, get_file_hashes=False, hash_3p_only=False):
		"""
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

		# open up a sql connection
		if self.db_engine == 'mysql':
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)
		elif self.db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
			exit()

		# get the ip, fqdn, domain, pubsuffix, and tld
		# we need the domain to figure out if cookies/elements are third-party
		origin_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url)

		# if we can't get page domain info we fail gracefully
		if origin_ip_fqdn_domain_pubsuffix_tld is None:
			sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
			return False

		origin_ip 			= origin_ip_fqdn_domain_pubsuffix_tld[0]
		origin_fqdn 		= origin_ip_fqdn_domain_pubsuffix_tld[1]
		origin_domain 		= origin_ip_fqdn_domain_pubsuffix_tld[2]
		origin_pubsuffix 	= origin_ip_fqdn_domain_pubsuffix_tld[3]
		origin_tld 			= origin_ip_fqdn_domain_pubsuffix_tld[4]
		
		# sql_driver.add_domain both stores the new domain and returns its db row id
		# if it is already in db just return the existing id
		page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld)

		# figure out the privacy policy url and text, starts null
		priv_policy_url = None
		priv_policy_url_text = None

		# read in our list of privacy link terms from the json file in webxray/resources/policyxray
		privacy_policy_term_list = self.utilities.get_privacy_policy_term_list()

		# we reverse links return from browser to check footer links first as that is where policy links tend to be
		all_links = browser_output['all_links']
		all_links.reverse()

		# if we have links search for privacy policy
		if len(all_links) > 0:
			# links are tuple
			for link_text,link_url in all_links:
				# makes sure we have text, skip links without
				if link_text:
					# need lower for string matching
					link_text = link_text.lower().strip()
					# not a link we can use
					if 'javascript' in link_text: continue
					# see if the link_text is in our term list
					if link_text in privacy_policy_term_list:
							# if the link_url is relative this will convert to absolute
							priv_policy_url = self.utilities.get_absolute_url_from_page_link(url,link_url)
							priv_policy_url_text = link_text
							break

		# if the final page is https (often after a redirect), mark it appropriately
		if browser_output['final_url'][:5] == 'https':
			page_is_ssl = True
		else:
			page_is_ssl = False

		if store_source:
			# handles issue where postgres will crash on inserting null character
			source = browser_output['source'].replace('\x00',' ')
		else:
			source = None

		# add page
		page_id = sql_driver.add_page(
			browser_output['browser_type'],
			browser_output['browser_version'],
			browser_output['browser_wait'],
			browser_output['title'],
			browser_output['meta_desc'],
			url, 
			browser_output['final_url'],
			priv_policy_url,
			priv_policy_url_text,
			page_is_ssl,
			source,
			browser_output['load_time'],
			page_domain_id
		)

		# store cookies
		for cookie in browser_output['cookies']:
			# get the ip, fqdn, domain, pubsuffix, and tld
			# we need the domain to figure out if cookies/elements are third-party
			# note:
			#	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
			cookie_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain'])
			
			# something went wrong, log and fail gracefully
			if cookie_ip_fqdn_domain_pubsuffix_tld is None:
				sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain'])
				continue

			# otherwise, everything went fine
			cookie_ip 			= cookie_ip_fqdn_domain_pubsuffix_tld[0]
			cookie_fqdn 		= cookie_ip_fqdn_domain_pubsuffix_tld[1]
			cookie_domain 		= cookie_ip_fqdn_domain_pubsuffix_tld[2]
			cookie_pubsuffix 	= cookie_ip_fqdn_domain_pubsuffix_tld[3]
			cookie_tld 			= cookie_ip_fqdn_domain_pubsuffix_tld[4]

			# mark third-party cookies
			if origin_domain != cookie_domain:
				is_3p_cookie = True
			else:
				is_3p_cookie = False

			# this is a first party cookie, see if we want to store it
			if is_3p_cookie is False and store_1p is False:
				continue

			# sql_driver.add_domain both stores the new domain and returns its id
			cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld)
		
			# name and domain are required, so if they fail we just continue
			try: name = cookie['name']
			except: continue
		
			try: domain = cookie_domain
			except: continue
		
			# these are optional, fill with null values if fail
			try: secure = cookie['secure']
			except: secure = None
		
			try: path = cookie['path']
			except: path = None
		
			try: httponly = cookie['httponly']
			except: httponly = None
		
			try: expiry = cookie['expiry']
			except: expiry = None
		
			try: value = cookie['value']
			except: value = None
		
			# all done with this cookie
			sql_driver.add_cookie(
				page_id,
				name, secure, path, domain, 
				httponly, expiry, value, 
				is_3p_cookie, cookie_domain_id
			)

		# process requests now
		for request in browser_output['processed_requests']:
			# if the request starts with the following we can't parse anyway, so skip
			if re.match('^(data|about|chrome|blob).+', request):
				continue

			# get the ip, fqdn, domain, pubsuffix, and tld
			# we need the domain to figure out if cookies/elements are third-party
			element_ip_fqdn_domain_pubsuffix_tld	= self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request)

			# problem with this request, log and fail gracefully
			if element_ip_fqdn_domain_pubsuffix_tld is None:
				sql_driver.log_error(url, 'Error parsing element request: '+request)
				continue

			element_ip 			= element_ip_fqdn_domain_pubsuffix_tld[0]
			element_fqdn 		= element_ip_fqdn_domain_pubsuffix_tld[1]
			element_domain 		= element_ip_fqdn_domain_pubsuffix_tld[2]
			element_pubsuffix 	= element_ip_fqdn_domain_pubsuffix_tld[3]
			element_tld 		= element_ip_fqdn_domain_pubsuffix_tld[4]

			# sql_driver.add_domain both stores the new domain and returns its db row id
			element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld)

			# mark third-party elements based on domain
			if origin_domain != element_domain:
				is_3p_element = True
			else:
				is_3p_element = False

			# if we are not storing 1p elements continue
			if is_3p_element is False and store_1p is False:
				continue
			
			if request[:5] == 'https' or request[:3] == 'wss':
				element_is_ssl = True
			else:
				element_is_ssl = False

			try:
				received = browser_output['processed_requests'][request]['received']
			except:
				received = None

			# get domain of referer and determine if page leaked by referer
			try:
				referer = browser_output['processed_requests'][request]['referer']
			except:
				referer = None

			if referer and len(referer) != 0:
				referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer)

				if referer_ip_fqdn_domain_pubsuffix_tld:
					if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain:
						page_domain_in_referer = True
					else:
						page_domain_in_referer = False
				else:
					page_domain_in_referer = None
					sql_driver.log_error(url, 'Error parsing referer header: '+referer)
			else:
				page_domain_in_referer = None

			try:
				start_time_offset = browser_output['processed_requests'][request]['start_time_offset']
			except:
				start_time_offset = None

			try:
				load_time = browser_output['processed_requests'][request]['load_time']
			except:
				load_time = None

			try:
				status = browser_output['processed_requests'][request]['status']
			except:
				status = None

			try:
				status_text = browser_output['processed_requests'][request]['status_text']
			except:
				status_text = None

			try:
				content_type = browser_output['processed_requests'][request]['content_type']
			except:
				content_type = None
			
			try:
				body_size = browser_output['processed_requests'][request]['body_size']
			except:
				body_size = None

			try:
				request_headers = str(browser_output['processed_requests'][request]['request_headers'])
			except:
				request_headers = None

			try:
				response_headers = str(browser_output['processed_requests'][request]['response_headers'])
			except:
				response_headers = None

			# consider anything before the "?" to be the element_url
			try:
				element_url = re.search('^(.+?)\?.+$', request).group(1)
			except:
				element_url = request

			# consider anything after the "?" to be the args
			try:
				element_args = re.search('^.+(\?.+)$', request).group(1) # start url args
			except:
				element_args = None

			# attempt to parse off the extension
			try:
				element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower()
			except:
				element_extension = None
			
			# lists of common extensions, can be expanded
			image_extensions 	= ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf']
			script_extensions 	= ['js', 'javascript']
			data_extensions 	= ['json', 'jsonp', 'xml']
			font_extentions 	= ['woff', 'ttf', 'otf']
			static_extentions 	= ['html', 'htm', 'shtml']
			dynamic_extentions	= ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi']

			# figure out what type of element it is
			if element_extension in image_extensions:
				element_type = 'image'
			elif element_extension in script_extensions:
				element_type = 'javascript'
			elif element_extension in data_extensions:
				element_type = 'data_structured'
			elif element_extension == 'css':
				element_type = 'style_sheet'
			elif element_extension in font_extentions:
				element_type = 'font'
			elif element_extension in static_extentions:
				element_type = 'page_static'
			elif element_extension == dynamic_extentions:
				element_type = 'page_dynamic'
			elif element_extension == 'swf' or element_extension == 'fla':
				element_type = 'Shockwave Flash'
			else:
				element_type = None

			# file hashing has non-trivial overhead and off by default
			#
			# what this does is uses the same ua/referer as the actual request
			# 	so we are just replaying the last one to get similar response
			# 	note that we aren't sending the same cookies so that could be an issue
			# 	otherwise it is equivalent to a page refresh in theory

			# option to hash only 3p elements observed here
			if (get_file_hashes and hash_3p_only and is_3p_element) or (get_file_hashes and hash_3p_only == False):
				replay_element_request = urllib.request.Request(
					request,
					headers = {
						'User-Agent' : browser_output['processed_requests'][request]['user_agent'],
						'Referer' : referer,
						'Accept' : '*/*'
					}
				)
				try:
					file_md5 = hashlib.md5(urllib.request.urlopen(replay_element_request,timeout=10).read()).hexdigest()
				except:
					file_md5 = None
			else:
				file_md5 = None

			# final tasks is to truncate the request if it is 
			#	over 2k characters as it is likely
			#	binary data and may cause problems inserting
			#	into TEXT fields in database
			#
			#  TODO:
			#	better handle binary data in general
			if len(request) >= 2000: request = request[:2000]
			if len(element_url) >= 2000: element_url = element_url[:2000]

			# store request
			sql_driver.add_element(
				page_id,
				request, element_url,
				is_3p_element, element_is_ssl,
				received,
				referer,
				page_domain_in_referer,
				start_time_offset,
				load_time,
				status,
				status_text,
				content_type,
				body_size,
				request_headers,
				response_headers,
				file_md5,
				element_extension,
				element_type,
				element_args,
				element_domain_id
			)

		# close db connection
		sql_driver.close()

		return True
Ejemplo n.º 9
0
class OutputStore:
    def __init__(self, dbname):
        self.uri_parser = ParseURI()
        self.sql_driver = MySQLDriver(dbname)

    # end init

    def store(self, uri, phantom_output):
        # parse out the json from our phantom_output
        # sometimes phantom prints out errors before the json, (despite us turning
        # it off!), so we match inside of the {}s to get json only
        try:
            data = json.loads(re.search('(\{.+\})', phantom_output).group(1))
        except Exception as e:
            self.sql_driver.log_error(uri, "Could Not Load JSON: %s" % e)
            return 'Could Not Load JSON'

        # we need to parse the domain to determine if requests are local or 3rd party
        # we need pubsuffix and tld for later analysis so store them now

        origin_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(
            uri)
        origin_domain = origin_domain_pubsuffix_tld[0]
        origin_pubsuffix = origin_domain_pubsuffix_tld[1]
        origin_tld = origin_domain_pubsuffix_tld[2]

        if re.match('^Exception.+', origin_domain):
            self.sql_driver.log_error(uri, 'Could not parse TLD for %s' % uri)
            return 'Could not parse TLD for %s' % uri

        page_domain_id = self.sql_driver.add_domain(origin_domain,
                                                    origin_pubsuffix,
                                                    origin_tld)

        # newFollowLogger is giving us the follow JSON data:
        # note source is null for now, but store anyway
        # 		big_out = {
        # 			final_uri: final_uri,
        # 			title: page.title,
        # 			meta_desc : meta_desc,
        # 			requested_uris: JSON.stringify(requested_uris),
        # 			received_uris: JSON.stringify(received_uris),
        # 			cookies: phantom.cookies,
        # 			source: 'NULL',
        # 		};

        # we are now storing uris with and without args and saving args
        # we must unquote to uri to get back to original state so can parse
        start_uri_original = urllib.parse.unquote(uri)

        try:
            start_uri_no_args = re.search('^(.+?)\?.+$',
                                          start_uri_original).group(
                                              1)  # start uri no args
        except:
            start_uri_no_args = uri

        try:
            start_uri_args = re.search('^.+(\?.+)$', start_uri_original).group(
                1)  # start uri args
        except:
            start_uri_args = 'NULL'

        # same for the final uri (this is where we are after potential redirects)
        final_uri = re.sub('\"', '', json.dumps(data["final_uri"]))
        final_uri_original = urllib.parse.unquote(final_uri)

        try:
            final_uri_no_args = re.search('^(.+?)\?.+$',
                                          final_uri_original).group(
                                              1)  # start uri no args
        except:
            final_uri_no_args = final_uri

        try:
            final_uri_args = re.search('^.+(\?.+)$', final_uri_original).group(
                1)  # start uri args
        except:
            final_uri_args = 'NULL'

        # add page
        # json.dumps to make sure strings go out ok for db
        page_id = self.sql_driver.add_page(
            str(re.sub('\"', '', json.dumps(data["title"]))),
            str(re.sub('\"', '',
                       json.dumps(data["meta_desc"]))), uri, start_uri_no_args,
            start_uri_args, final_uri, final_uri_no_args, final_uri_args,
            str(re.sub('\"', '', json.dumps(data["source"]))),
            str(re.sub('\"', '', json.dumps(data["requested_uris"]))),
            str(re.sub('\"', '',
                       json.dumps(data["received_uris"]))), page_domain_id)

        for cookie in data["cookies"]:
            # store external cookies, uri_parser fails on non-http, we should fix this
            # right now a lame hack is to prepend http://
            cookie_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(
                "http://" + cookie["domain"])
            cookie_domain = cookie_domain_pubsuffix_tld[0]
            cookie_pubsuffix = cookie_domain_pubsuffix_tld[1]
            cookie_tld = cookie_domain_pubsuffix_tld[2]

            # something went wrong, but carry on...
            if re.match('^Exception.+', cookie_domain):
                self.sql_driver.log_error(
                    uri, 'Error parsing cookie: ' + cookie_domain)
                continue

            # this is a 3party cookie
            if origin_domain != cookie_domain:

                cookie_domain_id = self.sql_driver.add_domain(
                    cookie_domain, cookie_pubsuffix, cookie_tld)

                # name and domain are required, so if they fail we just continue
                try:
                    name = cookie["name"]
                except:
                    continue

                try:
                    domain = cookie_domain
                except:
                    continue

                # these are optional, keep going with "N/A" vals

                try:
                    secure = cookie["secure"]
                except:
                    secure = "N/A"

                try:
                    path = cookie["path"]
                except:
                    path = "N/A"

                try:
                    expires = cookie["expires"]
                except:
                    expires = "N/A"

                try:
                    httponly = cookie["httponly"]
                except:
                    httponly = "N/A"

                try:
                    expiry = cookie["expiry"]
                except:
                    expiry = "N/A"

                try:
                    value = cookie["value"]
                except:
                    value = "N/A"

                cookie_id = self.sql_driver.add_cookie(name, secure, path,
                                                       domain, expires,
                                                       httponly, expiry, value,
                                                       cookie_domain_id)
                self.sql_driver.add_cookie_to_page(cookie_id, page_id)

        for request in data["requested_uris"]:
            # if the request starts with "data" we can't parse tld anyway, so skip
            if re.match('^(data|about|chrome).+', request):
                continue

            # get domain, pubsuffix, and tld from request
            requested_domain_pubsuffix_tld = self.uri_parser.get_domain_pubsuffix_tld(
                request)
            requested_domain = requested_domain_pubsuffix_tld[0]
            requested_pubsuffix = requested_domain_pubsuffix_tld[1]
            requested_tld = requested_domain_pubsuffix_tld[2]

            # see if we got back what we requested, if not a few things may have happened
            # 	* malformed uri
            #	* resource is gone or never existed
            #	* network latency (ie it didn't arrive in window specified)
            #	* we could be behind a firewall or censorship mechanism (eg gfw, golden shield)
            #	* our IP is blacklisted b/c we are totally a bot X-D
            # the point being, interpret this value with an open mind

            if request in data['received_uris']:
                recieved = '1'
            else:
                recieved = '0'

            # catch exceptions
            if re.match('^Exception.+', requested_domain):
                self.sql_driver.log_error(
                    uri, 'Error parsing element request: ' + request)
                continue

            # store new elements
            if origin_domain != requested_domain:
                full_uri = request
                try:
                    element_uri = re.search('^(.+?)\?.+$', full_uri).group(
                        1)  # start uri no args
                except:
                    element_uri = full_uri

                # attempt to parse off the extension
                try:
                    element_extension = re.search(
                        '\.([0-9A-Za-z]+)$', element_uri).group(1).lower()
                except:
                    element_extension = "NULL"

                # figure out what type of element it is
                if element_extension == 'png' or element_extension == 'jpg' or element_extension == 'jpgx' or element_extension == 'jpeg' or element_extension == 'gif' or element_extension == 'svg' or element_extension == 'bmp' or element_extension == 'tif' or element_extension == 'tiff' or element_extension == 'webp' or element_extension == 'srf':
                    element_type = 'image'
                elif element_extension == 'js' or element_extension == 'javascript':
                    element_type = 'javascript'
                elif element_extension == 'json' or element_extension == 'jsonp' or element_extension == 'xml':
                    element_type = 'data_structured'
                elif element_extension == 'css':
                    element_type = 'style_sheet'
                elif element_extension == 'woff' or element_extension == 'ttf' or element_extension == 'otf':
                    element_type = 'font'
                elif element_extension == 'htm' or element_extension == 'html' or element_extension == 'shtml':
                    element_type = 'page_static'
                elif element_extension == 'php' or element_extension == 'asp' or element_extension == 'jsp' or element_extension == 'aspx' or element_extension == 'ashx' or element_extension == 'pl' or element_extension == 'cgi' or element_extension == 'fcgi':
                    element_type = 'page_dynamic'
                elif element_extension == 'swf' or element_extension == 'fla':
                    element_type = 'Shockwave Flash'
                elif element_extension == 'NULL':
                    element_type = 'NULL'
                else:
                    element_type = 'unknown'

                try:
                    args = re.search('^.+(\?.+)$',
                                     full_uri).group(1)  # start uri args
                except:
                    args = 'NULL'

                element_domain_id = self.sql_driver.add_domain(
                    requested_domain, requested_pubsuffix, requested_tld)

                element_id = self.sql_driver.add_element(
                    "NULL", full_uri, element_uri, recieved, element_extension,
                    element_type, args, element_domain_id)
                self.sql_driver.add_element_to_page(element_id, page_id)

        return 'Successfully Added to DB'

    # end report()

    def close(self):
        # close mysql connections
        self.uri_parser.close()
        self.sql_driver.close()
        return