Exemple #1
0
    def __init__(self):
        # load up the tld list now as only hit it once this way
        self.pubsuffix_list = self.get_pubsuffix_list()

        # this is to speed up tlds lookup with hash table
        # we can share among all runs over time
        self.sql_driver = MySQLDriver()

        # check if sub_domain_tld exists, if not create one
        #    this should only really happen once
        #
        # bug?
        # when using pool this can be done several times in tandem
        #   resulting in some lost entries - e.g. two threads see db not exist the first
        #   one to break the tie will create a new blank db and add a new entry
        #	it is possible the slightly slower second thread will also create a blank db
        #	erasing the entry of the first thread, however, after the first round the db
        #	should not be re-created again
        #
        # it should be emphasized this db is ONLY used to speed up parsing the tld
        #	(partially because my regex method is slow and should be refactored!)
        #	the data is this db is NOT used for analysis - thus deleting a few records
        #	will have a minor impact on speed for the first few entries, and then be
        #	wholly irrelevant
        #
        #   still...it irks me that this entire class could be smarter, so I will
        #		fix it another day

        if self.sql_driver.check_db_exist('sub_domain_tld') == False:
            self.sql_driver.create_sub_domain_tld_db()
        else:
            self.sql_driver.db_switch('sub_domain_tld')
Exemple #2
0
    def process_uri(self, uri):
        sql_driver = MySQLDriver(self.db_name)
        output_store = OutputStore(self.db_name)
        phantom_driver = PhantomDriver(
            '--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js')

        # this can be higher or lower depending on network load
        # generally, 90 seems to be fine, so keep with it
        try:
            phantom_output = phantom_driver.execute(uri, 90)
        except:
            print("\t\t%-50s Phantomjs Did Not Return." % uri[:50])
            sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.")
            return

        if re.match('^FAIL.+', phantom_output):
            print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output))
            sql_driver.log_error(uri, phantom_output)
        else:
            print("\t\t%-50s %s" %
                  (uri[:50], output_store.store(uri, phantom_output)))

        # closes our db connections
        sql_driver.close()
        output_store.close()
        return
Exemple #3
0
    def __init__(self, db_name, num_tlds, num_results, tracker_threshold=0):
        self.db_name = db_name
        self.sql_driver = MySQLDriver(self.db_name)
        self.num_tlds = num_tlds
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold
        self.startTime = datetime.now()
        self.pages_ok_count = self.sql_driver.pages_ok_count()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        self.setup_report_dir()

        print('\t===========================')
        print('\t Patching DB with Org Data ')
        print('\t===========================')
        # update the domains to their owners
        self.patch_org_data()
        print('\t\tSuccess!')

        print('\t=====================')
        print('\t Getting top %s tlds' % self.num_tlds)
        print('\t=====================')
        print('\t\tProcessing...')
        self.top_tlds = self.get_top_tlds(self.num_tlds)
        print(self.top_tlds)
        print('\t\tSuccess!')
        print('\t\tThe top tlds are:')
        for (tld, pages) in self.top_tlds:
            print('\t\t |- %s (%s)' % (tld, pages))

        # SPECIAL SAUCE, FOR EXPERTS: tracker domains!
        #
        # idea for this is you set a threshold of the number of sites a given domain
        #	is connected to - domains connecting to many sites may correlate those visits
        #	via referer strings etc, so we call these 'tracker domains'
        #
        # on a really large set of sites (e.g. 1M+) this works well but on small samples
        #  (e.g. 500) it doesn't work well at all as known tracker domains may only
        #  appear on a single site
        #
        # this is off by default and unless you understand what you are doing...
        # 	DON'T USE THIS!
        #
        # longer-term we may want to train off a bigger corpus to find tracker domains and
        #	have them prepackaged
        #
        if tracker_threshold:
            print('\t=========================')
            print('\t Getting tracker domains ')
            print('\t=========================')
            print('\t\tProcessing...')
            self.tracker_domains = self.get_tracker_domains(
                self.tracker_threshold)
            print('\t\tSuccess!')
        else:
            self.tracker_domains = []
Exemple #4
0
    def run(self, pool_size):
        try:
            uri_list = open('./page_lists/' + self.pages_file_name, 'r')
        except:
            print(
                'File "%s" does not exist, file must be in ./page_lists directory.  Exiting.'
                % self.pages_file_name)
            exit()
        sql_driver = MySQLDriver(self.db_name)

        # sort out what uris we are processing from the list
        uris_to_process = []

        count = 0

        print('\t------------------------')
        print('\t Building List of Pages ')
        print('\t------------------------')

        for uri in uri_list:
            # skip lines that are comments
            if "#" in uri[0]: continue

            count += 1

            # drop trailing '/, clean off white space, make lower, create cli-safe uri
            # with parse.quote, but exclude :/ b/c of http://
            uri = re.sub('/$', '',
                         urllib.parse.quote(uri.strip(), safe=":/").lower())

            # if it is a m$ office or other doc, skip
            if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri):
                print("\t\t%s | %-50s Not an HTML document, Skipping." %
                      (count, uri[:50]))
                continue

            # skip if in db already
            if sql_driver.page_exists(uri):
                print("\t\t%s | %-50s Exists in DB, Skipping." %
                      (count, uri[:50]))
                continue

            # only add if not in list already
            if uri not in uris_to_process:
                print("\t\t%s | %-50s Adding." % (count, uri[:50]))
                uris_to_process.append(uri)
            else:
                print("\t\t%s | %-50s Already queued, Skipping." %
                      (count, uri[:50]))

        print('\t----------------------------------')
        print('\t%s pages will now be webXray\'d' % len(uris_to_process))
        print('\t\t...you can go take a walk. ;-)')
        print('\t----------------------------------')

        myPool = Pool(pool_size)
        myPool.map(self.process_uri, uris_to_process)
Exemple #5
0
async def post_analyze_url(request: XrayAnalyseRequest):
    print('Data recieve.')
    status = False
    cookies = []

    driver = MySQLDriver('wbxr_gayatri')
    try:
        url = unpadPKCS7(decrypt(request.url))
        url = url.decode('utf8')
        print('Encoded', url)

        if history_filter(url):
            driver.db.execute(
                "SELECT cookie.`domain` "
                "FROM page "
                "LEFT JOIN page_cookie_junction "
                "ON page.id = page_cookie_junction.page_id "
                "LEFT JOIN cookie "
                "ON page_cookie_junction.cookie_id = cookie.id "
                "WHERE page.start_uri_md5 = MD5(%s)",
                (url,),
            )
            fetched = driver.db.fetchall()
            if fetched:
                cookies = fetched
            else:
                cookies = analyze_url(url)
        else:
            cookies = []
        print(cookies)
    except Exception as e:
        print(e)
    else:
        status = True

    return JSONResponse(content={"status": status, "cookies": cookies})
Exemple #6
0
if sys.version_info[1] < 4:
    print(
        'Python 3.4 or above is required for webXray to function; please check your installation.'
    )
    exit()

# standard python 3.4 libs
import os
import re
import time
from optparse import OptionParser

# set up a global mysql driver, in the future you could use other db drivers here
# if the mysql connector is not installed this fails gracefully
from webxray.MySQLDriver import MySQLDriver
sql_driver = MySQLDriver()


# databases are stored with a 'wbxr_' prefix, this function helps select a database in interactive mode
def select_wbxr_db():
    wbxr_dbs = sql_driver.get_wbxr_dbs_list()

    if len(wbxr_dbs) == 0:
        print(
            '''\t\tThere are no databases to analyze, please try [C]ollecting data or 
				import an existing wbxr-formatted database manually.''')
        interaction()
        return

    for index, db_name in enumerate(wbxr_dbs):
        print('\t\t[%s] %s' % (index, db_name[5:]))
Exemple #7
0
    def __init__(self,
                 db_engine,
                 db_name,
                 num_tlds,
                 num_results,
                 tracker_threshold=None,
                 flush_owner_db=True):
        """
		This performs a few start-up tasks:
			- sets up some useful global variables
			- makes sure we have a directory to store the reports
			- flushes the existing domain_owner mappings (this can be disabled)
			- if we want to do per-tld reports, figures out the most common
			- if we want to filter against a given tracker threshold, sets it 
				up here (see documentation below for tracker threshold)
		"""

        # set various global vars
        self.db_engine = db_engine
        self.db_name = db_name
        self.num_tlds = num_tlds
        self.top_tlds = []
        self.num_results = num_results
        self.tracker_threshold = tracker_threshold
        self.start_time = datetime.now()

        # number of decimal places to round to in reports
        self.num_decimals = 2

        # set up global db connection
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            self.sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            self.sql_driver = SQLiteDriver(self.db_name)
        elif db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            self.sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            exit()

        # this is reused often, do it once to save time
        self.get_pages_ok_count = self.sql_driver.get_pages_ok_count()

        print('\t=============================')
        print('\t Checking Output Directories ')
        print('\t=============================')

        self.setup_report_dir()

        print('\t============================')
        print('\t Patching Domain Owner Data ')
        print('\t============================')

        if flush_owner_db:
            # update the domains to their owners in the db, can be overridden
            #	by changing flush_owner_db to false
            self.patch_domain_owners()
        else:
            print('\t\t\tSkipping')

        # this is used in various places to get owner information
        self.domain_owners = self.get_domain_owner_dict()

        # if we want to get sub-reports for the most frequent tlds we find
        #	them here
        if self.num_tlds:
            print('\t=====================')
            print('\t Getting top %s tlds' % self.num_tlds)
            print('\t=====================')
            print('\t\tProcessing...', end='', flush=True)
            self.top_tlds = self.get_top_tlds(self.num_tlds)
            print('done!')
            print('\t\tThe top tlds are:')
            for (tld, pages) in self.top_tlds:
                if tld: print('\t\t |- %s (%s)' % (tld, pages))
        else:
            # othewise we push in a single empty entry
            self.top_tlds.append((None, self.get_pages_ok_count))

        # SPECIAL FEATURE FOR EXPERTS: tracker domain filter
        #
        # you can set a threshold of the number of sites a given 3p domain
        #	is connected to - domains connecting to many sites may correlate those visits
        #	so we call these 'tracker domains'
        #
        # the 'tracker_threshold' variable set above controls the filtering level
        #
        # on large set of sites (e.g. >10k) this works well but on small samples
        #  (e.g. <500) it doesn't work as well as known tracker domains may only
        #  appear on a single site
        #
        # this is off by default and unless you understand what you are doing
        # 	don't use this...but because you are reading the source code for an otherwise
        #	undocumented feature you are probably competent to use it ;-)
        #
        # longer-term we may want to train off a bigger corpus to find tracker domains and
        #	have them prepackaged
        #
        # use at your own risk!
        if tracker_threshold:
            print('\t===================================================')
            print('\t Getting tracker domains with threshold level of %s' %
                  self.tracker_threshold)
            print('\t===================================================')
            print('\t\tProcessing...', end='', flush=True)
            self.tracker_domains = self.get_tracker_domains(
                self.tracker_threshold)
            print('done!')
        else:
            # set to None so various downstream operations get skipped
            self.tracker_domains = None
Exemple #8
0
	def process_url(self, url):
		"""
		this function takes a specified url, loads it in the browser (currently phantomjs)
			and returns json-formatted output with relevant request data, etc.

		the output_store class then puts this data in the db for later analysis
		"""

		# set up sql connection used to log errors and do timeseries checks
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# output store does the heavy lifting of analyzing browser output and storing to db
		output_store = OutputStore(self.db_engine, self.db_name)

		# support for loading same page with multiple browsers - purposefully undocumented 
		for browser_type in self.browser_types:

			# import and set up specified browser driver
			# 	note we need to set up a new browser each time to 
			#	get a fresh profile
			if browser_type == 'phantomjs':
				browser_driver 	= PhantomDriver()
			elif browser_type == 'chrome':
				browser_driver 	= ChromeDriver(ua=self.chrome_ua)

			# support for timeseries collections - purposefully undocumented 
			if self.allow_timeseries:
				page_last_accessed_browser_type = sql_driver.get_page_last_accessed_by_browser_type(url,browser_type)
				if page_last_accessed_browser_type:
					time_diff = datetime.now()-page_last_accessed_browser_type[0]
					if time_diff < timedelta(minutes=self.interval_minutes) and page_last_accessed_browser_type[1] == browser_type:
						print("\t\t%-50s Scanned too recently with %s" % (url[:50], browser_type))
						continue

			# attempt to load the page, fail gracefully
			try:
				browser_output = browser_driver.get_webxray_scan_data(url, self.browser_wait)
			except:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return		
			
			# if there was a problem browser_output will be None
			if browser_output == None:
				print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type))
				sql_driver.log_error(url, 'Unable to load page')
				sql_driver.close()
				return

			# attempt to store the output
			if output_store.store(url, browser_output):
				print('\t\t%-50s Success with %s' % (url[:50],browser_type))
			else:
				print('\t\t%-50s Fail with %s' % (url[:50],browser_type))
				sql_driver.log_error(url, 'Unable to load page')

		sql_driver.close()
		return
Exemple #9
0
	def run(self, pool_size):
		"""
		this function manages the parallel processing of the url list using the python Pool class

		the function first reads the list of urls out of the page_lists directory, cleans it
			for known issues (eg common binary files), and issues with idna encoding (tricky!)

		then the page list is mapped to the process_url function  and executed in parallell

		pool_size is defined in the run_webxray.py file, see details there
		"""

		# the list of url MUST be in the page_lists directory!
		try:
			url_list = open(os.path.dirname(os.path.abspath(__file__)) + '/../page_lists/' + self.pages_file_name, 'r')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()

		# set up sql connection used to determine if items are already in the db
		if self.db_engine == 'mysql':		
			from webxray.MySQLDriver import MySQLDriver
			sql_driver = MySQLDriver(self.db_name)
		elif self.db_engine == 'postgres':	
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			sql_driver = PostgreSQLDriver(self.db_name)
		elif self.db_engine == 'sqlite':	
			from webxray.SQLiteDriver import SQLiteDriver
			sql_driver = SQLiteDriver(self.db_name)

		# this list gets mapped to the Pool, very important!
		urls_to_process = set()

		# simple counter used solely for updates to CLI
		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for url in url_list:
			# skip lines that are comments
			if "#" in url[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', url)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, url[:50]))
				continue

			# non-ascii domains will crash phantomjs, so we need to convert them to 
			# 	idna/ascii/utf-8
			# this requires splitting apart the url, converting the domain to idna,
			#	and pasting it all back together
			
			split_url = urlsplit(url.strip())
			idna_fixed_netloc = split_url.netloc.encode('idna').decode('utf-8')
			url = urlunsplit((split_url.scheme,idna_fixed_netloc,split_url.path,split_url.query,split_url.fragment))

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', url):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, url[:50]))
				continue

			# skip if in db already unless we are doing a timeseries
			if self.allow_timeseries == False:
				if sql_driver.page_exists(url):
					print("\t\t%s | %-50s Exists in DB, Skipping." % (count, url[:50]))
					continue
	
			# only add if not in list already
			if url not in urls_to_process:
				print("\t\t%s | %-50s Adding." % (count, url[:50]))
				urls_to_process.add(url)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, url[:50]))

		# close the db connection
		sql_driver.close()

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(urls_to_process))
		print('\t\tBrowser(s) are %s' % self.browser_types)
		print('\t\tBrowser wait time is %s seconds' % self.browser_wait)
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		# for macOS (darwin) we must specify start method as 'forkserver'
		#	this is essentially voodoo to ward off evil spirits which 
		#	appear when large pool sizes are used on macOS
		# get_start_method must be set to 'allow_none', otherwise upon
		#	checking the method it gets set (!) - and if we then get/set again
		#	we get an error
		if sys.platform == 'darwin' and multiprocessing.get_start_method(allow_none=True) != 'forkserver':
			multiprocessing.set_start_method('forkserver')
		myPool = multiprocessing.Pool(pool_size)
		myPool.map(self.process_url, urls_to_process)

		# FYI
		self.print_runtime()
Exemple #10
0
	def run(self, pool_size):
		try:
			uri_list = open('./page_lists/'+self.pages_file_name, 'r')
		except:
			print('File "%s" does not exist, file must be in ./page_lists directory.  Exiting.' % self.pages_file_name)
			exit()
		sql_driver = MySQLDriver(self.db_name)

		# sort out what uris we are processing from the list
		uris_to_process = []

		count = 0
		
		print('\t------------------------')
		print('\t Building List of Pages ')
		print('\t------------------------')
				
		for uri in uri_list:
			# skip lines that are comments
			if "#" in uri[0]: continue
		
			count += 1
		
			# only do lines starting with https?://
			if not (re.match('^https?://.+', uri)):
				print("\t\t%s | %-50s Not a valid address, Skipping." % (count, uri[:50]))
				continue

			# non-ascii domains will crash phantomjs, so we need to convert them to 
			# 	idna/ascii/utf-8
			# this requires splitting apart the uri, converting the domain to idna,
			#	and pasting it all back together. ugly.
			
			parsed_uri = urlsplit(uri.strip())
			uri = parsed_uri[0] + "://"
			uri += parsed_uri[1].encode('idna').decode('utf-8')
			
			# if chunks exist glue them back together
			
			if len(parsed_uri[2]) != 0:
				uri += parsed_uri[2]
			if len(parsed_uri[3]) != 0:
				uri += '?' + parsed_uri[3]
			if len(parsed_uri[4]) != 0:
				uri += '#' + parsed_uri[4]

			# if it is a m$ office or other doc, skip
			if re.match('.+(pdf|ppt|pptx|doc|docx|txt|rtf|xls|xlsx)$', uri):
				print("\t\t%s | %-50s Not an HTML document, Skipping." % (count, uri[:50]))
				continue

			# skip if in db already
			if sql_driver.page_exists(uri):
				print("\t\t%s | %-50s Exists in DB, Skipping." % (count, uri[:50]))
				continue
	
			# only add if not in list already
			if uri not in uris_to_process:
				print("\t\t%s | %-50s Adding." % (count, uri[:50]))
				uris_to_process.append(uri)
			else:
				print("\t\t%s | %-50s Already queued, Skipping." % (count, uri[:50]))

		print('\t----------------------------------')
		print('\t%s addresses will now be webXray\'d'  % len(uris_to_process))
		print('\t\t...you can go take a walk. ;-)')
		print('\t----------------------------------')

		myPool = Pool(pool_size)
		myPool.map(self.process_uri, uris_to_process)
Exemple #11
0
    def store(self,
              url,
              browser_output,
              store_source=False,
              store_1p=True,
              get_file_hashes=False,
              hash_3p_only=False):
        """
		this is the primary function of this class,
		
		it takes the url of the given page and the request and cookie data generated
			by the browser

		data is cleaned up with some minor analysis (eg file types) and stored 
			for later in-depth analysis.
		
		there is an option to store first party requests as well as third, turned on by default
			to save disk space turn off store_1p

		there is also an option to get file hashes, this introduces serious overhead
			and is turned off by default
		"""

        # open up a sql connection
        if self.db_engine == 'mysql':
            from webxray.MySQLDriver import MySQLDriver
            sql_driver = MySQLDriver(self.db_name)
        elif self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine)
            exit()

        # get the ip, fqdn, domain, pubsuffix, and tld
        # we need the domain to figure out if cookies/elements are third-party
        origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
            url)

        # if we can't get page domain info we fail gracefully
        if origin_ip_fqdn_domain_pubsuffix_tld is None:
            sql_driver.log_error(url, 'Could not parse TLD for %s' % url)
            return False

        origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0]
        origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1]
        origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2]
        origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3]
        origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4]

        # sql_driver.add_domain both stores the new domain and returns its db row id
        # if it is already in db just return the existing id
        page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn,
                                               origin_domain, origin_pubsuffix,
                                               origin_tld)

        # figure out the privacy policy url and text, starts null
        priv_policy_url = None
        priv_policy_url_text = None

        # read in our list of privacy link terms from the json file in webxray/resources/policyxray
        privacy_policy_term_list = self.utilities.get_privacy_policy_term_list(
        )

        # we reverse links return from browser to check footer links first as that is where policy links tend to be
        all_links = browser_output['all_links']
        all_links.reverse()

        # if we have links search for privacy policy
        if len(all_links) > 0:
            # links are tuple
            for link_text, link_url in all_links:
                # makes sure we have text, skip links without
                if link_text:
                    # need lower for string matching
                    link_text = link_text.lower().strip()
                    # not a link we can use
                    if 'javascript' in link_text: continue
                    # see if the link_text is in our term list
                    if link_text in privacy_policy_term_list:
                        # if the link_url is relative this will convert to absolute
                        priv_policy_url = self.utilities.get_absolute_url_from_page_link(
                            url, link_url)
                        priv_policy_url_text = link_text
                        break

        # if the final page is https (often after a redirect), mark it appropriately
        if browser_output['final_url'][:5] == 'https':
            page_is_ssl = True
        else:
            page_is_ssl = False

        if store_source:
            # handles issue where postgres will crash on inserting null character
            source = browser_output['source'].replace('\x00', ' ')
        else:
            source = None

        # add page
        page_id = sql_driver.add_page(
            browser_output['browser_type'], browser_output['browser_version'],
            browser_output['browser_wait'], browser_output['title'],
            browser_output['meta_desc'], url, browser_output['final_url'],
            priv_policy_url, priv_policy_url_text, page_is_ssl, source,
            browser_output['load_time'], page_domain_id)

        # store cookies
        for cookie in browser_output['cookies']:
            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            # note:
            #	url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http://
            cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                'http://' + cookie['domain'])

            # something went wrong, log and fail gracefully
            if cookie_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url,
                    'Error parsing cookie with domain: ' + cookie['domain'])
                continue

            # otherwise, everything went fine
            cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0]
            cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1]
            cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2]
            cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3]
            cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4]

            # mark third-party cookies
            if origin_domain != cookie_domain:
                is_3p_cookie = True
            else:
                is_3p_cookie = False

            # this is a first party cookie, see if we want to store it
            if is_3p_cookie is False and store_1p is False:
                continue

            # sql_driver.add_domain both stores the new domain and returns its id
            cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn,
                                                     cookie_domain,
                                                     cookie_pubsuffix,
                                                     cookie_tld)

            # name and domain are required, so if they fail we just continue
            try:
                name = cookie['name']
            except:
                continue

            try:
                domain = cookie_domain
            except:
                continue

            # these are optional, fill with null values if fail
            try:
                secure = cookie['secure']
            except:
                secure = None

            try:
                path = cookie['path']
            except:
                path = None

            try:
                httponly = cookie['httponly']
            except:
                httponly = None

            try:
                expiry = cookie['expiry']
            except:
                expiry = None

            try:
                value = cookie['value']
            except:
                value = None

            # all done with this cookie
            sql_driver.add_cookie(page_id, name, secure, path, domain,
                                  httponly, expiry, value, is_3p_cookie,
                                  cookie_domain_id)

        # process requests now
        for request in browser_output['processed_requests']:
            # if the request starts with the following we can't parse anyway, so skip
            if re.match('^(data|about|chrome|blob).+', request):
                continue

            # get the ip, fqdn, domain, pubsuffix, and tld
            # we need the domain to figure out if cookies/elements are third-party
            element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                request)

            # problem with this request, log and fail gracefully
            if element_ip_fqdn_domain_pubsuffix_tld is None:
                sql_driver.log_error(
                    url, 'Error parsing element request: ' + request)
                continue

            element_ip = element_ip_fqdn_domain_pubsuffix_tld[0]
            element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1]
            element_domain = element_ip_fqdn_domain_pubsuffix_tld[2]
            element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3]
            element_tld = element_ip_fqdn_domain_pubsuffix_tld[4]

            # sql_driver.add_domain both stores the new domain and returns its db row id
            element_domain_id = sql_driver.add_domain(element_ip, element_fqdn,
                                                      element_domain,
                                                      element_pubsuffix,
                                                      element_tld)

            # mark third-party elements based on domain
            if origin_domain != element_domain:
                is_3p_element = True
            else:
                is_3p_element = False

            # if we are not storing 1p elements continue
            if is_3p_element is False and store_1p is False:
                continue

            if request[:5] == 'https':
                element_is_ssl = True
            else:
                element_is_ssl = False

            try:
                received = browser_output['processed_requests'][request][
                    'received']
            except:
                received = None

            # get domain of referer and determine if page leaked by referer
            try:
                referer = browser_output['processed_requests'][request][
                    'referer']
            except:
                referer = None

            if referer and len(referer) != 0:
                referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(
                    referer)

                if referer_ip_fqdn_domain_pubsuffix_tld:
                    if referer_ip_fqdn_domain_pubsuffix_tld[
                            2] == origin_domain:
                        page_domain_in_referer = True
                    else:
                        page_domain_in_referer = False
                else:
                    page_domain_in_referer = None
                    sql_driver.log_error(
                        url, 'Error parsing referer header: ' + referer)
            else:
                page_domain_in_referer = None

            try:
                start_time_offset = browser_output['processed_requests'][
                    request]['start_time_offset']
            except:
                start_time_offset = None

            try:
                load_time = browser_output['processed_requests'][request][
                    'load_time']
            except:
                load_time = None

            try:
                status = browser_output['processed_requests'][request][
                    'status']
            except:
                status = None

            try:
                status_text = browser_output['processed_requests'][request][
                    'status_text']
            except:
                status_text = None

            try:
                content_type = browser_output['processed_requests'][request][
                    'content_type']
            except:
                content_type = None

            try:
                body_size = browser_output['processed_requests'][request][
                    'body_size']
            except:
                body_size = None

            try:
                request_headers = str(browser_output['processed_requests']
                                      [request]['request_headers'])
            except:
                request_headers = None

            try:
                response_headers = str(browser_output['processed_requests']
                                       [request]['response_headers'])
            except:
                response_headers = None

            # consider anything before the "?" to be the element_url
            try:
                element_url = re.search('^(.+?)\?.+$', request).group(1)
            except:
                element_url = request

            # consider anything after the "?" to be the args
            try:
                element_args = re.search('^.+(\?.+)$',
                                         request).group(1)  # start url args
            except:
                element_args = None

            # attempt to parse off the extension
            try:
                element_extension = re.search('\.([0-9A-Za-z]+)$',
                                              element_url).group(1).lower()
            except:
                element_extension = None

            # lists of common extensions, can be expanded
            image_extensions = [
                'png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif',
                'tiff', 'webp', 'srf'
            ]
            script_extensions = ['js', 'javascript']
            data_extensions = ['json', 'jsonp', 'xml']
            font_extentions = ['woff', 'ttf', 'otf']
            static_extentions = ['html', 'htm', 'shtml']
            dynamic_extentions = [
                'php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'
            ]

            # figure out what type of element it is
            if element_extension in image_extensions:
                element_type = 'image'
            elif element_extension in script_extensions:
                element_type = 'javascript'
            elif element_extension in data_extensions:
                element_type = 'data_structured'
            elif element_extension == 'css':
                element_type = 'style_sheet'
            elif element_extension in font_extentions:
                element_type = 'font'
            elif element_extension in static_extentions:
                element_type = 'page_static'
            elif element_extension == dynamic_extentions:
                element_type = 'page_dynamic'
            elif element_extension == 'swf' or element_extension == 'fla':
                element_type = 'Shockwave Flash'
            else:
                element_type = None

            # file hashing has non-trivial overhead and off by default
            #
            # what this does is uses the same ua/referer as the actual request
            # 	so we are just replaying the last one to get similar response
            # 	note that we aren't sending the same cookies so that could be an issue
            # 	otherwise it is equivalent to a page refresh in theory

            # option to hash only 3p elements observed here
            if (get_file_hashes and hash_3p_only
                    and is_3p_element) or (get_file_hashes
                                           and hash_3p_only == False):
                replay_element_request = urllib.request.Request(
                    request,
                    headers={
                        'User-Agent':
                        browser_output['processed_requests'][request]
                        ['user_agent'],
                        'Referer':
                        referer,
                        'Accept':
                        '*/*'
                    })
                try:
                    file_md5 = hashlib.md5(
                        urllib.request.urlopen(replay_element_request,
                                               timeout=10).read()).hexdigest()
                except:
                    file_md5 = None
            else:
                file_md5 = None

            # store request
            sql_driver.add_element(
                page_id, request, element_url, is_3p_element, element_is_ssl,
                received, referer, page_domain_in_referer, start_time_offset,
                load_time, status, status_text, content_type, body_size,
                request_headers, response_headers, file_md5, element_extension,
                element_type, element_args, element_domain_id)

        # close db connection
        sql_driver.close()

        return True
Exemple #12
0
 def __init__(self, dbname):
     self.uri_parser = ParseURI()
     self.sql_driver = MySQLDriver(dbname)