Ejemplo n.º 1
0
    def __init__(self, db_name=None, db_engine=None):
        # if we have db params set up global db connection, otherwise we don't bother
        if db_name:
            if db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                self.sql_driver = SQLiteDriver(db_name)
            elif db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                self.sql_driver = PostgreSQLDriver(db_name)
            else:
                print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' %
                      db_engine)
                quit()
        elif db_engine:
            if db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                self.sql_driver = SQLiteDriver()
            elif db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                self.sql_driver = PostgreSQLDriver()
            else:
                print('Utilities.py: INVALID DB ENGINE FOR %s, QUITTING!' %
                      db_engine)
                quit()

        self.url_parser = ParseURL()
Ejemplo n.º 2
0
	def __init__(self):
		self.url_parser		= ParseURL()
		self.domain_owners 	= {}
		self.id_to_owner	= {}
		self.id_to_parent	= {}

		# set up the domain ownership dictionary
		for item in json.load(open(os.path.dirname(os.path.abspath(__file__))+'/resources/domain_owners/domain_owners.json', 'r', encoding='utf-8')):
			if item['id'] == '-': continue

			self.id_to_owner[item['id']] 	= item['name']
			self.id_to_parent[item['id']] 	= item['parent_id']
			for domain in item['domains']:
				self.domain_owners[domain] = item['id']
Ejemplo n.º 3
0
	def __init__(self, db_name, db_engine):
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
		self.debug		= False
		if db_engine == 'sqlite':
			from webxray.SQLiteDriver import SQLiteDriver
			self.sql_driver = SQLiteDriver(self.db_name)
		elif db_engine == 'postgres':
			from webxray.PostgreSQLDriver import PostgreSQLDriver
			self.sql_driver = PostgreSQLDriver(self.db_name)
		else:
			print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
			quit()
		self.config 	= self.sql_driver.get_config()
Ejemplo n.º 4
0
    def __init__(self, browser_type):
        self.url_parser = ParseURL()
        self.browser_type = browser_type
        self.domain_owners = {}
        self.id_to_owner = {}
        self.id_to_parent = {}

        # set up the domain ownership dictionary
        for item in json.load(
                open(
                    os.path.dirname(os.path.abspath(__file__)) +
                    '/resources/domain_owners/domain_owners.json', 'r')):
            self.id_to_owner[item['id']] = item['owner_name']
            self.id_to_parent[item['id']] = item['parent_id']
            for domain in item['domains']:
                self.domain_owners[domain] = item['id']
Ejemplo n.º 5
0
	def __init__(self, db_engine, db_name):
		self.db_engine	= db_engine
		self.db_name	= db_name
		self.utilities	= Utilities()
		self.url_parser = ParseURL()
Ejemplo n.º 6
0
	def __init__(self, config, port_offset=1, chrome_path=None, headless=True):
		self.debug = False

		# unpack config
		if self.debug: print(config)
		self.prewait				= config['client_prewait']
		self.no_event_wait 			= config['client_no_event_wait']
		self.max_wait 				= config['client_max_wait']
		self.return_page_text 		= config['client_get_text']
		self.return_bodies 			= config['client_get_bodies']
		self.return_bodies_base64 	= config['client_get_bodies_b64']
		self.return_screen_shot 	= config['client_get_screen_shot']
		self.reject_redirects		= config['client_reject_redirects']
		self.crawl_depth 			= config['client_crawl_depth']
		self.crawl_retries 			= config['client_crawl_retries']
		self.page_load_strategy		= config['client_page_load_strategy']
		self.min_internal_links		= config['client_min_internal_links']
		self.headless 				= headless

		# custom library in /webxray
		self.url_parser = ParseURL()

		# prevents get_scan from closing browser
		#	when we are doing a crawl
		self.is_crawl = False

		# gets overwritten once, so we don't have to keep
		#	figuring it out when doing crawls
		self.browser_type		= None
		self.browser_version 	= None
		self.user_agent			= None

		# we can override the path here
		if chrome_path:
			chrome_cmd = chrome_cmd
		else:
			# if path is not specified we use the common
			#	paths for each os
			if platform.system() == 'Darwin':
				chrome_cmd = '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome '
			elif platform.system() == 'Linux':
				chrome_cmd = '/usr/bin/google-chrome '
			elif platform.system() == 'Windows':
				chrome_cmd = 'start chrome '
			else:
				print('Unable to determine Operating System and therefore cannot guess correct Chrome path, see ChromeDriver.py for details.')
				exit()

		# use port offset to avoid collissions between processes
		port = 9222+port_offset

		# each process will use it's own debugging port or we use default 9222
		chrome_cmd += '--remote-debugging-port=%s' % port

		# sets up blank profile
		chrome_cmd += ' --guest'

		# not sure this really does anything
		chrome_cmd += ' --disable-gpu'

                # disable sandbox to worki inside docker
		chrome_cmd += ' --no-sandbox'

		# set up headless
		if self.headless: chrome_cmd += ' --headless'

		# if we're in production send the subprocess output to dev/null, None is normal
		if not self.debug:
			devnull = open(os.devnull, 'w')
		else:
			devnull = None

		# run command and as subprocess
		if self.debug: print(f'going to run command: "{chrome_cmd}"')
		subprocess.Popen(chrome_cmd,shell=True,stdin=None,stdout=devnull,stderr=devnull,close_fds=True)

		# allow browser to launch
		time.sleep(5)

		# the debugger address has a 'json' path where we can find the websocket
		#	address which is how we send devtools commands, thus we extract the value
		#	"webSocketDebuggerUrl" from the first json object
		try:
			debuggerAddress_json = json.loads(urllib.request.urlopen('http://localhost:%s/json' % port).read().decode())
			if self.debug: print(debuggerAddress_json)
			webSocketDebuggerUrl = debuggerAddress_json[0]['webSocketDebuggerUrl']
			self.launched = True
		except Exception as e:
			self.launched = False
			return

		# third, once we have the websocket address we open a connection
		#	and we are (finally) able to communicate with chrome via devtools!
		# note this connection must be closed!
		self.devtools_connection = create_connection(webSocketDebuggerUrl)

		# important, makes sure we don't get stuck
		#	waiting for messages to arrive
		self.devtools_connection.settimeout(3)

		# this is incremented globally
		self.current_ws_command_id = 0

		# prevent downloading files, the /dev/null is redundant
		if self.debug: print('going to disable downloading')
		response = self.get_single_ws_response('Page.setDownloadBehavior','"behavior":"deny","downloadPath":"/dev/null"')
		if response['success'] == False:
			self.exit()
			return response
		else:
			response = response['result']
		if self.debug: print(f'{response}')

		# done
		return
Ejemplo n.º 7
0
	def __init__(self, db_engine, db_name):
		self.db_engine	= db_engine
		self.db_name	= db_name
		self.url_parser = ParseURL()