def start_capture(self, pcap_path=None, pcap_filter=""): """Start capture. Configure sniffer if arguments are given.""" if cm.running_in_CI: wl_log.debug("CI run: will not run dumpcap") return False if pcap_filter: self.set_capture_filter(pcap_filter) if pcap_path: self.set_pcap_path(pcap_path) prefix = "" if cm.running_in_CI: prefix = "sudo " # run as sudo in Travis CI since we cannot setcap command = '{}dumpcap -P -a duration:{} -a filesize:{} -i eth0 -s 0 -f \'{}\' -w {}'\ .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE, self.pcap_filter, self.pcap_file) wl_log.info(command) self.p0 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) timeout = DUMPCAP_START_TIMEOUT # in seconds while timeout > 0 and not self.is_dumpcap_running(): time.sleep(0.1) timeout -= 0.1 if timeout < 0: raise cm.DumpcapTimeoutError() else: wl_log.debug("dumpcap started in %s seconds" % (DUMPCAP_START_TIMEOUT - timeout)) self.is_recording = True
def extract_links(br): """Extract FP related links from the current page.""" links_to_visit_text = list( ut.flatten([ br.find_elements_by_partial_link_text(linktext) for linktext in LINK_LABELS ])) links_to_visit_url = list( ut.flatten([ br.find_elements_by_xpath('//a[contains(@href,"%s")]' % linkurl) for linkurl in LINK_URLS ])) links_to_visit = [ link for link in links_to_visit_text + links_to_visit_url if link ] if len( links_to_visit ) < NO_OF_LINKS_TO_CLICK: # if we cannot find links by href and link texts links_to_visit += extract_onclick_elements( br) # we search for all elements with onclick event handler wl_log.info('%s links were found on %s' % (len(links_to_visit), br.current_url)) return links_to_visit
def crawl_sites(url_tuples, crawler_type, num_crawl_urls=0, max_parallel_procs=MAX_PARALLEL_PROCESSES): if crawler_type == 'lazy': agent_cfg = AGENT_CFG_PHANTOM_MOD_HOME_PAGE agent = HeadlessAgent() elif crawler_type == 'clicker': agent_cfg = AGENT_CFG_PHANTOM_MOD_CLICKER agent = HeadlessAgent() elif crawler_type == 'chrome_lazy': agent_cfg = AGENT_CFG_CHROME_LAZY agent = ChromeAgent() elif crawler_type == 'chrome_clicker': agent_cfg = AGENT_CFG_CHROME_CLICKER agent = ChromeAgent() elif crawler_type == 'dnt': # TODO scripts should take DNT as a parameter agent_cfg = AGENT_CFG_DNT_PHANTOM_LAZY agent = HeadlessAgent() agent.setOptions(agent_cfg) cr_job = CrawlJob(agent) job_cfg = { 'desc': "Crawl for browser fingerprint detection", 'max_parallel_procs': max_parallel_procs, 'urls': [], 'url_tuples': url_tuples, 'num_crawl_urls': num_crawl_urls } cr_job.setOptions(job_cfg) wl_log.info('Will crawl with agent config: %s and job config: %s' %(agent_cfg, job_cfg)) run_crawl(cr_job) return cr_job.crawl_id
def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id']) except Exception as exc: wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
def launch_tor_service(self, logfile='/dev/null'): """Launch Tor service and return the process.""" self.log_file = logfile self.tmp_tor_data_dir = ut.clone_dir_with_timestap( cm.get_tor_data_path(self.tbb_version)) self.torrc_dict.update({ 'DataDirectory': self.tmp_tor_data_dir, 'Log': ['INFO file %s' % logfile] }) wl_log.debug("Tor config: %s" % self.torrc_dict) try: self.tor_process = stem.process.launch_tor_with_config( config=self.torrc_dict, init_msg_handler=self.tor_log_handler, tor_cmd=cm.get_tor_bin_path(self.tbb_version), timeout=270) self.controller = Controller.from_port() self.controller.authenticate() return self.tor_process except stem.SocketError as exc: wl_log.critical("Unable to connect to tor on port %s: %s" % (cm.SOCKS_PORT, exc)) sys.exit(1) except: # most of the time this is due to another instance of # tor running on the system wl_log.critical("Error launching Tor", exc_info=True) sys.exit(1) wl_log.info("Tor running at port {0} & controller port {1}.".format( cm.SOCKS_PORT, cm.CONTROLLER_PORT)) return self.tor_process
def quit(self): """ Overrides the base class method cleaning the timestamped profile. """ self.is_running = False try: wl_log.info("Quit: Removing profile dir") shutil.rmtree(self.prof_dir_path) super(TorBrowserDriver, self).quit() except CannotSendRequest: wl_log.error("CannotSendRequest while quitting TorBrowserDriver", exc_info=False) # following is copied from webdriver.firefox.webdriver.quit() which # was interrupted due to an unhandled CannotSendRequest exception. # kill the browser self.binary.kill() # remove the profile folder try: shutil.rmtree(self.profile.path) if self.profile.tempfolder is not None: shutil.rmtree(self.profile.tempfolder) except Exception as e: print(str(e)) except Exception: wl_log.error("Exception while quitting TorBrowserDriver", exc_info=True)
def start_capture(self, pcap_path=None, pcap_filter=""): """Start capture. Configure sniffer if arguments are given.""" if pcap_filter: self.set_capture_filter(pcap_filter) if pcap_path: self.set_pcap_path(pcap_path) prefix = "" # 修改eth0为本地测试接口WLAN 写成配置文件 command = '{}dumpcap -P -a duration:{} -a filesize:{} -i {} -s 0 -f \"{}\" -w {}'\ .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE, self.netif, self.pcap_filter, self.pcap_file) wl_log.info(command) self.p0 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) timeout = DUMPCAP_START_TIMEOUT # in seconds while timeout > 0 and not self.is_dumpcap_running(): time.sleep(0.1) timeout -= 0.1 if timeout < 0: raise DumpcapTimeoutError() else: wl_log.debug("dumpcap started in %s seconds" % (DUMPCAP_START_TIMEOUT - timeout)) self.is_recording = True
def __do_instance(self): for self.job.visit in xrange(self.job.visits): ut.create_dir(self.job.path) wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url) with self.driver.launch(): try: self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except WebDriverException as seto_exc: wl_log.error("Setting soft timeout %s", seto_exc) self.__do_visit() if self.screenshots: try: self.driver.get_screenshot_as_file(self.job.png_file) except WebDriverException: wl_log.error("Cannot get screenshot.") if self.har_export: try: jscript = "return HAR.triggerExport().then(harLog => {return harLog;});" har_string = self.driver.execute_script(jscript) with open(self.job.har_file, 'w') as fd: json.dump(har_string, fd) except WebDriverException: wl_log.error("Cannot export HAR.") sleep(float(self.job.config['pause_between_visits'])) self.post_visit()
def start_capture(self, pcap_path=None, pcap_filter=""): """Start capture. Configure sniffer if arguments are given.""" if cm.running_in_CI: wl_log.debug("CI run: will not run dumpcap") return False if pcap_filter: self.set_capture_filter(pcap_filter) if pcap_path: self.set_pcap_path(pcap_path) prefix = "" if cm.running_in_CI: prefix = "sudo " # run as sudo in Travis CI since we cannot setcap command = '{}dumpcap -a duration:{} -a filesize:{} -i any -s 0 -f \'{}\' -w {}'\ .format(prefix, cm.SOFT_VISIT_TIMEOUT, cm.MAX_DUMP_SIZE, self.pcap_filter, self.pcap_file) wl_log.info(command) self.p0 = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) timeout = DUMPCAP_START_TIMEOUT # in seconds while timeout > 0 and not self.is_dumpcap_running(): time.sleep(0.1) timeout -= 0.1 if timeout < 0: raise cm.DumpcapTimeoutError() else: wl_log.debug("dumpcap started in %s seconds" % (DUMPCAP_START_TIMEOUT - timeout)) self.is_recording = True
def close_index_html(index_file): # wl_log.info('Will close %s' % index_file) # TODO: add check to don't close a file twice if not os.path.isfile(index_file): fu.write_to_file(index_file, '') # create an empty file index_src = fu.read_file(index_file) if index_src.startswith('<html'): wl_log.info('Index file %s already closed' % index_file) return scripts_src = """<script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery-1.9.1.min.js"></script> <style type="text/css" title="currentStyle"> @import "../../js/css/demo_page.css"; @import "../../js/css/demo_table.css"; </style> <script type="text/javascript" language="javascript" src="http://homes.esat.kuleuven.be/~gacar/jscss/jquery.dataTables.min.js"></script> <script type="text/javascript" charset="utf-8"> $(document).ready(function() { $('#results').dataTable( { "aaSorting": [[ 2, "desc" ]] } ); } ); </script>""" html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' />" + scripts_src + "</head>\n<body><table id ='results'>\ \n<thead><tr><th>Rank</th><th>Domain</th><th>Fonts</th><th>OffsetWidth</th><th>OffsetHeight</th><th>FP found</th></tr></thead>" + index_src + '</table></body></html>' fu.write_to_file(index_file, html_str)
def save_checkpoint(self): fname = join(cm.CRAWL_DIR, "job.chkpt") if isfile(fname): remove(fname) with open(fname, "w") as f: pickle.dump(self.job, f) wl_log.info("New checkpoint at %s" % fname)
def generate_index_file(path): table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>' fonts_dict = {} i = 0 for json_file in fu.gen_find_files("*.json", path): i = i + 1 wl_log.info("%s - %s" % (i, json_file)) domaInfo = load_domainfo_from_json_file(json_file) if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected: fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for filename, num_font_loaded in sorted_font_dict: #if num_font_loaded > FONT_LOAD_THRESHOLD: rank, domain = get_rank_domain_from_filename(filename) output_filename = os.path.basename(filename)[:-4] + ".html" table_str += '<tr><td>'+ rank + '</td><td><a href="' + output_filename + '">' + domain \ + '</a></td><td>' + str(num_font_loaded) + '</td></tr>' table_str += '</table>' html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>" index_filename = os.path.join(path, "index.html") fu.write_to_file(index_filename, html_str.encode('utf-8'))
def generate_index_file(path): table_str = '<table><th>Rank</th><th>Domain</th><th># fonts requested</th>' fonts_dict = {} i = 0 for json_file in fu.gen_find_files("*.json", path): i = i + 1 wl_log.info("%s - %s" % (i, json_file)) domaInfo = load_domainfo_from_json_file(json_file) if domaInfo.num_font_loads > FONT_LOAD_THRESHOLD or domaInfo.fp_detected: fonts_dict[domaInfo.log_filename] = domaInfo.num_font_loads sorted_font_dict = sorted(fonts_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for filename, num_font_loaded in sorted_font_dict: #if num_font_loaded > FONT_LOAD_THRESHOLD: rank,domain = get_rank_domain_from_filename(filename) output_filename = os.path.basename(filename)[:-4] + ".html" table_str += '<tr><td>'+ rank + '</td><td><a href="' + output_filename + '">' + domain \ + '</a></td><td>' + str(num_font_loaded) + '</td></tr>' table_str += '</table>' html_str = "<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\ <meta http-equiv='Content-Type' content='text/html; charset=utf-8' /> </head><body>" + table_str + "</body></html>" index_filename = os.path.join(path, "index.html") fu.write_to_file(index_filename, html_str.encode('utf-8'))
def cleanup_visit(self): """Kill sniffer and Tor browser if they're running.""" wl_log.info("Cleaning up visit.") wl_log.info("Cancelling timeout") ut.cancel_timeout() if self.sniffer and self.sniffer.is_recording: wl_log.info("Stopping sniffer...") self.sniffer.stop_capture() # remove non-tor traffic self.filter_guards_from_pcap() if self.tb_driver and self.tb_driver.is_running: # shutil.rmtree(self.tb_driver.prof_dir_path) wl_log.info("Quitting selenium driver...") self.tb_driver.quit() # close all open streams to prevent pollution self.tor_controller.close_all_streams() if self.xvfb and not cm.running_in_CI: wl_log.info("Stopping display...") self.vdisplay.stop() # after closing driver and stoping sniffer, we run postcrawl self.post_crawl()
def __init__(self, batch_num, site_num, instance_num, page_url, base_dir, tor_controller, bg_site=None, experiment=cm.EXP_TYPE_WANG_AND_GOLDBERG, xvfb=False, capture_screen=True): self.batch_num = batch_num self.site_num = site_num self.instance_num = instance_num self.page_url = page_url self.bg_site = bg_site self.experiment = experiment self.base_dir = base_dir self.visit_dir = None self.visit_log_dir = None self.tbb_version = cm.RECOMMENDED_TBB_VERSION self.capture_screen = capture_screen self.tor_controller = tor_controller self.xvfb = xvfb self.init_visit_dir() self.pcap_path = os.path.join( self.visit_dir, "{}.pcap".format(self.get_instance_name())) if self.xvfb and not cm.running_in_CI: wl_log.info("Starting XVFBm %sX%s" % (cm.XVFB_W, cm.XVFB_H)) self.vdisplay = Xvfb(width=cm.XVFB_W, height=cm.XVFB_H) self.vdisplay.start() # Create new instance of TorBrowser driver TorBrowserDriver.add_exception(self.page_url) self.tb_driver = TorBrowserDriver(tbb_path=cm.TBB_PATH, tbb_logfile_path=join(self.visit_dir, "logs", "firefox.log")) self.sniffer = Sniffer() # sniffer to capture the network traffic
def stop_crawl(self, pack_results=True): """ Cleans up crawl and kills tor process in case it's running.""" wl_log.info("Stopping crawl...") if self.visit: self.visit.cleanup_visit() self.tor_controller.kill_tor_proc() if pack_results: ut.pack_crawl_data(self.crawl_dir)
def import_gpg_key(key_fp): """Import GPG key with the given fingerprint.""" wl_log.info("Will import the GPG key %s" % key_fp) # https://www.torproject.org/docs/verifying-signatures.html.en ret_code = subprocess.Popen(['/usr/bin/gpg', '--keyserver', 'x-hkp://pool.sks-keyservers.net', '--recv-keys', key_fp]).wait() return True if ret_code == 0 else False
def kill_tor_proc(self): """Kill Tor process.""" if self.tor_process: wl_log.info("Killing tor process") self.tor_process.kill() if self.tmp_tor_data_dir and os.path.isdir(self.tmp_tor_data_dir): wl_log.info("Removing tmp tor data dir") shutil.rmtree(self.tmp_tor_data_dir)
def get_occurence_vector_from_swf(swf_filename, out_dir=''): cum_pattern = [0]*len(FP_ACTIONSCRIPT_STR_LIST) for src_file in gen_decompile_swf(swf_filename, out_dir): vector = fu.file_occurence_vector(src_file, FP_ACTIONSCRIPT_STR_LIST) cum_pattern = [x+y for (x, y) in zip(cum_pattern, vector)] wl_log.info("Cum Vector for %s %s" % (swf_filename[len(out_dir):], human_readable_occ_vector(cum_pattern))) return cum_pattern
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def _new_identity(self): wl_log.info("Creating a new identity...") try: ActionChains(self.driver).send_keys(Keys.CONTROL + Keys.SHIFT + 'U').perform() except WebDriverException: pass except Exception: wl_log.exception("Exception while creating new identity.")
def post_visit(self): guard_ips = set([ip for ip in self.controller.get_all_guard_ips()]) wl_log.debug("Found %s guards in the consensus.", len(guard_ips)) wl_log.info("Filtering packets without a guard IP.") try: ut.filter_pcap(self.job.pcap_file, guard_ips) except Exception as e: wl_log.error("ERROR: filtering pcap file: %s.", e) wl_log.error("Check pcap: %s", self.job.pcap_file)
def init_mitmproxy(basename, timeout, logging): try: port, pid = run_mitmdump(basename, timeout+1, logging) # runs a mitmdump process with the timeout+1 sec except: wl_log.critical('Exception initializing mitmdump') else: wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid)) return "127.0.0.1:%s " % port if port and pid else ""
def filter_packets_without_guard_ip(self): guard_ips = set([ip for ip in self.controller.get_all_guard_ips()]) wl_log.info("Found %s guards in the consensus.", len(guard_ips)) wl_log.info("Filtering packets without a guard IP.") try: ut.filter_tshark(self.job.tshark_file, guard_ips) except Exception as e: wl_log.error("ERROR: filtering tshark log: %s.", e) wl_log.error("Check tshark log: %s", self.job.thsark_file)
def stop_capture(self): ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid os.kill(self.p0.pid, signal.SIGINT) #self.p0.kill() if os.path.isfile(self.logpath): wl_log.info('Onionperf killed. Capture size: %s Bytes %s' % (os.path.getsize(self.logpath), self.logpath)) else: wl_log.warning('Onionperf killed but cannot find capture file: %s' % self.logpath)
def get_occurence_vector_from_swf(swf_filename, out_dir=''): cum_pattern = [0] * len(FP_ACTIONSCRIPT_STR_LIST) for src_file in gen_decompile_swf(swf_filename, out_dir): vector = fu.file_occurence_vector(src_file, FP_ACTIONSCRIPT_STR_LIST) cum_pattern = [x + y for (x, y) in zip(cum_pattern, vector)] wl_log.info( "Cum Vector for %s %s" % (swf_filename[len(out_dir):], human_readable_occ_vector(cum_pattern))) return cum_pattern
def stop_capture(self): """Kill the dumpcap process.""" ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid self.p0.kill() self.is_recording = False if os.path.isfile(self.pcap_file): wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' % (os.path.getsize(self.pcap_file), self.pcap_file)) else: wl_log.warning('Dumpcap killed but cannot find capture file: %s' % self.pcap_file)
def init_mitmproxy(basename, timeout, logging): try: port, pid = run_mitmdump( basename, timeout + 1, logging) # runs a mitmdump process with the timeout+1 sec except: wl_log.critical('Exception initializing mitmdump') else: wl_log.info('mitmdump will listen on port %s, pid %s' % (port, pid)) return "127.0.0.1:%s " % port if port and pid else ""
def write_decomp_log(swf_path, ffdec_status, ffdec_output, flare_status, flare_output): TIME_OUT_EXIT_CODE = -1 # return code for linux timeout command log_str = 'Decompile: %s || ' % os.path.basename(swf_path) log_str += 'ffdec: %d' % ffdec_status if ffdec_status: log_str += ' timeout' if ffdec_status == TIME_OUT_EXIT_CODE else ' error' else: log_str += ' OK' wl_log.info(log_str)
def extract_links(br): """Extract FP related links from the current page.""" links_to_visit_text = list(ut.flatten([br.find_elements_by_partial_link_text(linktext) for linktext in LINK_LABELS])) links_to_visit_url = list(ut.flatten([br.find_elements_by_xpath('//a[contains(@href,"%s")]' % linkurl) for linkurl in LINK_URLS])) links_to_visit = [link for link in links_to_visit_text + links_to_visit_url if link] if len(links_to_visit) < NO_OF_LINKS_TO_CLICK: # if we cannot find links by href and link texts links_to_visit += extract_onclick_elements(br) # we search for all elements with onclick event handler wl_log.info('%s links were found on %s' % (len(links_to_visit), br.current_url)) return links_to_visit
def take_screenshot(self): try: out_png = os.path.join(self.visit_dir, 'screenshot.png') wl_log.info("Taking screenshot of %s to %s" % (self.page_url, out_png)) self.tb_driver.get_screenshot_as_file(out_png) if cm.running_in_CI: wl_log.debug("Screenshot data:image/png;base64,%s" % self.tb_driver.get_screenshot_as_base64()) except: wl_log.info("Exception while taking screenshot of: %s" % self.page_url)
def _mark_failed(self): url = self.job.url if url in self.job.batch_failed and not self.job.batch_failed[url]: self.job.batch_failed[url] = True wl_log.info( "Visit to %s in %s different batches: won't visit the url again" % (url, MAX_BATCH_FAILED)) else: self.job.batch_failed[url] = False wl_log.info( "Visit to %s failed %s times within this batch, will skip for this batch" % (url, MAX_FAILED))
def download_tbb_tarball(tbb_ver, dl_dir=""): tbb_url = get_url_by_tbb_ver(tbb_ver) base_dir = dl_dir if dl_dir else cm.TBB_BASE_DIR tarball_path = os.path.join(base_dir, get_tbb_filename(tbb_ver)) if not os.path.isfile(tarball_path): wl_log.info("Will download %s to %s" % (tbb_url, tarball_path)) ut.download_file(tbb_url, tarball_path) ut.extract_tbb_tarball(tarball_path) if verify_tbb_tarball(tbb_ver, tarball_path, tbb_url): return tarball_path # we cannot verify the integrity of the downloaded tarball raise cm.TBBTarballVerificationError("Cannot verify the integrity of %s" % tarball_path)
def get_and_sleep(br, page_url): """Load page and sleep for a while.""" try: start_time = time() br.get(page_url) elapsed_time = time() - start_time wl_log.info("Page %s loaded in %s" % (page_url, elapsed_time)) except Exception as exc: wl_log.info('Error loading page %s %s' % (page_url, exc)) br.quit() else: #wl_log.info('Will sleep after reload %s' % page_url) br.execute_script("window.onbeforeunload = function(e){};") sleep(SLEEP_AFTER_PAGE_LOAD)
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename + '.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker( msg, crawl_id ) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as exc: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int( os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def click_crawler(br, page_url): num_clicked_links = 0 get_and_sleep(br, page_url) wl_log.info("Click crawler will click on %s" % page_url) for label in LINK_LABELS: num_clicked_links += click_to_xpath_selector(br, page_url, '//a[contains(text(), "'+ label + '")]') #num_clicked_links += click_by_link_text(br, page_url, label) for url_string in LINK_URLS: num_clicked_links += click_to_xpath_selector(br, page_url, '//a[contains(@href,"' + url_string + '")]') wl_log.info("Clicked %s links, will click %s more" % (num_clicked_links, MAX_LINKS_TO_CLICK - num_clicked_links)) for i in xrange(1, MAX_LINKS_TO_CLICK - num_clicked_links): click_to_xpath_selector(br, page_url, "(//a|//*[@onclick])[position()=%s]" % i)
def _do_instance(self): with Onionperf(self.job.onionperf_log): with TShark(device=self.device, path=self.job.pcap_file, filter=cm.DEFAULT_FILTER): sleep(1) # make sure sniffer is running with ut.timeout(cm.HARD_VISIT_TIMEOUT): wl_log.info("Visiting: %s" % self.job.url) self.driver.get(self._get_url()) self.page_source = self.driver.page_source.encode( 'utf-8').strip().lower() self.save_page_source() self.check_conn_error() self.check_captcha() sleep(float(self.job.config['pause_in_site'])) # TODO
def stop_capture(self): """Kill the dumpcap process.""" ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid self.p0.kill() self.is_recording = False captcha_filepath = ut.capture_filepath_to_captcha(self.pcap_file) if os.path.isfile(self.pcap_file): wl_log.info('Sniffer killed. Capture size: %s Bytes %s' % (os.path.getsize(self.pcap_file), self.pcap_file)) elif os.path.isfile(captcha_filepath): wl_log.info( 'Sniffer killed, file renamed to captcha_*. Capture size: %s Bytes %s' % (os.path.getsize(captcha_filepath), captcha_filepath)) else: wl_log.warning( 'Sniffer killed but cannot find capture file: %s or %s' % (self.pcap_file, captcha_filepath))
def click_to_xpath_selector(br, page_url, selector): #wl_log.info('Will find els by selector %s on %s' % (selector, page_url)) els = br.find_elements_by_xpath(selector) for el in els: if is_clickable(el, page_url): href = el.get_attribute('href') or "?" try: el.click() except Exception as es: wl_log.warning('Exception while clicking: href: %s %s %s' % (href, es, page_url)) else: wl_log.info('Clicked!: href: %s %s %s' % (href, selector, page_url)) sleep(WAIT_AFTER_CLICK) get_and_sleep(br, page_url) return 1 #wl_log.debug('No clickable element found for: %s %s' % (selector, page_url)) return 0 # we couldn't find any element to click
def __do_instance(self): for self.job.visit in xrange(self.job.visits): ut.create_dir(self.job.path) wl_log.info("*** Visit #%s to %s ***", self.job.visit, self.job.url) with self.driver.launch(): try: self.driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except WebDriverException as seto_exc: wl_log.error("Setting soft timeout %s", seto_exc) self.__do_visit() if self.screenshots: try: self.driver.get_screenshot_as_file(self.job.png_file) except WebDriverException: wl_log.error("Cannot get screenshot.") sleep(float(self.job.config['pause_between_visits'])) self.post_visit()
def click_crawler(br, page_url): num_clicked_links = 0 get_and_sleep(br, page_url) wl_log.info("Click crawler will click on %s" % page_url) for label in LINK_LABELS: num_clicked_links += click_to_xpath_selector( br, page_url, '//a[contains(text(), "' + label + '")]') #num_clicked_links += click_by_link_text(br, page_url, label) for url_string in LINK_URLS: num_clicked_links += click_to_xpath_selector( br, page_url, '//a[contains(@href,"' + url_string + '")]') wl_log.info("Clicked %s links, will click %s more" % (num_clicked_links, MAX_LINKS_TO_CLICK - num_clicked_links)) for i in xrange(1, MAX_LINKS_TO_CLICK - num_clicked_links): click_to_xpath_selector(br, page_url, "(//a|//*[@onclick])[position()=%s]" % i)
def parse_mitm_dump(basename, worker, crawl_id): dumpfile = basename +'.dmp' wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id)) requests = [] responses = [] if os.path.isfile(dumpfile): fr = flow.FlowReader(open(dumpfile)) try: for msg in fr.stream(): requests.append(msg.request.get_url()) # responses.append(msg.response.get_url()) worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc. except flow.FlowReadError as _: pass #wl_log.critical("Error reading mitm dump %s" % exc) else: wl_log.critical("Cannot find mitm dump %s" % dumpfile) doma_info = lp.DomainInfo() doma_info.requests = requests doma_info.responses = responses doma_info.crawl_id = crawl_id doma_info.url = "" doma_info.fc_dbg_font_loads = [] doma_info.fp_detected = lp.get_fp_from_reqs(requests) doma_info.log_complete = 1 print os.path.basename(dumpfile[:-4]).split('-')[0] doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0 db_conn = dbu.mysql_init_db() site_info_id = dbu.add_site_info_to_db(doma_info, db_conn) # parse log_file = basename + '.txt' if not os.path.isfile(log_file): log_file = basename + '.' + MITM_LOG_EXTENSION insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn) lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db db_conn.commit() db_conn.close() wl_log.info("Parsed %s OK" % (dumpfile)) if REMOVE_DMP_FILES: os.remove(dumpfile)
def run_mitmdump(basename, timeout, logging=False): """Run mitmdump as a subprocess in the background with a timeout.""" port = get_free_port() if not port: # we cannot get a free port return None, None dump_file = "%s.dmp" % basename cmd_re_dir = '' # for redirecting stderr to stdout and teeing quite_option = '-q' # mitmdump option to be quiet - no log if logging: mitm_log_file = "%s.%s" % (basename, MITM_LOG_EXTENSION) cmd_re_dir = ' 2>&1 |tee %s' % mitm_log_file # redirect all output to log file quite_option = '' # we don't want be quite! cmd = 'timeout %s mitmdump %s -z --anticache -p %s -w %s %s' % (timeout, quite_option, port, dump_file, cmd_re_dir) # -z: Try to convince servers to send us uncompressed data. mitmdump -h | grep "\-z" for info wl_log.info('mitmdump cmd %s' % cmd) subp = subprocess.Popen(cmd, shell=True) # shell=True - must be careful return port, subp.pid
def crawl_url(crawler_type, page_url, proxy_opt): if 'clicker' in crawler_type: worker = click_crawler else: worker = lazy_crawler br = init_browser('chrome', ['--allow-running-insecure-content', '--ignore-certificate-errors', '--disk-cache-size=0', \ '--enable-logging', '--v=1', "--proxy-server=%s" % proxy_opt]) if not page_url.startswith('http') and not page_url.startswith('file:'): page_url = 'http://' + page_url wl_log.info('***Will crawl %s***' % page_url) try: ut.timeout(CRAWLER_CLICKER_VISIT_TIMEOUT) worker(br, page_url) # run the worker function except ut.TimeExceededError as texc: wl_log.critical('***CRAWLER_CLICKER_VISIT_TIMEOUT at %s (%s)' % (page_url, texc)) finally: br.quit()
def crawl(self, job): """Crawls a set of urls in batches.""" self.job = job wl_log.info("Starting new crawl") wl_log.info(pformat(self.job)) for self.job.batch in xrange(self.job.batches): wl_log.info("**** Starting batch %s ***" % self.job.batch) self.__do_batch() sleep(float(self.job.config['pause_between_batches']))
def get_multitab(self): """Open two tab, use one to load a background site and the other to load the real site.""" PAUSE_BETWEEN_TAB_OPENINGS = 0.5 ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to kill running procs # load a blank page - a page is needed to send keys to the browser self.tb_driver.get(BAREBONE_HOME_PAGE) self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {} with {} in the background". format(self.page_url, self.bg_site)) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab # now that the focus is on the address bar, load the background # site by "typing" it to the address bar and "pressing" ENTER (\n) # simulated by send_keys function body.send_keys('%s\n' % self.bg_site) # the delay between the loading of background and real sites time.sleep(PAUSE_BETWEEN_TAB_OPENINGS) body = self.tb_driver.find_element_by_tag_name("body") body.send_keys(Keys.CONTROL + 't') # open a new tab t1 = time.time() self.tb_driver.get(self.page_url) # load the real site in the 2nd tab page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit()
def get_wang_and_goldberg(self): """Visit the site according to Wang and Goldberg (WPES'13) settings.""" ut.timeout(cm.HARD_VISIT_TIMEOUT) # set timeout to stop the visit self.sniffer.start_capture(self.pcap_path, 'tcp and not host %s and not tcp port 22 and not tcp port 20' % LOCALHOST_IP) time.sleep(cm.PAUSE_BETWEEN_INSTANCES) try: self.tb_driver.set_page_load_timeout(cm.SOFT_VISIT_TIMEOUT) except: wl_log.info("Exception setting a timeout {}".format(self.page_url)) wl_log.info("Crawling URL: {}".format(self.page_url)) t1 = time.time() self.tb_driver.get(self.page_url) page_load_time = time.time() - t1 wl_log.info("{} loaded in {} sec" .format(self.page_url, page_load_time)) time.sleep(cm.WAIT_IN_SITE) if self.capture_screen: self.take_screenshot() self.cleanup_visit()
def parse_crawl_log(filename, dump_fun=None, crawl_id=0, url=""): """Populate domain info object by parsing crawl log file of a site. Call dump function to output dump log. Logs to be parsed with this function are generated by setting env. variable FC_DEBUG=1 to 1 or logs from the browser. See, fontconfig library for details. """ origins_to_fonts = {} # will keep origin to loaded fonts mapping domaInfo = DomainInfo() file_content = fu.read_file(filename) wl_log.info("Parsing log for %s %s" % (url, filename)) fonts_by_fc_debug = re.findall( r"Sort Pattern.*$\W+family: \"([^\"]*)", file_content, re.MULTILINE ) # match family field of font request (not the matched one) domaInfo.num_offsetWidth_calls = len(re.findall(r"Element::offsetWidth", file_content)) # offset width attempts domaInfo.num_offsetHeight_calls = len(re.findall(r"Element::offsetHeight", file_content)) # offset height attempts # TODO add getBoundingClientRect font_and_urls = re.findall( r"CSSFontSelector::getFontData:? (.*) ([^\s]*)", file_content ) # output from modified browser # print 'font_and_urls', font_and_urls font_face_pairs = re.findall(r"CSSFontFace::getFontData (.*)->(.*)", file_content) # output from modified browser # print 'font_and_urls', font_and_urls domaInfo.log_complete = int(bool(re.findall(r"Finished all steps!", file_content))) # output from modified browser # print 'domaInfo.log_complete', domaInfo.log_complete js_log_prefix = ">>>FPLOG" fpd_logs = re.findall(r"%s.*" % js_log_prefix, file_content) # output from modified browser domaInfo.fpd_logs = [call[len(js_log_prefix) + 1 :] for call in set(fpd_logs)] for font_name, font_url in font_and_urls: if font_url.startswith("http") and len(font_name) > 1 and not font_name[:5] in ("data:", "http:", "https"): # font_name = font_name.rsplit(' ', 1)[0] if font_name.endswith(' onURL:') else font_name # TODO: unify chrome source code to log as Phantom do. then remove this line font_name = font_name.lower().strip() # origin = pub_suffix.get_public_suffix(font_url)\ origin = font_url if origin in origins_to_fonts: origins_to_fonts[origin].add(font_name) # print 'added', font_name, 'to', origin, origins_to_fonts[origin] else: origins_to_fonts[origin] = set([font_name]) for font, face in font_face_pairs: font = font.lower().strip() face = face.lower().strip() # replace all occurrences of this font-family name with the face for fonts_by_origin in origins_to_fonts.itervalues(): try: fonts_by_origin.remove(font) except: # we cannot find this font in this origin's list pass else: fonts_by_origin.add(face) # print 'removed', font, 'added', face for origin, fonts in origins_to_fonts.iteritems(): domaInfo.fonts_by_origins[origin] = list(fonts) domaInfo.fonts_loaded += domaInfo.fonts_by_origins[origin] domaInfo.fc_dbg_font_loads = list( set([font.lower() for font in fonts_by_fc_debug if not font[:5] in ("data:", "http:", "https")]) ) # filter out the data urls and web fonts domaInfo.fonts_loaded = list( set([font.lower() for font in domaInfo.fonts_loaded if not font[:5] in ("data:", "http:", "https")]) ) # filter out the data urls and web fonts requests = re.findall(r"^requested: (http.*)", file_content, re.MULTILINE) if not requests and filename.endswith(MITM_LOG_EXTENSION): requests = re.findall(r"(http.*)", file_content, re.MULTILINE) responses = "" # populate domain info obj domaInfo.num_font_loads = len(domaInfo.fonts_loaded) domaInfo.requests = list(set(requests)) domaInfo.responses = list(set(responses)) domaInfo.fp_detected = get_fp_from_reqs(requests) domaInfo.url = url domaInfo.rank = get_rank_domain_from_filename(filename)[ 0 ] # !!! rank may not be right. It's only true if we make top Alexa crawl. domaInfo.log_filename = filename domaInfo.crawl_id = crawl_id # Read canvas events and print them to log in canvas urls_read_from_canvas = Set() urls_wrote_to_canvas = Set() canvas_log = os.path.join(cm.BASE_FP_LOGS_FOLDER, str(crawl_id) + "canvas.log") read = wrote = False for read_event in cm.CANVAS_READ_EVENTS: if read_event in file_content: read = True break for write_event in cm.CANVAS_WRITE_EVENTS: if write_event in file_content: wrote = True break if read and wrote: wl_log.info("Found both canvas read and write events in log %s, registering in : %s" % (filename, canvas_log)) with open(canvas_log, "a+") as f: f.write(" ".join([str(domaInfo.rank), domaInfo.url]) + "\n") if dump_fun: # call dump function try: dump_fun(domaInfo) except KeyboardInterrupt: raise except Exception as exc: wl_log.exception("Exception while dumping %s: %s" % (domaInfo.url, exc))
def parse_crawl_logs(path, no_of_procs=16): files = fu.gen_find_files("*.txt", path) log_worker = partial(parse_crawl_log, dump_fun=dump_json_and_html) parallelize.run_in_parallel(files, log_worker, no_of_procs) wl_log.info("Worker processes are finished, will generate index")