def main(): # Parse arguments args = parse_arguments() # Read URLs url_list = read_list_urls(args) # Get torrc matching experiment type torrc_dict = cm.TORRC_BY_TYPE[args.experiment] # Instantiate crawler crawler = Crawler(url_list, torrc_dict, output=args.output, experiment=args.experiment, xvfb=args.xvfb, capture_screen=True) # Run the crawl try: crawler.crawl(args.batches, args.instances, start_line=args.start_line - 1) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") except Exception as e: wl_log.error("Exception: \n%s" % (traceback.format_exc())) finally: crawler.stop_crawl()
def run(): # build dirs build_crawl_dirs() # setup environment variables for exporting keys os.environ['TOR_CIRCUIT_KEY_EXPORT'] = cm.CRAWL_DIR + "/circuit-key.txt" os.environ['TOR_SSL_KEY_EXPORT'] = cm.CRAWL_DIR + "/ssl-key.txt" # Parse arguments args, config = parse_arguments() # Read URLs url_list = parse_url_list(args.url_file, args.start, args.stop) host_list = [urlparse(url).hostname for url in url_list] # Configure logger add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(cm.TBB_DIR, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") driver = TorBrowserWrapper(cm.TBB_DIR, tbb_logfile_path=cm.DEFAULT_FF_LOG, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=int(torrc_config['socksport'])) # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + args.type) crawler = crawl_type(driver, controller, args.screenshots, args.export_har) # Configure crawl job_config = ut.get_dict_subconfig(config, args.config, "job") job = crawler_mod.CrawlJob(job_config, url_list) # Run display xvfb_display = setup_virtual_display(args.virtual_display) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) finally: # Post crawl post_crawl() # Close display ut.stop_xvfb(xvfb_display) # die sys.exit(0)
def check_conn_error(self): if self.driver.current_url == "about:newtab": wl_log.warning('Stuck in about:newtab, visit #%s to %s' % (self.job.visit, self.job.url)) if self.driver.is_connection_error_page: wl_log.warning('Connection Error, visit #%s to %s' % (self.job.visit, self.job.url)) raise cm.ConnErrorPage
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def run(): # build dirs build_crawl_dirs() # Parse arguments args, config = parse_arguments() # Read URLs url_list = read_list_urls(args.url_file, args.start, args.stop) host_list = [urlparse(url).hostname for url in url_list] # Configure logger add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(cm.TBB_DIR, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") driver = TorBrowserWrapper(cm.TBB_DIR, tbb_logfile_path=cm.DEFAULT_FF_LOG, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=int(torrc_config['socksport']), canvas_allowed_hosts=host_list) # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + args.type) crawler = crawl_type(driver, controller, args.screenshots) # Configure crawl job_config = ut.get_dict_subconfig(config, args.config, "job") job = crawler_mod.CrawlJob(job_config, url_list) # Run display xvfb_display = setup_virtual_display(args.virtual_display) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) finally: # Post crawl post_crawl() # Close display ut.stop_xvfb(xvfb_display) # die sys.exit(0)
def stop_capture(self): ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid os.kill(self.p0.pid, signal.SIGINT) #self.p0.kill() if os.path.isfile(self.logpath): wl_log.info('Onionperf killed. Capture size: %s Bytes %s' % (os.path.getsize(self.logpath), self.logpath)) else: wl_log.warning('Onionperf killed but cannot find capture file: %s' % self.logpath)
def stop_capture(self): """Kill the dumpcap process.""" ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid self.p0.kill() self.is_recording = False if os.path.isfile(self.pcap_file): wl_log.info('Dumpcap killed. Capture size: %s Bytes %s' % (os.path.getsize(self.pcap_file), self.pcap_file)) else: wl_log.warning('Dumpcap killed but cannot find capture file: %s' % self.pcap_file)
def __do_batch(self): """ Must init/restart the Tor process to have a different circuit. If the controller is configured to not pollute the profile, each restart forces to switch the entry guard. """ with self.controller.launch(): for self.job.site in xrange(len(self.job.urls)): if len(self.job.url) > cm.MAX_FNAME_LENGTH: wl_log.warning("URL is too long: %s" % self.job.url) continue self.__do_instance() sleep(float(self.job.config['pause_between_sites']))
def stop_capture(self): """Kill the dumpcap process.""" ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid self.p0.kill() self.is_recording = False captcha_filepath = ut.capture_filepath_to_captcha(self.pcap_file) if os.path.isfile(self.pcap_file): wl_log.info('Sniffer killed. Capture size: %s Bytes %s' % (os.path.getsize(self.pcap_file), self.pcap_file)) elif os.path.isfile(captcha_filepath): wl_log.info( 'Sniffer killed, file renamed to captcha_*. Capture size: %s Bytes %s' % (os.path.getsize(captcha_filepath), captcha_filepath)) else: wl_log.warning( 'Sniffer killed but cannot find capture file: %s or %s' % (self.pcap_file, captcha_filepath))
def click_to_xpath_selector(br, page_url, selector): #wl_log.info('Will find els by selector %s on %s' % (selector, page_url)) els = br.find_elements_by_xpath(selector) for el in els: if is_clickable(el, page_url): href = el.get_attribute('href') or "?" try: el.click() except Exception as es: wl_log.warning('Exception while clicking: href: %s %s %s' % (href, es, page_url)) else: wl_log.info('Clicked!: href: %s %s %s' % (href, selector, page_url)) sleep(WAIT_AFTER_CLICK) get_and_sleep(br, page_url) return 1 #wl_log.debug('No clickable element found for: %s %s' % (selector, page_url)) return 0 # we couldn't find any element to click
def _do_batch(self): """ Must init/restart the Tor process to have a different circuit. If the controller is configured to not pollute the profile, each restart forces to switch the entry guard. """ while self.job.site < len(self.job.urls): if self.job.url in self.job.batch_failed and self.job.batch_failed[ self.job.url]: wl_log.info("Skipping URL because has failed too many times") self.job.site += 1 continue if len(self.job.url) > cm.MAX_FNAME_LENGTH: wl_log.warning("URL is too long: %s" % self.job.url) self.job.site += 1 continue self._do_visits() sleep(float(self.job.config['pause_between_sites'])) self.job.site += 1 if self.job.site == len(self.job.urls): self.job.site = 0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'): referer = msg.request.headers['Referer'][0] if msg.request.headers[ 'Referer'] else "" if msg.response and msg.response.content: print msg.request.get_url() if (msg.response.content[:3] in SWF_MAGIC_NUMBERS ): # to wide, but decompiler will discard them swf_hash = ut.hash_text(msg.response.content) swf_url = msg.request.get_url() db_conn = dbu.mysql_init_db() db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor) rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor) if not rows: swf_filename = os.path.join( dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1])) swf_filename = swf_filename[:MAX_FILENAME_LEN] if not swf_filename.endswith('.swf'): swf_filename += '.swf' wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer)) fu.write_to_file(swf_filename, msg.response.content) vector = swu.get_occurence_vector_from_swf( swf_filename, os.path.join(dir_path, prefix)) duplicate_swf = 0 else: wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url)) vector = swu.str_to_vector(rows[0]['occ_vector']) swf_filename = rows[0]['local_path'] duplicate_swf = 1 rank, domain = prefix.rsplit('/')[-1].split('-', 1) swf_info = swu.SwfInfo() swf_info.rank = rank # this might be fake swf_info.domain = domain swf_info.local_path = swf_filename swf_info.occ_vector = vector swf_info.hash = swf_hash swf_info.url = swf_url swf_info.referer = referer swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) swf_info.feat_vector = [] swf_info.page_url = '' swf_info.occ_string = ' '.join( swu.human_readable_occ_vector(vector)) swf_info.crawl_id = crawl_id swu.add_swf_to_db(swf_info, db_conn) db_conn.commit() db_cursor.close() db_conn.close() elif '.swf' in msg.request.path: wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100])) else: pass
def run(): # Parse arguments args, config = parse_arguments() # build dirs build_crawl_dirs() # Read URLs if isfile(args.urls): url_list = parse_url_list(args.urls, args.start, args.stop) else: try: url_list = args.urls.split(',') except Exception as e: wl_log.error("ERROR: expects a string with comma-separated list " "of URLs of a path to file") host_list = [urlparse(url).hostname for url in url_list] # Configure logger add_log_file_handler(wl_log, cm.CRAWL_LOG_FILENAME) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(tbb_path=args.tbb_path, tor_binary_path=args.tor_binary_path, tor_data_path=args.tor_data_path, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") ffprefs = ut.set_dict_value_types(ffprefs) print(ffprefs) addons_path = [abspath(args.addons_dir)] if args.addons_dir else [] driver_config = {'tbb_path': cm.TBB_DIR, 'tor_cfg': USE_RUNNING_TOR, 'pref_dict': ffprefs, 'extensions': addons_path, 'socks_port': int(torrc_config['socksport']), 'control_port': int(torrc_config['controlport']), 'canvas_allowed_hosts': host_list } # Instantiate crawler crawl_type = getattr(crawler_mod, "Crawler" + args.type) crawler = crawl_type(controller, driver_config=driver_config, device=args.device, screenshots=args.screenshots) # Configure crawl if args.recover_file is not None: if isfile(args.recover_file): with open(args.recover_file) as fchkpt: job = pickle.load(fchkpt) wl_log.info("Job recovered: %s" % str(job)) else: wl_log.error("Checkpoint file %s does not exist" % args.recover_file) sys.exit(1) else: # parse job configuration job_config = ut.get_dict_subconfig(config, args.config, "job") # get chunk of urls to crawl chunk = int(job_config.get('chunk', 0)) chunks = int(job_config.get('chunks', 1)) range_chunk = len(url_list) / chunks if chunk == chunks - 1: # last chunk takes remaining urls url_list_chunk = url_list[chunk * range_chunk:] else: url_list_chunk = url_list[chunk * range_chunk:(chunk + 1) * range_chunk] job = crawler_mod.CrawlJob(job_config, url_list_chunk) # Run display xvfb_display = setup_virtual_display(args.virtual_display) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) except Exception as e: wl_log.error("ERROR: unknown exception while crawling: %s" % e) finally: #driver.quit() #controller.quit() # Post crawl post_crawl() # Close display ut.stop_xvfb(xvfb_display) # die wl_log.info("[tbcrawler] the crawl has finished.") sys.exit(0)
if verbose: wl_log.setLevel(logging.DEBUG) else: wl_log.setLevel(logging.INFO) # Validate the given arguments # Read urls url_list = np.loadtxt(url_list_path, delimiter='\n', dtype=str) url_list = url_list.tolist() url_list = url_list[start_line - 1:stop_line] torrc_dict = cm.TORRC_DEFAULT if not tbb_version: tbb_version = cm.TBB_DEFAULT_VERSION elif tbb_version not in cm.TBB_KNOWN_VERSIONS: ut.die('Version of Tor browser is not recognized.') crawler = Crawler(torrc_dict, url_list, tbb_version, xvfb, capture_screen) wl_log.info('Command line parameters: %s' % sys.argv) # Run the crawl try: crawler.crawl(no_of_batches, no_of_instances, start_line=start_line - 1) except KeyboardInterrupt: wl_log.warning('Keyboard interrupt! Quitting...') except Exception as e: wl_log.error('Exception: \n%s' % (traceback.format_exc())) finally: crawler.stop_crawl()
def check_captcha(self): if CHECK_CAPTCHA and ut.has_captcha(self.page_source): wl_log.warning('captcha found') self.job.add_captcha()
if experiment == cm.EXP_TYPE_WANG_AND_GOLDBERG: torrc_dict = cm.TORRC_WANG_AND_GOLDBERG elif experiment == cm.EXP_TYPE_MULTITAB_ALEXA: torrc_dict = cm.TORRC_DEFAULT else: ut.die("Experiment type is not recognized." " Use --help to see the possible values.") if not tbb_version: # Assign the last stable version of TBB tbb_version = cm.TBB_DEFAULT_VERSION elif tbb_version not in cm.TBB_KNOWN_VERSIONS: ut.die("Version of Tor browser is not recognized." " Use --help to see which are the accepted values.") crawler = Crawler(torrc_dict, url_list, tbb_version, experiment, xvfb, capture_screen) wl_log.info("Command line parameters: %s" % sys.argv) # Run the crawl try: crawler.crawl(no_of_batches, no_of_instances, start_line=start_line - 1) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") except Exception as e: wl_log.error("Exception: \n%s" % (traceback.format_exc())) finally: crawler.stop_crawl(pack_results=False)