def run(): # Parse arguments args, config = parse_arguments() # build dirs build_crawl_dirs(args.url_file) # Read URLs url_list = parse_url_list(args.url_file, args.start, args.stop) # Configure logger add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(cm.TBB_DIR, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") driver = TorBrowserWrapper(cm.TBB_DIR, tbb_logfile_path=cm.DEFAULT_FF_LOG, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=int(torrc_config['socksport'])) # Instantiate crawler crawler = crawler_mod.Crawler(driver, controller, args.screenshots, args.device) # Configure crawl job_config = ut.get_dict_subconfig(config, args.config, "job") job = crawler_mod.CrawlJob(job_config, url_list) # Setup stem headless display if args.virtual_display: xvfb_h = int(args.virtual_display.split('x')[0]) xvfb_w = int(args.virtual_display.split('x')[1]) else: xvfb_h = cm.DEFAULT_XVFB_WIN_H xvfb_w = cm.DEFAULT_XVFB_WIN_W xvfb_display = start_xvfb(xvfb_w, xvfb_h) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) finally: # Post crawl post_crawl() # Close display stop_xvfb(xvfb_display) # die sys.exit(0)
def stop_capture(self): """Kill the dumpcap process.""" ut.kill_all_children(self.p0.pid) # self.p0.pid is the shell pid self.p0.kill() self.is_recording = False if os.path.isfile(self.pcap_file): wl_log.info('Capture killed. Traffic size: %s Bytes %s' % (os.path.getsize(self.pcap_file), self.pcap_file)) else: wl_log.warning('Capture killed but cannot find capture file: %s' % self.pcap_file) wl_log.warning('Check %s for error information!' % self.log)
def _do_batch(self): """ Must init/restart the Tor process to have a different circuit. If the controller is configured to not pollute the profile, each restart forces to switch the entry guard. """ with self.controller.launch(): for self.job.site in xrange(len(self.job.urls)): if len(self.job.url) > cm.MAX_FNAME_LENGTH: wl_log.warning("URL is too long: %s" % self.job.url) continue self._do_instance() sleep(float(self.job.config['pause_between_videos']))