def run(): # Parse arguments args, config = parse_arguments() # build dirs build_crawl_dirs(args.url_file) # Read URLs url_list = parse_url_list(args.url_file, args.start, args.stop) # Configure logger add_log_file_handler(wl_log, cm.DEFAULT_CRAWL_LOG) # Configure controller torrc_config = ut.get_dict_subconfig(config, args.config, "torrc") controller = TorController(cm.TBB_DIR, torrc_dict=torrc_config, pollute=False) # Configure browser ffprefs = ut.get_dict_subconfig(config, args.config, "ffpref") driver = TorBrowserWrapper(cm.TBB_DIR, tbb_logfile_path=cm.DEFAULT_FF_LOG, tor_cfg=USE_RUNNING_TOR, pref_dict=ffprefs, socks_port=int(torrc_config['socksport'])) # Instantiate crawler crawler = crawler_mod.Crawler(driver, controller, args.screenshots, args.device) # Configure crawl job_config = ut.get_dict_subconfig(config, args.config, "job") job = crawler_mod.CrawlJob(job_config, url_list) # Setup stem headless display if args.virtual_display: xvfb_h = int(args.virtual_display.split('x')[0]) xvfb_w = int(args.virtual_display.split('x')[1]) else: xvfb_h = cm.DEFAULT_XVFB_WIN_H xvfb_w = cm.DEFAULT_XVFB_WIN_W xvfb_display = start_xvfb(xvfb_w, xvfb_h) # Run the crawl chdir(cm.CRAWL_DIR) try: crawler.crawl(job) except KeyboardInterrupt: wl_log.warning("Keyboard interrupt! Quitting...") sys.exit(-1) finally: # Post crawl post_crawl() # Close display stop_xvfb(xvfb_display) # die sys.exit(0)
def __init__(self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt","tbb","tor-browser_en-US"), tb_log_path=join(_log_dir,"firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config(config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data)
def launch_tb_with_custom_stem(tbb_dir): xvfb_display = start_xvfb() socks_port = free_port() control_port = free_port() tor_data_dir = tempfile.mkdtemp() tor_binary = join(tbb_dir, cm.DEFAULT_TOR_BINARY_PATH) print("SOCKS port: %s, Control port: %s" % (socks_port, control_port)) torrc = { 'ControlPort': str(control_port), 'SOCKSPort': str(socks_port), 'DataDirectory': tor_data_dir } tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir, torrc=torrc, tor_binary=tor_binary) with Controller.from_port(port=control_port) as controller: controller.authenticate() with TorBrowserDriver(tbb_dir, socks_port=socks_port, control_port=control_port, tor_cfg=cm.USE_STEM) as driver: driver.load_url("https://check.torproject.org", wait_on_page=3) print(driver.find_element_by("h1.on").text) print(driver.find_element_by(".content > p").text) print_tor_circuits(controller) stop_xvfb(xvfb_display) tor_process.kill()
def headless_visit(tbb_dir): out_img = join(dirname(realpath(__file__)), "headless_screenshot.png") # start a virtual display xvfb_display = start_xvfb() with TorBrowserDriver(tbb_dir) as driver: driver.load_url("https://check.torproject.org") driver.get_screenshot_as_file(out_img) print("Screenshot is saved as %s" % out_img) stop_xvfb(xvfb_display)
def headless_visit(tbb_dir): out_img = join(dirname(realpath(__file__)), "headless_screenshot.png") # start a virtual display xvfb_display = start_xvfb() with TorBrowserDriver(tbb_dir) as driver: for i in range(len(load_table)): start_time = time.clock_gettime_ns(time.CLOCK_REALTIME) driver.load_url(load_table[i][URLS]) end_time = time.clock_gettime_ns(time.CLOCK_REALTIME) driver.get_screenshot_as_file(out_img) print("Screenshot is saved as %s" % out_img) elapsed_time = (end_time - start_time) / 1000000000 print("Load time: ", str(elapsed_time) + "s") load_table[i][VANILLA] = elapsed_time col = -1 for bridge in BRIDGE_TYPE: with TorBrowserDriver(tbb_dir, default_bridge_type=bridge) as bdriver: if bridge == "obfs4": col = 2 print("obfs4..........") if bridge == "meek-azure": col = 3 print("meek-azure..........") if col == -1: break for i in range(len(load_table)): start_time = time.clock_gettime_ns(time.CLOCK_REALTIME) bdriver.load_url(load_table[i][URLS]) end_time = time.clock_gettime_ns(time.CLOCK_REALTIME) bdriver.get_screenshot_as_file(out_img) print("Screenshot is saved as %s" % out_img) elapsed_time = (end_time - start_time) / 1000000000 print("Load time: ", str(elapsed_time) + "s") load_table[i][col] = elapsed_time print("About to print..........") write_csv() stop_xvfb(xvfb_display)
def main(): global workdir desc = "Take a screenshot using TorBrowserDriver" default_url = "https://check.torproject.org" parser = ArgumentParser(description=desc) parser.add_argument('tbb_path') parser.add_argument('output_dir', default=workdir) parser.add_argument('url', nargs='?', default=default_url) args = parser.parse_args() out_img = realpath(join(args.output_dir, "screenshot.png")) if default_url is None: print("ERROR: cannot detect main URL") return 1 xvfb_display = start_xvfb() with TorBrowserDriver(args.tbb_path, headless=True) as driver: visit_and_screenshot(driver, default_url, out_img) stop_xvfb(xvfb_display)
def pullpage(): xvfb_display = start_xvfb() t1 = 'https://whatismyipaddress.com/' t2 = "https://www.bulq.com/" t3 = 'https://cultofrick.com' target = t1 driver = webdriver.Firefox() #TorBrowserDriver.FirefoxProfile(); #driver=TorBrowserDriver('/GMDelight/GMDelight/webtools/tor-browser_en-US') driver.get(target) driver.refresh() #print(driver.page_source) driver.get_screenshot_as_file( '/GMDelight/GMDelight/static/headless_screenshot.png') pgsource = str(driver.page_source) driver.quit() stop_xvfb(xvfb_display) return pgsource
def __init__( self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt", "tbb", "tor-browser_en-US"), tb_log_path=join(_log_dir, "firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") # Set stem logging level to INFO - "high level library activity" stem.util.log.get_logger().setLevel(stem.util.log.Runlevel.INFO) self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config( config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data)
def pytest_sessionstart(session): if "TRAVIS" not in environ and "NO_XVFB" not in environ: test_conf["xvfb_display"] = start_xvfb() if "TRAVIS" in environ: test_conf["tor_process"] = launch_tor()
from tbselenium.utils import start_xvfb, stop_xvfb from tbselenium.tbdriver import TorBrowserDriver from os.path import join, dirname, realpath out_img = join(dirname(realpath(__file__)), "headless_screenshot.png") xvfb_display = start_xvfb() with TorBrowserDriver( '/home/manivannan/pythonexamle/selenium_example/tor-browser_en-US' ) as driver: driver.load_url("https://check.torproject.org") driver.get_screenshot_as_file(out_img) print("Screenshot is saved as %s" % out_img) stop_xvfb(xvfb_display)
def pytest_sessionstart(session): if ("TRAVIS" not in environ and ("NO_XVFB" not in environ or environ["NO_XVFB"] != "1")): test_conf["xvfb_display"] = start_xvfb() test_conf["temp_data_dir"], test_conf["tor_process"] = launch_tor()
def prepare_driver(disable_cookies=False, tor=False, v=False, headless=False): """Prepares a Selenium webdriver given multiple args. Parameters ---------- disable_cookies : Boolean True to use a driver in incognito mode with cookies disables. tor : Boolean True to use a Tor webdriver. v : Boolean verbosity. headless : Boolean True to set the webdriver headless, which means not showing the Firefox window. Returns ------- WebDriver A selenium or tbselenium webdriver. xvfb_display The Xvfb process for hiding the tbselenium webdriver. tor_process The Stem process for running the tbselenium webdriver. """ options = Options() if headless and v: print("Setting headless mode...") options.headless = headless if disable_cookies: firefox_profile = webdriver.FirefoxProfile() # set incognito mode firefox_profile.set_preference("browser.privatebrowsing.autostart", True) # disable cookies firefox_profile.set_preference("network.cookie.cookieBehavior", 2) driver = webdriver.Firefox(options=options, firefox_profile=firefox_profile) elif not tor: driver = webdriver.Firefox(options=options) else: if v: print("Configuring tor browser...") tbb_dir = Driver.TOR_PATH if headless: xvfb_display = start_xvfb() try: tor_process = launch_tbb_tor_with_stem(tbb_path=tbb_dir) except OSError as e: if 'timeout' in str(e): print( 'Error: Tor connection timeout. Check URL or Internet connection' ) return None, None, None else: raise e # Tor driver constructor driver = TorBrowserDriver(tbb_dir, tor_cfg=cm.USE_STEM) if headless: return driver, xvfb_display, tor_process else: return driver, None, tor_process return driver, None, None
def launch_tor_with_custom_stem(datalist, browser): print("length of data: ", len(datalist)) tor_binary = join(cm.TorProxypath, cm.DEFAULT_TOR_BINARY_PATH) tor_process, controller = 0, 0 try: TRYTOR_CNT = cm.TRYCNT while TRYTOR_CNT > 0 and tor_process == 0 and controller == 0: print("try to setup tor:", str(TRYTOR_CNT)) tor_process, controller = TorSetup(tor_binary) TRYTOR_CNT -= 1 if tor_process == 0: raise TorSetupError print("finish tor proxy setup...") xvfb_display = start_xvfb() # virtual display for ele in datalist: t = getTime() savepath, out_img = SetOutputPath(ele, t) p = 0 try: driver, TRYCNT = 0, cm.TRYCNT while driver == 0 and TRYCNT != 0: print("try to setup tbb:", str(TRYCNT)) args = (cm.driverpath, controller, ele[2]) if browser == 'TBB' else () options = { 'TBB': TBBSetup, 'FF': FFSetup, 'CR': ChromeSetup } driver = options[browser](*args) TRYCNT -= 1 if driver == 0: raise TBBSetupError cmd = "tcpdump -i %s tcp and not port ssh -w %s" % ( cm.netInterface, savepath) print('cmd = ', cmd) cmd = cmd.split(' ') p = subprocess.Popen(cmd) try: timeout(cm.VISITPAGE_TIMEOUT) driver.get('https://' + ele[0]) cancel_timeout() time.sleep(cm.DURATION_VISIT_PAGE) p.terminate() if (ele[2] == 0 or ele[2] == 2): driver.get_screenshot_as_file(out_img) writeLog(str(t) + "," + ele[0] + "," + str(ele[2])) print("Finish tcpdump sleep...") except TimeExceededError: writeLog("Error crawling," + ele[0] + "," + str(ele[2]) + "\n" + str("Page visit Timeout")) finally: cancel_timeout() except TBBSetupError: print("[crawl.py error]: unable to setup TBB") writeLog("[crawl.py error]: unable to setup TBB") except Exception as e: with open(cm.ErrorFilePath, 'a+') as fw: fw.write(ele[0] + "," + str(e) + "\n") writeLog("Error crawling," + ele[0] + "," + str(ele[2]) + "\n" + str(e)) finally: if p != 0 and p.returncode != 0: try: p.terminate() except Exception as e: writeLog("[crawl.py] tcpdump terminate error: " + str(e)) if controller != 0: cleanupStream(controller, str(ele[2]), ele[0]) if driver != 0: try: timeout(30) driver.quit() cancel_timeout() except Exception as e: cancel_timeout() writeLog("[crawl.py] driver quit error: " + str(e)) if ele[2] != 3: time.sleep(cm.PAUSE_BETWEEN_INSTANCES) else: time.sleep(cm.PAUSE_BETWEEN_SITES) RemoveTmpFile() RemoveProcess() except TorSetupError: print("[crawl.py] unable to set up tor proxy") writeLog("[crawl.py] unable to set up tor proxy") except Exception as e: print("[crawl.py]launch_tor_with_custom_stem Error") print("Error:", str(e)) writeLog("[crawl.py]launch_tor_with_custom_stem Error : " + str(e)) finally: if tor_process != 0: tor_process.kill() stop_xvfb(xvfb_display)