def test_browser_type(): browser_params = BrowserParams() browser_params.browser = "something unsupported" with pytest.raises(ConfigError): validate_browser_params(browser_params) browser_params.browser = "firefox" validate_browser_params(browser_params)
def test_tp_cookies_opt(): browser_params = BrowserParams() browser_params.tp_cookies = "something unsupported" with pytest.raises(ConfigError): validate_browser_params(browser_params) browser_params.tp_cookies = "never" validate_browser_params(browser_params)
def test_num_browser_crawl_config(): manager_params = ManagerParams(num_browsers=2) browser_params = [BrowserParams()] with pytest.raises(ConfigError): validate_crawl_configs(manager_params, browser_params) browser_params.append(BrowserParams()) validate_crawl_configs(manager_params, browser_params)
def test_display_mode(): browser_params = BrowserParams() browser_params.display_mode = "something unsupported" with pytest.raises(ConfigError): validate_browser_params(browser_params) browser_params.display_mode = [] with pytest.raises(ConfigError): validate_browser_params(browser_params) browser_params.display_mode = "native" validate_browser_params(browser_params)
def get_test_config( self, data_dir="", num_browsers=NUM_BROWSERS, display_mode="headless" ): """Load and return the default test parameters.""" if not data_dir: data_dir = self.tmpdir manager_params = ManagerParams(num_browsers=num_browsers) browser_params = [BrowserParams() for _ in range(num_browsers)] manager_params.data_directory = data_dir manager_params.log_directory = data_dir manager_params.num_browsers = num_browsers for i in range(num_browsers): browser_params[i].display_mode = display_mode manager_params.database_name = join( manager_params.data_directory, manager_params.database_name ) return manager_params, browser_params
def default_params( tmp_path: Path, num_browsers: int = NUM_BROWSERS ) -> Tuple[ManagerParams, List[BrowserParams]]: """Just a simple wrapper around task_manager.load_default_params""" manager_params = ManagerParams( num_browsers=NUM_BROWSERS ) # num_browsers is necessary to let TaskManager know how many browsers to spawn browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] manager_params.data_directory = tmp_path manager_params.log_path = tmp_path / "openwpm.log" for i in range(num_browsers): browser_params[i].display_mode = "headless" return manager_params, browser_params
def get_test_config( self, data_dir: Optional[Path] = None, num_browsers: int = NUM_BROWSERS, display_mode: str = "headless", ) -> Tuple[ManagerParams, List[BrowserParams]]: """Load and return the default test parameters.""" if not data_dir: data_dir = self.tmpdir assert data_dir is not None # Mypy doesn't understand this without help manager_params = ManagerParams(num_browsers=num_browsers) browser_params = [BrowserParams() for _ in range(num_browsers)] manager_params.log_path = data_dir / "openwpm.log" manager_params.num_browsers = num_browsers for i in range(num_browsers): browser_params[i].display_mode = display_mode return manager_params, browser_params
def test_save_content_type(): browser_params = BrowserParams() browser_params.save_content = [] with pytest.raises(ConfigError): validate_browser_params(browser_params) browser_params.save_content = "something unsupported" with pytest.raises(ConfigError): validate_browser_params(browser_params) browser_params.save_content = False validate_browser_params(browser_params) browser_params.save_content = "script" validate_browser_params(browser_params)
def start_webdriver(with_extension=True, load_browser_params=True, browser_params_file=None): """Open a webdriver instance and a server for the test pages This is meant to be imported and run manually from a python or ipython shell. A webdriver instance is returned and both the webdriver and server will automatically clean up when the shell is exited. Parameters ---------- with_extension : boolean Set to True to also load OpenWPM extension instrumentation load_browser_params : boolean Set to True to load browser_params browser_params_file : string Specify the browser_params.json to load. If None, default params from openwpm/config.py::BrowserParams will be loaded. Returns ------- webdriver A selenium webdriver instance. """ firefox_binary_path = get_firefox_binary_path() fb = FirefoxBinary(firefox_path=firefox_binary_path) server, thread = start_server() def register_cleanup(driver): driver.get(BASE_TEST_URL) def cleanup_server(): print("Cleanup before shutdown...") server.shutdown() thread.join() print("...server shutdown") driver.quit() print("...webdriver closed") shutil.rmtree(driver.capabilities["moz:profile"], ignore_errors=True) print("...browser profile removed") atexit.register(cleanup_server) return driver browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) fo = Options() fo.add_argument("-profile") fo.add_argument(str(browser_profile_path)) if with_extension: # TODO: Restore preference for log level in a way that works in Fx 57+ # fp.set_preference("*****@*****.**", "all") configure_firefox.optimize_prefs(fo) driver = webdriver.Firefox(firefox_binary=fb, options=fo) if load_browser_params is True: # There's probably more we could do here # to set more preferences and better emulate # what happens in TaskManager. But this lets # us pass some basic things. browser_params = BrowserParams() if browser_params_file is not None: with open(browser_params_file, "r") as f: browser_params.from_json(f.read()) js_request = browser_params.js_instrument_settings js_request_as_string = jsi.clean_js_instrumentation_settings( js_request) browser_params.js_instrument_settings = js_request_as_string with open(browser_profile_path / "browser_params.json", "w") as f: f.write(browser_params.to_json()) if with_extension: # add openwpm extension to profile xpi() ext_xpi = join(EXT_PATH, "dist", "openwpm-1.0.zip") driver.install_addon(ext_xpi, temporary=True) return register_cleanup(driver)
# The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [] file = open("SubPagesLess.txt", "r") for page in file: page = page.strip('\n') sites.append(page) # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams( num_browsers=NUM_BROWSERS ) # num_browsers is necessary to let TaskManager know how many browsers to spawn browser_params = [ BrowserParams(display_mode="native") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution
def processSite(site): # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [site] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) return (None)
SENTRY_DSN = os.getenv("SENTRY_DSN", None) LOGGER_SETTINGS = mp_logger.parse_config_from_env() if CALLSTACK_INSTRUMENT is True: # Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work JS_INSTRUMENT = True EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30) # Loads the default manager params # We can't use more than one browser per instance because the job management # code below requires blocking commands. For more context see: # https://github.com/openwpm/OpenWPM/issues/470 NUM_BROWSERS = 1 manager_params = ManagerParams() browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)] # Browser configuration for i in range(NUM_BROWSERS): browser_params[i].display_mode = DISPLAY_MODE browser_params[i].http_instrument = HTTP_INSTRUMENT browser_params[i].cookie_instrument = COOKIE_INSTRUMENT browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT browser_params[i].js_instrument = JS_INSTRUMENT browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS if SAVE_CONTENT == "1": browser_params[i].save_content = True elif SAVE_CONTENT == "0": browser_params[i].save_content = False else:
from openwpm.task_manager import TaskManager # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [ "http://www.example.com", "http://www.princeton.edu", "http://citp.princeton.edu/", ] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution
def crawl(sites, db_filename): ''' sites ihe list of sites that we wish to crawl db_filename is the file name of the output database ''' # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams NUM_BROWSERS = 12 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True browser_params[i].bot_mitigation = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider( Path("./datadir/{}.sqlite".format(db_filename))), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, reset=True, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence)