def test_output_format(): manager_params = ManagerParams() manager_params.output_format = "not None and not int" with pytest.raises(ConfigError): validate_manager_params(manager_params) manager_params.output_format = "s3" validate_manager_params(manager_params)
def test_log_file_extension(): manager_params = ManagerParams() manager_params.log_file = "something.unsupported" with pytest.raises(ConfigError): validate_manager_params(manager_params) manager_params.log_file = [] with pytest.raises(ConfigError): validate_manager_params(manager_params)
def test_failure_limit(): manager_params = ManagerParams() manager_params.failure_limit = "not None and not int" with pytest.raises(ConfigError): validate_manager_params(manager_params) manager_params.failure_limit = None # when failure_limit is set to None validate_manager_params(manager_params) manager_params.failure_limit = 2 # when failure_limit is set to int validate_manager_params(manager_params)
def get_test_config( self, data_dir: Optional[Path] = None, num_browsers: int = NUM_BROWSERS, display_mode: str = "headless", ) -> Tuple[ManagerParams, List[BrowserParams]]: """Load and return the default test parameters.""" if not data_dir: data_dir = self.tmpdir assert data_dir is not None # Mypy doesn't understand this without help manager_params = ManagerParams(num_browsers=num_browsers) browser_params = [BrowserParams() for _ in range(num_browsers)] manager_params.log_path = data_dir / "openwpm.log" manager_params.num_browsers = num_browsers for i in range(num_browsers): browser_params[i].display_mode = display_mode return manager_params, browser_params
def default_params( tmp_path: Path, num_browsers: int = NUM_BROWSERS ) -> Tuple[ManagerParams, List[BrowserParams]]: """Just a simple wrapper around task_manager.load_default_params""" manager_params = ManagerParams( num_browsers=NUM_BROWSERS ) # num_browsers is necessary to let TaskManager know how many browsers to spawn browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] manager_params.data_directory = tmp_path manager_params.log_path = tmp_path / "openwpm.log" for i in range(num_browsers): browser_params[i].display_mode = "headless" return manager_params, browser_params
def test_num_browser_crawl_config(): manager_params = ManagerParams(num_browsers=2) browser_params = [BrowserParams()] with pytest.raises(ConfigError): validate_crawl_configs(manager_params, browser_params) browser_params.append(BrowserParams()) validate_crawl_configs(manager_params, browser_params)
def get_test_config( self, data_dir="", num_browsers=NUM_BROWSERS, display_mode="headless" ): """Load and return the default test parameters.""" if not data_dir: data_dir = self.tmpdir manager_params = ManagerParams(num_browsers=num_browsers) browser_params = [BrowserParams() for _ in range(num_browsers)] manager_params.data_directory = data_dir manager_params.log_directory = data_dir manager_params.num_browsers = num_browsers for i in range(num_browsers): browser_params[i].display_mode = display_mode manager_params.database_name = join( manager_params.data_directory, manager_params.database_name ) return manager_params, browser_params
def __init__( self, manager_params_temp: ManagerParams, browser_params_temp: List[BrowserParams], structured_storage_provider: StructuredStorageProvider, unstructured_storage_provider: Optional[UnstructuredStorageProvider], logger_kwargs: Dict[Any, Any] = {}, ) -> None: """Initialize the TaskManager with browser and manager config params Parameters ---------- manager_params_temp : ManagerParams TaskManager configuration parameters browser_params_temp : list of BrowserParams Browser configuration parameters. It is a list which includes individual configurations for each browser. logger_kwargs : dict, optional Keyword arguments to pass to MPLogger on initialization. """ validate_crawl_configs(manager_params_temp, browser_params_temp) manager_params = ManagerParamsInternal.from_dict(manager_params_temp.to_dict()) browser_params = [ BrowserParamsInternal.from_dict(bp.to_dict()) for bp in browser_params_temp ] manager_params.screenshot_path = manager_params.data_directory / "screenshots" manager_params.source_dump_path = manager_params.data_directory / "sources" self.manager_params = manager_params self.browser_params = browser_params self._logger_kwargs = logger_kwargs # Create data directories if they do not exist if not os.path.exists(manager_params.screenshot_path): os.makedirs(manager_params.screenshot_path) if not os.path.exists(manager_params.source_dump_path): os.makedirs(manager_params.source_dump_path) # Check size of parameter dictionary self.num_browsers = manager_params.num_browsers # Parse and flesh out js_instrument_settings for a_browsers_params in self.browser_params: js_settings = a_browsers_params.js_instrument_settings cleaned_js_settings = clean_js_instrumentation_settings(js_settings) a_browsers_params.cleaned_js_instrument_settings = cleaned_js_settings # Flow control self.closing = False self.failure_status: Optional[Dict[str, Any]] = None self.threadlock = threading.Lock() self.failure_count = 0 self.failure_limit = manager_params.failure_limit # Start logging server thread self.logging_server = MPLogger( self.manager_params.log_path, str(structured_storage_provider), **self._logger_kwargs ) self.manager_params.logger_address = self.logging_server.logger_address self.logger = logging.getLogger("openwpm") # Initialize the storage controller self._launch_storage_controller( structured_storage_provider, unstructured_storage_provider ) # Sets up the BrowserManager(s) + associated queues self.browsers = self._initialize_browsers(browser_params) self._launch_browsers() # Start the manager watchdog thread = threading.Thread(target=self._manager_watchdog, args=()) thread.daemon = True thread.name = "OpenWPM-watchdog" thread.start() # Save crawl config information to database openwpm_v, browser_v = get_version() self.storage_controller_handle.save_configuration( manager_params, browser_params, openwpm_v, browser_v ) self.logger.info( get_configuration_string( self.manager_params, browser_params, (openwpm_v, browser_v) ) ) self.unsaved_command_sequences: Dict[int, CommandSequence] = dict() self.callback_thread = threading.Thread( target=self._mark_command_sequences_complete, args=() ) self.callback_thread.name = "OpenWPM-completion_handler" self.callback_thread.start()
def test_database_file_extension(): manager_params = ManagerParams() manager_params.database_name = "something.unsupported" with pytest.raises(ConfigError): validate_manager_params(manager_params)
from openwpm.task_manager import TaskManager #Code gotten from OPENWPM Project with slight modifications # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [] file = open("SubPagesLess.txt", "r") for page in file: page = page.strip('\n') sites.append(page) # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams( num_browsers=NUM_BROWSERS ) # num_browsers is necessary to let TaskManager know how many browsers to spawn browser_params = [ BrowserParams(display_mode="native") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls
def processSite(site): # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [site] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) return (None)
SENTRY_DSN = os.getenv("SENTRY_DSN", None) LOGGER_SETTINGS = mp_logger.parse_config_from_env() if CALLSTACK_INSTRUMENT is True: # Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work JS_INSTRUMENT = True EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30) # Loads the default manager params # We can't use more than one browser per instance because the job management # code below requires blocking commands. For more context see: # https://github.com/openwpm/OpenWPM/issues/470 NUM_BROWSERS = 1 manager_params = ManagerParams() browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)] # Browser configuration for i in range(NUM_BROWSERS): browser_params[i].display_mode = DISPLAY_MODE browser_params[i].http_instrument = HTTP_INSTRUMENT browser_params[i].cookie_instrument = COOKIE_INSTRUMENT browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT browser_params[i].js_instrument = JS_INSTRUMENT browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS if SAVE_CONTENT == "1": browser_params[i].save_content = True elif SAVE_CONTENT == "0": browser_params[i].save_content = False
from openwpm.config import BrowserParams, ManagerParams from openwpm.storage.sql_provider import SQLiteStorageProvider from openwpm.task_manager import TaskManager # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [ "http://www.example.com", "http://www.princeton.edu", "http://citp.princeton.edu/", ] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made
MAX_JOB_RETRIES = int(os.getenv("MAX_JOB_RETRIES", "2")) JS_INSTRUMENT_SETTINGS = json.loads(JS_INSTRUMENT_SETTINGS) if CALLSTACK_INSTRUMENT is True: # Must have JS_INSTRUMENT True for CALLSTACK_INSTRUMENT to work JS_INSTRUMENT = True EXTENDED_LEASE_TIME = 2 * (TIMEOUT + DWELL_TIME + 30) # Loads the default manager params # We can't use more than one browser per instance because the job management # code below requires blocking commands. For more context see: # https://github.com/mozilla/OpenWPM/issues/470 NUM_BROWSERS = 1 manager_params = ManagerParams() browser_params = [BrowserParams() for _ in range(NUM_BROWSERS)] # Browser configuration for i in range(NUM_BROWSERS): browser_params[i].display_mode = DISPLAY_MODE browser_params[i].http_instrument = HTTP_INSTRUMENT browser_params[i].cookie_instrument = COOKIE_INSTRUMENT browser_params[i].navigation_instrument = NAVIGATION_INSTRUMENT browser_params[i].callstack_instrument = CALLSTACK_INSTRUMENT browser_params[i].js_instrument = JS_INSTRUMENT browser_params[i].js_instrument_settings = JS_INSTRUMENT_SETTINGS if SAVE_CONTENT == "1": browser_params[i].save_content = True elif SAVE_CONTENT == "0": browser_params[i].save_content = False
def crawl(sites, db_filename): ''' sites ihe list of sites that we wish to crawl db_filename is the file name of the output database ''' # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams NUM_BROWSERS = 12 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True browser_params[i].bot_mitigation = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider( Path("./datadir/{}.sqlite".format(db_filename))), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, reset=True, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence)