def test_crash_profile(self): manager_params, browser_params = self.get_config() manager_params.failure_limit = 2 manager = TaskManager(manager_params, browser_params) try: manager.get("http://example.com") # So we have a profile manager.get("example.com") # Selenium requires scheme prefix manager.get("example.com") # Selenium requires scheme prefix manager.get("example.com") # Selenium requires scheme prefix manager.get("example.com") # Requires two commands to shut down except CommandExecutionError: pass assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
def test_profile_saved_when_launch_crashes(self): manager_params, browser_params = self.get_config() browser_params[0].proxy = True browser_params[0].save_content = "script" manager = TaskManager(manager_params, browser_params) manager.get("http://example.com") # Kill the LevelDBAggregator # This will cause the proxy launch to crash manager.ldb_status_queue.put("DIE") manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout manager.get("example.com") # Cause a selenium crash # The browser will fail to launch due to the proxy crashes try: manager.get("http://example.com") except CommandExecutionError: pass manager.close() assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
def test_custom_function(default_params, xpi, server): """ Test `custom_function` with an inline func that collects links """ table_name = TableName("page_links") manager_params, browser_params = default_params path = manager_params.data_directory / "crawl-data.sqlite" db = sqlite3.connect(path) cur = db.cursor() cur.execute( """CREATE TABLE IF NOT EXISTS %s ( top_url TEXT, link TEXT, visit_id INTEGER, browser_id INTEGER);""" % table_name ) cur.close() db.close() storage_provider = SQLiteStorageProvider(path) manager = TaskManager(manager_params, browser_params, storage_provider, None) cs = command_sequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.append_command(CollectLinksCommand(table_name, "http")) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( path, "SELECT top_url, link FROM page_links;", as_tuple=True, ) assert PAGE_LINKS == set(query_result)
def test_saving(self): manager_params, browser_params = self.get_config() manager = TaskManager(manager_params, browser_params) manager.get("http://example.com") manager.close() assert isfile( join(browser_params[0].profile_archive_dir, "profile.tar.gz"))
def _create_task_manager( params: Tuple[ManagerParams, List[BrowserParams]] ) -> Tuple[TaskManager, Path]: manager_params, browser_params = params db_path = manager_params.data_directory / "crawl-data.sqlite" structured_provider = SQLiteStorageProvider(db_path) manager = TaskManager( manager_params, browser_params, structured_provider, None, ) return manager, db_path
def test_crash(self): manager_params, browser_params = self.get_config() manager_params.failure_limit = 0 manager = TaskManager(manager_params, browser_params) with pytest.raises(CommandExecutionError): manager.get("http://example.com") # So we have a profile manager.get("example.com") # Selenium requires scheme prefix manager.get("example.com") # Requires two commands to shut down
def test_display_shutdown(self): manager_params, browser_params = self.get_config() TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager = TaskManager(manager_params, browser_params) port = manager.browsers[0].display_port sequence = CommandSequence(TEST_SITE) sequence.get() sequence.append_command(ExceptionCommand) manager.execute_command_sequence(sequence) manager.close() assert not os.path.exists("/tmp/.X%s-lock" % port)
def test_local_callbacks(self): manager_params, browser_params = self.get_config() TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager = TaskManager(manager_params, browser_params) def callback(argument: List[int], success: bool): argument.extend([1, 2, 3]) my_list = [] sequence = CommandSequence( TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert my_list == [1, 2, 3]
def test_seed_persistance(self): manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0].seed_tar = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.append_command(TestConfigSetCommand("test_pref", True)) command_sequences.append(cs) manager = TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params.database_name, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = "~/Desktop/" manager_params.log_directory = "~/Desktop/" # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val)), ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=10), timeout=60) # Have a look at custom_command.py to see how to implement your own command
def processSite(site): # The list of sites that we wish to crawl NUM_BROWSERS = 1 sites = [site] # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) return (None)
bucket_name=GCS_BUCKET, base_path=CRAWL_DIRECTORY, token=AUTH_TOKEN, ) unstructured = GcsUnstructuredProvider( project=GCP_PROJECT, bucket_name=GCS_BUCKET, base_path=CRAWL_DIRECTORY + "/data", token=AUTH_TOKEN, ) # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager( manager_params, browser_params, structured, unstructured, logger_kwargs=LOGGER_SETTINGS, ) # At this point, Sentry should be initiated if SENTRY_DSN: # Add crawler.py-specific context with sentry_sdk.configure_scope() as scope: # tags generate breakdown charts and search filters scope.set_tag("CRAWL_DIRECTORY", CRAWL_DIRECTORY) scope.set_tag("GCS_BUCKET", GCS_BUCKET) scope.set_tag("DISPLAY_MODE", DISPLAY_MODE) scope.set_tag("HTTP_INSTRUMENT", HTTP_INSTRUMENT) scope.set_tag("COOKIE_INSTRUMENT", COOKIE_INSTRUMENT) scope.set_tag("NAVIGATION_INSTRUMENT", NAVIGATION_INSTRUMENT)
# Record DNS resolution browser_params[i].dns_instrument = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider(Path("./datadir/crawl-data.sqlite")), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, callback=callback,
def test_profile_error(self): manager_params, browser_params = self.get_config() browser_params[0].seed_tar = "/tmp/NOTREAL" with pytest.raises(ProfileLoadError): TaskManager(manager_params, browser_params) # noqa
def crawl(sites, db_filename): ''' sites ihe list of sites that we wish to crawl db_filename is the file name of the output database ''' # Loads the default ManagerParams # and NUM_BROWSERS copies of the default BrowserParams NUM_BROWSERS = 12 manager_params = ManagerParams(num_browsers=NUM_BROWSERS) browser_params = [ BrowserParams(display_mode="headless") for _ in range(NUM_BROWSERS) ] # Update browser configuration (use this for per-browser settings) for i in range(NUM_BROWSERS): # Record HTTP Requests and Responses browser_params[i].http_instrument = True # Record cookie changes browser_params[i].cookie_instrument = True # Record Navigations browser_params[i].navigation_instrument = True # Record JS Web API calls browser_params[i].js_instrument = True # Record the callstack of all WebRequests made browser_params[i].callstack_instrument = True # Record DNS resolution browser_params[i].dns_instrument = True browser_params[i].bot_mitigation = True # Update TaskManager configuration (use this for crawl-wide settings) manager_params.data_directory = Path("./datadir/") manager_params.log_directory = Path("./datadir/") # Commands time out by default after 60 seconds with TaskManager( manager_params, browser_params, SQLiteStorageProvider( Path("./datadir/{}.sqlite".format(db_filename))), None, ) as manager: # Visits the sites for index, site in enumerate(sites): def callback(success: bool, val: str = site) -> None: print( f"CommandSequence for {val} ran {'successfully' if success else 'unsuccessfully'}" ) # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, site_rank=index, reset=True, callback=callback, ) # Start by visiting the page command_sequence.append_command(GetCommand(url=site, sleep=3), timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence)