def test_seed_persistence(default_params, task_manager_creator): manager_params, browser_params = default_params p = Path("profile.tar.gz") for browser_param in browser_params: browser_param.seed_tar = p manager, db = task_manager_creator(default_params) command_sequences = [] for _ in range(2): cs = CommandSequence(url=BASE_TEST_URL) cs.get() cs.append_command(AssertConfigSetCommand("test_pref", True)) command_sequences.append(cs) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( db, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_dump_profile_command(default_params, task_manager_creator): """Test saving the browser profile using a command.""" manager_params, browser_params = default_params manager_params.num_browsers = 1 manager, _ = task_manager_creator((manager_params, browser_params[:1])) cs = CommandSequence(url=BASE_TEST_URL) cs.get() tar_path = manager_params.data_directory / "profile.tar.gz" cs.dump_profile(tar_path, True) manager.execute_command_sequence(cs) manager.close() assert tar_path.is_file()
def test_display_shutdown(self): manager_params, browser_params = self.get_config() TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager = TaskManager(manager_params, browser_params) port = manager.browsers[0].display_port sequence = CommandSequence(TEST_SITE) sequence.get() sequence.append_command(ExceptionCommand) manager.execute_command_sequence(sequence) manager.close() assert not os.path.exists("/tmp/.X%s-lock" % port)
def test_display_shutdown(task_manager_creator, default_params): """Test the XVFB display option to see if it runs and deletes the lockfile upon shutdown""" manager_params, browser_params = default_params for browser_param in browser_params: browser_param.display_mode = "xvfb" TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager, db = task_manager_creator((manager_params, browser_params)) port = manager.browsers[0].display_port sequence = CommandSequence(TEST_SITE) sequence.get() sequence.append_command(ExceptionCommand()) manager.execute_command_sequence(sequence) manager.close() assert not os.path.exists("/tmp/.X%s-lock" % port)
def test_local_callbacks(self): manager_params, browser_params = self.get_config() TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager = TaskManager(manager_params, browser_params) def callback(argument: List[int], success: bool): argument.extend([1, 2, 3]) my_list = [] sequence = CommandSequence( TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert my_list == [1, 2, 3]
def test_local_callbacks(default_params, task_manager_creator): """Test the storage controller as well as the entire callback machinery to see if all callbacks get correctly called""" manager, _ = task_manager_creator(default_params) TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" def callback(argument: List[int], success: bool) -> None: argument.extend([1, 2, 3]) my_list: List[int] = [] sequence = CommandSequence( TEST_SITE, blocking=True, callback=partial(callback, my_list) ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert my_list == [1, 2, 3]
def test_s3_callbacks(self): TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" manager_params, browser_params = self.get_config() dataset = LocalS3Dataset(manager_params.s3_bucket, manager_params.s3_directory) manager = task_manager.TaskManager(manager_params, browser_params) queue = Queue() def ensure_site_in_s3(success: bool): # Ensure http table is created queue.put( TEST_SITE in dataset.load_table("http_requests").top_level_url.unique() ) sequence = CommandSequence( TEST_SITE, reset=True, blocking=True, callback=ensure_site_in_s3 ) sequence.get() manager.execute_command_sequence(sequence) manager.close() assert queue.get()
def test_seed_persistance(self): manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0].seed_tar = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.append_command(TestConfigSetCommand("test_pref", True)) command_sequences.append(cs) manager = TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params.database_name, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_seed_persistance(self): def test_config_is_set(*args, **kwargs): driver = kwargs["driver"] driver.get("about:config") result = driver.execute_script(""" var prefs = Components .classes["@mozilla.org/preferences-service;1"] .getService(Components.interfaces.nsIPrefBranch); try { return prefs.getBoolPref("test_pref") } catch (e) { return false; } """) assert result manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0]["seed_tar"] = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.run_custom_function(test_config_is_set) command_sequences.append(cs) manager = task_manager.TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params["db"], "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_cache_hits_recorded(http_params, task_manager_creator): """Verify all http responses are recorded, including cached responses Note that we expect to see all of the same requests and responses during the second vist (even if cached) except for images. Cached images do not trigger Observer Notification events. See Bug 634073: https://bugzilla.mozilla.org/show_bug.cgi?id=634073 The test page includes an image which does several permanent redirects before returning a 404. We expect to see new requests and responses for this image when the page is reloaded. Additionally, the redirects should be cached. """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" manager_params, browser_params = http_params() # ensuring that we only spawn one browser manager_params.num_browsers = 1 manager, db = task_manager_creator((manager_params, [browser_params[0]])) for i in range(2): cs = CommandSequence(test_url, site_rank=i) cs.get(sleep=5) manager.execute_command_sequence(cs) manager.close() request_id_to_url = dict() # HTTP Requests rows = db_utils.query_db( db, """ SELECT hr.* FROM http_requests as hr JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id WHERE sv.site_rank = 1""", ) observed_records = set() for row in rows: # HACK: favicon caching is unpredictable, don't bother checking it if row["url"].split("?")[0].endswith("favicon.ico"): continue observed_records.add(( row["url"].split("?")[0], row["top_level_url"], row["triggering_origin"], row["loading_origin"], row["loading_href"], row["is_XHR"], row["is_third_party_channel"], row["is_third_party_to_top_window"], row["resource_type"], )) request_id_to_url[row["request_id"]] = row["url"] assert observed_records == HTTP_CACHED_REQUESTS # HTTP Responses rows = db_utils.query_db( db, """ SELECT hp.* FROM http_responses as hp JOIN site_visits sv ON sv.visit_id = hp.visit_id and sv.browser_id = hp.browser_id WHERE sv.site_rank = 1""", ) observed_records = set() for row in rows: # HACK: favicon caching is unpredictable, don't bother checking it if row["url"].split("?")[0].endswith("favicon.ico"): continue observed_records.add(( row["url"].split("?")[0], # TODO: referrer isn't available yet in the # webext instrumentation | row['referrer'], row["is_cached"], )) assert row["request_id"] in request_id_to_url assert request_id_to_url[row["request_id"]] == row["url"] assert HTTP_CACHED_RESPONSES == observed_records # HTTP Redirects rows = db_utils.query_db( db, """ SELECT hr.* FROM http_redirects as hr JOIN site_visits sv ON sv.visit_id = hr.visit_id and sv.browser_id = hr.browser_id WHERE sv.site_rank = 1""", ) observed_records = set() for row in rows: # TODO: new_request_id isn't supported yet # src = request_id_to_url[row['old_request_id']].split('?')[0] # dst = request_id_to_url[row['new_request_id']].split('?')[0] src = row["old_request_url"].split("?")[0] dst = row["new_request_url"].split("?")[0] observed_records.add((src, dst)) assert HTTP_CACHED_REDIRECTS == observed_records
timeout=5) if job is None: manager.logger.info("Waiting for work") time.sleep(5) continue unsaved_jobs.append(job) retry_number = job_queue.get_retry_number(job) site_rank, site = job.decode("utf-8").split(",") if "://" not in site: site = "http://" + site manager.logger.info("Visiting %s..." % site) callback = get_job_completion_callback(manager.logger, unsaved_jobs_lock, job_queue, job) command_sequence = CommandSequence( site, blocking=True, reset=True, retry_number=retry_number, callback=callback, site_rank=int(site_rank), ) command_sequence.get(sleep=DWELL_TIME, timeout=TIMEOUT) manager.execute_command_sequence(command_sequence) else: manager.logger.info("Job queue finished, exiting.") manager.close() if SENTRY_DSN: sentry_sdk.capture_message("Crawl worker finished")
manager_params.log_directory = "~/Desktop/" # memory_watchdog and process_watchdog are useful for large scale cloud crawls. # Please refer to docs/Configuration.md#platform-configuration-options for more information # manager_params.memory_watchdog = True # manager_params.process_watchdog = True # Instantiates the measurement platform # Commands time out by default after 60 seconds manager = TaskManager(manager_params, browser_params) # Visits the sites for site in sites: # Parallelize sites over all number of browsers set above. command_sequence = CommandSequence( site, reset=True, callback=lambda success, val=site: print("CommandSequence {} done". format(val)), ) # Start by visiting the page command_sequence.get(sleep=3, timeout=60) # Run commands across the three browsers (simple parallelization) manager.execute_command_sequence(command_sequence) # Shuts down the browsers and waits for the data to finish logging manager.close()