def test_extension_gets_correct_visit_id(self) -> None: url_a = utilities.BASE_TEST_URL + "/simple_a.html" url_b = utilities.BASE_TEST_URL + "/simple_b.html" self.visit(url_a) db = self.visit(url_b) qry_res = db_utils.query_db( db, "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] simple_a_visit_id = db_utils.query_db( db, "SELECT visit_id FROM javascript WHERE symbol=?", ("window.navigator.userAgent", ), ) simple_b_visit_id = db_utils.query_db( db, "SELECT visit_id FROM javascript WHERE symbol=?", ("window.navigator.platform", ), ) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0]
def test_extension_gets_correct_visit_id(self): manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) url_a = utilities.BASE_TEST_URL + "/simple_a.html" url_b = utilities.BASE_TEST_URL + "/simple_b.html" manager.get(url_a) manager.get(url_b) manager.close() qry_res = db_utils.query_db( manager_params.database_name, "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] simple_a_visit_id = db_utils.query_db( manager_params.database_name, "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.userAgent", ), ) simple_b_visit_id = db_utils.query_db( manager_params.database_name, "SELECT visit_id FROM javascript WHERE " "symbol=?", ("window.navigator.platform", ), ) assert visit_ids[url_a] == simple_a_visit_id[0][0] assert visit_ids[url_b] == simple_b_visit_id[0][0]
def test_page_visit(self): test_url = utilities.BASE_TEST_URL + "/http_test_page.html" db = self.visit(test_url) request_id_to_url = dict() # HTTP Requests rows = db_utils.query_db(db, "SELECT * FROM http_requests") observed_records = set() for row in rows: observed_records.add( ( row["url"].split("?")[0], row["top_level_url"], row["triggering_origin"], row["loading_origin"], row["loading_href"], row["is_XHR"], row["is_third_party_channel"], row["is_third_party_to_top_window"], row["resource_type"], ) ) request_id_to_url[row["request_id"]] = row["url"] assert HTTP_REQUESTS == observed_records # HTTP Responses rows = db_utils.query_db(db, "SELECT * FROM http_responses") observed_records: Set[Tuple[str, str]] = set() for row in rows: observed_records.add( ( row["url"].split("?")[0], # TODO: webext-instrumentation doesn't support referrer # yet | row['referrer'], row["location"], ) ) assert row["request_id"] in request_id_to_url assert request_id_to_url[row["request_id"]] == row["url"] assert HTTP_RESPONSES == observed_records # HTTP Redirects rows = db_utils.query_db(db, "SELECT * FROM http_redirects") observed_records = set() for row in rows: # TODO: webext instrumentation doesn't support new_request_id yet # src = request_id_to_url[row['old_request_id']].split('?')[0] # dst = request_id_to_url[row['new_request_id']].split('?')[0] src = row["old_request_url"].split("?")[0] dst = row["new_request_url"].split("?")[0] headers = json.loads(row["headers"]) location = None for header, value in headers: if header.lower() == "location": location = value break observed_records.add((src, dst, location)) assert HTTP_REDIRECTS == observed_records
def test_profile_recovery(monkeypatch, default_params, task_manager_creator, testcase, stateful, seed_tar): """Test browser profile recovery in various scenarios.""" manager_params, browser_params = default_params manager_params.num_browsers = 1 browser_params[0].seed_tar = seed_tar manager, db = task_manager_creator((manager_params, browser_params[:1])) manager.get(BASE_TEST_URL, reset=not stateful) if testcase == "normal_operation": pass elif testcase == "on_crash": # Cause a selenium crash to force browser to restart manager.get("example.com", reset=not stateful) elif testcase == "on_crash_during_launch": # Cause a selenium crash to force browser to restart manager.get("example.com", reset=not stateful) # This will cause browser restarts to fail monkeypatch.setenv("FIREFOX_BINARY", "/tmp/NOTREAL") # Let the launch succeed after some failed launch attempts def undo_monkeypatch(): time.sleep(5) # This should be smaller than _SPAWN_TIMEOUT monkeypatch.undo() Thread(target=undo_monkeypatch).start() elif testcase == "on_timeout": # Set a very low timeout to cause a restart manager.get("about:config", reset=not stateful, timeout=0.1) cs = CommandSequence("about:config", reset=not stateful) expected_value = True if seed_tar else False cs.append_command(AssertConfigSetCommand("test_pref", expected_value)) tar_directory = manager_params.data_directory / "browser_profile" tar_path = tar_directory / "profile.tar.gz" cs.dump_profile(tar_path, True) manager.execute_command_sequence(cs) manager.close() # Check that a consistent profile is used for stateful crawls but # not for stateless crawls with tarfile.open(tar_path) as tar: tar.extractall(tar_directory) ff_db = tar_directory / "places.sqlite" rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") places = [url for (url, ) in rows] if stateful: assert BASE_TEST_URL in places else: assert BASE_TEST_URL not in places # Check if seed_tar was loaded on restart rows = db_utils.query_db( db, "SELECT command_status FROM crawl_history WHERE" " command='AssertConfigSetCommand'", ) assert rows[0][0] == "ok"
def test_get_http_tables_valid(self, display_mode): """Check that get works and populates http tables correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(display_mode) manager = task_manager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = command_sequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = command_sequence.CommandSequence(url_b) cs_b.get(sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( manager_params.database_name, "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db( manager_params.database_name, "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_a, ), ) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db( manager_params.database_name, "SELECT visit_id FROM http_requests" " WHERE url = ?", (url_b, ), ) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db( manager_params.database_name, "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_a, ), ) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db( manager_params.database_name, "SELECT visit_id FROM http_responses" " WHERE url = ?", (url_b, ), ) assert qry_res[0][0] == visit_ids[url_b]
def test_get_site_visits_table_valid(http_params, task_manager_creator, display_mode): """Check that get works and populates db correctly.""" # Run the test crawl manager_params, browser_params = http_params(display_mode) manager, db = task_manager_creator((manager_params, browser_params)) # Set up two sequential get commands to two URLS cs_a = command_sequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = command_sequence.CommandSequence(url_b) cs_b.get(sleep=1) # Perform the get commands manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( db, "SELECT site_url FROM site_visits ORDER BY site_url", ) # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[1][0] == url_b
def test_browse_site_visits_table_valid(self, display_mode): """Check that CommandSequence.browse() populates db correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(display_mode) manager = task_manager.TaskManager(manager_params, browser_params) # Set up two sequential browse commands to two URLS cs_a = command_sequence.CommandSequence(url_a, site_rank=0) cs_a.browse(num_links=1, sleep=1) cs_b = command_sequence.CommandSequence(url_b, site_rank=1) cs_b.browse(num_links=1, sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( manager_params["db"], "SELECT site_url, site_rank" " FROM site_visits") # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[0][1] == 0 assert qry_res[1][0] == url_b assert qry_res[1][1] == 1
def test_get_site_visits_table_valid(self, display_mode): """Check that get works and populates db correctly.""" # Run the test crawl manager_params, browser_params = self.get_config(display_mode) manager = task_manager.TaskManager(manager_params, browser_params) # Set up two sequential get commands to two URLS cs_a = command_sequence.CommandSequence(url_a) cs_a.get(sleep=1) cs_b = command_sequence.CommandSequence(url_b) cs_b.get(sleep=1) # Perform the get commands manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db(manager_params["db"], "SELECT site_url FROM site_visits") # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[1][0] == url_b
def get_post_requests_from_db(self, db): """Query the crawl database and return the POST requests.""" return db_utils.query_db( db, "SELECT * FROM http_requests\ WHERE method = 'POST'", )
def test_browse_site_visits_table_valid(http_params, task_manager_creator, display_mode): """Check that CommandSequence.browse() populates db correctly.""" # Run the test crawl manager_params, browser_params = http_params(display_mode) manager, db = task_manager_creator((manager_params, browser_params)) # Set up two sequential browse commands to two URLS cs_a = command_sequence.CommandSequence(url_a, site_rank=0) cs_a.browse(num_links=1, sleep=1) cs_b = command_sequence.CommandSequence(url_b, site_rank=1) cs_b.browse(num_links=1, sleep=1) manager.execute_command_sequence(cs_a) manager.execute_command_sequence(cs_b) manager.close() qry_res = db_utils.query_db( db, "SELECT site_url, site_rank FROM site_visits ORDER BY site_rank", ) # We had two separate page visits assert len(qry_res) == 2 assert qry_res[0][0] == url_a assert qry_res[0][1] == 0 assert qry_res[1][0] == url_b assert qry_res[1][1] == 1
def test_http_stacktrace(self): test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html" manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) manager.get(test_url, sleep=10) db = manager_params["db"] manager.close() rows = db_utils.query_db( db, ( "SELECT hr.url, c.call_stack" " FROM callstacks c" " JOIN http_requests hr" " ON c.request_id=hr.request_id" " AND c.visit_id= hr.visit_id" " AND c.browser_id = hr.browser_id;" ), ) print("Printing callstacks contents") observed_records = set() for row in rows: print(row["call_stack"]) url, call_stack = row test_urls = ( "inject_pixel.js", "test_image.png", "Blank.gif", ) if url.endswith(test_urls): observed_records.add(call_stack) assert HTTP_STACKTRACES == observed_records
def test_seed_persistence(default_params, task_manager_creator): manager_params, browser_params = default_params p = Path("profile.tar.gz") for browser_param in browser_params: browser_param.seed_tar = p manager, db = task_manager_creator(default_params) command_sequences = [] for _ in range(2): cs = CommandSequence(url=BASE_TEST_URL) cs.get() cs.append_command(AssertConfigSetCommand("test_pref", True)) command_sequences.append(cs) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( db, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_custom_function(default_params, xpi, server): """ Test `custom_function` with an inline func that collects links """ table_name = TableName("page_links") manager_params, browser_params = default_params path = manager_params.data_directory / "crawl-data.sqlite" db = sqlite3.connect(path) cur = db.cursor() cur.execute( """CREATE TABLE IF NOT EXISTS %s ( top_url TEXT, link TEXT, visit_id INTEGER, browser_id INTEGER);""" % table_name ) cur.close() db.close() storage_provider = SQLiteStorageProvider(path) manager = TaskManager(manager_params, browser_params, storage_provider, None) cs = command_sequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.append_command(CollectLinksCommand(table_name, "http")) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( path, "SELECT top_url, link FROM page_links;", as_tuple=True, ) assert PAGE_LINKS == set(query_result)
def test_content_saving(self, tmpdir): """ check that content is saved and hashed correctly """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" manager_params, browser_params = self.get_test_config(str(tmpdir)) browser_params[0]["http_instrument"] = True browser_params[0]["save_content"] = True manager = task_manager.TaskManager(manager_params, browser_params) manager.get(url=test_url, sleep=1) manager.close() db = manager_params["db"] rows = db_utils.query_db(db, "SELECT * FROM http_responses;") disk_content = dict() for row in rows: if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]: continue path = urlparse(row["url"]).path with open(os.path.join(BASE_PATH, path[1:]), "rb") as f: content = f.read() chash = sha256(content).hexdigest() assert chash == row["content_hash"] disk_content[chash] = content ldb_content = dict() for chash, content in db_utils.get_content(str(tmpdir)): chash = chash.decode("ascii") ldb_content[chash] = content for k, v in disk_content.items(): assert v == ldb_content[k]
def test_service_worker_requests(self): """Check correct URL attribution for requests made by service worker""" test_url = utilities.BASE_TEST_URL + "/http_service_worker_page.html" db = self.visit(test_url) request_id_to_url = dict() # HTTP Requests rows = db_utils.query_db(db, "SELECT * FROM http_requests") observed_records = set() for row in rows: observed_records.add(( row["url"].split("?")[0], row["top_level_url"], row["triggering_origin"], row["loading_origin"], row["loading_href"], row["is_XHR"], row["is_third_party_channel"], row["is_third_party_to_top_window"], row["resource_type"], )) request_id_to_url[row["request_id"]] = row["url"] assert HTTP_SERVICE_WORKER_REQUESTS == observed_records
def test_name_resolution(self): db = self.visit("http://localtest.me:8000") result = db_utils.query_db(db, "SELECT * FROM dns_responses") result = result[0] print(result.keys()) assert result["used_address"] == "127.0.0.1" assert result["addresses"] == "127.0.0.1" assert result["hostname"] == "localtest.me" assert result["canonical_name"] == "localtest.me"
def test_property_enumeration(self) -> None: test_url = utilities.BASE_TEST_URL + "/property_enumeration.html" db = self.visit(test_url) rows = db_utils.query_db(db, "SELECT script_url, symbol FROM javascript") observed_symbols = set() for script_url, symbol in rows: assert script_url == test_url observed_symbols.add(symbol) assert PROPERTIES == observed_symbols
def test_custom_function(self): """ Test `custom_function` with an inline func that collects links """ from openwpm.socket_interface import ClientSocket def collect_links(table_name, scheme, **kwargs): """ Collect links with `scheme` and save in table `table_name` """ driver = kwargs["driver"] manager_params = kwargs["manager_params"] browser_id = kwargs["command"].browser_id visit_id = kwargs["command"].visit_id link_urls = [ x for x in (element.get_attribute("href") for element in driver.find_elements_by_tag_name("a")) if x.startswith(scheme + "://") ] current_url = driver.current_url sock = ClientSocket() sock.connect(*manager_params.aggregator_address) query = ("CREATE TABLE IF NOT EXISTS %s (" "top_url TEXT, link TEXT, " "visit_id INTEGER, browser_id INTEGER);" % table_name) sock.send(("create_table", query)) for link in link_urls: query = ( table_name, { "top_url": current_url, "link": link, "visit_id": visit_id, "browser_id": browser_id, }, ) sock.send(query) sock.close() manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) cs = command_sequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.run_custom_function(collect_links, ("page_links", "http")) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params.database_name, "SELECT top_url, link FROM page_links;", as_tuple=True, ) assert PAGE_LINKS == set(query_result)
def test_parse_neterror_integration(default_params, task_manager_creator): manager, db = task_manager_creator(default_params) manager.get("http://website.invalid") manager.close() get_command = db_utils.query_db( db, "SELECT command_status, error FROM crawl_history WHERE command ='GetCommand'", as_tuple=True, )[0] assert get_command[0] == "neterror" assert get_command[1] == "dnsNotFound"
def test_parse_neterror_integration(self): manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) manager.get("http://website.invalid") manager.close() get_command = db_utils.query_db( manager_params["db"], "SELECT command_status, error FROM crawl_history WHERE command = \"<class 'openwpm.commands.types.GetCommand'>\"", as_tuple=True, )[0] assert get_command[0] == "neterror" assert get_command[1] == "dnsNotFound"
def test_name_resolution(default_params, task_manager_creator): manager_params, browser_params = default_params for browser_param in browser_params: browser_param.dns_instrument = True manager, db = task_manager_creator((manager_params, browser_params)) manager.get("http://localtest.me:8000") manager.close() result = db_utils.query_db(db, "SELECT * FROM dns_responses") result = result[0] assert result["used_address"] == "127.0.0.1" assert result["addresses"] == "127.0.0.1" assert result["hostname"] == "localtest.me" assert result["canonical_name"] == "localtest.me"
def test_custom_function(self): """ Test `custom_function` with an inline func that collects links """ manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) cs = command_sequence.CommandSequence(url_a) cs.get(sleep=0, timeout=60) cs.append_command(CollectLinksCommand("http", "page_links")) manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params.database_name, "SELECT top_url, link FROM page_links;", as_tuple=True, ) assert PAGE_LINKS == set(query_result)
def test_command_duration(default_params, task_manager_creator): manager, db = task_manager_creator(default_params) manager.get(url=TEST_URL, sleep=5) manager.close() get_command = db_utils.query_db( db, "SELECT duration FROM crawl_history WHERE command = 'GetCommand'", as_tuple=True, )[0] assert get_command[0] > (5 * 1000 ) # milliseconds conversion for sleep time assert get_command[0] <= ( (5 * 1000) + 2 * 1000 ) # milliseconds conversion for sleep time + time duration a command took (milliseconds)
def test_command_duration(self): manager_params, browser_params = self.get_config() manager = task_manager.TaskManager(manager_params, browser_params) manager.get(url=TEST_URL, sleep=5) manager.close() get_command = db_utils.query_db( manager_params.database_name, "SELECT duration FROM crawl_history WHERE command = \"<class 'openwpm.commands.types.GetCommand'>\"", as_tuple=True, )[0] assert get_command[0] > (5 * 1000 ) # milliseconds conversion for sleep time assert get_command[0] <= ( (5 * 1000) + 2 * 1000 ) # milliseconds conversion for sleep time + time duration a command took (milliseconds)
def test_seed_persistance(self): manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0].seed_tar = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.append_command(TestConfigSetCommand("test_pref", True)) command_sequences.append(cs) manager = TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params.database_name, "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_js_profile_cookies(self): """ Check that profile cookies set by JS are saved """ # Run the test crawl manager_params, browser_params = self.get_config() browser_params[0].cookie_instrument = True manager = task_manager.TaskManager(manager_params, browser_params) url = utilities.BASE_TEST_URL + "/js_cookie.html" cs = command_sequence.CommandSequence(url) cs.get(sleep=3, timeout=120) manager.execute_command_sequence(cs) manager.close() # Check that the JS cookie we stored is recorded qry_res = db_utils.query_db( manager_params.database_name, ("SELECT visit_id, record_type, change_cause, is_http_only, " "is_host_only, is_session, host, is_secure, name, path, " "value, same_site FROM javascript_cookies"), as_tuple=True, ) assert len(qry_res) == 1 # we store only one cookie cookies = qry_res[0] # take the first cookie # compare URL, domain, name, value, origin, path assert cookies == expected_js_cookie
def test_http_stacktrace(default_params, task_manager_creator): manager_params, browser_params = default_params for browser_param in browser_params: # Record HTTP Requests and Responses browser_param.http_instrument = True # Record JS Web API calls browser_param.js_instrument = True # Record the callstack of all WebRequests made browser_param.callstack_instrument = True test_url = utilities.BASE_TEST_URL + "/http_stacktrace.html" manager, db = task_manager_creator((manager_params, browser_params)) manager.get(test_url, sleep=10) manager.close() rows = db_utils.query_db( db, ( "SELECT hr.url, c.call_stack" " FROM callstacks c" " JOIN http_requests hr" " ON c.request_id=hr.request_id" " AND c.visit_id= hr.visit_id" " AND c.browser_id = hr.browser_id;" ), ) print("Printing callstacks contents") observed_records = set() for row in rows: print(row["call_stack"]) url, call_stack = row test_urls = ( "inject_pixel.js", "test_image.png", "Blank.gif", ) if url.endswith(test_urls): observed_records.add(call_stack) assert HTTP_STACKTRACES == observed_records
def test_content_saving(http_params, xpi, server): """ check that content is saved and hashed correctly """ test_url = utilities.BASE_TEST_URL + "/http_test_page.html" manager_params, browser_params = http_params() for browser_param in browser_params: browser_param.http_instrument = True browser_param.save_content = True db = manager_params.data_directory / "crawl-data.sqlite" structured_storage = SQLiteStorageProvider(db_path=db) ldb_path = Path(manager_params.data_directory) / "content.ldb" unstructured_storage = LevelDbProvider(db_path=ldb_path) manager = task_manager.TaskManager(manager_params, browser_params, structured_storage, unstructured_storage) manager.get(url=test_url, sleep=1) manager.close() rows = db_utils.query_db(db, "SELECT * FROM http_responses;") disk_content = dict() for row in rows: if "MAGIC_REDIRECT" in row["url"] or "404" in row["url"]: continue path = urlparse(row["url"]).path with open(os.path.join(BASE_PATH, path[1:]), "rb") as f: content = f.read() chash = sha256(content).hexdigest() assert chash == row["content_hash"] disk_content[chash] = content ldb_content = dict() for chash, content in db_utils.get_content(ldb_path): chash = chash.decode("ascii") ldb_content[chash] = content for k, v in disk_content.items(): assert v == ldb_content[k]
def test_seed_persistance(self): def test_config_is_set(*args, **kwargs): driver = kwargs["driver"] driver.get("about:config") result = driver.execute_script(""" var prefs = Components .classes["@mozilla.org/preferences-service;1"] .getService(Components.interfaces.nsIPrefBranch); try { return prefs.getBoolPref("test_pref") } catch (e) { return false; } """) assert result manager_params, browser_params = self.get_test_config(num_browsers=1) browser_params[0]["seed_tar"] = "." command_sequences = [] for _ in range(2): cs = CommandSequence(url="https://example.com", reset=True) cs.get() cs.run_custom_function(test_config_is_set) command_sequences.append(cs) manager = task_manager.TaskManager(manager_params, browser_params) for cs in command_sequences: manager.execute_command_sequence(cs) manager.close() query_result = db_utils.query_db( manager_params["db"], "SELECT * FROM crawl_history;", ) assert len(query_result) > 0 for row in query_result: assert row[ "command_status"] == "ok", f"Command {tuple(row)} was not ok"
def test_browse_wrapper_http_table_valid(http_params, task_manager_creator, display_mode): """Check that TaskManager.browse() wrapper works and populates http tables correctly. NOTE: Since the browse command is choosing links randomly, there is a (very small -- 2*0.5^20) chance this test will fail with valid code. """ # Run the test crawl manager_params, browser_params = http_params(display_mode) manager, db = task_manager_creator((manager_params, browser_params)) # Set up two sequential browse commands to two URLS manager.browse(url_a, num_links=20, sleep=1) manager.browse(url_b, num_links=1, sleep=1) manager.close() qry_res = db_utils.query_db(db, "SELECT visit_id, site_url FROM site_visits") # Construct dict mapping site_url to visit_id visit_ids = dict() for row in qry_res: visit_ids[row[1]] = row[0] qry_res = db_utils.query_db( db, "SELECT visit_id FROM http_requests WHERE url = ?", (url_a, ), ) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db( db, "SELECT visit_id FROM http_requests WHERE url = ?", (url_b, ), ) assert qry_res[0][0] == visit_ids[url_b] qry_res = db_utils.query_db( db, "SELECT visit_id FROM http_responses WHERE url = ?", (url_a, ), ) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db( db, "SELECT visit_id FROM http_responses WHERE url = ?", (url_b, ), ) assert qry_res[0][0] == visit_ids[url_b] # Page simple_a.html has three links: # 1) An absolute link to simple_c.html # 2) A relative link to simple_d.html # 3) A javascript: link # 4) A link to www.google.com # 5) A link to example.com?localhost # We should see page visits for 1 and 2, but not 3-5. qry_res = db_utils.query_db( db, "SELECT visit_id FROM http_responses WHERE url = ?", (url_c, ), ) assert qry_res[0][0] == visit_ids[url_a] qry_res = db_utils.query_db( db, "SELECT visit_id FROM http_responses WHERE url = ?", (url_d, ), ) assert qry_res[0][0] == visit_ids[url_a] # We expect 4 urls: a,c,d and a favicon request qry_res = db_utils.query_db( db, "SELECT COUNT(DISTINCT url) FROM http_responses WHERE visit_id = ?", (visit_ids[url_a], ), ) assert qry_res[0][0] == 4