def test_all(g: Graph) -> None: tasks = [f"windows10/opt-{chr(i)}" for i in range(len(g.vs))] try: test_scheduling.close_failing_together_db("label") except AssertionError: pass test_scheduling.remove_failing_together_db("label") # TODO: Also add some couples that are *not* failing together. ft: Dict[str, Dict[str, Tuple[float, float]]] = {} for edge in g.es: task1 = tasks[edge.tuple[0]] task2 = tasks[edge.tuple[1]] assert task1 < task2 if task1 not in ft: ft[task1] = {} ft[task1][task2] = (0.1, 1.0) failing_together = test_scheduling.get_failing_together_db("label", False) for t, ts in ft.items(): failing_together[t.encode("ascii")] = pickle.dumps(ts) test_scheduling.close_failing_together_db("label") model = TestLabelSelectModel() result = model.reduce(tasks, 1.0) hypothesis.note(f"Result: {sorted(result)}") assert len(result) == len(g.components())
def test_reduce(): failing_together = test_scheduling.get_failing_together_db("label") failing_together[b"test-linux64/debug$test-windows10/debug"] = struct.pack( "ff", 0.1, 1.0) failing_together[b"test-linux64/debug$test-windows10/opt"] = struct.pack( "ff", 0.1, 1.0) failing_together[b"test-linux64/opt$test-windows10/opt"] = struct.pack( "ff", 0.1, 0.91) failing_together[b"test-linux64/debug$test-linux64/opt"] = struct.pack( "ff", 0.1, 1.0) failing_together[ b"test-linux64-asan/debug$test-linux64/debug"] = struct.pack( "ff", 0.1, 1.0) test_scheduling.close_failing_together_db("label") model = TestLabelSelectModel() assert model.reduce({"test-linux64/debug", "test-windows10/debug"}, 1.0) == {"test-linux64/debug"} assert model.reduce({"test-linux64/debug", "test-windows10/opt"}, 1.0) == {"test-linux64/debug"} assert model.reduce({"test-linux64/opt", "test-windows10/opt"}, 1.0) == { "test-linux64/opt", "test-windows10/opt", } assert model.reduce({"test-linux64/opt", "test-windows10/opt"}, 0.9) == {"test-linux64/opt"} assert model.reduce({"test-linux64/opt", "test-linux64/debug"}, 1.0) == {"test-linux64/opt"} assert model.reduce({"test-linux64-asan/debug", "test-linux64/debug"}, 1.0) == {"test-linux64/debug"}
def mock_schedule_tests_classify(monkeypatch): with open("known_tasks", "w") as f: f.write("prova") # Initialize a mock past failures DB. for granularity in ("label", "group"): past_failures_data = test_scheduling.get_past_failures(granularity) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ f"test-{granularity}1", f"test-{granularity}2", "test-linux64/opt", "test-windows10/opt", ] past_failures_data.close() failing_together = test_scheduling.get_failing_together_db() failing_together[b"test-linux64/opt$test-windows10/opt"] = struct.pack( "ff", 0.1, 1.0) test_scheduling.close_failing_together_db() def do_mock(labels_to_choose, groups_to_choose): # Add a mock test selection model. def classify(self, items, probabilities=False): assert probabilities results = [] for item in items: runnable_name = item["test_job"]["name"] if self.granularity == "label": if runnable_name in labels_to_choose: results.append([ 1 - labels_to_choose[runnable_name], labels_to_choose[runnable_name], ]) else: results.append([0.9, 0.1]) elif self.granularity == "group": if runnable_name in groups_to_choose: results.append([ 1 - groups_to_choose[runnable_name], groups_to_choose[runnable_name], ]) else: results.append([0.9, 0.1]) return np.array(results) class MockModelCache: def get(self, model_name): if "group" in model_name: return bugbug.models.testselect.TestGroupSelectModel() else: return bugbug.models.testselect.TestLabelSelectModel() monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache()) monkeypatch.setattr(bugbug.models.testselect.TestSelectModel, "classify", classify) return do_mock
def mock_get_config_specific_groups( monkeypatch: MonkeyPatch, ) -> None: with open("known_tasks", "w") as f: f.write("prova") # Initialize a mock past failures DB. past_failures_data = test_scheduling.get_past_failures("group", False) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ "test-group1", "test-group2", ] past_failures_data.close() try: test_scheduling.close_failing_together_db("config_group") except AssertionError: pass failing_together = test_scheduling.get_failing_together_db("config_group", False) failing_together[b"$ALL_CONFIGS$"] = pickle.dumps( ["test-linux1804-64/opt-*", "test-windows10/debug-*", "test-windows10/opt-*"] ) failing_together[b"$CONFIGS_BY_GROUP$"] = pickle.dumps( { "test-group1": { "test-linux1804-64/opt-*", "test-windows10/debug-*", "test-windows10/opt-*", }, "test-group2": { "test-linux1804-64/opt-*", "test-windows10/debug-*", "test-windows10/opt-*", }, } ) failing_together[b"test-group1"] = pickle.dumps( { "test-linux1804-64/opt-*": { "test-windows10/debug-*": (1.0, 0.0), "test-windows10/opt-*": (1.0, 0.0), }, "test-windows10/debug-*": { "test-windows10/opt-*": (1.0, 1.0), }, } ) test_scheduling.close_failing_together_db("config_group") monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache())
def test_reduce(): failing_together = test_scheduling.get_failing_together_db("label") failing_together[b"test-linux1804-64/debug"] = pickle.dumps( { "test-windows10/debug": (0.1, 1.0), "test-windows10/opt": (0.1, 1.0), "test-linux1804-64/opt": (0.1, 1.0), } ) failing_together[b"test-linux1804-64/opt"] = pickle.dumps( {"test-windows10/opt": (0.1, 0.91),} ) failing_together[b"test-linux1804-64-asan/debug"] = pickle.dumps( {"test-linux1804-64/debug": (0.1, 1.0),} ) test_scheduling.close_failing_together_db("label") model = TestLabelSelectModel() assert model.reduce({"test-linux1804-64/debug", "test-windows10/debug"}, 1.0) == { "test-linux1804-64/debug" } assert model.reduce({"test-linux1804-64/debug", "test-windows10/opt"}, 1.0) == { "test-linux1804-64/debug" } assert model.reduce({"test-linux1804-64/opt", "test-windows10/opt"}, 1.0) == { "test-linux1804-64/opt", "test-windows10/opt", } assert model.reduce({"test-linux1804-64/opt", "test-windows10/opt"}, 0.9) == { "test-linux1804-64/opt" } assert model.reduce({"test-linux1804-64/opt", "test-linux1804-64/debug"}, 1.0) == { "test-linux1804-64/opt" } assert model.reduce( {"test-linux1804-64-asan/debug", "test-linux1804-64/debug"}, 1.0 ) == {"test-linux1804-64/debug"} # Test case where the second task is not present in the failing together stats of the first. assert model.reduce( {"test-linux1804-64-asan/debug", "test-windows10/opt"}, 1.0 ) == {"test-linux1804-64-asan/debug", "test-windows10/opt"} # Test case where a task is not present at all in the failing together DB. assert model.reduce({"test-linux1804-64-qr/debug", "test-windows10/opt"}, 1.0) == { "test-linux1804-64-qr/debug", "test-windows10/opt", }
def test_reduce2(failing_together: LMDBDict) -> None: failing_together[b"windows10/opt-a"] = pickle.dumps({ "windows10/opt-b": (0.1, 1.0), "windows10/opt-c": (0.1, 0.3), "windows10/opt-d": (0.1, 1.0), }) failing_together[b"windows10/opt-b"] = pickle.dumps({ "windows10/opt-c": (0.1, 1.0), "windows10/opt-d": (0.1, 0.3), }) test_scheduling.close_failing_together_db("label") assert testselect.reduce_configs( { "windows10/opt-a", "windows10/opt-b", "windows10/opt-c", "windows10/opt-d" }, 1.0, ) == { "windows10/opt-b", }
def test_reduce2(failing_together: LMDBDict) -> None: failing_together[b"windows10/opt-a"] = pickle.dumps({ "windows10/opt-b": (0.1, 1.0), "windows10/opt-c": (0.1, 0.3), "windows10/opt-d": (0.1, 1.0), }) failing_together[b"windows10/opt-b"] = pickle.dumps({ "windows10/opt-c": (0.1, 1.0), "windows10/opt-d": (0.1, 0.3), }) test_scheduling.close_failing_together_db("label") model = TestLabelSelectModel() assert model.reduce( { "windows10/opt-a", "windows10/opt-b", "windows10/opt-c", "windows10/opt-d" }, 1.0, ) == { "windows10/opt-b", }
def test_select_configs(failing_together_config_group: LMDBDict) -> None: past_failures_data = test_scheduling.get_past_failures("group", False) past_failures_data["all_runnables"] = ["group1", "group2"] past_failures_data.close() failing_together_config_group[b"group1"] = pickle.dumps({ "linux1804-64-asan/debug": { "linux1804-64/debug": (1.0, 0.0), "linux1804-64/opt": (1.0, 0.0), "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 0.0), }, "linux1804-64/debug": { "linux1804-64/opt": (1.0, 1.0), "mac/debug": (1.0, 1.0), "windows10/debug": (1.0, 1.0), }, "linux1804-64/opt": { "mac/debug": (1.0, 1.0), "windows10/debug": (1.0, 1.0), }, "mac/debug": { "windows10/debug": (1.0, 1.0) }, }) failing_together_config_group[b"group2"] = pickle.dumps({ "linux1804-64-asan/debug": { "linux1804-64/debug": (1.0, 1.0), "linux1804-64/opt": (1.0, 0.0), "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 0.0), }, "linux1804-64/debug": { "linux1804-64/opt": (1.0, 0.0), "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 1.0), }, "linux1804-64/opt": { "mac/debug": (1.0, 0.0), "windows10/debug": (1.0, 0.0), }, "mac/debug": { "windows10/debug": (1.0, 0.0) }, }) failing_together_config_group[b"$ALL_CONFIGS$"] = pickle.dumps([ "linux1804-64-asan/debug", "linux1804-64/debug", "linux1804-64/opt", "mac/debug", "windows10/debug", ]) failing_together_config_group[b"$CONFIGS_BY_GROUP$"] = pickle.dumps({ "group1": { "linux1804-64-asan/debug", "linux1804-64/debug", "linux1804-64/opt", "mac/debug", "windows10/debug", }, "group2": { "linux1804-64-asan/debug", "linux1804-64/debug", "linux1804-64/opt", "mac/debug", "windows10/debug", }, }) test_scheduling.close_failing_together_db("config_group") model = TestGroupSelectModel() result = model.select_configs( { "group1", "group2", }, 1.0, ) assert len(result) == 2 assert set( result["group1"]) == {"linux1804-64-asan/debug", "linux1804-64/opt"} assert set(result["group2"]) == { "linux1804-64/opt", "mac/debug", "linux1804-64/debug", }
def failing_together_config_group() -> Iterator[LMDBDict]: yield test_scheduling.get_failing_together_db("config_group", False) test_scheduling.close_failing_together_db("config_group")
def failing_together() -> Iterator[LMDBDict]: yield test_scheduling.get_failing_together_db("label", False) test_scheduling.close_failing_together_db("label")
def mock_schedule_tests_classify( monkeypatch: MonkeyPatch, ) -> Callable[[dict[str, float], dict[str, float]], None]: with open("known_tasks", "w") as f: f.write("prova") # Initialize a mock past failures DB. for granularity in ("label", "group"): past_failures_data = test_scheduling.get_past_failures(granularity, False) past_failures_data["push_num"] = 1 past_failures_data["all_runnables"] = [ "test-linux1804-64-opt-label1", "test-linux1804-64-opt-label2", "test-group1", "test-group2", "test-linux1804-64/opt", "test-windows10/opt", ] past_failures_data.close() try: test_scheduling.close_failing_together_db("label") except AssertionError: pass failing_together = test_scheduling.get_failing_together_db("label", False) failing_together[b"test-linux1804-64/opt"] = pickle.dumps( { "test-windows10/opt": (0.1, 1.0), } ) test_scheduling.close_failing_together_db("label") try: test_scheduling.close_failing_together_db("config_group") except AssertionError: pass failing_together = test_scheduling.get_failing_together_db("config_group", False) failing_together[b"$ALL_CONFIGS$"] = pickle.dumps( ["test-linux1804-64/opt", "test-windows10/debug", "test-windows10/opt"] ) failing_together[b"$CONFIGS_BY_GROUP$"] = pickle.dumps( { "test-group1": { "test-linux1804-64/opt", "test-windows10/debug", "test-windows10/opt", }, "test-group2": { "test-linux1804-64/opt", "test-windows10/debug", "test-windows10/opt", }, } ) failing_together[b"test-group1"] = pickle.dumps( { "test-linux1804-64/opt": { "test-windows10/debug": (1.0, 0.0), "test-windows10/opt": (1.0, 1.0), }, "test-windows10/debug": { "test-windows10/opt": (1.0, 0.0), }, } ) test_scheduling.close_failing_together_db("config_group") try: test_scheduling.close_touched_together_db() except AssertionError: pass test_scheduling.get_touched_together_db(False) test_scheduling.close_touched_together_db() def do_mock(labels_to_choose, groups_to_choose): # Add a mock test selection model. def classify(self, items, probabilities=False): assert probabilities results = [] for item in items: runnable_name = item["test_job"]["name"] if self.granularity == "label": if runnable_name in labels_to_choose: results.append( [ 1 - labels_to_choose[runnable_name], labels_to_choose[runnable_name], ] ) else: results.append([0.9, 0.1]) elif self.granularity == "group": if runnable_name in groups_to_choose: results.append( [ 1 - groups_to_choose[runnable_name], groups_to_choose[runnable_name], ] ) else: results.append([0.9, 0.1]) return np.array(results) monkeypatch.setattr(bugbug_http.models, "MODEL_CACHE", MockModelCache()) monkeypatch.setattr( bugbug.models.testselect.TestSelectModel, "classify", classify ) return do_mock
def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data ): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations(sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count ) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info(f"{skipped} couples skipped because their support was too low") logger.info("Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], -k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info("Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[ f"{couple[0]}${couple[1]}".encode("utf-8") ] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db()