def generate_component_connections(self, bug_map: Dict[int, bugzilla.BugDict], bugs: List[int]) -> None: bugs_set = set(bugs) commits = [ commit for commit in repository.get_commits() if commit["bug_id"] in bugs_set ] commit_map = {commit["node"]: commit for commit in commits} # Retrieve components of test failures that occurred when landing patches to fix bugs in specific components. component_failures = collections.defaultdict(list) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( "group") for revisions, _, _, possible_regressions, likely_regressions in tqdm( push_data_iter(), total=push_data_count): commit_list = [ commit_map[revision] for revision in revisions if revision in commit_map ] if len(commit_list) == 0: continue commit_bugs = [ bug_map[commit["bug_id"]] for commit in commit_list if commit["bug_id"] in bug_map ] components = list( set(get_full_component(bug) for bug in commit_bugs)) groups = [ group for group in list( set(possible_regressions + likely_regressions)) if group.encode("utf-8") in self.path_to_component ] for group in groups: for component in components: component_failures[component].append( self.path_to_component[group.encode( "utf-8")].tobytes().decode("utf-8")) # Filter out commits for which we have no bugs. commits = [commit for commit in commits if commit["bug_id"] in bug_map] # Sort commits by bug component, so we can use itertools.groupby to group them by bug component. commits.sort(key=lambda x: get_full_component(bug_map[x["bug_id"]])) commit_groups = [] for component, commit_iter in itertools.groupby( commits, lambda x: get_full_component(bug_map[x["bug_id"]])): commit_group = { "component": component, "most_common_test_failure_components": histogram(component_failures[component]) if component in component_failures else {}, } self.get_prev_bugs_stats( commit_group, list(commit_iter), component, ) commit_groups.append(commit_group) with open("component_connections.json", "w") as f: json.dump(commit_groups, f) repository.close_component_mapping()
def evaluation(self): # Get a test set of pushes on which to test the model. pushes, train_push_len = self.get_pushes(False) # To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using # only failure data from the training pushes (otherwise, we'd leak training information into the test # set). if self.granularity == "label": print( "Generate failing together DB (restricted to training pushes)") push_data, _ = test_scheduling.get_push_data("label") test_scheduling.generate_failing_together_probabilities( push_data, pushes[train_push_len - 1]["revs"][0]) test_pushes = pushes[train_push_len:] all_tasks = reduce( lambda x, y: x | y, (set(push["failures"]) | set(push["passes"]) for push in test_pushes[-28:]), ) test_pushes_failures = sum(1 for push in test_pushes if len(push["failures"]) > 0) test_pushes = {push["revs"][0]: push for push in test_pushes} print( f"Testing on {len(test_pushes)} ({test_pushes_failures} with failures) out of {len(pushes)}. {len(all_tasks)} schedulable tasks." ) commit_map = get_commit_map() past_failures_data = test_scheduling.get_past_failures( self.granularity) last_push_num = past_failures_data["push_num"] past_failures_data.close() # Select tests for all the pushes in the test set. for i, (rev, push) in enumerate(tqdm(test_pushes.items())): commits = tuple(commit_map[revision] for revision in push["revs"] if revision in commit_map) if len(commits) == 0: test_pushes[rev]["all_possibly_selected"] = {} continue push_num = last_push_num - (len(test_pushes) - (i + 1)) # Note: we subtract 100 to the push number to make sure we don't use # past failure data for the push itself. # The number 100 comes from the fact that in the past failure data # generation we store past failures in batches of 100 pushes. test_pushes[rev]["all_possibly_selected"] = self.select_tests( commits, 0.3, push_num - 100) reductions = [None] if self.granularity == "label": reductions += [0.9, 1.0] def do_eval(confidence_threshold, reduction, cap, minimum): for rev, push in test_pushes.items(): selected = set(name for name, confidence in push["all_possibly_selected"].items() if confidence >= confidence_threshold) if minimum is not None and len(selected) < minimum: remaining = [(name, confidence) for name, confidence in push["all_possibly_selected"].items() if name not in selected] selected.update(name for name, _ in sorted( remaining, key=lambda x: -x[1])[:minimum - len(selected)]) if reduction is not None: selected = self.reduce(selected, reduction) if cap is not None and len(selected) > cap: selected = set( sorted( ((name, confidence) for name, confidence in push["all_possibly_selected"].items() if name in selected), key=lambda x: x[1], reverse=True, )[:cap]) caught = selected & set(push["failures"]) push["number_scheduled"] = len(selected) push["caught_one"] = (len(caught) > 0 if len(push["failures"]) != 0 else None) push["some_didnt_run"] = (not selected.issubset( set(push["passes"]) | set(push["failures"])), ) push["caught_percentage"] = (len(caught) / len(push["failures"]) if len(push["failures"]) != 0 else None) min_scheduled = min(result["number_scheduled"] for result in test_pushes.values()) max_scheduled = max(result["number_scheduled"] for result in test_pushes.values()) average_scheduled = statistics.mean( result["number_scheduled"] for result in test_pushes.values()) num_failing_pushes = sum(1 for result in test_pushes.values() if result["caught_one"] is not None) num_caught_one = sum(1 for result in test_pushes.values() if result["caught_one"]) num_caught_one_or_some_didnt_run = sum( 1 for result in test_pushes.values() if result["caught_one"] or (result["caught_one"] is not None and result["some_didnt_run"])) percentage_caught_one = 100 * num_caught_one / num_failing_pushes percentage_caught_one_or_some_didnt_run = ( 100 * num_caught_one_or_some_didnt_run / num_failing_pushes) average_caught_percentage = 100 * statistics.mean( result["caught_percentage"] for result in test_pushes.values() if result["caught_percentage"] is not None) reduction_str = (f"enabled at {reduction * 100}%" if reduction is not None else "disabled") print( f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, and cap at {cap}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures." ) for minimum in [None, 10]: for cap in [None, 300, 500]: for reduction in reductions: for confidence_threshold in [ 0.5, 0.7, 0.8, 0.85, 0.9, 0.95 ]: do_eval(confidence_threshold, reduction, cap, minimum)
def generate_test_scheduling_history(self, granularity: str, training_months: int) -> None: if granularity != "config_group": # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=training_months) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) elif granularity == "config_group": test_scheduling_db = test_scheduling.TEST_CONFIG_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_CONFIG_GROUP_DB) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( granularity) if granularity in ("label", "config_group"): test_scheduling.generate_failing_together_probabilities( granularity, push_data_iter(), push_data_count) def generate_all_data() -> Generator[Dict[str, Any], None, None]: past_failures = test_scheduling.get_past_failures( granularity, False) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 commit_map = {} for commit_data in tqdm(repository.get_commits()): commit_map[commit_data["node"]] = commit_data # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 if granularity in ("group", "config_group"): update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for ( i, ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ), ) in enumerate(tqdm(push_data_iter(), total=push_data_count)): push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue # Skip wptsync commits, since they are not like normal pushes made by developers. if any(repository.is_wptsync(commit) for commit in commits): continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity in ("group", "config_group"): update_touched_together_gen.send(commits[0]["node"]) result_data = [] for data in test_scheduling.generate_data( granularity, past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result_data.append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "data": result_data, } if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() # For the config/group granularity, we are only interested in the failing together DB. if granularity != "config_group": db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) create_tar_zst(past_failures_db) if granularity == "group": create_tar_zst(touched_together_db) if granularity in ("label", "config_group"): create_tar_zst(failing_together_db)
def go(self, bugs: List[int], meta_bugs: Optional[List[int]] = None) -> None: if meta_bugs is not None: bugs += meta_bugs + self.get_blocking_of(meta_bugs) logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs) component_team_mapping = bugzilla.get_component_team_mapping() bugs_set = set(bugs) commits = [ commit for commit in repository.get_commits() if commit["bug_id"] in bugs_set ] commit_map = {commit["node"]: commit for commit in commits} hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)} logger.info(f"{len(commits)} commits to analyze.") logger.info(f"{len(bugs_set)} bugs to analyze.") bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } if meta_bugs is not None: blocker_to_meta = collections.defaultdict(set) for meta_bug in meta_bugs: if meta_bug not in bug_map: continue for blocker_bug_id in bugzilla.find_blocking( bug_map, bug_map[meta_bug]): blocker_to_meta[blocker_bug_id].add(meta_bug) def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f) past_regressions_by = {} past_fixed_bugs_by = {} past_regression_blocked_bugs_by = {} past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) past_regression_blocked_bugs_by[dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) past_fixed_bug_blocked_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(dimension=dimension)) path_to_component = repository.get_component_mapping() def get_full_component(bug): return "{}::{}".format(bug["product"], bug["component"]) def histogram(components: List[str]) -> Dict[str, float]: counter = collections.Counter(components) return { component: count / len(components) for component, count in counter.most_common() } def component_histogram(bugs: List[dict]) -> Dict[str, float]: return histogram([bug["component"] for bug in bugs]) def find_risk_band(risk: float) -> str: for name, start, end in self.risk_bands: if start <= risk <= end: return name assert False def get_prev_bugs(past_bugs_by: dict, commit: repository.CommitDict, component: str = None) -> List[dict]: paths = [ path for path in commit["files"] if component is None or (path.encode( "utf-8") in path_to_component and path_to_component[ path.encode("utf-8")] == component.encode("utf-8")) ] past_bugs = [] for path, f_group in commit["functions"].items(): if path not in paths: continue if path not in past_bugs_by["function"]: continue found = False for f in f_group: if f[0] not in past_bugs_by["function"][path]: continue found = True past_bugs += past_bugs_by["function"][path][f[0]] if found: paths.remove(path) for path in paths: if path in past_bugs_by["file"]: past_bugs += past_bugs_by["file"][path] paths.remove(path) for path, directories in zip(paths, repository.get_directories(paths)): found = False for directory in directories: if directory in past_bugs_by["directory"]: found = True past_bugs += past_bugs_by["directory"][directory] if found: paths.remove(path) components = [ path_to_component[path.encode("utf-8")].tobytes().decode( "utf-8") for path in paths if path.encode("utf-8") in path_to_component ] for component in components: if component in past_bugs_by["component"]: past_bugs += past_bugs_by["component"][component] return past_bugs def get_prev_bugs_stats( commit_group: dict, commit_list: List[repository.CommitDict], component: str = None, ) -> None: # Find previous regressions occurred in the same files as those touched by these commits. # And find previous bugs that were fixed by touching the same files as these commits. # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits. # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits. prev_regressions: List[Dict[str, Any]] = sum( (get_prev_bugs(past_regressions_by, commit, component) for commit in commit_list), [], ) prev_fixed_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_fixed_bugs_by, commit, component) for commit in commit_list), [], ) prev_regression_blocked_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_regression_blocked_bugs_by, commit, component) for commit in commit_list), [], ) prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_fixed_bug_blocked_bugs_by, commit, component) for commit in commit_list), [], ) prev_regressions = _deduplicate(prev_regressions) prev_fixed_bugs = _deduplicate(prev_fixed_bugs) prev_regression_blocked_bugs = _deduplicate( prev_regression_blocked_bugs) prev_fixed_bug_blocked_bugs = _deduplicate( prev_fixed_bug_blocked_bugs) regression_components = component_histogram(prev_regressions) fixed_bugs_components = component_histogram(prev_fixed_bugs) regression_blocked_bug_components = component_histogram( prev_regression_blocked_bugs) fixed_bug_blocked_bug_components = component_histogram( prev_fixed_bug_blocked_bugs) commit_group[ "most_common_regression_components"] = regression_components # These are only used for component connections for the time being. if component: commit_group["prev_regressions"] = prev_regressions[-3:] commit_group["prev_fixed_bugs"] = prev_fixed_bugs[-3:] commit_group[ "prev_regression_blocked_bugs"] = prev_regression_blocked_bugs[ -3:] commit_group[ "prev_fixed_bug_blocked_bugs"] = prev_fixed_bug_blocked_bugs[ -3:] commit_group[ "most_common_fixed_bugs_components"] = fixed_bugs_components commit_group[ "most_common_regression_blocked_bug_components"] = regression_blocked_bug_components commit_group[ "most_common_fixed_bug_blocked_bug_components"] = fixed_bug_blocked_bug_components def get_commit_data( commit_list: List[repository.CommitDict]) -> List[dict]: if len(commit_list) == 0: return [] # Evaluate risk of commits associated to this bug. probs = self.regressor_model.classify(commit_list, probabilities=True) commits_data = [] for i, commit in enumerate(commit_list): revision_id = repository.get_revision_id(commit) if revision_id in revision_map: testing = phabricator.get_testing_project( revision_map[revision_id]) if testing is None: testing = "missing" else: testing = None commits_data.append({ "id": commit["node"], "testing": testing, "risk": float(probs[i][1]), "backedout": bool(commit["backedoutby"]), "author": commit["author_email"], "reviewers": commit["reviewers"], "coverage": [ commit["cov_added"], commit["cov_covered"], commit["cov_unknown"], ], }) return commits_data # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID. commits.sort(key=lambda x: x["bug_id"]) bug_to_commits = {} for bug_id, commit_iter in itertools.groupby(commits, lambda x: x["bug_id"]): # TODO: Figure out what to do with bugs we couldn't download (security bugs). if bug_id not in bug_map: continue bug_to_commits[bug_id] = sorted( commit_iter, key=lambda x: hash_to_rev[x["node"]]) bug_summaries = [] for bug_id in bugs: if bug_id not in bug_map: continue commit_list = bug_to_commits.get(bug_id, []) commit_data = get_commit_data(commit_list) bug = bug_map[bug_id] bug_summary = { "id": bug_id, "regressor": bug_id in regressor_bug_ids, "regression": len(bug["regressed_by"]) > 0 or any(keyword in bug["keywords"] for keyword in ["regression", "talos-regression"]) or ("cf_has_regression_range" in bug and bug["cf_has_regression_range"] == "yes"), "whiteboard": bug["whiteboard"], "assignee": bug["assigned_to"] if bug["assigned_to"] != "*****@*****.**" else None, "versions": bugzilla.get_fixed_versions(bug), "component": get_full_component(bug), "team": bugzilla.component_to_team(component_team_mapping, bug["product"], bug["component"]), "summary": bug["summary"], "types": bug_to_types(bug), "severity": bug["severity"], "creation_date": dateutil.parser.parse( bug["creation_time"]).strftime("%Y-%m-%d"), "date": max( dateutil.parser.parse(commit["pushdate"]) for commit in commit_list).strftime("%Y-%m-%d") if len(commit_list) > 0 else None, "commits": commit_data, "meta_ids": list(blocker_to_meta[bug_id]), "risk_band": find_risk_band(max(commit["risk"] for commit in commit_data)) if len(commit_data) > 0 else None, } get_prev_bugs_stats(bug_summary, commit_list) bug_summaries.append(bug_summary) landings_by_date = collections.defaultdict(list) for bug_summary in bug_summaries: landings_by_date[bug_summary["creation_date"]].append(bug_summary) with open("landings_by_date.json", "w") as f: output: dict = { "summaries": landings_by_date, } if meta_bugs is not None: output["featureMetaBugs"] = [{ "id": meta_bug, "summary": bug_map[meta_bug]["summary"] } for meta_bug in meta_bugs] json.dump(output, f) # Retrieve components of test failures that occurred when landing patches to fix bugs in specific components. component_failures = collections.defaultdict(list) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( "group") for revisions, _, _, possible_regressions, likely_regressions in tqdm( push_data_iter(), total=push_data_count): commit_list = [ commit_map[revision] for revision in revisions if revision in commit_map ] if len(commit_list) == 0: continue commit_bugs = [ bug_map[commit["bug_id"]] for commit in commit_list if commit["bug_id"] in bug_map ] components = list( set(get_full_component(bug) for bug in commit_bugs)) groups = [ group for group in list( set(possible_regressions + likely_regressions)) if group.encode("utf-8") in path_to_component ] for group in groups: for component in components: component_failures[component].append(path_to_component[ group.encode("utf-8")].tobytes().decode("utf-8")) # Filter out commits for which we have no bugs. commits = [commit for commit in commits if commit["bug_id"] in bug_map] # Sort commits by bug component, so we can use itertools.groupby to group them by bug component. commits.sort(key=lambda x: get_full_component(bug_map[x["bug_id"]])) commit_groups = [] for component, commit_iter in itertools.groupby( commits, lambda x: get_full_component(bug_map[x["bug_id"]])): commit_group = { "component": component, "most_common_test_failure_components": histogram(component_failures[component]) if component in component_failures else {}, } get_prev_bugs_stats(commit_group, list(commit_iter), component) commit_groups.append(commit_group) with open("component_connections.json", "w") as f: json.dump(commit_groups, f) repository.close_component_mapping()
def evaluation(self) -> None: # Get a test set of pushes on which to test the model. pushes, train_push_len = self.get_pushes(False) # To evaluate the model with reductions enabled, we need to regenerate the failing together DB, using # only failure data from the training pushes (otherwise, we'd leak training information into the test # set). print("Generate failing together DB (restricted to training pushes)") push_data_iter, push_data_count, _ = test_scheduling.get_push_data( "label" if self.granularity == "label" else "config_group" ) test_scheduling.generate_failing_together_probabilities( "label" if self.granularity == "label" else "config_group", push_data_iter(), push_data_count, pushes[train_push_len - 1]["revs"][0], ) test_pushes_list = pushes[train_push_len:] all_tasks = reduce( lambda x, y: x | y, ( set(push["failures"]) | set(push["passes"]) for push in test_pushes_list[-28:] ), ) all_revs = set(sum((push["revs"] for push in test_pushes_list), [])) test_pushes_failures = sum( 1 for push in test_pushes_list if len(push["failures"]) > 0 ) test_pushes = {push["revs"][0]: push for push in test_pushes_list} if self.granularity == "group": for ( revisions, fix_revision, push_runnables, possible_regressions, likely_regressions, ) in tqdm(push_data_iter(), total=push_data_count): if revisions[0] not in test_pushes: continue test_pushes[revisions[0]]["config_group_failures"] = ( possible_regressions + likely_regressions ) print( f"Testing on {len(test_pushes)} ({test_pushes_failures} with failures) out of {len(pushes)}. {len(all_tasks)} schedulable tasks." ) del pushes commit_map = get_commit_map(all_revs) past_failures_data = test_scheduling.get_past_failures(self.granularity, True) last_push_num = past_failures_data["push_num"] past_failures_data.close() # Select tests for all the pushes in the test set. for i, push in enumerate(tqdm(test_pushes.values())): commits = tuple( commit_map.pop(revision) for revision in push["revs"] if revision in commit_map ) if len(commits) == 0: push["all_possibly_selected"] = {} continue push_num = last_push_num - (len(test_pushes) - (i + 1)) # Note: we subtract 100 to the push number to make sure we don't use # past failure data for the push itself. # The number 100 comes from the fact that in the past failure data # generation we store past failures in batches of 100 pushes. push["all_possibly_selected"] = self.select_tests( commits, 0.5, push_num - 100 ) def do_eval( executor: concurrent.futures.ProcessPoolExecutor, confidence_threshold: float, reduction: Optional[float], cap: Optional[int], minimum: Optional[int], ) -> None: futures: Dict[concurrent.futures.Future, Dict[str, Any]] = {} for push in test_pushes.values(): futures[ executor.submit( eval_apply_transforms, self, push, confidence_threshold, reduction, cap, minimum, ) ] = push for future in concurrent.futures.as_completed(futures): exc = future.exception() if exc is not None: print( "Exception {} while running {}".format( exc, futures[future]["revs"][0] ) ) for f in futures: f.cancel() push = futures[future] selected, group_configs = future.result() if reduction is not None and self.granularity == "group": push["number_configs"] = len( set( sum( group_configs.values(), [], ) ) ) selected_config_groups = set( (config, group) for group, configs in group_configs.items() for config in configs ) caught_config_groups = selected_config_groups & set( push["config_group_failures"] ) push["caught_one_config_group"] = ( len(caught_config_groups) > 0 if len(push["config_group_failures"]) != 0 else None ) push["caught_percentage_config_group"] = ( len(caught_config_groups) / len(push["config_group_failures"]) if len(push["config_group_failures"]) != 0 else None ) caught = selected & set(push["failures"]) push["number_scheduled"] = len(selected) push["caught_one"] = ( len(caught) > 0 if len(push["failures"]) != 0 else None ) push["some_didnt_run"] = ( not selected.issubset(set(push["passes"]) | set(push["failures"])), ) push["caught_percentage"] = ( len(caught) / len(push["failures"]) if len(push["failures"]) != 0 else None ) min_scheduled = min( result["number_scheduled"] for result in test_pushes.values() ) max_scheduled = max( result["number_scheduled"] for result in test_pushes.values() ) average_scheduled = statistics.mean( result["number_scheduled"] for result in test_pushes.values() ) num_failing_pushes = sum( 1 for result in test_pushes.values() if result["caught_one"] is not None ) num_caught_one = sum( 1 for result in test_pushes.values() if result["caught_one"] ) num_caught_one_or_some_didnt_run = sum( 1 for result in test_pushes.values() if result["caught_one"] or (result["caught_one"] is not None and result["some_didnt_run"]) ) percentage_caught_one = 100 * num_caught_one / num_failing_pushes percentage_caught_one_or_some_didnt_run = ( 100 * num_caught_one_or_some_didnt_run / num_failing_pushes ) average_caught_percentage = 100 * statistics.mean( result["caught_percentage"] for result in test_pushes.values() if result["caught_percentage"] is not None ) reduction_str = ( f"enabled at {reduction * 100}%" if reduction is not None else "disabled" ) message = f"For confidence threshold {confidence_threshold}, with reduction {reduction_str}, cap at {cap}, and minimum at {minimum}: scheduled {average_scheduled} tasks on average (min {min_scheduled}, max {max_scheduled}). In {percentage_caught_one}% of pushes we caught at least one failure ({percentage_caught_one_or_some_didnt_run}% ignoring misses when some of our selected tasks didn't run). On average, we caught {average_caught_percentage}% of all seen failures." if reduction is not None and self.granularity == "group": average_configs = statistics.mean( result["number_configs"] for result in test_pushes.values() ) median_configs = statistics.median( result["number_configs"] for result in test_pushes.values() ) message += f" On average, we selected {average_configs} configs (a median of {median_configs} configs)." num_caught_one_config_group = sum( 1 for result in test_pushes.values() if result["caught_one_config_group"] ) percentage_caught_one_config_group = ( 100 * num_caught_one_config_group / num_failing_pushes ) average_caught_percentage_config_group = 100 * statistics.mean( result["caught_percentage_config_group"] for result in test_pushes.values() if result["caught_percentage_config_group"] is not None ) message += f" In {percentage_caught_one_config_group}% of pushes we caught at least one config/group failure. On average, we caught {average_caught_percentage_config_group}% of all seen config/group failures." print(message) with concurrent.futures.ProcessPoolExecutor( max_workers=utils.get_physical_cpu_count() ) as executor: scenarios = [ (None, None, None), (10, None, None), (None, 300, None), (None, None, 0.9), (None, None, 1.0), ] for minimum, cap, reduction in scenarios: # Pre-generate equivalence sets, so when we run the config selection in multiple processes # we don't risk concurrent writes to the equivalence sets file. if reduction is not None and self.granularity == "group": self._get_equivalence_sets(reduction) for confidence_threshold in [0.5, 0.7, 0.8, 0.85, 0.9, 0.95]: do_eval(executor, confidence_threshold, reduction, cap, minimum)