def get_landed_and_filed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [ commit for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= since and commit["bug_id"] ] logger.info(f"Retrieving bug IDs since {days} days ago") timespan_ids = bugzilla.get_ids_between(since, datetime.utcnow()) bugzilla.download_bugs(timespan_ids) bug_ids = set(commit["bug_id"] for commit in commits) bug_ids.update(bug["id"] for bug in bugzilla.get_bugs() if dateutil.parser.parse(bug["creation_time"]).replace( tzinfo=None) >= since and bug["resolution"] not in [ "INVALID", "WONTFIX", "INACTIVE", "DUPLICATE", "INCOMPLETE", "MOVED", "WORKSFORME", ]) return list(bug_ids)
def get_regressors_of(self, bug_ids: List[int]) -> List[int]: bugzilla.download_bugs(bug_ids) return sum( (bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in bug_ids), [], )
def retrieve_bugs(self): bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info('Downloading bugs from {} to {}'.format( two_years_and_six_months_ago, six_months_ago)) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info('Downloading labelled bugs') bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) # Try to re-download inconsistent bugs, up to three times. for i in range(3): bug_ids = bug_snapshot.get_inconsistencies() if len(bug_ids) == 0: break logger.info( f'Re-downloading {len(bug_ids)} bugs, as they were inconsistent' ) bugzilla.delete_bugs(bug_ids) bugzilla.download_bugs(bug_ids) self.compress_file('data/bugs.json')
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( "Downloading bugs from {} to {}".format( two_years_and_six_months_ago, six_months_ago ) ) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info("Downloading labelled bugs") bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) # Try to re-download inconsistent bugs, up to three times. for i in range(3): bug_ids = bug_snapshot.get_inconsistencies() if len(bug_ids) == 0: break logger.info( f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(bug_ids) bugzilla.download_bugs(bug_ids) self.compress_file("data/bugs.json")
def get_blocking_of(self, bug_ids: List[int], meta_only: bool = False) -> Dict[int, List[int]]: bugzilla.download_bugs(bug_ids) bug_map = {bug["id"]: bug for bug in bugzilla.get_bugs()} return { bug_id: bugzilla.find_blocking(bug_map, bug_map[bug_id]) for bug_id in bug_ids if not meta_only or "meta" in bug_map[bug_id]["keywords"] }
def retrieve_bugs(self): bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) six_months_ago = datetime.utcnow() - timedelta(182) two_years_and_six_months_ago = six_months_ago - timedelta(365) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) self.compress_file('data/bugs.json')
def retrieve_bugs(self): bugzilla.set_token(secrets[secrets.BUGZILLA_TOKEN]) six_months_ago = datetime.utcnow() - timedelta(182) two_years_and_six_months_ago = six_months_ago - timedelta(365) logger.info('Downloading bugs from {} to {}'.format( two_years_and_six_months_ago, six_months_ago)) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info('Downloading labelled bugs') bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) self.compress_file('data/bugs.json')
def go(self, days: int) -> None: bugs = self.get_landed_and_filed_since(days) meta_bugs = self.get_meta_bugs(days) last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Deleting bugs modified since the last run on {last_modified}") changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids) bugs = list(set(bugs)) test_infos = self.retrieve_test_info(days) test_info_bugs: List[int] = [ bug["id"] for test_info in test_infos.values() for bug in test_info["bugs"] ] logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs + test_info_bugs + [FUZZING_METABUG_ID] + meta_bugs) logger.info(f"{len(bugs)} bugs to analyze.") bugs_set = set(bugs + test_info_bugs + meta_bugs) bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call). if bug["id"] in bugs_set or len(bug["blocks"]) > 0: bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs, self.get_blocking_of(meta_bugs)) self.generate_component_connections(bug_map, bugs) self.generate_component_test_stats(bug_map, test_infos)
def go(self, days: int) -> None: bugs = self.get_landed_and_filed_since(days) meta_bugs = self.get_blocking_of(self.get_meta_bugs(days)) bugs += meta_bugs.keys() bugs += sum(meta_bugs.values(), []) bugs = list(set(bugs)) test_infos = self.retrieve_test_info(days) test_info_bugs: List[int] = [ bug["id"] for test_info in test_infos.values() for bug in test_info["bugs"] ] logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs + test_info_bugs) logger.info(f"{len(bugs)} bugs to analyze.") bugs_set = set(bugs + test_info_bugs) bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): # Only add to the map bugs we are interested in, and bugs that block other bugs (needed for the bug_to_types call). if bug["id"] in bugs_set or len(bug["blocks"]) > 0: bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) self.generate_landings_by_date(bug_map, regressor_bug_ids, bugs, meta_bugs) self.generate_component_connections(bug_map, bugs) self.generate_component_test_stats(bug_map, test_infos)
def go(self, bugs: List[int], meta_bugs: Optional[List[int]] = None) -> None: if meta_bugs is not None: bugs += meta_bugs + self.get_blocking_of(meta_bugs) logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs) component_team_mapping = bugzilla.get_component_team_mapping() bugs_set = set(bugs) commits = [ commit for commit in repository.get_commits() if commit["bug_id"] in bugs_set ] commit_map = {commit["node"]: commit for commit in commits} hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)} logger.info(f"{len(commits)} commits to analyze.") logger.info(f"{len(bugs_set)} bugs to analyze.") bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } if meta_bugs is not None: blocker_to_meta = collections.defaultdict(set) for meta_bug in meta_bugs: if meta_bug not in bug_map: continue for blocker_bug_id in bugzilla.find_blocking( bug_map, bug_map[meta_bug]): blocker_to_meta[blocker_bug_id].add(meta_bug) def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f) past_regressions_by = {} past_fixed_bugs_by = {} past_regression_blocked_bugs_by = {} past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) past_regression_blocked_bugs_by[dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) past_fixed_bug_blocked_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format(dimension=dimension)) path_to_component = repository.get_component_mapping() def get_full_component(bug): return "{}::{}".format(bug["product"], bug["component"]) def histogram(components: List[str]) -> Dict[str, float]: counter = collections.Counter(components) return { component: count / len(components) for component, count in counter.most_common() } def component_histogram(bugs: List[dict]) -> Dict[str, float]: return histogram([bug["component"] for bug in bugs]) def find_risk_band(risk: float) -> str: for name, start, end in self.risk_bands: if start <= risk <= end: return name assert False def get_prev_bugs(past_bugs_by: dict, commit: repository.CommitDict, component: str = None) -> List[dict]: paths = [ path for path in commit["files"] if component is None or (path.encode( "utf-8") in path_to_component and path_to_component[ path.encode("utf-8")] == component.encode("utf-8")) ] past_bugs = [] for path, f_group in commit["functions"].items(): if path not in paths: continue if path not in past_bugs_by["function"]: continue found = False for f in f_group: if f[0] not in past_bugs_by["function"][path]: continue found = True past_bugs += past_bugs_by["function"][path][f[0]] if found: paths.remove(path) for path in paths: if path in past_bugs_by["file"]: past_bugs += past_bugs_by["file"][path] paths.remove(path) for path, directories in zip(paths, repository.get_directories(paths)): found = False for directory in directories: if directory in past_bugs_by["directory"]: found = True past_bugs += past_bugs_by["directory"][directory] if found: paths.remove(path) components = [ path_to_component[path.encode("utf-8")].tobytes().decode( "utf-8") for path in paths if path.encode("utf-8") in path_to_component ] for component in components: if component in past_bugs_by["component"]: past_bugs += past_bugs_by["component"][component] return past_bugs def get_prev_bugs_stats( commit_group: dict, commit_list: List[repository.CommitDict], component: str = None, ) -> None: # Find previous regressions occurred in the same files as those touched by these commits. # And find previous bugs that were fixed by touching the same files as these commits. # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits. # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits. prev_regressions: List[Dict[str, Any]] = sum( (get_prev_bugs(past_regressions_by, commit, component) for commit in commit_list), [], ) prev_fixed_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_fixed_bugs_by, commit, component) for commit in commit_list), [], ) prev_regression_blocked_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_regression_blocked_bugs_by, commit, component) for commit in commit_list), [], ) prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = sum( (get_prev_bugs(past_fixed_bug_blocked_bugs_by, commit, component) for commit in commit_list), [], ) prev_regressions = _deduplicate(prev_regressions) prev_fixed_bugs = _deduplicate(prev_fixed_bugs) prev_regression_blocked_bugs = _deduplicate( prev_regression_blocked_bugs) prev_fixed_bug_blocked_bugs = _deduplicate( prev_fixed_bug_blocked_bugs) regression_components = component_histogram(prev_regressions) fixed_bugs_components = component_histogram(prev_fixed_bugs) regression_blocked_bug_components = component_histogram( prev_regression_blocked_bugs) fixed_bug_blocked_bug_components = component_histogram( prev_fixed_bug_blocked_bugs) commit_group[ "most_common_regression_components"] = regression_components # These are only used for component connections for the time being. if component: commit_group["prev_regressions"] = prev_regressions[-3:] commit_group["prev_fixed_bugs"] = prev_fixed_bugs[-3:] commit_group[ "prev_regression_blocked_bugs"] = prev_regression_blocked_bugs[ -3:] commit_group[ "prev_fixed_bug_blocked_bugs"] = prev_fixed_bug_blocked_bugs[ -3:] commit_group[ "most_common_fixed_bugs_components"] = fixed_bugs_components commit_group[ "most_common_regression_blocked_bug_components"] = regression_blocked_bug_components commit_group[ "most_common_fixed_bug_blocked_bug_components"] = fixed_bug_blocked_bug_components def get_commit_data( commit_list: List[repository.CommitDict]) -> List[dict]: if len(commit_list) == 0: return [] # Evaluate risk of commits associated to this bug. probs = self.regressor_model.classify(commit_list, probabilities=True) commits_data = [] for i, commit in enumerate(commit_list): revision_id = repository.get_revision_id(commit) if revision_id in revision_map: testing = phabricator.get_testing_project( revision_map[revision_id]) if testing is None: testing = "missing" else: testing = None commits_data.append({ "id": commit["node"], "testing": testing, "risk": float(probs[i][1]), "backedout": bool(commit["backedoutby"]), "author": commit["author_email"], "reviewers": commit["reviewers"], "coverage": [ commit["cov_added"], commit["cov_covered"], commit["cov_unknown"], ], }) return commits_data # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID. commits.sort(key=lambda x: x["bug_id"]) bug_to_commits = {} for bug_id, commit_iter in itertools.groupby(commits, lambda x: x["bug_id"]): # TODO: Figure out what to do with bugs we couldn't download (security bugs). if bug_id not in bug_map: continue bug_to_commits[bug_id] = sorted( commit_iter, key=lambda x: hash_to_rev[x["node"]]) bug_summaries = [] for bug_id in bugs: if bug_id not in bug_map: continue commit_list = bug_to_commits.get(bug_id, []) commit_data = get_commit_data(commit_list) bug = bug_map[bug_id] bug_summary = { "id": bug_id, "regressor": bug_id in regressor_bug_ids, "regression": len(bug["regressed_by"]) > 0 or any(keyword in bug["keywords"] for keyword in ["regression", "talos-regression"]) or ("cf_has_regression_range" in bug and bug["cf_has_regression_range"] == "yes"), "whiteboard": bug["whiteboard"], "assignee": bug["assigned_to"] if bug["assigned_to"] != "*****@*****.**" else None, "versions": bugzilla.get_fixed_versions(bug), "component": get_full_component(bug), "team": bugzilla.component_to_team(component_team_mapping, bug["product"], bug["component"]), "summary": bug["summary"], "types": bug_to_types(bug), "severity": bug["severity"], "creation_date": dateutil.parser.parse( bug["creation_time"]).strftime("%Y-%m-%d"), "date": max( dateutil.parser.parse(commit["pushdate"]) for commit in commit_list).strftime("%Y-%m-%d") if len(commit_list) > 0 else None, "commits": commit_data, "meta_ids": list(blocker_to_meta[bug_id]), "risk_band": find_risk_band(max(commit["risk"] for commit in commit_data)) if len(commit_data) > 0 else None, } get_prev_bugs_stats(bug_summary, commit_list) bug_summaries.append(bug_summary) landings_by_date = collections.defaultdict(list) for bug_summary in bug_summaries: landings_by_date[bug_summary["creation_date"]].append(bug_summary) with open("landings_by_date.json", "w") as f: output: dict = { "summaries": landings_by_date, } if meta_bugs is not None: output["featureMetaBugs"] = [{ "id": meta_bug, "summary": bug_map[meta_bug]["summary"] } for meta_bug in meta_bugs] json.dump(output, f) # Retrieve components of test failures that occurred when landing patches to fix bugs in specific components. component_failures = collections.defaultdict(list) push_data_iter, push_data_count, all_runnables = test_scheduling.get_push_data( "group") for revisions, _, _, possible_regressions, likely_regressions in tqdm( push_data_iter(), total=push_data_count): commit_list = [ commit_map[revision] for revision in revisions if revision in commit_map ] if len(commit_list) == 0: continue commit_bugs = [ bug_map[commit["bug_id"]] for commit in commit_list if commit["bug_id"] in bug_map ] components = list( set(get_full_component(bug) for bug in commit_bugs)) groups = [ group for group in list( set(possible_regressions + likely_regressions)) if group.encode("utf-8") in path_to_component ] for group in groups: for component in components: component_failures[component].append(path_to_component[ group.encode("utf-8")].tobytes().decode("utf-8")) # Filter out commits for which we have no bugs. commits = [commit for commit in commits if commit["bug_id"] in bug_map] # Sort commits by bug component, so we can use itertools.groupby to group them by bug component. commits.sort(key=lambda x: get_full_component(bug_map[x["bug_id"]])) commit_groups = [] for component, commit_iter in itertools.groupby( commits, lambda x: get_full_component(bug_map[x["bug_id"]])): commit_group = { "component": component, "most_common_test_failure_components": histogram(component_failures[component]) if component in component_failures else {}, } get_prev_bugs_stats(commit_group, list(commit_iter), component) commit_groups.append(commit_group) with open("component_connections.json", "w") as f: json.dump(commit_groups, f) repository.close_component_mapping()
def get_blocking_of(self, bug_ids: List[int]) -> List[int]: bugzilla.download_bugs(bug_ids) bug_map = {bug["id"]: bug for bug in bugzilla.get_bugs()} return sum((bugzilla.find_blocking(bug_map, bug_map[bug_id]) for bug_id in bug_ids), [])
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids( {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()} ) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between( two_years_and_six_months_ago, six_months_ago ) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) if limit: regressed_by_bug_ids = regressed_by_bug_ids[-limit:] logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = ( timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids ) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set ) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
def go(self, bugs: List[int], meta_bugs: Optional[List[int]] = None) -> None: if meta_bugs is not None: bugs += meta_bugs + self.get_blocking_of(meta_bugs) logger.info("Download bugs of interest...") bugzilla.download_bugs(bugs) bugs_set = set(bugs) commits = [ commit for commit in repository.get_commits() if commit["bug_id"] in bugs_set ] hash_to_rev = {commit["node"]: i for i, commit in enumerate(commits)} logger.info(f"{len(commits)} commits to analyze.") bug_ids = {commit["bug_id"] for commit in commits} logger.info(f"{len(bug_ids)} bugs to analyze.") bug_map = {} regressor_bug_ids = set() for bug in bugzilla.get_bugs(): if bug["id"] in bugs_set: bug_map[bug["id"]] = bug if len(bug["regressions"]) > 0: regressor_bug_ids.add(bug["id"]) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } if meta_bugs is not None: blocker_to_meta = collections.defaultdict(set) for meta_bug in meta_bugs: if meta_bug not in bug_map: continue for blocker_bug_id in bugzilla.find_blocking( bug_map, bug_map[meta_bug]): blocker_to_meta[blocker_bug_id].add(meta_bug) # TODO: Use past regressions by function information too (maybe first by function and if no results by file? or prioritize function and recentness?) def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f) past_regressions_by_file = _download_past_bugs( PAST_REGRESSIONS_BY_FILE_URL) past_fixed_bugs_by_file = _download_past_bugs( PAST_FIXED_BUGS_BY_FILE_URL) past_regression_blocked_bugs_by_file = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_FILE_URL) past_fixed_bug_blocked_bugs_by_file = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_FILE_URL) def component_histogram(bugs: List[dict]) -> Dict[str, float]: counter = collections.Counter(bug["component"] for bug in bugs) return { component: count / len(bugs) for component, count in counter.most_common() } # Sort commits by bug ID, so we can use itertools.groupby to group them by bug ID. commits.sort(key=lambda x: x["bug_id"]) commit_groups = [] for bug_id, commit_iter in itertools.groupby(commits, lambda x: x["bug_id"]): # TODO: Figure out what to do with bugs we couldn't download (security bugs). if bug_id not in bug_map: continue commit_list = list(commit_iter) commit_list.sort(key=lambda x: hash_to_rev[x["node"]]) # Find previous regressions occurred in the same files as those touched by these commits. # And find previous bugs that were fixed by touching the same files as these commits. # And find previous bugs that were blocked by regressions occurred in the same files as those touched by these commits. # And find previous bugs that were blocked by bugs that were fixed by touching the same files as those touched by these commits. prev_regressions: List[Dict[str, Any]] = [] prev_fixed_bugs: List[Dict[str, Any]] = [] prev_regression_blocked_bugs: List[Dict[str, Any]] = [] prev_fixed_bug_blocked_bugs: List[Dict[str, Any]] = [] for commit in commit_list: for path in commit["files"]: if path in past_regressions_by_file: prev_regressions.extend( bug_summary for bug_summary in past_regressions_by_file[path]) if path in past_fixed_bugs_by_file: prev_fixed_bugs.extend( bug_summary for bug_summary in past_fixed_bugs_by_file[path]) if path in past_regression_blocked_bugs_by_file: prev_regression_blocked_bugs.extend( bug_summary for bug_summary in past_regression_blocked_bugs_by_file[path]) if path in past_fixed_bug_blocked_bugs_by_file: prev_fixed_bug_blocked_bugs.extend( bug_summary for bug_summary in past_fixed_bug_blocked_bugs_by_file[path]) prev_regressions = _deduplicate(prev_regressions) prev_fixed_bugs = _deduplicate(prev_fixed_bugs) prev_regression_blocked_bugs = _deduplicate( prev_regression_blocked_bugs) prev_fixed_bug_blocked_bugs = _deduplicate( prev_fixed_bug_blocked_bugs) regression_components = component_histogram(prev_regressions) fixed_bugs_components = component_histogram(prev_fixed_bugs) regression_blocked_bug_components = component_histogram( prev_regression_blocked_bugs) fixed_bug_blocked_bug_components = component_histogram( prev_fixed_bug_blocked_bugs) # Evaluate risk of commits associated to this bug. probs = self.regressor_model.classify(commit_list, probabilities=True) commits_data = [] for i, commit in enumerate(commit_list): revision_id = repository.get_revision_id(commit) if revision_id in revision_map: testing = phabricator.get_testing_project( revision_map[revision_id]) if testing is None: testing = "none" else: testing = None commits_data.append({ "id": commit["node"], "testing": testing, "risk": float(probs[i][1]), "backedout": bool(commit["backedoutby"]), "regressor": commit["bug_id"] in regressor_bug_ids, }) bug = bug_map[bug_id] commit_groups.append({ "id": bug_id, "versions": bugzilla.get_fixed_versions(bug), "component": "{}::{}".format(bug["product"], bug["component"]), "summary": bug["summary"], "date": max( dateutil.parser.parse(commit["pushdate"]) for commit in commit_list).strftime("%Y-%m-%d"), "commits": commits_data, "meta_ids": list(blocker_to_meta[bug_id]), "prev_regressions": prev_regressions[-3:], "prev_fixed_bugs": prev_fixed_bugs[-3:], "prev_regression_blocked_bugs": prev_regression_blocked_bugs[-3:], "prev_fixed_bug_blocked_bugs": prev_fixed_bug_blocked_bugs[-3:], "most_common_regression_components": regression_components, "most_common_fixed_bugs_components": fixed_bugs_components, "most_common_regression_blocked_bug_components": regression_blocked_bug_components, "most_common_fixed_bug_blocked_bug_components": fixed_bug_blocked_bug_components, }) landings_by_date = collections.defaultdict(list) for commit_group in commit_groups: landings_by_date[commit_group["date"]].append(commit_group) with open("landings_by_date.json", "w") as f: output: dict = { "landings": landings_by_date, } if meta_bugs is not None: output["featureMetaBugs"] = [{ "id": meta_bug, "summary": bug_map[meta_bug]["summary"] } for meta_bug in meta_bugs] json.dump(output, f)
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)
def go(self, days_start: int, days_end: int) -> None: commits = self.get_landed_since(days_start, days_end) logger.info("Retrieve Phabricator revisions linked to commits...") revision_ids = set( filter(None, (repository.get_revision_id(commit) for commit in commits))) logger.info("Download revisions of interest...") phabricator.download_revisions(revision_ids) revision_map = { revision["id"]: revision for revision in phabricator.get_revisions() if revision["id"] in revision_ids } logger.info("Download bugs of interest...") bugzilla.download_bugs(commit["bug_id"] for commit in commits if commit["bug_id"]) # Filter-out commits with no Phabricator revision linked to them, or with no testing tags. commits = [ commit for commit in commits if repository.get_revision_id(commit) in revision_map ] logger.info(f"{len(commits)} revisions") # Filter-out commits with no testing tags. commits = [ commit for commit in commits if phabricator.get_testing_project( revision_map[repository.get_revision_id(commit)]) is not None ] logger.info(f"{len(commits)} revisions with testing tags") def list_testing_projects( commits: Iterable[repository.CommitDict], ) -> Collection[str]: return list( filter( None, (phabricator.get_testing_project( revision_map[repository.get_revision_id(commit)]) for commit in commits), )) testing_projects = list_testing_projects(commits) print(f"Most common testing tags (in {len(commits)} revisions):") for testing_project, count in collections.Counter( testing_projects).most_common(): print( f"{testing_project} - {round(100 * count / len(testing_projects), 1)}%" ) backedout_commits = [ commit for commit in commits if commit["backedoutby"] ] backedout_testing_projects = list_testing_projects(backedout_commits) print( f"\nMost common testing tags for backed-out revisions (in {len(backedout_commits)} revisions):" ) for testing_project, count in collections.Counter( backedout_testing_projects).most_common(): print( f"{testing_project} - {round(100 * count / len(backedout_testing_projects), 1)}%" ) regressor_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if len(bug["regressions"]) > 0 } regressor_commits = [ commit for commit in commits if commit["bug_id"] in regressor_bug_ids ] regressor_testing_projects = list_testing_projects(regressor_commits) print( f"\nMost common testing tags for revisions which caused regressions (in {len(regressor_commits)} revisions):" ) for testing_project, count in collections.Counter( regressor_testing_projects).most_common(): print( f"{testing_project} - {round(100 * count / len(regressor_testing_projects), 1)}%" )