def print_uncaught( granularity: str, scheduler1: str, scheduler2: Optional[str] = None ) -> None: push_data_db = ( test_scheduling.PUSH_DATA_GROUP_DB if granularity == "group" else test_scheduling.PUSH_DATA_CONFIG_GROUP_DB ) assert db.download(push_data_db) regressions_by_rev = {} for revisions, _, _, possible_regressions, likely_regressions in db.read( push_data_db ): regressions_by_rev[revisions[0]] = get_regressions( granularity, likely_regressions, possible_regressions ) for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB): if len(scheduler_stat["schedulers"]) == 0: continue rev = scheduler_stat["id"] if rev not in regressions_by_rev: continue regressions = regressions_by_rev[rev] if len(regressions) == 0: continue scheduled_by_scheduler = {} caught_by_scheduler = {} for scheduler in scheduler_stat["schedulers"]: scheduled = get_scheduled(granularity, scheduler) scheduled_by_scheduler[scheduler["name"]] = scheduled caught_by_scheduler[scheduler["name"]] = regressions & scheduled if scheduler1 not in caught_by_scheduler: continue if len(caught_by_scheduler[scheduler1]) == 0: if scheduler2 is not None and scheduler2 not in caught_by_scheduler: print( f"{scheduler1} didn't catch any of the {len(regressions)} regressions on {rev}" ) elif scheduler2 is not None and len(caught_by_scheduler[scheduler2]) == 0: print( f"{scheduler1} and {scheduler2} didn't catch any of the {len(regressions)} regressions on {rev}" ) else: print( f"{scheduler1} didn't catch any of the {len(regressions)} regressions on {rev}, while {scheduler2} did" ) print(f"Regressions: {regressions}") print(f"Scheduled by {scheduler1}: {scheduled_by_scheduler[scheduler1]}")
def test_delete(mock_db, db_format, db_compression): db_path = mock_db(db_format, db_compression) db.write(db_path, range(1, 9)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8] db.delete(db_path, lambda x: x == 4) assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
def test_append(mock_db, db_format, db_compression): db_path = mock_db(db_format, db_compression) db.write(db_path, range(1, 4)) assert list(db.read(db_path)) == [1, 2, 3] db.append(db_path, range(4, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def test_append_compressed(tmp_path): db_path = tmp_path / 'prova.json.gz' db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 4)) assert list(db.read(db_path)) == [1, 2, 3] db.append(db_path, range(4, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def main(): description = "Find bug-introducing commits from bug-fixing commits" parser = argparse.ArgumentParser(description=description) parser.add_argument("what", choices=["to_ignore", "bug_fixing", "bug_introducing"]) parser.add_argument( "--repo_dir", help= "Path to a Gecko repository. If no repository exists, it will be cloned to this location.", ) parser.add_argument("--git_repo_url", help="URL to the git repository on which to run SZZ.") parser.add_argument("--git_repo_dir", help="Path where the git repository will be cloned.") parser.add_argument( "--tokenized_git_repo_url", help="URL to the tokenized git repository on which to run SZZ.", ) parser.add_argument( "--tokenized_git_repo_dir", help="Path where the tokenized git repository will be cloned.", ) args = parser.parse_args() regressor_finder = RegressorFinder( args.repo_dir, args.git_repo_url, args.git_repo_dir, args.tokenized_git_repo_url, args.tokenized_git_repo_dir, ) if args.what == "to_ignore": regressor_finder.get_commits_to_ignore() elif args.what == "bug_fixing": regressor_finder.find_bug_fixing_commits() elif args.what == "bug_introducing": assert args.git_repo_url or args.tokenized_git_repo_url if args.git_repo_url: assert not args.tokenized_git_repo_url regressor_finder.find_bug_introducing_commits( args.git_repo_dir, False) evaluate(db.read(BUG_INTRODUCING_COMMITS_DB)) if args.tokenized_git_repo_url: assert not args.git_repo_url regressor_finder.find_bug_introducing_commits( args.tokenized_git_repo_dir, True) evaluate(db.read(TOKENIZED_BUG_INTRODUCING_COMMITS_DB))
def test_delete_compressed(tmp_path): db_path = tmp_path / 'prova.json.gz' print(db_path) db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 9)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7, 8] db.delete(db_path, lambda x: x == 4) assert list(db.read(db_path)) == [1, 2, 3, 5, 6, 7, 8]
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") if db.is_old_version( IGNORED_COMMITS_DB) or not db.exists(IGNORED_COMMITS_DB): db.download(IGNORED_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info( f"Already found {len(prev_commits_to_ignore)} commits to ignore..." ) if len(prev_commits_to_ignore) > 0: rev_start = "children({})".format( prev_commits_to_ignore[-1]["rev"]) else: rev_start = 0 # 2 days more than the end date, so we can know if a commit was backed-out. # We have to do this as recent commits might be missing in the mercurial <-> git map, # otherwise we could just use "tip". end_date = datetime.now() - RELATIVE_END_DATE + relativedelta(2) with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs( hg, rev_start, "pushdate('{}')".format(end_date.strftime("%Y-%m-%d"))) # Given that we use the pushdate, there might be cases where the starting commit is returned too (e.g. if we rerun the task on the same day). if len(prev_commits_to_ignore) > 0: found_prev = -1 for i, rev in enumerate(revs): if rev.decode("utf-8") == prev_commits_to_ignore[-1]["rev"]: found_prev = i break revs = revs[found_prev + 1:] commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) repository.set_commits_to_ignore(self.mercurial_repo_dir, commits) commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append({ "rev": commit.node, "type": "backedout" if commit.backedoutby else "", }) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.append(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) return prev_commits_to_ignore + commits_to_ignore
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") db.download(IGNORED_COMMITS_DB) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info( f"Already found {len(prev_commits_to_ignore)} commits to ignore..." ) # When we already have some analyzed commits, re-analyze the last 3500 to make sure # we didn't miss back-outs that happened since the last analysis. if len(prev_commits_to_ignore) > 0: first_commit_to_reanalyze = ( -3500 if len(prev_commits_to_ignore) >= 3500 else 0) rev_start = "children({})".format( prev_commits_to_ignore[first_commit_to_reanalyze]["rev"]) else: rev_start = 0 with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs(hg, rev_start) commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) with hglib.open(self.mercurial_repo_dir) as hg: repository.set_commits_to_ignore(hg, self.mercurial_repo_dir, commits) for commit in commits: commit.ignored |= commit.author_email == "*****@*****.**" chosen_commits = set() commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append({ "rev": commit.node, "type": "backedout" if commit.backedoutby else "", }) chosen_commits.add(commit.node) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") for prev_commit in prev_commits_to_ignore[::-1]: if prev_commit["rev"] not in chosen_commits: commits_to_ignore.append(prev_commit) chosen_commits.add(prev_commit["rev"]) logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def get_labels(self): classes = {} regressors = set(r["bug_introducing_rev"] for r in db.read(BUG_INTRODUCING_COMMITS_DB) if r["bug_introducing_rev"]) for commit_data in repository.get_commits(): if commit_data["ever_backedout"]: continue node = commit_data["node"] if node in regressors: classes[node] = 1 else: push_date = dateutil.parser.parse(commit_data["pushdate"]) # The labels we have are only from two years and six months ago (see the regressor finder script). if push_date < datetime.utcnow() - relativedelta(years=2, months=6): continue # We remove the last 6 months, as there could be regressions which haven't been filed yet. if push_date > datetime.utcnow() - relativedelta(months=6): continue classes[node] = 0 print("{} commits caused regressions".format( sum(1 for label in classes.values() if label == 1))) print("{} commits did not cause regressions".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def go(days: int) -> None: logger.info("Download previous shadow scheduler statistics...") db.download(SHADOW_SCHEDULER_STATS_DB) logger.info("Get previously gathered statistics...") prev_scheduler_stat_revs = set( scheduler_stat["id"] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB)) logger.info( f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..." ) to_date = datetime.utcnow() - relativedelta(days=3) from_date = to_date - relativedelta(days=days) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) pushes = [ push for push in pushes if push.rev not in prev_scheduler_stat_revs ] logger.info(f"{len(pushes)} left to analyze") db.append(SHADOW_SCHEDULER_STATS_DB, analyze_shadow_schedulers(pushes)) utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB)
def test_write_read(tmp_path): db_path = tmp_path / 'prova.json' db.register(db_path, 'https://alink', 1) db.write(db_path, range(1, 8)) assert list(db.read(db_path)) == [1, 2, 3, 4, 5, 6, 7]
def main(): description = "Find bug-introducing commits from bug-fixing commits" parser = argparse.ArgumentParser(description=description) parser.add_argument("cache_root", help="Cache for repository clones.") parser.add_argument( "git_repo_url", help="URL to the git repository on which to run SZZ." ) parser.add_argument( "git_repo_dir", help="Path where the git repository will be cloned." ) parser.add_argument( "tokenized_git_repo_url", help="URL to the tokenized git repository on which to run SZZ.", ) parser.add_argument( "tokenized_git_repo_dir", help="Path where the tokenized git repository will be cloned.", ) args = parser.parse_args() regressor_finder = RegressorFinder( args.cache_root, args.git_repo_url, args.git_repo_dir, args.tokenized_git_repo_url, args.tokenized_git_repo_dir, ) commits_to_ignore = regressor_finder.get_commits_to_ignore() bug_fixing_commits = regressor_finder.find_bug_fixing_commits() tokenized_done = regressor_finder.find_bug_introducing_commits( bug_fixing_commits, commits_to_ignore, True ) evaluate(db.read(TOKENIZED_BUG_INTRODUCING_COMMITS_DB)) done = regressor_finder.find_bug_introducing_commits( bug_fixing_commits, commits_to_ignore, False ) evaluate(db.read(BUG_INTRODUCING_COMMITS_DB)) with open("done", "w") as f: f.write(str(1 if tokenized_done and done else 0))
def retrieve_test_info(self, days: int) -> Dict[str, Any]: logger.info("Download previous test info...") db.download(TEST_INFOS_DB) dates = [ datetime.utcnow() - timedelta(days=day) for day in reversed(range(days)) ] logger.info("Get previously gathered test info...") test_infos = { test_info["date"]: test_info for test_info in db.read(TEST_INFOS_DB) } prev_skips = None for date in tqdm(dates): date_str = date.strftime("%Y-%m-%d") # Gather the latest three days again, as the data might have changed. if date_str in test_infos and date < datetime.utcnow() - timedelta( days=3): prev_skips = test_infos[date_str]["skips"] continue test_infos[date_str] = { "date": date_str, "bugs": [{ "id": item["bug_id"], "count": item["bug_count"] } for item in test_scheduling.get_failure_bugs(date, date)], "skips": {}, } try: test_info = test_scheduling.get_test_info(date) for component in test_info["tests"].keys(): test_infos[date_str]["skips"][component] = sum( 1 for test in test_info["tests"][component] if "skip-if" in test) except requests.exceptions.HTTPError: # If we couldn't find a test info artifact for the given date, assume the number of skip-ifs didn't change from the previous day. assert prev_skips is not None test_infos[date_str]["skips"] = prev_skips prev_skips = test_infos[date_str]["skips"] db.write( TEST_INFOS_DB, (test_infos[date.strftime("%Y-%m-%d")] for date in dates if date.strftime("%Y-%m-%d") in test_infos), ) zstd_compress(TEST_INFOS_DB) return test_infos
def get_test_scheduling_history(granularity): if granularity == "label": test_scheduling_db = TEST_LABEL_SCHEDULING_DB elif granularity == "group": test_scheduling_db = TEST_GROUP_SCHEDULING_DB else: raise Exception(f"{granularity} granularity unsupported") return db.read(test_scheduling_db)
def get_test_scheduling_history(granularity): if granularity == "label": test_scheduling_db = TEST_LABEL_SCHEDULING_DB elif granularity == "group": test_scheduling_db = TEST_GROUP_SCHEDULING_DB else: raise Exception(f"{granularity} granularity unsupported") for obj in db.read(test_scheduling_db): yield obj["revs"], obj["data"]
def test_unregistered_db(tmp_path): db_path = tmp_path / "prova.json" with pytest.raises(AssertionError): list(db.read(db_path)) with pytest.raises(AssertionError): db.write(db_path, range(7)) with pytest.raises(AssertionError): db.append(db_path, range(7))
def get_commits( include_no_bug: bool = False, include_backouts: bool = False, include_ignored: bool = False, ) -> Generator[CommitDict, None, None]: return filter_commits( db.read(COMMITS_DB), include_no_bug=include_no_bug, include_backouts=include_backouts, include_ignored=include_ignored, )
def get_labels(self): classes = {} regressors = set( r["bug_introducing_rev"] for r in db.read(BUG_INTRODUCING_COMMITS_DB) if r["bug_introducing_rev"] ) regressor_bugs = set( sum((bug["regressed_by"] for bug in bugzilla.get_bugs()), []) ) for commit_data in repository.get_commits(): if commit_data["backedoutby"]: continue if repository.is_wptsync(commit_data): continue push_date = dateutil.parser.parse(commit_data["pushdate"]) # Skip commits used for the evaluation phase. if push_date > datetime.utcnow() - relativedelta(months=EVALUATION_MONTHS): continue node = commit_data["node"] if node in regressors or commit_data["bug_id"] in regressor_bugs: classes[node] = 1 else: # The labels we have are only from two years and six months ago (see the regressor finder script). if push_date < datetime.utcnow() - relativedelta(years=2, months=6): continue # We remove the last 6 months, as there could be regressions which haven't been filed yet. if push_date > datetime.utcnow() - relativedelta(months=6): continue classes[node] = 0 print( "{} commits caused regressions".format( sum(1 for label in classes.values() if label == 1) ) ) print( "{} commits did not cause regressions".format( sum(1 for label in classes.values() if label == 0) ) ) return classes, [0, 1]
def go(days: int) -> None: logger.info("Download previous shadow scheduler statistics...") db.download(SHADOW_SCHEDULER_STATS_DB) logger.info("Get previously gathered statistics...") prev_scheduler_stat_revs = set( scheduler_stat["id"] for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB)) logger.info( f"Already gathered statistics for {len(prev_scheduler_stat_revs)} pushes..." ) to_date = datetime.utcnow() - relativedelta(days=3) from_date = to_date - relativedelta(days=days) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) pushes = [ push for push in pushes if push.rev not in prev_scheduler_stat_revs ] logger.info(f"{len(pushes)} left to analyze") def compress_and_upload() -> None: utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB) def results() -> Iterator[dict]: for i, push in enumerate(tqdm(pushes)): try: yield analyze_shadow_schedulers(push) except Exception: traceback.print_exc() # Upload every 42 pushes. if (i + 1) % 42 == 0: compress_and_upload() db.append(SHADOW_SCHEDULER_STATS_DB, results()) compress_and_upload()
def push_data_iter() -> Iterator[PushResult]: return (( revisions, filter_runnables( rename_runnables(granularity, push_tasks), all_runnables_set, granularity, ), filter_runnables( rename_runnables(granularity, possible_regressions), all_runnables_set, granularity, ), filter_runnables( rename_runnables(granularity, likely_regressions), all_runnables_set, granularity, ), ) for revisions, push_tasks, possible_regressions, likely_regressions in db.read(push_data_db))
def get_test_scheduling_history(): return db.read(TEST_SCHEDULING_DB)
def get_bugs(include_invalid: Optional[bool] = False) -> Iterator[BugDict]: yield from (bug for bug in db.read(BUGS_DB) if include_invalid or bug["product"] != "Invalid Bugs")
def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") if db.is_old_version(db_path) or not db.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) def _init(git_repo_dir): thread_local.git = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["rev"])) git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) commit = thread_local.git.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: logger.info("Skipping {} as it is too big".format( bug_fixing_commit["rev"])) return None bug_introducing_modifications = thread_local.git.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info("Found {} for {}".format(bug_introducing_modifications, bug_fixing_commit["rev"])) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: try: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) except Exception as e: # Skip commits that are in git but not in mercurial, as they are too old (older than "Free the lizard"). if not str(e).startswith( "Missing git commit in the VCS map"): raise # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: def results(): num_analyzed = 0 bug_fixing_commits_queue = bug_fixing_commits.copy() # Analyze up to 500 commits at a time, to avoid the task running out of time. while len( bug_fixing_commits_queue) != 0 and num_analyzed != 500: bug_introducing_commit_futures = [] for _ in range( min(500 - num_analyzed, len(bug_fixing_commits))): bug_introducing_commit_futures.append( executor.submit(find_bic, bug_fixing_commits.pop())) logger.info( f"Analyzing a chunk of {len(bug_introducing_commit_futures)} commits" ) for future in tqdm( concurrent.futures.as_completed( bug_introducing_commit_futures), total=len(bug_introducing_commit_futures), ): result = future.result() if result is not None: num_analyzed += 1 yield from result with open("done", "w") as f: f.write( str(1 if len(bug_fixing_commits_queue) == 0 else 0)) db.append(db_path, results()) zstd_compress(db_path)
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) logger.info("Downloading bugs database...") if db.is_old_version( bugzilla.BUGS_DB) or not db.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) logger.info("Download previous classifications...") if db.is_old_version( BUG_FIXING_COMMITS_DB) or not db.exists(BUG_FIXING_COMMITS_DB): db.download(BUG_FIXING_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") download_model("defectenhancementtask") defect_model = DefectEnhancementTaskModel.load( "defectenhancementtaskmodel") logger.info("Downloading regression model...") download_model("regression") regression_model = RegressionModel.load("regressionmodel") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits return [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ["r", "d"] ]
def go(months: int) -> None: logger.info("Download previous shadow scheduler statistics...") db.download(SHADOW_SCHEDULER_STATS_DB) logger.info("Get previously gathered statistics...") scheduler_stats = { scheduler_stat["id"]: scheduler_stat for scheduler_stat in db.read(SHADOW_SCHEDULER_STATS_DB) } logger.info(f"Already gathered statistics for {len(scheduler_stats)} pushes...") to_date = datetime.utcnow() - relativedelta(days=3) from_date = to_date - relativedelta(months=months) pushes = mozci.push.make_push_objects( from_date=from_date.strftime("%Y-%m-%d"), to_date=to_date.strftime("%Y-%m-%d"), branch="autoland", ) pushes_to_analyze = [push for push in pushes if push.rev not in scheduler_stats] logger.info(f"{len(pushes_to_analyze)} left to analyze") def compress_and_upload() -> None: db.write( SHADOW_SCHEDULER_STATS_DB, ( scheduler_stats[push.rev] for push in pushes if push.rev in scheduler_stats ), ) utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB) assert db.download(test_scheduling.PUSH_DATA_GROUP_DB) group_regressions = {} for revisions, _, _, possible_regressions, likely_regressions in db.read( test_scheduling.PUSH_DATA_GROUP_DB ): group_regressions[revisions[0]] = set(likely_regressions) assert db.download(test_scheduling.PUSH_DATA_CONFIG_GROUP_DB) config_group_regressions = {} for ( revisions, _, _, possible_regressions, likely_regressions, ) in db.read(test_scheduling.PUSH_DATA_CONFIG_GROUP_DB): config_group_regressions[revisions[0]] = set( tuple(r) for r in likely_regressions ) start_time = time.monotonic() with concurrent.futures.ThreadPoolExecutor() as executor: future_to_push = { executor.submit( analyze_shadow_schedulers, group_regressions[push.rev] if push.rev in group_regressions else None, config_group_regressions[push.rev] if push.rev in config_group_regressions else None, push, ): push for push in pushes_to_analyze if push.rev in group_regressions or push.rev in config_group_regressions } try: for future in tqdm( concurrent.futures.as_completed(future_to_push), total=len(future_to_push), ): push = future_to_push[future] try: scheduler_stats[push.rev] = future.result() except Exception: traceback.print_exc() # Upload every 10 minutes. if time.monotonic() - start_time >= 600: compress_and_upload() start_time = time.monotonic() except Exception: for f in future_to_push.keys(): f.cancel() raise compress_and_upload()
def get_issues() -> Iterator[IssueDict]: yield from db.read(GITHUB_ISSUES_DB)
def get_bugs(): return db.read(BUGS_DB)
def get_bugs(include_invalid=False): yield from (bug for bug in db.read(BUGS_DB) if include_invalid or bug["product"] != "Invalid Bugs")
def get_commits(): return db.read(COMMITS_DB)
def go(self) -> None: logger.info( "Generate map of bug ID -> bug data for all bugs which were defects" ) bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ("d", "r")) logger.info( f"{len(bug_fixing_commits_nodes)} bug-fixing commits to analyze") all_bug_ids = set(commit["bug_id"] for commit in repository.get_commits()) bug_map = { bug["id"]: bug for bug in bugzilla.get_bugs() if bug["id"] in all_bug_ids } logger.info( "Generate a map from files/functions to the bugs which were fixed/introduced by touching them" ) # TODO: Support "moving" past bugs between files when they are renamed and between functions when they are # moved across files. past_regressions_by_file: Dict[str, List[int]] = defaultdict(list) past_fixed_bugs_by_file: Dict[str, List[int]] = defaultdict(list) past_regression_blocked_bugs_by_file: Dict[ str, List[int]] = defaultdict(list) past_fixed_bug_blocked_bugs_by_file: Dict[ str, List[int]] = defaultdict(list) past_regressions_by_function: Dict[str, Dict[ str, List[int]]] = defaultdict(lambda: defaultdict(list)) past_fixed_bugs_by_function: Dict[str, Dict[ str, List[int]]] = defaultdict(lambda: defaultdict(list)) past_regression_blocked_bugs_by_function: Dict[str, Dict[ str, List[int]]] = defaultdict(lambda: defaultdict(list)) past_fixed_bug_blocked_bugs_by_function: Dict[str, Dict[ str, List[int]]] = defaultdict(lambda: defaultdict(list)) for commit in tqdm(repository.get_commits()): if commit["bug_id"] not in bug_map: continue bug = bug_map[commit["bug_id"]] if len(bug["regressions"]) > 0: for path in commit["files"]: past_regressions_by_file[path].extend( bug_id for bug_id in bug["regressions"] if bug_id in bug_map) past_regression_blocked_bugs_by_file[path].extend( bugzilla.find_blocked_by(bug_map, bug)) for path, f_group in commit["functions"].items(): for f in f_group: past_regressions_by_function[path][f[0]].extend( bug_id for bug_id in bug["regressions"] if bug_id in bug_map) past_regression_blocked_bugs_by_function[path][ f[0]].extend(bugzilla.find_blocked_by( bug_map, bug)) if commit["node"] in bug_fixing_commits_nodes: for path in commit["files"]: past_fixed_bugs_by_file[path].append(bug["id"]) past_fixed_bug_blocked_bugs_by_file[path].extend( bugzilla.find_blocked_by(bug_map, bug)) for path, f_group in commit["functions"].items(): for f in f_group: past_fixed_bugs_by_function[path][f[0]].append( bug["id"]) past_fixed_bug_blocked_bugs_by_function[path][ f[0]].extend(bugzilla.find_blocked_by( bug_map, bug)) def _transform(bug_ids: List[int]) -> List[dict]: seen = set() results = [] for bug_id in bug_ids: if bug_id in seen: continue seen.add(bug_id) bug = bug_map[bug_id] results.append({ "id": bug_id, "summary": bug["summary"], "component": "{}::{}".format(bug["product"], bug["component"]), }) return results past_regression_summaries_by_file = { path: _transform(bug_ids) for path, bug_ids in past_regressions_by_file.items() } past_fixed_bug_summaries_by_file = { path: _transform(bug_ids) for path, bug_ids in past_fixed_bugs_by_file.items() } past_regression_blocked_bug_summaries_by_file = { path: _transform(bug_ids) for path, bug_ids in past_regression_blocked_bugs_by_file.items() } past_fixed_bug_blocked_bug_summaries_by_file = { path: _transform(bug_ids) for path, bug_ids in past_fixed_bug_blocked_bugs_by_file.items() } past_regression_summaries_by_function = { path: { func: _transform(bug_ids) for func, bug_ids in funcs_bugs.items() } for path, funcs_bugs in past_regressions_by_function.items() } past_fixed_bug_summaries_by_function = { path: { func: _transform(bug_ids) for func, bug_ids in funcs_bugs.items() } for path, funcs_bugs in past_fixed_bugs_by_function.items() } past_regression_blocked_bug_summaries_by_function = { path: { func: _transform(bug_ids) for func, bug_ids in funcs_bugs.items() } for path, funcs_bugs in past_regression_blocked_bugs_by_function.items() } past_fixed_bug_blocked_bug_summaries_by_function = { path: { func: _transform(bug_ids) for func, bug_ids in funcs_bugs.items() } for path, funcs_bugs in past_fixed_bug_blocked_bugs_by_function.items() } with open("data/past_regressions_by_file.json", "w") as f: json.dump(past_regression_summaries_by_file, f) zstd_compress("data/past_regressions_by_file.json") with open("data/past_fixed_bugs_by_file.json", "w") as f: json.dump(past_fixed_bug_summaries_by_file, f) zstd_compress("data/past_fixed_bugs_by_file.json") with open("data/past_regression_blocked_bugs_by_file.json", "w") as f: json.dump(past_regression_blocked_bug_summaries_by_file, f) zstd_compress("data/past_regression_blocked_bugs_by_file.json") with open("data/past_fixed_bug_blocked_bugs_by_file.json", "w") as f: json.dump(past_fixed_bug_blocked_bug_summaries_by_file, f) zstd_compress("data/past_fixed_bug_blocked_bugs_by_file.json") with open("data/past_regressions_by_function.json", "w") as f: json.dump(past_regression_summaries_by_function, f) zstd_compress("data/past_regressions_by_function.json") with open("data/past_fixed_bugs_by_function.json", "w") as f: json.dump(past_fixed_bug_summaries_by_function, f) zstd_compress("data/past_fixed_bugs_by_function.json") with open("data/past_regression_blocked_bugs_by_function.json", "w") as f: json.dump(past_regression_blocked_bug_summaries_by_function, f) zstd_compress("data/past_regression_blocked_bugs_by_function.json") with open("data/past_fixed_bug_blocked_bugs_by_function.json", "w") as f: json.dump(past_fixed_bug_blocked_bug_summaries_by_function, f) zstd_compress("data/past_fixed_bug_blocked_bugs_by_function.json")