def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") db.download(IGNORED_COMMITS_DB) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info( f"Already found {len(prev_commits_to_ignore)} commits to ignore..." ) # When we already have some analyzed commits, re-analyze the last 3500 to make sure # we didn't miss back-outs that happened since the last analysis. if len(prev_commits_to_ignore) > 0: first_commit_to_reanalyze = ( -3500 if len(prev_commits_to_ignore) >= 3500 else 0) rev_start = "children({})".format( prev_commits_to_ignore[first_commit_to_reanalyze]["rev"]) else: rev_start = 0 with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs(hg, rev_start) commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) with hglib.open(self.mercurial_repo_dir) as hg: repository.set_commits_to_ignore(hg, self.mercurial_repo_dir, commits) for commit in commits: commit.ignored |= commit.author_email == "*****@*****.**" chosen_commits = set() commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append({ "rev": commit.node, "type": "backedout" if commit.backedoutby else "", }) chosen_commits.add(commit.node) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") for prev_commit in prev_commits_to_ignore[::-1]: if prev_commit["rev"] not in chosen_commits: commits_to_ignore.append(prev_commit) chosen_commits.add(prev_commit["rev"]) logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def upload_adr_cache(self): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath(adr.config["cache"]["stores"]["file"] ["path"]) == os.path.abspath(cache_path) create_tar_zst(f"{ADR_CACHE_DB}.zst") db.upload(ADR_CACHE_DB)
def get_commits_to_ignore(self) -> None: assert db.download(repository.COMMITS_DB) ignored = set() commits_to_ignore = [] all_commits = set() annotate_ignore_nodes = { node for node, label in labels.get_labels("annotateignore") if label == "1" } for commit in repository.get_commits(include_no_bug=True, include_backouts=True, include_ignored=True): all_commits.add(commit["node"][:12]) if (commit["ignored"] or commit["backedoutby"] or not commit["bug_id"] or len(commit["backsout"]) > 0 or repository.is_wptsync(commit) or commit["node"] in annotate_ignore_nodes): commits_to_ignore.append({ "rev": commit["node"], "type": "backedout" if commit["backedoutby"] else "", }) ignored.add(commit["node"][:12]) if len(commit["backsout"]) > 0: for backedout in commit["backsout"]: if backedout[:12] in ignored: continue ignored.add(backedout[:12]) commits_to_ignore.append({ "rev": backedout, "type": "backedout" }) logger.info(f"{len(commits_to_ignore)} commits to ignore...") # Skip backed-out commits which aren't in the repository (commits which landed *before* the Mercurial history # started, and backouts which mentioned a bad hash in their message). commits_to_ignore = [ c for c in commits_to_ignore if c["rev"][:12] in all_commits ] logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def upload_adr_cache(): cache_path = os.path.splitext(ADR_CACHE_DB)[0] assert os.path.abspath(adr.config["cache"]["stores"]["file"] ["path"]) == os.path.abspath(cache_path) with open_tar_zst(f"{ADR_CACHE_DB}.zst") as tar: tar.add(cache_path) db.upload(ADR_CACHE_DB)
def compress_and_upload() -> None: db.write( SHADOW_SCHEDULER_STATS_DB, (scheduler_stats[push.rev] for push in pushes if push.rev in scheduler_stats), ) utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB)
def compress_and_upload(): zstd_compress(db_path) db.upload(db_path)
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download previous classifications...") db.download(BUG_FIXING_COMMITS_DB) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") defect_model = download_and_load_model("defectenhancementtask") logger.info("Downloading regression model...") regression_model = download_and_load_model("regression") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) zstd_compress(BUG_FIXING_COMMITS_DB) db.upload(BUG_FIXING_COMMITS_DB)
def compress_and_upload() -> None: utils.zstd_compress(SHADOW_SCHEDULER_STATS_DB) db.upload(SHADOW_SCHEDULER_STATS_DB)
def get_commits_to_ignore(self): logger.info("Download previous commits to ignore...") db.download(IGNORED_COMMITS_DB) logger.info("Get previously classified commits...") prev_commits_to_ignore = list(db.read(IGNORED_COMMITS_DB)) logger.info(f"Already found {len(prev_commits_to_ignore)} commits to ignore...") # When we already have some analyzed commits, re-analyze the last 3500 to make sure # we didn't miss back-outs that happened since the last analysis. if len(prev_commits_to_ignore) > 0: first_commit_to_reanalyze = ( -3500 if len(prev_commits_to_ignore) >= 3500 else 0 ) rev_start = "children({})".format( prev_commits_to_ignore[first_commit_to_reanalyze]["rev"] ) else: rev_start = 0 with hglib.open(self.mercurial_repo_dir) as hg: revs = repository.get_revs(hg, rev_start) # Drop commits which are not yet present in the mercurial <-> git mapping. while len(revs) > 0: try: vcs_map.mercurial_to_git(revs[-1].decode("ascii")) break except Exception as e: if not str(e).startswith("Missing mercurial commit in the VCS map"): raise revs.pop() commits = repository.hg_log_multi(self.mercurial_repo_dir, revs) repository.set_commits_to_ignore(self.mercurial_repo_dir, commits) chosen_commits = set() commits_to_ignore = [] for commit in commits: if commit.ignored or commit.backedoutby: commits_to_ignore.append( { "rev": commit.node, "type": "backedout" if commit.backedoutby else "", } ) chosen_commits.add(commit.node) logger.info(f"{len(commits_to_ignore)} new commits to ignore...") for prev_commit in prev_commits_to_ignore[::-1]: if prev_commit["rev"] not in chosen_commits: commits_to_ignore.append(prev_commit) chosen_commits.add(prev_commit["rev"]) logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info( "...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout") ) ) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)