def __init__(self, repo_dir: str) -> None: repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = download_and_load_model("regressor") bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" with hglib.open(self.repo_dir) as hg: revs = repository.get_revs(hg, rev_start) chunk_size = 70000 for i in range(0, len(revs), chunk_size): repository.download_commits(self.repo_dir, revs=revs[i:(i + chunk_size)]) logger.info("commit data extracted from repository") # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() zstd_compress(repository.COMMITS_DB) create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
def retrieve_commits(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6 ) repository.download_commits(self.repo_dir, two_years_and_six_months_ago) logger.info("commit data extracted from repository") self.compress_file("data/commits.json")
def retrieve_commits(self): shared_dir = self.repo_dir + '-shared' cmd = hglib.util.cmdbuilder('robustcheckout', 'https://hg.mozilla.org/mozilla-central', self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b'tip') cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info('mozilla-central cloned') two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) repository.download_commits(self.repo_dir, two_years_and_six_months_ago) logger.info('commit data extracted from repository') self.compress_file('data/commits.json')
def __init__(self, repo_dir: str) -> None: if not os.path.exists(repo_dir): repository.clone(repo_dir) else: repository.pull(repo_dir, "mozilla-central", "tip") logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def boot_worker(): # Preload models bugbug_http.models.preload_models() # Clone mozilla central repo_dir = os.environ.get("BUGBUG_REPO_DIR", os.path.join(tempfile.gettempdir(), "bugbug-hg")) logger.info(f"Cloning mozilla-central in {repo_dir}...") repository.clone(repo_dir) # Download databases logger.info("Downloading test scheduling DB support file...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") repository.download_commits(repo_dir, rev_start) logger.info("Worker boot done")
def update_commit_db(self): repository.clone(self.repo_dir, update=True) assert db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass repository.download_commits(self.repo_dir, rev_start="children({})".format( commit["node"]))
def classify( self, revision=None, phabricator_deployment=None, diff_id=None, runnable_jobs_path=None, ): if revision is not None: assert phabricator_deployment is None assert diff_id is None if diff_id is not None: assert phabricator_deployment is not None assert revision is None self.update_commit_db() if phabricator_deployment is not None and diff_id is not None: with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, phabricator_deployment, diff_id) revision = hg.log( revrange="not public()")[0].node.decode("utf-8") commits = repository.download_commits( self.repo_dir, rev_start=revision, save=False, use_single_process=self.use_single_process, ) else: commits = [] for commit in repository.get_commits(): if commit["node"] == revision: commits.append(commit) break # The commit to analyze was not in our DB, let's mine it. if len(commits) == 0: commits = repository.download_commits( self.repo_dir, revs=[revision], save=False, use_single_process=self.use_single_process, ) assert len(commits) > 0, "There are no commits to analyze" if not self.use_test_history: self.classify_regressor(commits) else: self.classify_test_select(commits, runnable_jobs_path)
def boot_worker(): # Clone autoland logger.info(f"Cloning mozilla autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") # Download test scheduling DB support files. logger.info("Downloading test scheduling DB support files...") assert ( db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS ) assert ( db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, ) or ALLOW_MISSING_MODELS ) assert ( db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) or ALLOW_MISSING_MODELS ) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") repository.download_commits(REPO_DIR, rev_start) # Preload models bugbug_http.models.preload_models() logger.info("Worker boot done")
def update_commit_db(self): repository.clone(self.repo_dir) if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True, support_files_too=True) for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) repository.download_commits(self.repo_dir, rev_start)
def retrieve_commits(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") try: os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db")) except FileNotFoundError: logger.info("pushlog database doesn't exist") # Pull and update, to make sure the pushlog is generated. hg = hglib.open(self.repo_dir) hg.pull(update=True) hg.close() db.download_version(repository.COMMITS_DB) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def classify(self, diff_id): self.update_commit_db() with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, diff_id) patch_rev = hg.log(revrange="not public()")[0].node # Analyze patch. commits = repository.download_commits( self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False) # We use "clean" (or "dirty") commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" (or "dirty") commits. if not self.use_test_history: probs, importance = self.model.classify( commits[-1], probabilities=True, importances=True, background_dataset=lambda v: self.X[self.y != v], importance_cutoff=0.05, ) self.generate_feature_importance_data(probs, importance) with open("probs.json", "w") as f: json.dump(probs[0].tolist(), f) if self.model_name == "regressor" and self.method_defect_predictor_dir: self.classify_methods() else: # TODO: Should we consider a merge of the commits of the stack? commit = commits[-1] push_num = self.past_failures_data["push_num"] # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES. # XXX: Consider using the runnable jobs artifact from the Gecko Decision task. all_tasks = self.past_failures_data["all_tasks"] selected_tasks = [] # TODO: Classify multiple commit/test at the same time. for data in test_scheduling.generate_data(self.past_failures_data, commit, push_num, all_tasks, [], []): if not data["name"].startswith("test-"): continue commit["test_job"] = data probs = self.model.classify(commit, probabilities=True) if probs[0][1] > 0.9: selected_tasks.append(data["name"]) with open("selected_tasks", "w") as f: f.writelines(f"{selected_task}\n" for selected_task in selected_tasks)
def classify(self, diff_id): self.update_commit_db() with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, diff_id) patch_rev = hg.log(revrange="not public()")[0].node # Analyze patch. commits = repository.download_commits( self.repo_dir, rev_start=patch_rev.decode("utf-8"), ret=True, save=False) probs, importance = self.model.classify(commits[-1], probabilities=True, importances=True) feature_names = self.model.get_human_readable_feature_names() features = [] for i, (val, feature_index, is_positive) in enumerate(importance["importances"]): features.append([ i + 1, feature_names[int(feature_index)], f'({"+" if (is_positive) else "-"}{val})', ]) with open("probs.json", "w") as f: json.dump(probs[0].tolist(), f) with open("importance.html", "w") as f: f.write(importance["html"])
def get_bugs(self, date="today", bug_ids=[]): self.query_url = "" # Ignore already analyzed commits. for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" commits = repository.download_commits(self.repo_dir, rev_start, ret=True) commits = [ commit for commit in commits if not commit["ever_backedout"] ] probs = self.model.classify(commits, True) indexes = probs.argmax(axis=-1) result = {} for commit, prob, index in zip(commits, probs, indexes): result[commit["node"]] = { "id": commit["node"], "summary": commit["desc"].split("\n", 1)[0], "result": "Risky" if prob[1] > 0.5 else "Not risky", "confidence": nice_round(prob[index]), } return result
def classify( self, revision=None, phabricator_deployment=None, diff_id=None, runnable_jobs_path=None, ): if revision is not None: assert phabricator_deployment is None assert diff_id is None if diff_id is not None: assert phabricator_deployment is not None assert revision is None self.update_commit_db() with hglib.open(self.repo_dir) as hg: if phabricator_deployment is not None and diff_id is not None: self.apply_phab(hg, phabricator_deployment, diff_id) revision = hg.log( revrange="not public()")[0].node.decode("utf-8") # Analyze patch. commits = repository.download_commits(self.repo_dir, rev_start=revision, save=False) if not self.use_test_history: self.classify_regressor(commits) else: self.classify_test_select(commits, runnable_jobs_path)
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" repository.download_commits(self.repo_dir, rev_start=rev_start) logger.info("commit data extracted from repository") zstd_compress(repository.COMMITS_DB) create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") zstd_compress("data/commits.json") zstd_compress("data/commit_experiences.pickle")
def retrieve_commits(self): repository.clone(self.repo_dir) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") )
def retrieve_commits(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") try: os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db")) except FileNotFoundError: logger.info("pushlog database doesn't exist") # Pull and update, to make sure the pushlog is generated. hg = hglib.open(self.repo_dir) hg.pull(update=True) hg.close() two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6 ) repository.download_commits(self.repo_dir, two_years_and_six_months_ago) logger.info("commit data extracted from repository") self.compress_file("data/commits.json")
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if not db.is_old_version(repository.COMMITS_DB) and not limit: db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: if limit: rev_start = ( -1 * limit ) # Mercurial revset support negative integers starting from tip else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") zstd_compress("data/commits.json") zstd_compress("data/commit_experiences.pickle")
def schedule_tests(branch, rev): from bugbug_http.app import JobInfo from bugbug_http import REPO_DIR job = JobInfo(schedule_tests, branch, rev) LOGGER.debug(f"Processing {job}") # Load the full stack of patches leading to that revision try: stack = get_hgmo_stack(branch, rev) except requests.exceptions.RequestException: LOGGER.warning(f"Push not found for {branch} @ {rev}!") return "NOK" # Apply the stack on the local repository try: revs = repository.apply_stack(REPO_DIR, stack, branch) except Exception as e: LOGGER.warning(f"Failed to apply stack {branch} @ {rev}: {e}") return "NOK" test_selection_threshold = float( os.environ.get("TEST_SELECTION_CONFIDENCE_THRESHOLD", 0.3)) # Analyze patches. commits = repository.download_commits(REPO_DIR, revs=revs, save=False, use_single_process=True) tasks = MODEL_CACHE.get("testlabelselect").select_tests( commits, test_selection_threshold) reduced = MODEL_CACHE.get("testlabelselect").reduce( set(t for t, c in tasks.items() if c >= 0.7), 1.0) data = { "tasks": tasks, "groups": MODEL_CACHE.get("testgroupselect").select_tests( commits, test_selection_threshold), "reduced_tasks": {t: c for t, c in tasks.items() if t in reduced}, } setkey(job.result_key, orjson.dumps(data)) return "OK"
def classify(self, diff_id): self.update_commit_db() with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, diff_id) patch_rev = hg.log(revrange="not public()")[0].node # Analyze patch. commits = repository.download_commits( self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False) probs, importance = self.model.classify( commits[-1], probabilities=True, importances=True, background_dataset=self.background_dataset, ) features = [] for i, (val, feature_index, is_positive) in enumerate( importance["importances"]["classes"][1][0]): features.append([ i + 1, importance["feature_legend"][str(i + 1)], f'{"+" if (is_positive) else "-"}{val}', ]) with open("probs.json", "w") as f: json.dump(probs[0].tolist(), f) with open("importances.json", "w") as f: json.dump(features, f) with open("importance.html", "w") as f: f.write(importance["html"])
def test_download_commits(fake_hg_repo): hg, local, remote = fake_hg_repo responses.add( responses.HEAD, "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json", status=200, headers={"ETag": "123"}, ) responses.add( responses.GET, "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json", status=200, json={ "file1": ["Firefox", "Menus"], "file2": ["Firefox", "General"], "file3": ["Core", "General"], }, ) # Remove the mock DB generated by the mock_data fixture. os.remove("data/commits.json") with open(os.path.join(local, ".hg-annotate-ignore-revs"), "w") as f: f.write("not_existing_hash\n") add_file(hg, local, "file1", "1\n2\n3\n4\n5\n6\n7\n") commit(hg, date=datetime(1991, 4, 16, tzinfo=timezone.utc)) hg.push(dest=bytes(remote, "ascii")) copy_pushlog_database(remote, local) commits = repository.download_commits(local) assert len(commits) == 0 commits = list(repository.get_commits()) assert len(commits) == 0 # Wait one second, to have a different pushdate. time.sleep(1) add_file(hg, local, "file2", "1\n2\n3\n4\n5\n6\n7\n") revision2 = commit(hg, "Bug 123 - Prova. r=moz,rev2") hg.push(dest=bytes(remote, "ascii")) copy_pushlog_database(remote, local) commits = repository.download_commits(local) assert len(commits) == 1 commits = list(repository.get_commits()) assert len(commits) == 1 assert commits[0]["node"] == revision2 assert commits[0]["touched_prev_total_author_sum"] == 0 assert commits[0]["seniority_author"] > 0 # Wait one second, to have a different pushdate. time.sleep(1) add_file(hg, local, "file3", "1\n2\n3\n4\n5\n6\n7\n") revision3 = commit(hg, "Bug 456 - Prova. r=moz") hg.push(dest=bytes(remote, "ascii")) copy_pushlog_database(remote, local) commits = repository.download_commits(local, revision3) assert len(commits) == 1 commits = list(repository.get_commits()) assert len(commits) == 2 assert commits[0]["node"] == revision2 assert commits[0]["touched_prev_total_author_sum"] == 0 assert commits[0]["seniority_author"] > 0 assert commits[1]["node"] == revision3 assert commits[1]["touched_prev_total_author_sum"] == 1 assert commits[1]["seniority_author"] > commits[0]["seniority_author"] os.remove("data/commits.json") os.remove("data/commit_experiences.pickle") commits = repository.download_commits(local, f"children({revision2})") assert len(commits) == 1 assert len(list(repository.get_commits())) == 1 os.remove("data/commits.json") os.remove("data/commit_experiences.pickle") commits = repository.download_commits(local) assert len(list(repository.get_commits())) == 2
def schedule_tests(branch: str, rev: str) -> str: from bugbug_http.app import JobInfo from bugbug_http import REPO_DIR job = JobInfo(schedule_tests, branch, rev) LOGGER.info(f"Processing {job}...") # Pull the revision to the local repository LOGGER.info("Pulling commits from the remote repository...") repository.pull(REPO_DIR, branch, rev) # Load the full stack of patches leading to that revision LOGGER.info("Loading commits to analyze using automationrelevance...") try: revs = get_hgmo_stack(branch, rev) except requests.exceptions.RequestException: LOGGER.warning(f"Push not found for {branch} @ {rev}!") return "NOK" test_selection_threshold = float( os.environ.get("TEST_SELECTION_CONFIDENCE_THRESHOLD", 0.5) ) # Analyze patches. commits = repository.download_commits( REPO_DIR, revs=revs, save=False, use_single_process=True, include_no_bug=True ) if len(commits) > 0: testlabelselect_model = MODEL_CACHE.get("testlabelselect") testgroupselect_model = MODEL_CACHE.get("testgroupselect") tasks = testlabelselect_model.select_tests(commits, test_selection_threshold) reduced = testlabelselect_model.reduce( set(t for t, c in tasks.items() if c >= 0.8), 1.0 ) reduced_higher = testlabelselect_model.reduce( set(t for t, c in tasks.items() if c >= 0.9), 1.0 ) groups = testgroupselect_model.select_tests(commits, test_selection_threshold) config_groups = testgroupselect_model.select_configs(groups.keys(), 1.0) else: tasks = {} reduced = {} groups = {} config_groups = {} data = { "tasks": tasks, "groups": groups, "config_groups": config_groups, "reduced_tasks": {t: c for t, c in tasks.items() if t in reduced}, "reduced_tasks_higher": {t: c for t, c in tasks.items() if t in reduced_higher}, "known_tasks": get_known_tasks(), } setkey(job.result_key, orjson.dumps(data), compress=True) return "OK"
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( (parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor"))) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) self.past_regression_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) self.past_fixed_bug_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format( dimension=dimension))
def boot_worker(): # Clone autoland def clone_autoland(): logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") def extract_past_failures_label(): try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_LABEL_DB)) logger.info("Label-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Label-level past failures DB not extracted, but missing models are allowed." ) def extract_failing_together(): try: utils.extract_file( os.path.join("data", test_scheduling.FAILING_TOGETHER_LABEL_DB)) logger.info("Failing together DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Failing together DB not extracted, but missing models are allowed." ) def extract_past_failures_group(): try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_GROUP_DB)) logger.info("Group-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Group-level past failures DB not extracted, but missing models are allowed." ) def extract_touched_together(): try: utils.extract_file( os.path.join("data", test_scheduling.TOUCHED_TOGETHER_DB)) logger.info("Touched together DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Touched together DB not extracted, but missing models are allowed." ) def extract_commits(): try: utils.extract_file(f"{repository.COMMITS_DB}.zst") logger.info("Commits DB extracted.") return True except FileNotFoundError: logger.info( "Commits DB not extracted, but missing models are allowed.") assert ALLOW_MISSING_MODELS return False def extract_commit_experiences(): try: utils.extract_file( os.path.join("data", repository.COMMIT_EXPERIENCES_DB)) logger.info("Commit experiences DB extracted.") except FileNotFoundError: logger.info( "Commit experiences DB not extracted, but missing models are allowed." ) assert ALLOW_MISSING_MODELS @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=1, min=1, max=8), ) def retrieve_schedulable_tasks(): # Store in a file the list of tasks in the latest autoland push. r = requests.get( "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.autoland.latest.taskgraph.decision/artifacts/public/target-tasks.json" ) r.raise_for_status() with open("known_tasks", "w") as f: f.write("\n".join(r.json())) with concurrent.futures.ThreadPoolExecutor() as executor: clone_autoland_future = executor.submit(clone_autoland) retrieve_schedulable_tasks_future = executor.submit( retrieve_schedulable_tasks) commits_db_extracted = extract_commits() extract_commit_experiences() extract_touched_together() extract_past_failures_label() extract_past_failures_group() extract_failing_together() if commits_db_extracted: # Update the commits DB. logger.info("Browsing all commits...") for commit in repository.get_commits(): pass logger.info("All commits browsed.") # Wait repository to be cloned, as it's required to call repository.download_commits. logger.info("Waiting autoland to be cloned...") clone_autoland_future.result() rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) logger.info("Commits DB updated.") logger.info("Updating touched together DB...") if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass logger.info("Touched together DB updated.") # Wait list of schedulable tasks to be downloaded and written to disk. retrieve_schedulable_tasks_future.result() logger.info("Worker boot done")
def schedule_tests(branch, rev): from bugbug_http.app import JobInfo from bugbug_http import REPO_DIR job = JobInfo(schedule_tests, branch, rev) LOGGER.debug(f"Processing {job}") # Load the full stack of patches leading to that revision try: stack = get_hgmo_stack(branch, rev) except requests.exceptions.RequestException: LOGGER.warning(f"Push not found for {branch} @ {rev}!") return "NOK" # Apply the stack on the local repository try: revs = repository.apply_stack(REPO_DIR, stack, branch) except Exception as e: LOGGER.warning(f"Failed to apply stack {branch} @ {rev}: {e}") return "NOK" test_selection_threshold = float( os.environ.get("TEST_SELECTION_CONFIDENCE_THRESHOLD", 0.3)) # Analyze patches. commits = repository.download_commits(REPO_DIR, revs=revs, save=False, use_single_process=True) commit_data = commit_features.merge_commits(commits) def get_runnables(granularity): past_failures_data = test_scheduling.get_past_failures(granularity) push_num = past_failures_data["push_num"] all_runnables = past_failures_data["all_runnables"] commit_tests = [] for data in test_scheduling.generate_data(past_failures_data, commit_data, push_num, all_runnables, [], []): if granularity == "label" and not data["name"].startswith("test-"): continue commit_test = commit_data.copy() commit_test["test_job"] = data commit_tests.append(commit_test) probs = MODEL_CACHE.get(f"test{granularity}select").classify( commit_tests, probabilities=True) selected_indexes = np.argwhere( probs[:, 1] > test_selection_threshold)[:, 0] return { commit_tests[i]["test_job"]["name"]: math.floor(probs[i, 1] * 100) / 100 for i in selected_indexes } data = { "tasks": get_runnables("label"), "groups": get_runnables("group"), } setkey(job.result_key, orjson.dumps(data)) return "OK"
def boot_worker(): # Clone autoland def clone_autoland(): logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") def extract_past_failures_label(): try: utils.extract_file(test_scheduling.PAST_FAILURES_LABEL_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Label-level past failures DB extracted.") def extract_past_failures_group(): try: utils.extract_file(test_scheduling.PAST_FAILURES_GROUP_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Group-level past failures DB extracted.") def extract_touched_together(): try: utils.extract_file(test_scheduling.TOUCHED_TOGETHER_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Touched together DB extracted.") def extract_commits(): try: utils.extract_file(f"{repository.COMMITS_DB}.zst") except FileNotFoundError: assert ALLOW_MISSING_MODELS return False logger.info("Commits DB extracted.") return True def extract_commit_experiences(): try: utils.extract_file(repository.COMMIT_EXPERIENCES_DB) except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info("Commit experiences DB extracted.") with concurrent.futures.ThreadPoolExecutor() as executor: clone_autoland_future = executor.submit(clone_autoland) commits_db_extracted = extract_commits() extract_commit_experiences() extract_touched_together() extract_past_failures_label() extract_past_failures_group() if commits_db_extracted: # Update the commits DB. logger.info("Browsing all commits...") for commit in repository.get_commits(): pass logger.info("All commits browsed.") # Wait repository to be cloned, as it's required to call repository.download_commits. logger.info("Waiting autoland to be cloned...") clone_autoland_future.result() rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) logger.info("Commits DB updated.") logger.info("Updating touched together DB...") if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass logger.info("Touched together DB updated.") logger.info("Worker boot done")
def classify(self, diff_id): self.update_commit_db() with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, diff_id) patch_rev = hg.log(revrange="not public()")[0].node # Analyze patch. commits = repository.download_commits( self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False) # We use "clean" (or "dirty") commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" (or "dirty") commits. probs, importance = self.model.classify( commits[-1], probabilities=True, importances=True, background_dataset=lambda v: self.X[self.y != v], importance_cutoff=0.05, ) pred_class = self.model.le.inverse_transform([probs[0].argmax()])[0] features = [] for i, (val, feature_index, is_positive) in enumerate( importance["importances"]["classes"][pred_class][0]): value = importance["importances"]["values"][0, int(feature_index)] X = self.X[:, int(feature_index)] y = self.y[X != 0] X = X[X != 0] spearman = spearmanr(X, y) buggy_X = X[y == 1] clean_X = X[y == 0] median = np.median(X) median_clean = np.median(clean_X) median_buggy = np.median(buggy_X) perc_buggy_values_higher_than_median = ( buggy_X >= median).sum() / buggy_X.shape[0] perc_buggy_values_lower_than_median = ( buggy_X < median).sum() / buggy_X.shape[0] perc_clean_values_higher_than_median = ( clean_X > median).sum() / clean_X.shape[0] perc_clean_values_lower_than_median = ( clean_X <= median).sum() / clean_X.shape[0] logger.info("Feature: {}".format( importance["feature_legend"][str(i + 1)])) logger.info("Shap value: {}{}".format( "+" if (is_positive) else "-", val)) logger.info(f"spearman: {spearman}") logger.info(f"value: {value}") logger.info(f"overall mean: {np.mean(X)}") logger.info(f"overall median: {np.median(X)}") logger.info(f"mean for y == 0: {np.mean(clean_X)}") logger.info(f"mean for y == 1: {np.mean(buggy_X)}") logger.info(f"median for y == 0: {np.median(clean_X)}") logger.info(f"median for y == 1: {np.median(buggy_X)}") logger.info( f"perc_buggy_values_higher_than_median: {perc_buggy_values_higher_than_median}" ) logger.info( f"perc_buggy_values_lower_than_median: {perc_buggy_values_lower_than_median}" ) logger.info( f"perc_clean_values_higher_than_median: {perc_clean_values_higher_than_median}" ) logger.info( f"perc_clean_values_lower_than_median: {perc_clean_values_lower_than_median}" ) features.append({ "index": i + 1, "name": importance["feature_legend"][str(i + 1)], "shap": float(f'{"+" if (is_positive) else "-"}{val}'), "value": importance["importances"]["values"][0, int(feature_index)], "spearman": spearman, "median": median, "median_bug_introducing": median_buggy, "median_clean": median_clean, "perc_buggy_values_higher_than_median": perc_buggy_values_higher_than_median, "perc_buggy_values_lower_than_median": perc_buggy_values_lower_than_median, "perc_clean_values_higher_than_median": perc_clean_values_higher_than_median, "perc_clean_values_lower_than_median": perc_clean_values_lower_than_median, }) # Group together features that are very similar to each other, so we can simplify the explanation # to users. attributes = ["Total", "Maximum", "Minimum", "Average"] already_added = set() feature_groups = [] for i1, f1 in enumerate(features): if i1 in already_added: continue feature_groups.append([f1]) for j, f2 in enumerate(features[i1 + 1:]): i2 = j + i1 + 1 f1_name = f1["name"] for attribute in attributes: if f1_name.startswith(attribute): f1_name = f1_name[len(attribute) + 1:] break f2_name = f2["name"] for attribute in attributes: if f2_name.startswith(attribute): f2_name = f2_name[len(attribute) + 1:] break if f1_name != f2_name: continue already_added.add(i2) feature_groups[-1].append(f2) # Pick a representative example from each group. features = [] for feature_group in feature_groups: shap = sum(f["shap"] for f in feature_group) # Only select easily explainable features from the group. selected = [ f for f in feature_group if (f["shap"] > 0 and abs(f["value"] - f["median_bug_introducing"]) < abs(f["value"] - f["median_clean"])) or ( f["shap"] < 0 and abs(f["value"] - f["median_clean"]) < abs(f["value"] - f["median_bug_introducing"])) ] # If there are no easily explainable features in the group, select all features of the group. if len(selected) == 0: selected = feature_group def feature_sort_key(f): if f["shap"] > 0 and f["spearman"][0] > 0: return f["perc_buggy_values_higher_than_median"] elif f["shap"] > 0 and f["spearman"][0] < 0: return f["perc_buggy_values_lower_than_median"] elif f["shap"] < 0 and f["spearman"][0] > 0: return f["perc_clean_values_lower_than_median"] elif f["shap"] < 0 and f["spearman"][0] < 0: return f["perc_clean_values_higher_than_median"] feature = max(selected, key=feature_sort_key) feature["shap"] = shap for attribute in attributes: if feature["name"].startswith(attribute): feature["name"] = feature["name"][len(attribute) + 1:].capitalize() break features.append(feature) with open("probs.json", "w") as f: json.dump(probs[0].tolist(), f) with open("importances.json", "w") as f: json.dump(features, f) # Get commit hash from 4 months before the analysis time. # The method-level analyzer needs 4 months of history. four_months_ago = datetime.utcnow() - relativedelta(months=4) p = subprocess.run( [ "git", "rev-list", "-n", "1", "--until={}".format(four_months_ago.strftime("%Y-%m-%d")), "HEAD", ], check=True, capture_output=True, cwd=self.git_repo_dir, ) stop_hash = p.stdout.decode().strip() # Run the method-level analyzer. subprocess.run( [ "python3", "tester.py", "--repo", self.git_repo_dir, "--start", "HEAD", "--stop", stop_hash, "--output", os.path.abspath("method_level.csv"), ], check=True, cwd=self.method_defect_predictor_dir, ) method_level_results = [] try: with open("method_level.csv", "r") as f: reader = csv.DictReader(f) for item in reader: method_level_results.append(item) except FileNotFoundError: # No methods were classified. pass with open("method_level.json", "w") as f: json.dump(method_level_results, f)
def boot_worker() -> None: # Clone autoland def clone_autoland() -> None: logger.info(f"Cloning autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") def extract_past_failures_label() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_LABEL_DB) ) logger.info("Label-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Label-level past failures DB not extracted, but missing models are allowed." ) def extract_failing_together_label() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.FAILING_TOGETHER_LABEL_DB) ) logger.info("Failing together label DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Failing together label DB not extracted, but missing models are allowed." ) def extract_failing_together_config_group() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.FAILING_TOGETHER_CONFIG_GROUP_DB) ) logger.info("Failing together config/group DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Failing together config/group DB not extracted, but missing models are allowed." ) def extract_past_failures_group() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.PAST_FAILURES_GROUP_DB) ) logger.info("Group-level past failures DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Group-level past failures DB not extracted, but missing models are allowed." ) def extract_touched_together() -> None: try: utils.extract_file( os.path.join("data", test_scheduling.TOUCHED_TOGETHER_DB) ) logger.info("Touched together DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Touched together DB not extracted, but missing models are allowed." ) def extract_commits() -> bool: try: utils.extract_file(f"{repository.COMMITS_DB}.zst") logger.info("Commits DB extracted.") return True except FileNotFoundError: logger.info("Commits DB not extracted, but missing models are allowed.") assert ALLOW_MISSING_MODELS return False def extract_commit_experiences() -> None: try: utils.extract_file(os.path.join("data", repository.COMMIT_EXPERIENCES_DB)) logger.info("Commit experiences DB extracted.") except FileNotFoundError: assert ALLOW_MISSING_MODELS logger.info( "Commit experiences DB not extracted, but missing models are allowed." ) @tenacity.retry( stop=tenacity.stop_after_attempt(7), wait=tenacity.wait_exponential(multiplier=1, min=1, max=8), ) def retrieve_schedulable_tasks() -> None: r = requests.get( "https://hg.mozilla.org/integration/autoland/json-pushes?version=2&tipsonly=1" ) r.raise_for_status() revs = [ push_obj["changesets"][0] for push_id, push_obj in r.json()["pushes"].items() ] logger.info(f"Retrieving known tasks from {revs}") # Store in a file the list of tasks in the latest autoland pushes. # We use more than one to protect ourselves from broken decision tasks. known_tasks = set() for rev in revs: r = requests.get( f"https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.autoland.revision.{rev}.taskgraph.decision/artifacts/public/target-tasks.json" ) if r.ok: known_tasks.update(r.json()) logger.info(f"Retrieved {len(known_tasks)} tasks") assert len(known_tasks) > 0 with open("known_tasks", "w") as f: f.write("\n".join(known_tasks)) with concurrent.futures.ThreadPoolExecutor() as executor: clone_autoland_future = executor.submit(clone_autoland) retrieve_schedulable_tasks_future = executor.submit(retrieve_schedulable_tasks) commits_db_extracted = extract_commits() extract_commit_experiences() extract_touched_together() extract_past_failures_label() extract_past_failures_group() extract_failing_together_label() extract_failing_together_config_group() if commits_db_extracted: # Update the commits DB. logger.info("Browsing all commits...") nodes = collections.deque( (commit["node"] for commit in repository.get_commits()), maxlen=4096 ) nodes.reverse() logger.info("All commits browsed.") # Wait repository to be cloned, as it's required to call repository.download_commits. logger.info("Waiting autoland to be cloned...") clone_autoland_future.result() with hglib.open(REPO_DIR) as hg: # Try using nodes backwards, in case we have some node that was on central at the time # we mined commits, but is not yet on autoland. for node in nodes: try: revs = repository.get_revs(hg, rev_start=f"children({node})") break except hglib.error.CommandError as e: if b"abort: unknown revision" not in e.err: raise logger.info("Updating commits DB...") commits = repository.download_commits( REPO_DIR, revs=revs, use_single_process=True ) logger.info("Commits DB updated.") logger.info("Updating touched together DB...") if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass logger.info("Touched together DB updated.") # Wait list of schedulable tasks to be downloaded and written to disk. retrieve_schedulable_tasks_future.result() logger.info("Worker boot done")