def retrieve_revisions(self, limit: Optional[int] = None) -> None: phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) db.download(phabricator.REVISIONS_DB) # Get the commits DB, as we need it to get the revision IDs linked to recent commits. assert db.download(repository.COMMITS_DB) # Get the bugs DB, as we need it to get the revision IDs linked to bugs. assert db.download(bugzilla.BUGS_DB) # Get IDs of revisions linked to commits since a year ago. start_date = datetime.utcnow() - relativedelta(years=1) revision_ids = list((filter( None, (repository.get_revision_id(commit) for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= start_date), ))) if limit is not None: revision_ids = revision_ids[-limit:] # Get IDs of revisions linked to bugs since a year ago. for bug in bugzilla.get_bugs(): if (dateutil.parser.parse( bug["creation_time"]).replace(tzinfo=None) < start_date): continue revision_ids += bugzilla.get_revision_ids(bug) phabricator.download_revisions(revision_ids) zstd_compress(phabricator.REVISIONS_DB)
def __init__(self, repo_dir: str) -> None: repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = download_and_load_model("regressor") bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def __init__(self, repo_dir: str) -> None: if not os.path.exists(repo_dir): repository.clone(repo_dir) else: repository.pull(repo_dir, "mozilla-central", "tip") logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def apply_phab(self, hg, diff_id): phabricator_api = PhabricatorAPI( api_key=get_secret("PHABRICATOR_TOKEN"), url=get_secret("PHABRICATOR_URL")) diffs = phabricator_api.search_diffs(diff_id=diff_id) assert len(diffs) == 1, "No diff available for {}".format(diff_id) diff = diffs[0] # Get the stack of patches base, patches = phabricator_api.load_patches_stack(hg, diff) assert len(patches) > 0, "No patches to apply" # Load all the diffs details with commits messages diffs = phabricator_api.search_diffs(diff_phid=[p[0] for p in patches], attachments={"commits": True}) commits = { diff["phid"]: diff["attachments"]["commits"].get("commits", []) for diff in diffs } # First apply patches on local repo for diff_phid, patch in patches: commit = commits.get(diff_phid) message = "" if commit: message += "{}\n".format(commit[0]["message"]) logger.info(f"Applying {diff_phid}") hg.import_( patches=io.BytesIO(patch.encode("utf-8")), message=message, user="******", )
def classify_test_select(self, commits, runnable_jobs_path): testfailure_probs = self.testfailure_model.classify(commits[-1], probabilities=True) logger.info(f"Test failure risk: {testfailure_probs[0][1]}") if not runnable_jobs_path: runnable_jobs = {} elif runnable_jobs_path.startswith("http"): r = requests.get(runnable_jobs_path) r.raise_for_status() runnable_jobs = r.json() else: with open(runnable_jobs_path, "r") as f: runnable_jobs = json.load(f) # XXX: Remove tasks which are not in runnable jobs right away, so we avoid classifying them. # XXX: Consider using mozilla-central built-in rules to filter some of the tasks out, e.g. SCHEDULES. selected_tasks = list( self.model.select_tests( commits, float(get_secret( "TEST_SELECTION_CONFIDENCE_THRESHOLD"))).values()) # XXX: For now, only restrict to linux64 test tasks (as for runnable jobs above, we could remove these right away). selected_tasks = [ t for t in selected_tasks if t.startswith("test-linux1804-64/") ] with open("failure_risk", "w") as f: f.write("1" if testfailure_probs[0][1] > float( get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD")) else "0") # This should be kept in sync with the test scheduling history retriever script. if len(runnable_jobs) > 0: cleaned_selected_tasks = [] for selected_task in selected_tasks: if (selected_task.startswith("test-linux64") and selected_task not in runnable_jobs): selected_task = selected_task.replace( "test-linux64-", "test-linux1804-64-") if (selected_task.startswith("test-linux1804-64-") and selected_task not in runnable_jobs): selected_task = selected_task.replace( "test-linux1804-64-", "test-linux64-") if selected_task in runnable_jobs: cleaned_selected_tasks.append(selected_task) # It isn't worth running the build associated to the tests, if we only run three test tasks. if len(cleaned_selected_tasks) < 3: cleaned_selected_tasks = [] with open("selected_tasks", "w") as f: f.writelines(f"{selected_task}\n" for selected_task in cleaned_selected_tasks)
def generate(self): db_path = os.path.join("data", self.git_repo_path) db.register( db_path, "https://s3-us-west-2.amazonaws.com/communitytc-bugbug/data/", VERSION, ) is_old_version = db.is_old_schema(db_path) with ThreadPoolExecutorResult(max_workers=2) as executor: cloner = executor.submit(repository.clone, self.repo_dir) cloner.add_done_callback( lambda future: logger.info("mozilla-central cloned") ) git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@" ) if not is_old_version: executor.submit(self.clone_git_repo) else: executor.submit(self.init_git_repo) tenacity.retry( lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True ), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )() push_args = ["git", "push", repo_push_url, "master"] if is_old_version: push_args.append("--force") done = False while not done: done = generator.generate( self.repo_dir, self.git_repo_path, limit=COMMITS_STEP, tokenize=self.tokenize, remove_comments=self.remove_comments, ) tenacity.retry( lambda: subprocess.run(push_args, cwd=self.git_repo_path, check=True), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )() # We are not using db.upload as we don't need to upload the git repo. upload_s3([f"{db_path}.version"])
def generate(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_url = "https://github.com/marco-c/gecko-dev-wordified" repo_push_url = ( f"https://{git_user}:{git_password}@github.com/marco-c/gecko-dev-wordified" ) git_repo_path = os.path.basename(repo_url) retry(lambda: subprocess.run(["git", "clone", repo_url, git_repo_path], check=True)) try: retry(lambda: subprocess.run( ["git", "pull", repo_url, "master"], cwd=git_repo_path, capture_output=True, check=True, )) except subprocess.CalledProcessError as e: # When the repo is empty. if b"Couldn't find remote ref master" in e.stdout: pass done = generator.generate(self.repo_dir, git_repo_path, limit=10000) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) retry(lambda: subprocess.run(["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True))
def generate(self): db_path = os.path.join("data", self.git_repo_path) db.register( db_path, f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.microannotate_{self.git_repo_path}.latest/artifacts/public/", VERSION, ) # TODO: Check the version again once we can run tasks for longer (https://bugzilla.mozilla.org/show_bug.cgi?id=1604175). is_old_version = False # db.is_old_schema(db_path) with ThreadPoolExecutorResult(max_workers=2) as executor: cloner = executor.submit(repository.clone, self.repo_dir) cloner.add_done_callback( lambda future: logger.info("mozilla-central cloned")) git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") if not is_old_version: executor.submit(self.clone_git_repo) else: executor.submit(self.init_git_repo) tenacity.retry( lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )() push_args = ["git", "push", repo_push_url, "master"] if is_old_version: push_args.append("--force") done = False while not done: done = generator.generate( self.repo_dir, self.git_repo_path, limit=COMMITS_STEP, tokenize=self.tokenize, remove_comments=self.remove_comments, ) tenacity.retry( lambda: subprocess.run( push_args, cwd=self.git_repo_path, check=True), wait=tenacity.wait_fixed(30), stop=tenacity.stop_after_attempt(5), )()
def generate(self): repository.clone(self.repo_dir) logger.info("mozilla-central cloned") git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") git_repo_path = os.path.basename(self.repo_url) retry(lambda: subprocess.run( ["git", "clone", self.repo_url, git_repo_path], check=True)) try: retry(lambda: subprocess.run( ["git", "pull", self.repo_url, "master"], cwd=git_repo_path, capture_output=True, check=True, )) except subprocess.CalledProcessError as e: # When the repo is empty. if b"Couldn't find remote ref master" in e.stdout: pass retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) for i in range(STEPS): logger.info(f"Step {i} out of {STEPS}") done = generator.generate( self.repo_dir, git_repo_path, limit=TOTAL_COMMITS // STEPS, tokenize=self.tokenize, remove_comments=self.remove_comments, ) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True, )) if done: break
def __init__(self, cache_root, repo_url, tokenize, remove_comments): git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") self.repo_url = repo_url.replace( "https://", f"https://{git_user}:{git_password}@") self.git_repo_path = os.path.basename(self.repo_url) self.tokenize = tokenize self.remove_comments = remove_comments assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central")
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( "Downloading bugs from {} to {}".format( two_years_and_six_months_ago, six_months_ago ) ) bugzilla.download_bugs_between(two_years_and_six_months_ago, six_months_ago) logger.info("Downloading labelled bugs") bug_ids = labels.get_all_bug_ids() bugzilla.download_bugs(bug_ids) # Try to re-download inconsistent bugs, up to three times. for i in range(3): bug_ids = bug_snapshot.get_inconsistencies() if len(bug_ids) == 0: break logger.info( f"Re-downloading {len(bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(bug_ids) bugzilla.download_bugs(bug_ids) self.compress_file("data/bugs.json")
def apply_phab(self, hg, diff_id): phabricator_api = PhabricatorAPI( api_key=get_secret("PHABRICATOR_TOKEN"), url=get_secret("PHABRICATOR_URL")) diffs = phabricator_api.search_diffs(diff_id=diff_id) assert len(diffs) == 1, f"No diff available for {diff_id}" diff = diffs[0] # Get the stack of patches base, patches = phabricator_api.load_patches_stack(hg, diff) assert len(patches) > 0, "No patches to apply" # Load all the diffs details with commits messages diffs = phabricator_api.search_diffs(diff_phid=[p[0] for p in patches], attachments={"commits": True}) diffs_data = {} for diff in diffs: revision = phabricator_api.load_revision( rev_phid=diff["revisionPHID"]) logger.info("Diff {} linked to Revision {}".format( diff["id"], revision["id"])) diffs_data[diff["phid"]] = { "commits": diff["attachments"]["commits"].get("commits", []), "revision": revision, } # First apply patches on local repo for diff_phid, patch in patches: diff_data = diffs_data.get(diff_phid) commits = diff_data["commits"] revision = diff_data["revision"] if commits and commits[0]["message"]: message = commits[0]["message"] else: message = revision["fields"]["title"] logger.info(f"Applying {diff_phid}") hg.import_( patches=io.BytesIO(patch.encode("utf-8")), message=message, user="******", )
def generate(self): with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: cloner = executor.submit(repository.clone, self.repo_dir) cloner.add_done_callback( lambda future: logger.info("mozilla-central cloned")) git_user = get_secret("GIT_USER") git_password = get_secret("GIT_PASSWORD") repo_push_url = self.repo_url.replace( "https://", f"https://{git_user}:{git_password}@") git_repo_path = os.path.basename(self.repo_url) executor.submit(self.clone_git_repo, git_repo_path) retry(lambda: subprocess.run( ["git", "config", "--global", "http.postBuffer", "12M"], check=True)) for i in range(STEPS): logger.info(f"Step {i} out of {STEPS}") done = generator.generate( self.repo_dir, git_repo_path, limit=TOTAL_COMMITS // STEPS, tokenize=self.tokenize, remove_comments=self.remove_comments, ) with open("done", "w") as f: f.write(str(1 if done else 0)) retry(lambda: subprocess.run( ["git", "push", repo_push_url, "master"], cwd=git_repo_path, check=True, )) if done: break
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") )
def apply_phab(self, hg, phabricator_deployment, diff_id): if phabricator_deployment == PHAB_PROD: api_key = get_secret("PHABRICATOR_TOKEN") url = get_secret("PHABRICATOR_URL") else: api_key = get_secret("PHABRICATOR_DEV_TOKEN") url = get_secret("PHABRICATOR_DEV_URL") phabricator_api = PhabricatorAPI(api_key=api_key, url=url) # Get the stack of patches stack = phabricator_api.load_patches_stack(diff_id) assert len(stack) > 0, "No patches to apply" # Find the first unknown base revision needed_stack = [] revisions = {} for patch in reversed(stack): needed_stack.insert(0, patch) # Stop as soon as a base revision is available if self.has_revision(hg, patch.base_revision): logger.info( f"Stopping at diff {patch.id} and revision {patch.base_revision}" ) break if not needed_stack: logger.info("All the patches are already applied") return # Load all the diff revisions diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack]) revisions = { diff["phid"]: phabricator_api.load_revision(rev_phid=diff["revisionPHID"], attachments={"reviewers": True}) for diff in diffs } # Update repo to base revision hg_base = needed_stack[0].base_revision if not self.has_revision(hg, hg_base): logger.warning( "Missing base revision {} from Phabricator".format(hg_base)) hg_base = "tip" if hg_base: hg.update(rev=hg_base, clean=True) logger.info(f"Updated repo to {hg_base}") if self.git_repo_dir and hg_base != "tip": try: self.git_base = tuple( vcs_map.mercurial_to_git(self.git_repo_dir, [hg_base]))[0] subprocess.run( [ "git", "checkout", "-b", "analysis_branch", self.git_base ], check=True, cwd=self.git_repo_dir, ) logger.info(f"Updated git repo to {self.git_base}") except Exception as e: logger.info( f"Updating git repo to Mercurial {hg_base} failed: {e}" ) def load_user(phid): if phid.startswith("PHID-USER"): return phabricator_api.load_user(user_phid=phid) elif phid.startswith("PHID-PROJ"): # TODO: Support group reviewers somehow. logger.info(f"Skipping group reviewer {phid}") else: raise Exception(f"Unsupported reviewer {phid}") for patch in needed_stack: revision = revisions[patch.phid] message = "{}\n\n{}".format(revision["fields"]["title"], revision["fields"]["summary"]) author_name = None author_email = None if patch.commits: author_name = patch.commits[0]["author"]["name"] author_email = patch.commits[0]["author"]["email"] if author_name is None: author = load_user(revision["fields"]["authorPHID"]) author_name = author["fields"]["realName"] # XXX: Figure out a way to know the email address of the author. author_email = author["fields"]["username"] reviewers = list( filter( None, (load_user(reviewer["reviewerPHID"]) for reviewer in revision["attachments"]["reviewers"]["reviewers"]), )) reviewers = set(reviewer["fields"]["username"] for reviewer in reviewers) if len(reviewers): message = replace_reviewers(message, reviewers) logger.info( f"Applying {patch.phid} from revision {revision['id']}: {message}" ) hg.import_( patches=io.BytesIO(patch.patch.encode("utf-8")), message=message.encode("utf-8"), user=f"{author_name} <{author_email}>".encode("utf-8"), ) if self.git_repo_dir: patch_proc = subprocess.Popen( ["patch", "-p1", "--no-backup-if-mismatch", "--force"], stdin=subprocess.PIPE, cwd=self.git_repo_dir, ) patch_proc.communicate(patch.patch.encode("utf-8")) assert patch_proc.returncode == 0, "Failed to apply patch" subprocess.run( [ "git", "-c", f"user.name={author_name}", "-c", f"user.email={author_email}", "commit", "-am", message, ], check=True, cwd=self.git_repo_dir, )
def retrieve_bugs(self, limit=None): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids( {"f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date()} ) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between( two_years_and_six_months_ago, six_months_ago ) if limit: timespan_ids = timespan_ids[:limit] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[:limit] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info(f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions and bugs which caused regressions (useful for the regressor model). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) if limit: regressed_by_bug_ids = regressed_by_bug_ids[-limit:] logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = ( timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids ) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set ) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies(inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) zstd_compress("data/bugs.json")
def get_token() -> str: return get_secret("GITHUB_TOKEN")
def apply_phab(self, hg, diff_id): def has_revision(revision): if not revision: return False try: hg.identify(revision) return True except hglib.error.CommandError: return False phabricator_api = PhabricatorAPI( api_key=get_secret("PHABRICATOR_TOKEN"), url=get_secret("PHABRICATOR_URL")) # Get the stack of patches stack = phabricator_api.load_patches_stack(diff_id) assert len(stack) > 0, "No patches to apply" # Find the first unknown base revision needed_stack = [] revisions = {} for patch in reversed(stack): needed_stack.insert(0, patch) # Stop as soon as a base revision is available if has_revision(patch.base_revision): logger.info( f"Stopping at diff {patch.id} and revision {patch.base_revision}" ) break if not needed_stack: logger.info("All the patches are already applied") return # Load all the diff revisions diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack]) revisions = { diff["phid"]: phabricator_api.load_revision(rev_phid=diff["revisionPHID"], attachments={"reviewers": True}) for diff in diffs } # Update repo to base revision hg_base = needed_stack[0].base_revision if not has_revision(hg_base): logger.warning( "Missing base revision {} from Phabricator".format(hg_base)) hg_base = "tip" if hg_base: hg.update(rev=hg_base, clean=True) logger.info(f"Updated repo to {hg_base}") try: self.git_base = vcs_map.mercurial_to_git(hg_base) subprocess.run( [ "git", "checkout", "-b", "analysis_branch", self.git_base ], check=True, cwd=self.git_repo_dir, ) logger.info(f"Updated git repo to {self.git_base}") except Exception as e: logger.info( f"Updating git repo to Mercurial {hg_base} failed: {e}") def load_user(phid): if phid.startswith("PHID-USER"): return phabricator_api.load_user(user_phid=phid) elif phid.startswith("PHID-PROJ"): # TODO: Support group reviewers somehow. logger.info(f"Skipping group reviewer {phid}") else: raise Exception(f"Unsupported reviewer {phid}") for patch in needed_stack: revision = revisions[patch.phid] message = "{}\n\n{}".format(revision["fields"]["title"], revision["fields"]["summary"]) author_name = None author_email = None if patch.commits: author_name = patch.commits[0]["author"]["name"] author_email = patch.commits[0]["author"]["email"] if author_name is None: author = load_user(revision["fields"]["authorPHID"]) author_name = author["fields"]["realName"] # XXX: Figure out a way to know the email address of the author. author_email = author["fields"]["username"] reviewers = list( filter( None, (load_user(reviewer["reviewerPHID"]) for reviewer in revision["attachments"]["reviewers"]["reviewers"]), )) reviewers = set(reviewer["fields"]["username"] for reviewer in reviewers) if len(reviewers): message = replace_reviewers(message, reviewers) logger.info( f"Applying {patch.phid} from revision {revision['id']}: {message}" ) hg.import_( patches=io.BytesIO(patch.patch.encode("utf-8")), message=message.encode("utf-8"), user=f"{author_name} <{author_email}>".encode("utf-8"), ) with tempfile.TemporaryDirectory() as tmpdirname: temp_file = os.path.join(tmpdirname, "temp.patch") with open(temp_file, "w") as f: f.write(patch.patch) subprocess.run( ["git", "apply", "--3way", temp_file], check=True, cwd=self.git_repo_dir, ) subprocess.run( [ "git", "-c", f"user.name={author_name}", "-c", f"user.email={author_email}", "commit", "-am", message, ], check=True, cwd=self.git_repo_dir, )
def classify(self, diff_id): self.update_commit_db() with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, diff_id) patch_rev = hg.log(revrange="not public()")[0].node # Analyze patch. commits = repository.download_commits( self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False ) # We use "clean" (or "dirty") commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" (or "dirty") commits. if not self.use_test_history: probs, importance = self.model.classify( commits[-1], probabilities=True, importances=True, background_dataset=lambda v: self.X[self.y != v], importance_cutoff=0.05, ) self.generate_feature_importance_data(probs, importance) with open("probs.json", "w") as f: json.dump(probs[0].tolist(), f) if self.model_name == "regressor" and self.method_defect_predictor_dir: self.classify_methods(commits[-1]) else: testfailure_probs = self.testfailure_model.classify( commits[-1], probabilities=True ) logger.info(f"Test failure risk: {testfailure_probs[0][1]}") commit_data = commit_features.merge_commits(commits) push_num = self.past_failures_data["push_num"] # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES. # XXX: Consider using the runnable jobs artifact from the Gecko Decision task. all_tasks = self.past_failures_data["all_tasks"] # XXX: For now, only restrict to test-linux64 tasks. all_tasks = [ t for t in all_tasks if t.startswith("test-linux64/") and "test-verify" not in t ] commit_tests = [] for data in test_scheduling.generate_data( self.past_failures_data, commit_data, push_num, all_tasks, [], [] ): if not data["name"].startswith("test-"): continue commit_test = commit_data.copy() commit_test["test_job"] = data commit_tests.append(commit_test) probs = self.model.classify(commit_tests, probabilities=True) selected_indexes = np.argwhere( probs[:, 1] > float(get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD")) )[:, 0] selected_tasks = [ commit_tests[i]["test_job"]["name"] for i in selected_indexes ] with open("failure_risk", "w") as f: f.write( "1" if testfailure_probs[0][1] > float(get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD")) else "0" ) # It isn't worth running the build associated to the tests, if we only run three test tasks. if len(selected_tasks) < 3: selected_tasks = [] with open("selected_tasks", "w") as f: f.writelines(f"{selected_task}\n" for selected_task in selected_tasks)
def classify_test_select(self, commits, runnable_jobs_path): testfailure_probs = self.testfailure_model.classify(commits[-1], probabilities=True) logger.info(f"Test failure risk: {testfailure_probs[0][1]}") commit_data = commit_features.merge_commits(commits) push_num = self.past_failures_data["push_num"] # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES. all_tasks = self.past_failures_data["all_runnables"] if not runnable_jobs_path: runnable_jobs = {task for task in all_tasks} elif runnable_jobs_path.startswith("http"): r = requests.get(runnable_jobs_path) r.raise_for_status() runnable_jobs = r.json() else: with open(runnable_jobs_path, "r") as f: runnable_jobs = json.load(f) # XXX: For now, only restrict to linux64 test tasks. all_tasks = [ t for t in all_tasks if t.startswith("test-linux1804-64/") ] # XXX: Remove tasks which are not in runnable jobs right away, so we avoid classifying them. commit_tests = [] for data in test_scheduling.generate_data(self.past_failures_data, commit_data, push_num, all_tasks, [], []): if not data["name"].startswith("test-"): continue commit_test = commit_data.copy() commit_test["test_job"] = data commit_tests.append(commit_test) probs = self.model.classify(commit_tests, probabilities=True) selected_indexes = np.argwhere(probs[:, 1] > float( get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD")))[:, 0] selected_tasks = [ commit_tests[i]["test_job"]["name"] for i in selected_indexes ] with open("failure_risk", "w") as f: f.write("1" if testfailure_probs[0][1] > float( get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD")) else "0") # This should be kept in sync with the test scheduling history retriever script. cleaned_selected_tasks = [] for selected_task in selected_tasks: if (selected_task.startswith("test-linux64") and selected_task not in runnable_jobs): selected_task = selected_task.replace("test-linux64-", "test-linux1804-64-") if (selected_task.startswith("test-linux1804-64-") and selected_task not in runnable_jobs): selected_task = selected_task.replace("test-linux1804-64-", "test-linux64-") if selected_task in runnable_jobs: cleaned_selected_tasks.append(selected_task) # It isn't worth running the build associated to the tests, if we only run three test tasks. if len(cleaned_selected_tasks) < 3: cleaned_selected_tasks = [] with open("selected_tasks", "w") as f: f.writelines(f"{selected_task}\n" for selected_task in cleaned_selected_tasks)
def apply_phab(self, hg, diff_id): def has_revision(revision): if not revision: return False try: hg.identify(revision) return True except hglib.error.CommandError: return False phabricator_api = PhabricatorAPI( api_key=get_secret("PHABRICATOR_TOKEN"), url=get_secret("PHABRICATOR_URL") ) # Get the stack of patches stack = phabricator_api.load_patches_stack(diff_id) assert len(stack) > 0, "No patches to apply" # Find the first unknown base revision needed_stack = [] revisions = {} for patch in reversed(stack): needed_stack.insert(0, patch) # Stop as soon as a base revision is available if has_revision(patch.base_revision): logger.info( f"Stopping at diff {patch.id} and revision {patch.base_revision}" ) break if not needed_stack: logger.info("All the patches are already applied") return # Load all the diff revisions diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack]) revisions = { diff["phid"]: phabricator_api.load_revision(rev_phid=diff["revisionPHID"]) for diff in diffs } # Update repo to base revision hg_base = needed_stack[0].base_revision if not has_revision(hg_base): logger.warning("Missing base revision {} from Phabricator".format(hg_base)) hg_base = "tip" if hg_base: hg.update(rev=hg_base, clean=True) logger.info(f"Updated repo to {hg_base}") try: self.git_base = vcs_map.mercurial_to_git(hg_base) subprocess.run( ["git", "checkout", "-b", "analysis_branch", self.git_base], check=True, cwd=self.git_repo_dir, ) logger.info(f"Updated git repo to {self.git_base}") except Exception as e: logger.info(f"Updating git repo to Mercurial {hg_base} failed: {e}") for patch in needed_stack: revision = revisions[patch.phid] if patch.commits: message = patch.commits[0]["message"] author_name = patch.commits[0]["author"]["name"] author_email = patch.commits[0]["author"]["email"] else: message = revision["fields"]["title"] author_name = "bugbug" author_email = "*****@*****.**" logger.info( f"Applying {patch.phid} from revision {revision['id']}: {message}" ) hg.import_( patches=io.BytesIO(patch.patch.encode("utf-8")), message=message.encode("utf-8"), user=f"{author_name} <{author_email}>".encode("utf-8"), ) with tempfile.TemporaryDirectory() as tmpdirname: temp_file = os.path.join(tmpdirname, "temp.patch") with open(temp_file, "w") as f: f.write(patch.patch) subprocess.run( ["git", "apply", "--3way", temp_file], check=True, cwd=self.git_repo_dir, ) subprocess.run( [ "git", "-c", f"user.name={author_name}", "-c", f"user.email={author_email}", "commit", "-am", message, ], check=True, cwd=self.git_repo_dir, )
def retrieve_bugs(self, limit: int = None) -> None: bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = set( bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() })) logger.info(f"Retrieved {len(changed_ids)} IDs.") all_components = bugzilla.get_product_component_count(9999) deleted_component_ids = set( bug["id"] for bug in bugzilla.get_bugs() if "{}::{}".format( bug["product"], bug["component"]) not in all_components) logger.info( f"{len(deleted_component_ids)} bugs belonging to deleted components" ) changed_ids |= deleted_component_ids # Get IDs of bugs between (two years and six months ago) and now. two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6) logger.info(f"Retrieving bug IDs since {two_years_and_six_months_ago}") timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago) if limit: timespan_ids = timespan_ids[-limit:] logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() if limit: labelled_bug_ids = labelled_bug_ids[-limit:] logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. # XXX: Temporarily avoid downloading the commits DB when a limit is set, to avoid the integration test fail when the commits DB is bumped. if limit is None: assert db.download(repository.COMMITS_DB) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=3) commit_bug_ids = list( set(commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date)) if limit: commit_bug_ids = commit_bug_ids[-limit:] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which are regressions, bugs which caused regressions (useful for the regressor model), # and blocked bugs. regression_related_ids: List[int] = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in bugzilla.get_bugs()), [], ))) if limit: regression_related_ids = regression_related_ids[-limit:] logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) # Get IDs of bugs linked to intermittent failures. test_failure_bug_ids = [ item["bug_id"] for item in test_scheduling.get_failure_bugs( two_years_and_six_months_ago, datetime.utcnow()) ] if limit: test_failure_bug_ids = test_failure_bug_ids[-limit:] logger.info(f"{len(test_failure_bug_ids)} bugs about test failures.") all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regression_related_ids + test_failure_bug_ids) all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) new_bugs = bugzilla.download_bugs(all_ids) # Get regression_related_ids again (the set could have changed after downloading new bugs). for i in range(7): regression_related_ids = list( set( sum( (bug["regressed_by"] + bug["regressions"] + bug["blocks"] for bug in new_bugs), [], ))) logger.info( f"{len(regression_related_ids)} bugs which caused regressions fixed by commits." ) if limit: regression_related_ids = regression_related_ids[-limit:] # If we got all bugs we needed, break. if set(regression_related_ids).issubset(all_ids): break new_bugs = bugzilla.download_bugs(regression_related_ids) # Try to re-download inconsistent bugs, up to twice. inconsistent_bugs = bugzilla.get_bugs(include_invalid=True) for i in range(2): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) # TODO: Figure out why. missing_history_bug_ids = { bug["id"] for bug in bugzilla.get_bugs() if "history" not in bug } bugzilla.delete_bugs(lambda bug: bug["id"] in missing_history_bug_ids) logger.info( f"Deleted {len(missing_history_bug_ids)} bugs as we couldn't retrieve their history" ) zstd_compress(bugzilla.BUGS_DB)
def apply_phab(self, hg, diff_id): def has_revision(revision): if not revision: return False try: hg.identify(revision) return True except hglib.error.CommandError: return False phabricator_api = PhabricatorAPI( api_key=get_secret("PHABRICATOR_TOKEN"), url=get_secret("PHABRICATOR_URL")) # Get the stack of patches stack = phabricator_api.load_patches_stack(diff_id) assert len(stack) > 0, "No patches to apply" # Find the first unknown base revision needed_stack = [] revisions = {} for patch in reversed(stack): needed_stack.insert(0, patch) # Stop as soon as a base revision is available if has_revision(patch.base_revision): logger.info( f"Stopping at diff {patch.id} and revision {patch.base_revision}" ) break if not needed_stack: logger.info("All the patches are already applied") return # Load all the diff revisions diffs = phabricator_api.search_diffs(diff_phid=[p.phid for p in stack]) revisions = { diff["phid"]: phabricator_api.load_revision(rev_phid=diff["revisionPHID"]) for diff in diffs } # Update repo to base revision hg_base = needed_stack[0].base_revision if hg_base: hg.update(rev=hg_base, clean=True) logger.info(f"Updated repo to {hg_base}") for patch in needed_stack: revision = revisions[patch.phid] if patch.commits: message = patch.commits[0]["message"] else: message = revision["fields"]["title"] logger.info( f"Applying {patch.phid} from revision {revision['id']}: {message}" ) hg.import_( patches=io.BytesIO(patch.patch.encode("utf-8")), message=message, user="******", )
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( (parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor"))) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) self.past_regression_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) self.past_fixed_bug_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format( dimension=dimension))
def classify(self, diff_id): self.update_commit_db() with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, diff_id) patch_rev = hg.log(revrange="not public()")[0].node # Analyze patch. commits = repository.download_commits( self.repo_dir, rev_start=patch_rev.decode("utf-8"), save=False ) # We use "clean" (or "dirty") commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" (or "dirty") commits. if not self.use_test_history: probs, importance = self.model.classify( commits[-1], probabilities=True, importances=True, background_dataset=lambda v: self.X[self.y != v], importance_cutoff=0.05, ) self.generate_feature_importance_data(probs, importance) with open("probs.json", "w") as f: json.dump(probs[0].tolist(), f) if self.model_name == "regressor" and self.method_defect_predictor_dir: self.classify_methods() else: backout_probs = self.backout_model.classify(commits[-1], probabilities=True) logger.info(f"Backout risk: {backout_probs[0][1]}") commit_data = commit_features.merge_commits(commits) push_num = self.past_failures_data["push_num"] # XXX: Consider using mozilla-central built-in rules to filter some of these out, e.g. SCHEDULES. # XXX: Consider using the runnable jobs artifact from the Gecko Decision task. all_tasks = self.past_failures_data["all_tasks"] selected_tasks = [] # TODO: Classify multiple commit/test at the same time. for data in test_scheduling.generate_data( self.past_failures_data, commit_data, push_num, all_tasks, [], [] ): if not data["name"].startswith("test-"): continue commit_data["test_job"] = data probs = self.model.classify(commit_data, probabilities=True) if probs[0][1] > float( get_secret("TEST_SELECTION_CONFIDENCE_THRESHOLD") ): selected_tasks.append(data["name"]) with open("failure_risk", "w") as f: f.write( "1" if backout_probs[0][1] > float(get_secret("TEST_FAILURE_CONFIDENCE_THRESHOLD")) else "0" ) with open("selected_tasks", "w") as f: f.writelines(f"{selected_task}\n" for selected_task in selected_tasks)