def items_gen(self, classes): if not self.bug_data: bug_map = None else: all_bug_ids = set(commit["bug_id"] for commit in repository.get_commits() if commit["node"] in classes) bug_map = {} for bug in bugzilla.get_bugs(): if bug["id"] not in all_bug_ids: continue bug_map[bug["id"]] = bug assert len(bug_map) > 0 for commit in repository.get_commits(): if commit["node"] not in classes: continue if self.bug_data: if commit["bug_id"] in bug_map: commit["bug"] = bug_map[commit["bug_id"]] else: commit["bug"] = {} yield commit, classes[commit["node"]]
def boot_worker(): # Preload models bugbug_http.models.preload_models() # Clone mozilla central repo_dir = os.environ.get("BUGBUG_REPO_DIR", os.path.join(tempfile.gettempdir(), "bugbug-hg")) logger.info(f"Cloning mozilla-central in {repo_dir}...") repository.clone(repo_dir) # Download databases logger.info("Downloading test scheduling DB support file...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") repository.download_commits(repo_dir, rev_start) logger.info("Worker boot done")
def get_landed_and_filed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [] last_commit_by_bug: Dict[int, datetime] = {} for commit in repository.get_commits(): if not commit["bug_id"]: continue push_date = dateutil.parser.parse(commit["pushdate"]) if push_date >= since and ( commit["bug_id"] not in last_commit_by_bug or push_date - last_commit_by_bug[commit["bug_id"]] < timedelta(days=91) or not all(repository.is_test(p) for p in commit["files"])): commits.append(commit) last_commit_by_bug[commit["bug_id"]] = push_date logger.info(f"Retrieving bug IDs since {days} days ago") timespan_ids = bugzilla.get_ids_between(since, resolution=["---", "FIXED"]) return list( set(commit["bug_id"] for commit in commits) | set(timespan_ids))
def get_landed_and_filed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [ commit for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= since and commit["bug_id"] ] logger.info(f"Retrieving bug IDs since {days} days ago") timespan_ids = bugzilla.get_ids_between(since, datetime.utcnow()) bugzilla.download_bugs(timespan_ids) bug_ids = set(commit["bug_id"] for commit in commits) bug_ids.update(bug["id"] for bug in bugzilla.get_bugs() if dateutil.parser.parse(bug["creation_time"]).replace( tzinfo=None) >= since and bug["resolution"] not in [ "INVALID", "WONTFIX", "INACTIVE", "DUPLICATE", "INCOMPLETE", "MOVED", "WORKSFORME", ]) return list(bug_ids)
def __init__(self, repo_dir: str) -> None: if not os.path.exists(repo_dir): repository.clone(repo_dir) else: repository.pull(repo_dir, "mozilla-central", "tip") logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def get_labels(self): classes = {} for commit_data in repository.get_commits(): classes[commit_data["node"]] = 1 if commit_data["ever_backedout"] else 0 return classes, [0, 1]
def get_labels(self): classes = {} regressors = set(r[0] for r in labels.get_labels("regressor")) for commit_data in repository.get_commits(): if commit_data["ever_backedout"]: continue node = commit_data["node"] if node in regressors: classes[node] = 1 else: push_date = dateutil.parser.parse(commit_data["pushdate"]) # The labels we have are only from 2016-11-01. # TODO: Automate collection of labels and somehow remove this check. if push_date < datetime(2016, 11, 1): continue # We remove the last 6 months, as there could be regressions which haven't been filed yet. if push_date > datetime.utcnow() - relativedelta(months=6): continue classes[node] = 0 print("{} commits caused regressions".format( sum(1 for label in classes.values() if label == 1))) print("{} commits did not cause regressions".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def items_gen(self, classes): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 done = set() for test_data in test_scheduling.get_test_scheduling_history("label"): revs = test_data["revs"] if revs[0] in done: continue if revs[0] not in classes: continue done.add(revs[0]) commits = tuple(commit_map[revision] for revision in revs if revision in commit_map) if len(commits) == 0: continue commit_data = commit_features.merge_commits(commits) yield commit_data, classes[revs[0]]
def items_gen(self, classes): if not self.commit_data: commit_map = None else: commit_map = defaultdict(list) for commit in repository.get_commits(): bug_id = commit["bug_id"] if not bug_id: continue commit_map[bug_id].append(commit) assert len(commit_map) > 0 for bug in bugzilla.get_bugs(): bug_id = bug["id"] if bug_id not in classes: continue if self.commit_data: if bug_id in commit_map: bug["commits"] = commit_map[bug_id] else: bug["commits"] = [] yield bug, classes[bug_id]
def get_labels(self): classes = {} two_years_and_six_months_ago = datetime.utcnow() - relativedelta( years=2, months=6 ) for commit_data in repository.get_commits(): pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate < two_years_and_six_months_ago: continue classes[commit_data["node"]] = 1 if commit_data["backedoutby"] else 0 print( "{} commits were backed out".format( sum(1 for label in classes.values() if label == 1) ) ) print( "{} commits were not backed out".format( sum(1 for label in classes.values() if label == 0) ) ) return classes, [0, 1]
def retrieve_revisions(self, limit: Optional[int] = None) -> None: phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) db.download(phabricator.REVISIONS_DB) # Get the commits DB, as we need it to get the revision IDs linked to recent commits. assert db.download(repository.COMMITS_DB) # Get the bugs DB, as we need it to get the revision IDs linked to bugs. assert db.download(bugzilla.BUGS_DB) # Get IDs of revisions linked to commits since a year ago. start_date = datetime.utcnow() - relativedelta(years=1) revision_ids = list((filter( None, (repository.get_revision_id(commit) for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= start_date), ))) if limit is not None: revision_ids = revision_ids[-limit:] # Get IDs of revisions linked to bugs since a year ago. for bug in bugzilla.get_bugs(): if (dateutil.parser.parse( bug["creation_time"]).replace(tzinfo=None) < start_date): continue revision_ids += bugzilla.get_revision_ids(bug) phabricator.download_revisions(revision_ids) zstd_compress(phabricator.REVISIONS_DB)
def retrieve_commits(self, limit): repository.clone(self.repo_dir) if limit: # Mercurial revset supports negative integers starting from tip rev_start = -limit else: db.download(repository.COMMITS_DB, support_files_too=True) rev_start = 0 for commit in repository.get_commits(): rev_start = f"children({commit['node']})" with hglib.open(self.repo_dir) as hg: revs = repository.get_revs(hg, rev_start) chunk_size = 70000 for i in range(0, len(revs), chunk_size): repository.download_commits(self.repo_dir, revs=revs[i:(i + chunk_size)]) logger.info("commit data extracted from repository") # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() zstd_compress(repository.COMMITS_DB) create_tar_zst(os.path.join("data", repository.COMMIT_EXPERIENCES_DB))
def get_bugs(self, date="today", bug_ids=[]): self.query_url = "" # Ignore already analyzed commits. for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" commits = repository.download_commits(self.repo_dir, rev_start, ret=True) commits = [ commit for commit in commits if not commit["ever_backedout"] ] probs = self.model.classify(commits, True) indexes = probs.argmax(axis=-1) result = {} for commit, prob, index in zip(commits, probs, indexes): result[commit["node"]] = { "id": commit["node"], "summary": commit["desc"].split("\n", 1)[0], "result": "Risky" if prob[1] > 0.5 else "Not risky", "confidence": nice_round(prob[index]), } return result
def get_labels(self): classes = {} # Commits in regressor or regression bugs usually are not formatting changes. regression_related_bugs = set( sum( (bug["regressed_by"] + bug["regressions"] for bug in bugzilla.get_bugs()), [], )) for commit_data in repository.get_commits(include_ignored=True): if commit_data["backedoutby"]: continue node = commit_data["node"] if commit_data["ignored"]: classes[node] = 1 elif commit_data["bug_id"] in regression_related_bugs: classes[node] = 0 for node, label in labels.get_labels("annotateignore"): classes[node] = int(label) print("{} commits that can be ignored".format( sum(1 for label in classes.values() if label == 1))) print("{} commits that cannot be ignored".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def get_labels(self): classes = {} regressors = set(r["bug_introducing_rev"] for r in db.read(BUG_INTRODUCING_COMMITS_DB) if r["bug_introducing_rev"]) for commit_data in repository.get_commits(): if commit_data["ever_backedout"]: continue node = commit_data["node"] if node in regressors: classes[node] = 1 else: push_date = dateutil.parser.parse(commit_data["pushdate"]) # The labels we have are only from two years and six months ago (see the regressor finder script). if push_date < datetime.utcnow() - relativedelta(years=2, months=6): continue # We remove the last 6 months, as there could be regressions which haven't been filed yet. if push_date > datetime.utcnow() - relativedelta(months=6): continue classes[node] = 0 print("{} commits caused regressions".format( sum(1 for label in classes.values() if label == 1))) print("{} commits did not cause regressions".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def __init__(self, repo_dir: str) -> None: repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = download_and_load_model("regressor") bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN"))
def items_gen(self, classes): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 for test_data in test_scheduling.get_test_scheduling_history( self.granularity): revs = test_data["revs"] name = test_data["name"] if (revs[0], name) not in classes: continue commits = tuple(commit_map[revision] for revision in test_data["revs"] if revision in commit_map) if len(commits) == 0: continue commit_data = commit_features.merge_commits(commits) commit_data["test_job"] = test_data yield commit_data, classes[(revs[0], name)]
def generate_data(): commits_with_data = set() saved_nodes = set() push_num = 0 for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue total_failures = get_past_failures(task, push_num) past_7_pushes_failures = total_failures - get_past_failures( task, push_num - 7) past_14_pushes_failures = total_failures - get_past_failures( task, push_num - 14) past_28_pushes_failures = total_failures - get_past_failures( task, push_num - 28) past_56_pushes_failures = total_failures - get_past_failures( task, push_num - 56) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } if task in commit_push_data[1] or task in commit_push_data[ 2]: past_failures[task][push_num] = total_failures + 1 push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}")
def get_commit_map(): commit_map = {} for commit in repository.get_commits(): commit_map[commit["node"]] = commit assert len(commit_map) > 0 return commit_map
def get_labels(self): classes = {} for commit_data in repository.get_commits(): classes[commit_data[ "node"]] = 1 if commit_data["ever_backedout"] else 0 return classes, [0, 1]
def boot_worker(): # Clone autoland logger.info(f"Cloning mozilla autoland in {REPO_DIR}...") repository.clone(REPO_DIR, "https://hg.mozilla.org/integration/autoland") # Download test scheduling DB support files. logger.info("Downloading test scheduling DB support files...") assert (db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.PAST_FAILURES_GROUP_DB, ) or ALLOW_MISSING_MODELS) assert (db.download_support_file( test_scheduling.TEST_GROUP_SCHEDULING_DB, test_scheduling.TOUCHED_TOGETHER_DB, ) or ALLOW_MISSING_MODELS) # Download commits DB logger.info("Downloading commits DB...") commits_db_downloaded = db.download(repository.COMMITS_DB, support_files_too=True) if not ALLOW_MISSING_MODELS: assert commits_db_downloaded if commits_db_downloaded: # And update it logger.info("Browsing all commits...") for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) logger.info("Updating commits DB...") commits = repository.download_commits(REPO_DIR, rev_start, use_single_process=True) if len(commits) > 0: # Update the touched together DB. update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) update_touched_together_gen.send(commits[-1]["node"]) try: update_touched_together_gen.send(None) except StopIteration: pass # Preload models bugbug_http.models.preload_models() logger.info("Worker boot done")
def get_commits_to_ignore(self) -> None: assert db.download(repository.COMMITS_DB) ignored = set() commits_to_ignore = [] all_commits = set() annotate_ignore_nodes = { node for node, label in labels.get_labels("annotateignore") if label == "1" } for commit in repository.get_commits(include_no_bug=True, include_backouts=True, include_ignored=True): all_commits.add(commit["node"][:12]) if (commit["ignored"] or commit["backedoutby"] or not commit["bug_id"] or len(commit["backsout"]) > 0 or repository.is_wptsync(commit) or commit["node"] in annotate_ignore_nodes): commits_to_ignore.append({ "rev": commit["node"], "type": "backedout" if commit["backedoutby"] else "", }) ignored.add(commit["node"][:12]) if len(commit["backsout"]) > 0: for backedout in commit["backsout"]: if backedout[:12] in ignored: continue ignored.add(backedout[:12]) commits_to_ignore.append({ "rev": backedout, "type": "backedout" }) logger.info(f"{len(commits_to_ignore)} commits to ignore...") # Skip backed-out commits which aren't in the repository (commits which landed *before* the Mercurial history # started, and backouts which mentioned a bad hash in their message). commits_to_ignore = [ c for c in commits_to_ignore if c["rev"][:12] in all_commits ] logger.info(f"{len(commits_to_ignore)} commits to ignore...") logger.info("...of which {} are backed-out".format( sum(1 for commit in commits_to_ignore if commit["type"] == "backedout"))) db.write(IGNORED_COMMITS_DB, commits_to_ignore) zstd_compress(IGNORED_COMMITS_DB) db.upload(IGNORED_COMMITS_DB)
def get_landed_since(self, days: int) -> List[int]: since = datetime.utcnow() - timedelta(days=days) commits = [ commit for commit in repository.get_commits() if dateutil.parser.parse(commit["pushdate"]) >= since and commit["bug_id"] ] return [commit["bug_id"] for commit in commits]
def get_landed_since(self, days_start: int, days_end: int) -> Collection[repository.CommitDict]: since = datetime.utcnow() - timedelta(days=days_start) until = datetime.utcnow() - timedelta(days=days_end) return [ commit for commit in repository.get_commits(include_no_bug=True, include_backouts=True, include_ignored=True) if since <= dateutil.parser.parse(commit["pushdate"]) <= until ]
def update_commit_db(self): repository.clone(self.repo_dir, update=True) assert db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass repository.download_commits(self.repo_dir, rev_start="children({})".format( commit["node"]))
def update_touched_together(): touched_together = get_touched_together_db() last_analyzed = ( touched_together["last_analyzed"] if "last_analyzed" in touched_together else None ) # We can start once we get to the last revision we added in the previous run. can_start = True if last_analyzed is None else False seen = set() end_revision = yield i = 0 for commit in repository.get_commits(): seen.add(commit["node"]) if can_start: touched_together["last_analyzed"] = commit["node"] # As in the test scheduling history retriever script, for now skip commits which are too large. if len(commit["files"]) <= 50 and not commit["ever_backedout"]: # Number of times a source file was touched together with a directory. for f1 in commit["files"]: for d2 in set( os.path.dirname(f) for f in commit["files"] if f != f1 ): set_touched_together(f1, d2) # Number of times a directory was touched together with another directory. for d1, d2 in itertools.combinations( list(set(os.path.dirname(f) for f in commit["files"])), 2 ): set_touched_together(d1, d2) i += 1 if i % 5000: touched_together.sync() elif last_analyzed == commit["node"]: can_start = True if commit["node"] == end_revision: # Some commits could be in slightly different order between mozilla-central and autoland. # It's a small detail that shouldn't affect the features, but we need to take it into account. while end_revision in seen: end_revision = yield if end_revision is None: break touched_together.close()
def get_labels(self): classes = {} regressors = set( r["bug_introducing_rev"] for r in db.read(BUG_INTRODUCING_COMMITS_DB) if r["bug_introducing_rev"] ) regressor_bugs = set( sum((bug["regressed_by"] for bug in bugzilla.get_bugs()), []) ) for commit_data in repository.get_commits(): if commit_data["backedoutby"]: continue if repository.is_wptsync(commit_data): continue push_date = dateutil.parser.parse(commit_data["pushdate"]) # Skip commits used for the evaluation phase. if push_date > datetime.utcnow() - relativedelta(months=EVALUATION_MONTHS): continue node = commit_data["node"] if node in regressors or commit_data["bug_id"] in regressor_bugs: classes[node] = 1 else: # The labels we have are only from two years and six months ago (see the regressor finder script). if push_date < datetime.utcnow() - relativedelta(years=2, months=6): continue # We remove the last 6 months, as there could be regressions which haven't been filed yet. if push_date > datetime.utcnow() - relativedelta(months=6): continue classes[node] = 0 print( "{} commits caused regressions".format( sum(1 for label in classes.values() if label == 1) ) ) print( "{} commits did not cause regressions".format( sum(1 for label in classes.values() if label == 0) ) ) return classes, [0, 1]
def classify( self, revision=None, phabricator_deployment=None, diff_id=None, runnable_jobs_path=None, ): if revision is not None: assert phabricator_deployment is None assert diff_id is None if diff_id is not None: assert phabricator_deployment is not None assert revision is None self.update_commit_db() if phabricator_deployment is not None and diff_id is not None: with hglib.open(self.repo_dir) as hg: self.apply_phab(hg, phabricator_deployment, diff_id) revision = hg.log( revrange="not public()")[0].node.decode("utf-8") commits = repository.download_commits( self.repo_dir, rev_start=revision, save=False, use_single_process=self.use_single_process, ) else: commits = [] for commit in repository.get_commits(): if commit["node"] == revision: commits.append(commit) break # The commit to analyze was not in our DB, let's mine it. if len(commits) == 0: commits = repository.download_commits( self.repo_dir, revs=[revision], save=False, use_single_process=self.use_single_process, ) assert len(commits) > 0, "There are no commits to analyze" if not self.use_test_history: self.classify_regressor(commits) else: self.classify_test_select(commits, runnable_jobs_path)
def get_commit_map( revs: Optional[Set[test_scheduling.Revision]] = None, ) -> Dict[test_scheduling.Revision, repository.CommitDict]: commit_map = {} for commit in repository.get_commits(): if revs is not None and commit["node"] not in revs: continue commit_map[commit["node"]] = commit assert len(commit_map) > 0 return commit_map
def get_labels(self): classes = {} for commit_data in repository.get_commits(): classes[commit_data[ "node"]] = 1 if commit_data["ever_backedout"] else 0 print("{} commits were backed out".format( sum(1 for label in classes.values() if label == 1))) print("{} commits were not backed out".format( sum(1 for label in classes.values() if label == 0))) return classes, [0, 1]
def update_commit_db(self): repository.clone(self.repo_dir) if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True, support_files_too=True) for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) repository.download_commits(self.repo_dir, rev_start)
def get_author_ids(): author_ids = set() for commit in repository.get_commits(): author_ids.add(commit["author_email"]) return author_ids