def test_download_version(tmp_path): url_zst = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.zst" url_version = "https://index.taskcluster.net/v1/task/project.relman.bugbug.data_commits.latest/artifacts/public/prova.json.version" db_path = tmp_path / "prova.json" db.register(db_path, url_zst, 1, support_files=[]) responses.add(responses.HEAD, url_version, status=200, headers={"ETag": "123"}) responses.add(responses.GET, url_version, status=200, body="42") db.download_version(db_path) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version")) assert os.path.exists(db_path.with_suffix(db_path.suffix + ".version.etag")) assert not db.is_old_version(db_path) db.register(db_path, url_zst, 43, support_files=[]) assert db.is_old_version(db_path)
def update_commit_db(self): repository.clone(self.repo_dir) db.download_version(repository.COMMITS_DB) if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True, support_files_too=True) for commit in repository.get_commits(): pass rev_start = "children({})".format(commit["node"]) repository.download_commits(self.repo_dir, rev_start)
def retrieve_commits(self): shared_dir = self.repo_dir + "-shared" cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=shared_dir, networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") try: os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db")) except FileNotFoundError: logger.info("pushlog database doesn't exist") # Pull and update, to make sure the pushlog is generated. hg = hglib.open(self.repo_dir) hg.pull(update=True) hg.close() db.download_version(repository.COMMITS_DB) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def __init__(self): self.model_class = RegressorModel self.repo_dir = get_login_info()["repo_dir"] if not os.path.exists(self.repo_dir): cmd = hglib.util.cmdbuilder( "robustcheckout", "https://hg.mozilla.org/mozilla-central", self.repo_dir, purge=True, sharebase=self.repo_dir + "-shared", networkattempts=7, branch=b"tip", ) cmd.insert(0, hglib.HGPATH) proc = hglib.util.popen(cmd) out, err = proc.communicate() if proc.returncode: raise hglib.error.CommandError(cmd, proc.returncode, out, err) logger.info("mozilla-central cloned") # Remove pushlog DB to make sure it's regenerated. try: os.remove(os.path.join(self.repo_dir, ".hg", "pushlog2.db")) except FileNotFoundError: logger.info("pushlog database doesn't exist") logger.info("Pulling and updating mozilla-central") with hglib.open(self.repo_dir) as hg: hg.pull(update=True) logger.info("mozilla-central pulled and updated") db.download_version(repository.COMMITS_DB) if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True, support_files_too=True) super().__init__() self.model = self.model_class.load(self.retrieve_model())
def retrieve_commits(self): repository.clone(self.repo_dir) db.download_version(repository.COMMITS_DB) if not db.is_old_version(repository.COMMITS_DB): db.download(repository.COMMITS_DB, support_files_too=True) for commit in repository.get_commits(): pass rev_start = f"children({commit['node']})" else: rev_start = 0 repository.download_commits(self.repo_dir, rev_start) logger.info("commit data extracted from repository") self.compress_file("data/commits.json") self.compress_file("data/commit_experiences.pickle")
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") # Get the commits DB, as we need it to get the bug IDs linked to recent commits. db.download_version(repository.COMMITS_DB) if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) # Get IDs of bugs linked to commits (used for some commit-based models, e.g. backout and regressor). start_date = datetime.now() - relativedelta(years=2, months=6) commit_bug_ids = [ commit["bug_id"] for commit in repository.get_commits() if commit["bug_id"] and dateutil.parser.parse(commit["pushdate"]) >= start_date ] logger.info( f"{len(commit_bug_ids)} bugs linked to commits to download.") # Get IDs of bugs which caused regressions fixed by commits (useful for the regressor model). regressed_by_bug_ids = sum( [ bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids ], [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) all_ids = (timespan_ids + labelled_bug_ids + commit_bug_ids + regressed_by_bug_ids) all_ids = timespan_ids + labelled_bug_ids + commit_bug_ids all_ids_set = set(all_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs(lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids_set) bugzilla.download_bugs(all_ids) # Get regressed_by_bug_ids again (the set could have changed after downloading new bugs). regressed_by_bug_ids = sum( [ bug["regressed_by"] for bug in bugzilla.get_bugs() if bug["id"] in commit_bug_ids ], [], ) logger.info( f"{len(regressed_by_bug_ids)} bugs which caused regressions fixed by commits." ) bugzilla.download_bugs(regressed_by_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def retrieve_bugs(self): bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) db.download_version(bugzilla.BUGS_DB) if not db.is_old_version(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB) # Get IDs of bugs changed since last run. last_modified = db.last_modified(bugzilla.BUGS_DB) logger.info( f"Retrieving IDs of bugs modified since the last run on {last_modified}" ) changed_ids = bugzilla.get_ids({ "f1": "delta_ts", "o1": "greaterthaneq", "v1": last_modified.date() }) logger.info(f"Retrieved {len(changed_ids)} IDs.") # Get IDs of bugs between (two years and six months ago) and (six months ago). six_months_ago = datetime.utcnow() - relativedelta(months=6) two_years_and_six_months_ago = six_months_ago - relativedelta(years=2) logger.info( f"Retrieving bug IDs from {two_years_and_six_months_ago} to {six_months_ago}" ) timespan_ids = bugzilla.get_ids_between(two_years_and_six_months_ago, six_months_ago) logger.info(f"Retrieved {len(timespan_ids)} IDs.") # Get IDs of labelled bugs. labelled_bug_ids = labels.get_all_bug_ids() logger.info(f"{len(labelled_bug_ids)} labelled bugs to download.") all_ids = set(timespan_ids + labelled_bug_ids) # We have to redownload bugs that were changed since the last download. # We can remove from the DB the bugs that are outside of the considered timespan and are not labelled. bugzilla.delete_bugs( lambda bug: bug["id"] in changed_ids or bug["id"] not in all_ids) bugzilla.download_bugs(timespan_ids + labelled_bug_ids) # Try to re-download inconsistent bugs, up to three times. inconsistent_bugs = bugzilla.get_bugs() for i in range(3): # We look for inconsistencies in all bugs first, then, on following passes, # we only look for inconsistencies in bugs that were found to be inconsistent in the first pass inconsistent_bugs = bug_snapshot.get_inconsistencies( inconsistent_bugs) inconsistent_bug_ids = set(bug["id"] for bug in inconsistent_bugs) if len(inconsistent_bug_ids) == 0: break logger.info( f"Re-downloading {len(inconsistent_bug_ids)} bugs, as they were inconsistent" ) bugzilla.delete_bugs(lambda bug: bug["id"] in inconsistent_bug_ids) bugzilla.download_bugs(inconsistent_bug_ids) self.compress_file("data/bugs.json")
def find_bug_introducing_commits(self, bug_fixing_commits, commits_to_ignore, tokenized): if tokenized: db_path = TOKENIZED_BUG_INTRODUCING_COMMITS_DB repo_dir = self.tokenized_git_repo_dir else: db_path = BUG_INTRODUCING_COMMITS_DB repo_dir = self.git_repo_dir def git_to_mercurial(rev): if tokenized: return self.tokenized_git_to_mercurial[rev] else: return vcs_map.git_to_mercurial(rev) def mercurial_to_git(rev): if tokenized: return self.mercurial_to_tokenized_git[rev] else: return vcs_map.mercurial_to_git(rev) logger.info("Download previously found bug-introducing commits...") db.download_version(db_path) if db.is_old_version(db_path) or not os.path.exists(db_path): db.download(db_path, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(db_path)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_rev"] for bug_introducing_commit in prev_bug_introducing_commits) logger.info( f"Already classified {len(prev_bug_introducing_commits)} commits..." ) hashes_to_ignore = set(commit["rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines("{}\n".format(mercurial_to_git(commit["rev"])) for commit in commits_to_ignore if not tokenized or commit["rev"] in self.mercurial_to_tokenized_git) logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] not in hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) if tokenized: bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["rev"] in self.mercurial_to_tokenized_git ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones with no git hash" ) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): git_fix_revision = mercurial_to_git(bug_fixing_commit["rev"]) logger.info(f"Analyzing {git_fix_revision}...") commit = GIT_REPO.get_commit(git_fix_revision) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore")) logger.info(bug_introducing_modifications) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values( ): for bug_introducing_hash in bug_introducing_hashes: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": git_to_mercurial(bug_introducing_hash), }) # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append({ "bug_fixing_rev": bug_fixing_commit["rev"], "bug_introducing_rev": "", }) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor(initializer=_init, initargs=(repo_dir, ), max_workers=os.cpu_count() + 1) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm(bug_introducing_commits, total=len(bug_fixing_commits)) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits)) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(db_path, bug_introducing_commits) compress_file(db_path)
def find_bug_fixing_commits(self): logger.info("Downloading commits database...") db.download_version(repository.COMMITS_DB) if db.is_old_version(repository.COMMITS_DB) or not os.path.exists( repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) logger.info("Downloading bugs database...") db.download_version(bugzilla.BUGS_DB) if db.is_old_version( bugzilla.BUGS_DB) or not os.path.exists(bugzilla.BUGS_DB): db.download(bugzilla.BUGS_DB, force=True) logger.info("Download previous classifications...") db.download_version(BUG_FIXING_COMMITS_DB) if db.is_old_version(BUG_FIXING_COMMITS_DB ) or not os.path.exists(BUG_FIXING_COMMITS_DB): db.download(BUG_FIXING_COMMITS_DB, force=True) logger.info("Get previously classified commits...") prev_bug_fixing_commits = list(db.read(BUG_FIXING_COMMITS_DB)) prev_bug_fixing_commits_nodes = set( bug_fixing_commit["rev"] for bug_fixing_commit in prev_bug_fixing_commits) logger.info( f"Already classified {len(prev_bug_fixing_commits)} commits...") # TODO: Switch to the pure Defect model, as it's better in this case. logger.info("Downloading defect/enhancement/task model...") download_model("defectenhancementtask") defect_model = DefectEnhancementTaskModel.load( "defectenhancementtaskmodel") logger.info("Downloading regression model...") download_model("regression") regression_model = RegressionModel.load("regressionmodel") start_date = datetime.now() - RELATIVE_START_DATE end_date = datetime.now() - RELATIVE_END_DATE logger.info( f"Gathering bug IDs associated to commits (since {start_date} and up to {end_date})..." ) commit_map = defaultdict(list) for commit in repository.get_commits(): if commit["node"] in prev_bug_fixing_commits_nodes: continue commit_date = dateutil.parser.parse(commit["pushdate"]) if commit_date < start_date or commit_date > end_date: continue commit_map[commit["bug_id"]].append(commit["node"]) logger.info( f"{sum(len(commit_list) for commit_list in commit_map.values())} commits found, {len(commit_map)} bugs linked to commits" ) assert len(commit_map) > 0 def get_relevant_bugs(): return (bug for bug in bugzilla.get_bugs() if bug["id"] in commit_map) bug_count = sum(1 for bug in get_relevant_bugs()) logger.info( f"{bug_count} bugs in total, {len(commit_map) - bug_count} bugs linked to commits missing" ) known_defect_labels = defect_model.get_labels() known_regression_labels = regression_model.get_labels() bug_fixing_commits = [] def append_bug_fixing_commits(bug_id, type_): for commit in commit_map[bug_id]: bug_fixing_commits.append({"rev": commit, "type": type_}) for bug in tqdm(get_relevant_bugs(), total=bug_count): # Ignore bugs which are not linked to the commits we care about. if bug["id"] not in commit_map: continue # If we know the label already, we don't need to apply the model. if (bug["id"] in known_regression_labels and known_regression_labels[bug["id"]] == 1): append_bug_fixing_commits(bug["id"], "r") continue if bug["id"] in known_defect_labels: if known_defect_labels[bug["id"]] == "defect": append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") continue if defect_model.classify(bug)[0] == "defect": if regression_model.classify(bug)[0] == 1: append_bug_fixing_commits(bug["id"], "r") else: append_bug_fixing_commits(bug["id"], "d") else: append_bug_fixing_commits(bug["id"], "e") db.append(BUG_FIXING_COMMITS_DB, bug_fixing_commits) compress_file(BUG_FIXING_COMMITS_DB) bug_fixing_commits = prev_bug_fixing_commits + bug_fixing_commits return [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["type"] in ["r", "d"] ]
def find_bug_introducing_commits(cache_dir, git_repo_dir): mercurial_repo_dir = os.path.join(cache_dir, "mozilla-central") logger.info("Downloading Mercurial <-> git mapping file...") vcs_map.download_mapfile() logger.info(f"Cloning mercurial repository to {mercurial_repo_dir}...") repository.clone(mercurial_repo_dir) logger.info(f"Cloning git repository to {git_repo_dir}...") clone_gecko_dev(git_repo_dir) logger.info("Download previously found bug-introducing commits...") db.download_version(BUG_INTRODUCING_COMMITS_DB) if db.is_old_version(BUG_INTRODUCING_COMMITS_DB) or not os.path.exists( BUG_INTRODUCING_COMMITS_DB ): db.download(BUG_INTRODUCING_COMMITS_DB, force=True) logger.info("Get previously found bug-introducing commits...") prev_bug_introducing_commits = list(db.read(BUG_INTRODUCING_COMMITS_DB)) prev_bug_introducing_commits_nodes = set( bug_introducing_commit["bug_fixing_mercurial_rev"] for bug_introducing_commit in prev_bug_introducing_commits ) logger.info(f"Already classified {len(prev_bug_introducing_commits)} commits...") commits_to_ignore = get_commits_to_ignore(mercurial_repo_dir) git_hashes_to_ignore = set(commit["git_rev"] for commit in commits_to_ignore) with open("git_hashes_to_ignore", "w") as f: f.writelines(f"{git_hash}\n" for git_hash in git_hashes_to_ignore) bug_fixing_commits = find_bug_fixing_commits() logger.info(f"{len(bug_fixing_commits)} commits to analyze") # Skip already found bug-introducing commits. bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["mercurial_rev"] not in prev_bug_introducing_commits_nodes ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping already analyzed ones" ) bug_fixing_commits = [ bug_fixing_commit for bug_fixing_commit in bug_fixing_commits if bug_fixing_commit["git_rev"] not in git_hashes_to_ignore ] logger.info( f"{len(bug_fixing_commits)} commits left to analyze after skipping the ones in the ignore list" ) def _init(git_repo_dir): global GIT_REPO GIT_REPO = GitRepository(git_repo_dir) def find_bic(bug_fixing_commit): logger.info("Analyzing {}...".format(bug_fixing_commit["git_rev"])) commit = GIT_REPO.get_commit(bug_fixing_commit["git_rev"]) # Skip huge changes, we'll likely be wrong with them. if len(commit.modifications) > MAX_MODIFICATION_NUMBER: return [None] bug_introducing_modifications = GIT_REPO.get_commits_last_modified_lines( commit, hashes_to_ignore_path=os.path.realpath("git_hashes_to_ignore") ) logger.info(bug_introducing_modifications) bug_introducing_commits = [] for bug_introducing_hashes in bug_introducing_modifications.values(): for bug_introducing_hash in bug_introducing_hashes: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": vcs_map.git_to_mercurial( bug_introducing_hash ), "bug_introducing_git_rev": bug_introducing_hash, } ) # Add an empty result, just so that we don't reanalyze this again. if len(bug_introducing_commits) == 0: bug_introducing_commits.append( { "bug_fixing_mercurial_rev": bug_fixing_commit["mercurial_rev"], "bug_fixing_git_rev": bug_fixing_commit["git_rev"], "bug_introducing_mercurial_rev": "", "bug_introducing_git_rev": "", } ) return bug_introducing_commits with concurrent.futures.ThreadPoolExecutor( initializer=_init, initargs=(git_repo_dir,), max_workers=os.cpu_count() + 1 ) as executor: bug_introducing_commits = executor.map(find_bic, bug_fixing_commits) bug_introducing_commits = tqdm( bug_introducing_commits, total=len(bug_fixing_commits) ) bug_introducing_commits = list( itertools.chain.from_iterable(bug_introducing_commits) ) total_results_num = len(bug_introducing_commits) bug_introducing_commits = list(filter(None, bug_introducing_commits)) logger.info( f"Skipped {total_results_num - len(bug_introducing_commits)} commits as they were too big" ) db.append(BUG_INTRODUCING_COMMITS_DB, bug_introducing_commits) compress_file(BUG_INTRODUCING_COMMITS_DB)