def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") zstd_decompress("regressormodel") assert os.path.exists( "regressormodel"), "Decompressed model exists" if not os.path.exists("regressormodel_data_X"): download_check_etag(URL, "regressormodel_data_X.zst") zstd_decompress("regressormodel_data_X") assert os.path.exists( "regressormodel_data_X"), "Decompressed X dataset exists" if not os.path.exists("regressormodel_data_y"): download_check_etag(URL, "regressormodel_data_y.zst") zstd_decompress("regressormodel_data_y") assert os.path.exists( "regressormodel_data_y"), "Decompressed y dataset exists" self.model = RegressorModel.load("regressormodel") # We use "clean" commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" commits. X = joblib.load("regressormodel_data_X") y = joblib.load("regressormodel_data_y") self.background_dataset = X[y == 0]
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") zstd_decompress("regressormodel") assert os.path.exists("regressormodel"), "Decompressed file exists" self.model = RegressorModel.load("regressormodel")
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") dctx = zstandard.ZstdDecompressor() with open("regressormodel.zst", "rb") as input_f: with open("regressormodel", "wb") as output_f: dctx.copy_stream(input_f, output_f) assert os.path.exists("regressormodel"), "Decompressed file exists" self.model = RegressorModel.load("regressormodel")
def __init__(self, cache_root, git_repo_dir, method_defect_predictor_dir): self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") regressormodel_path = "regressormodel" if not os.path.exists(regressormodel_path): download_check_etag(URL.format(f"{regressormodel_path}.zst"), f"{regressormodel_path}.zst") zstd_decompress(regressormodel_path) assert os.path.exists( regressormodel_path), "Decompressed model exists" regressormodel_data_X_path = "regressormodel_data_X" if not os.path.exists(regressormodel_data_X_path): download_check_etag( URL.format(f"{regressormodel_data_X_path}.zst"), f"{regressormodel_data_X_path}.zst", ) zstd_decompress(regressormodel_data_X_path) assert os.path.exists( regressormodel_data_X_path), "Decompressed X dataset exists" regressormodel_data_y_path = "regressormodel_data_y" if not os.path.exists(regressormodel_data_y_path): download_check_etag( URL.format(f"{regressormodel_data_y_path}.zst"), f"{regressormodel_data_y_path}.zst", ) zstd_decompress(regressormodel_data_y_path) assert os.path.exists( regressormodel_data_y_path), "Decompressed y dataset exists" self.model = RegressorModel.load(regressormodel_path) self.X = to_array(joblib.load(regressormodel_data_X_path)) self.y = to_array(joblib.load(regressormodel_data_y_path)) self.method_defect_predictor_dir = method_defect_predictor_dir self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) self.git_repo_dir = git_repo_dir self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( ( parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";") ), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor")) ) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key( get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN") )
def __init__(self, repo_dir: str) -> None: self.risk_bands = sorted( (parse_risk_band(risk_band) for risk_band in get_secret("REGRESSOR_RISK_BANDS").split(";")), key=lambda x: x[1], ) repository.clone(repo_dir) logger.info("Downloading commits database...") assert db.download(repository.COMMITS_DB, support_files_too=True) logger.info("Updating commits DB...") for commit in repository.get_commits(): pass repository.download_commits( repo_dir, rev_start="children({})".format(commit["node"]), ) # Some commits that were already in the DB from the previous run might need # to be updated (e.g. coverage information). repository.update_commits() logger.info("Downloading revisions database...") assert db.download(phabricator.REVISIONS_DB) logger.info("Downloading bugs database...") assert db.download(bugzilla.BUGS_DB) logger.info("Download commit classifications...") assert db.download(BUG_FIXING_COMMITS_DB) self.regressor_model = cast( RegressorModel, RegressorModel.load(download_model("regressor"))) bugzilla.set_token(get_secret("BUGZILLA_TOKEN")) phabricator.set_api_key(get_secret("PHABRICATOR_URL"), get_secret("PHABRICATOR_TOKEN")) self.path_to_component = repository.get_component_mapping() self.past_regressions_by = {} self.past_fixed_bugs_by = {} self.past_regression_blocked_bugs_by = {} self.past_fixed_bug_blocked_bugs_by = {} for dimension in ["component", "directory", "file", "function"]: self.past_regressions_by[dimension] = _download_past_bugs( PAST_REGRESSIONS_BY_URL.format(dimension=dimension)) self.past_fixed_bugs_by[dimension] = _download_past_bugs( PAST_FIXED_BUGS_BY_URL.format(dimension=dimension)) self.past_regression_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_REGRESSION_BLOCKED_BUGS_BY_URL.format( dimension=dimension)) self.past_fixed_bug_blocked_bugs_by[ dimension] = _download_past_bugs( PAST_FIXED_BUG_BLOCKED_BUGS_BY_URL.format( dimension=dimension))