def classify_bugs(model_name, classifier, bug_id): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info(f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag( f"https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst", f"{model_file_name}.zst", ) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists(model_file_name), "Decompressed file doesn't exist" model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify( bug, probabilities=True, importances=True ) model.print_feature_importances( importance["importances"], class_probabilities=probas ) with open("importance.html", "w") as f: f.write(importance["html"]) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input()
def download_model(model_url, file_path): logger.info( f"Downloading model from {model_url!r} and save it in {file_path!r}") download_check_etag(model_url) zstd_decompress(file_path) logger.info(f"Written model in {file_path}")
def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.repo_dir = repo_dir self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label") self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def main(args): model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag(URL.format(model_file_name)) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model = similarity.model_name_to_class[args.algorithm].load( f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" ) bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id]) bugs = {} for bug in bugzilla.get_bugs(): if bug["id"] in bug_ids or bug["id"] == args.bug_id: bugs[bug["id"]] = bug print("{}: {}".format(args.bug_id, bugs[args.bug_id]["summary"])) for bug_id in bug_ids: print("{}: {}".format(bug_id, bugs[bug_id]["summary"]))
def test_download_check_etag_unchanged(): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug/prova.txt" responses.add( responses.HEAD, url, status=200, headers={"ETag": "123", "Last-Modified": "2019-04-16",}, ) responses.add(responses.GET, url, status=200, body="prova") responses.add( responses.HEAD, url, status=200, headers={"ETag": "123", "Last-Modified": "2019-04-16",}, ) responses.add(responses.GET, url, status=200, body="prova2") utils.download_check_etag(url, "prova.txt") assert os.path.exists("prova.txt") with open("prova.txt", "r") as f: assert f.read() == "prova" utils.download_check_etag(url, "prova.txt") assert os.path.exists("prova.txt") with open("prova.txt", "r") as f: assert f.read() == "prova"
def test_download_check_missing(): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug/prova.txt" responses.add( responses.HEAD, url, status=404, headers={"ETag": "123", "Last-Modified": "2019-04-16",}, ) responses.add( responses.GET, url, status=404, body=requests.exceptions.HTTPError("HTTP error") ) url_fallback = url.replace( "https://community-tc.services.mozilla.com/api/index", "https://index.taskcluster.net", ) responses.add( responses.HEAD, url_fallback, status=404, headers={"ETag": "123"}, ) responses.add( responses.GET, url_fallback, status=404, body=requests.exceptions.HTTPError("HTTP error"), ) with pytest.raises(requests.exceptions.HTTPError, match="HTTP error"): utils.download_check_etag(url, "prova.txt") assert not os.path.exists("prova.txt")
def download_model(model_name): if not os.path.exists(f"{model_name}model"): url = BASE_URL.format(model_name=model_name) logger.info(f"Downloading {url}...") download_check_etag(url, f"{model_name}model.zst") zstd_decompress(f"{model_name}model") assert os.path.exists(f"{model_name}model"), "Decompressed file exists"
def test_download_check_etag_fallback(tmp_path): url = "https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug/prova.txt" responses.add( responses.HEAD, url, status=404, headers={"ETag": "123", "Last-Modified": "2019-04-16",}, ) responses.add( responses.GET, url, status=404, body=requests.exceptions.HTTPError("HTTP error") ) url_fallback = url.replace( "https://community-tc.services.mozilla.com/api/index", "https://index.taskcluster.net", ) responses.add( responses.HEAD, url_fallback, status=200, headers={"ETag": "123"}, ) responses.add(responses.GET, url_fallback, status=200, body="prova") utils.download_check_etag(url, "prova.txt") assert os.path.exists("prova.txt") with open("prova.txt", "r") as f: assert f.read() == "prova"
def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f)
def retrieve_push_data(self): # Download previous cache. cache_path = os.path.abspath("data/adr_cache") if not os.path.exists(cache_path): cache_available = True try: download_check_etag(ADR_CACHE_URL, "adr_cache.tar.xz") except requests.exceptions.HTTPError: logger.info("The adr cache is not available yet, trying fallback...") try: download_check_etag(OLD_ADR_CACHE_URL, "adr_cache.tar.xz") except requests.exceptions.HTTPError: logger.info( "The adr cache is not available yet, trying fallback..." ) cache_available = False if cache_available: with tarfile.open("adr_cache.tar.xz", "r:xz") as tar: tar.extractall() assert os.path.exists("data/adr_cache"), "Decompressed adr cache exists" # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write( f"""[adr.cache.stores] file = {{ driver = "file", path = "{cache_path}" }} """ ) # We'll use the past TRAINING_MONTHS months only for training the model, # but we use 3 months more than that to calculate the failure statistics. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("push_data.json"), "-f", "json", "push_data", "--", "--from", f"today-{TRAINING_MONTHS + 3}month", "--to", "today-2day", "--branch", "autoland", ], check=True, stdout=subprocess.DEVNULL, # Redirect to /dev/null, as the logs are too big otherwise. ) with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar: tar.add("data/adr_cache") zstd_compress("push_data.json")
def download_support_file(path, file_name): url = urljoin(DATABASES[path]["url"], file_name) path = os.path.join(os.path.dirname(path), file_name) print(f"Downloading {url} to {path}") utils.download_check_etag(url, path) if path.endswith(".xz"): extract_file(path[:-3])
def __init__(self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = self.load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" if not os.path.exists(model_data_X_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")) zstd_decompress(model_data_X_path) assert os.path.exists( model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" if not os.path.exists(model_data_y_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")) zstd_decompress(model_data_y_path) assert os.path.exists( model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) if model_name == "testselect": self.use_test_history = True assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB) self.past_failures_data = test_scheduling.get_past_failures() self.backout_model = self.load_model("backout") assert self.backout_model is not None
def load_model(self, model_name): model_path = f"{model_name}model" if not os.path.exists(model_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_path}.zst")) zstd_decompress(model_path) assert os.path.exists(model_path), "Decompressed model exists" return get_model_class(model_name).load(model_path)
def get_labels(file_name): path = os.path.join(get_labels_dir(), f"{file_name}.csv") if not os.path.exists(path) and file_name in LABELS_URLS: utils.download_check_etag(LABELS_URLS[file_name], path) with open(path, "r") as f: reader = csv.reader(f) next(reader) yield from reader
def download_model(model_name): if not os.path.exists(f"{model_name}model"): url = BASE_URL.format(model_name=model_name) logger.info(f"Downloading {url}...") download_check_etag(url, f"{model_name}model.zst") dctx = zstandard.ZstdDecompressor() with open(f"{model_name}model.zst", "rb") as input_f: with open(f"{model_name}model", "wb") as output_f: dctx.copy_stream(input_f, output_f) assert os.path.exists(f"{model_name}model"), "Decompressed file exists"
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") zstd_decompress("regressormodel") assert os.path.exists("regressormodel"), "Decompressed file exists" self.model = RegressorModel.load("regressormodel")
def download_support_file(path, file_name): try: url = urljoin(DATABASES[path]["url"], file_name) path = os.path.join(os.path.dirname(path), file_name) print(f"Downloading {url} to {path}") utils.download_check_etag(url, path) if path.endswith(".zst"): extract_file(path) except requests.exceptions.HTTPError: print(f"{file_name} is not yet available to download for {path}")
def download_component_mapping(): global path_to_component utils.download_check_etag( "https://index.taskcluster.net/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json", "data/component_mapping.json", ) with open("data/component_mapping.json", "r") as f: path_to_component = json.load(f) path_to_component = { path: "::".join(component) for path, component in path_to_component.items() }
def retrieve_test_scheduling_history(self): os.makedirs("data", exist_ok=True) # Download previous cache. cache_path = os.path.abspath("data/adr_cache") if not os.path.exists(cache_path): try: download_check_etag(URL, "data/adr_cache.tar.xz") with tarfile.open("data/adr_cache.tar.xz", "r:xz") as tar: tar.extractall() assert os.path.exists("data/adr_cache"), "Decompressed adr cache exists" except requests.exceptions.HTTPError: logger.info("The adr cache is not available yet") # Setup adr cache configuration. os.makedirs(os.path.expanduser("~/.config/adr"), exist_ok=True) with open(os.path.expanduser("~/.config/adr/config.toml"), "w") as f: f.write( f"""[adr.cache.stores] file = {{ driver = "file", path = "{cache_path}" }} """ ) # TODO: Increase timespan when https://github.com/ahal/ci-recipes/issues/6 is fixed. subprocess.run( [ "run-adr", "ahal/ci-recipes", "recipe", "-o", os.path.abspath("data/test_scheduling_history.json"), "-f", "json", "push_data", "--", "--from", "today-3month", "--to", "today-2day", "--branch", "autoland", ], check=True, ) zstd_compress("data/test_scheduling_history.json") with tarfile.open("data/adr_cache.tar.xz", "w:xz") as tar: tar.add("data/adr_cache")
def download_component_mapping(): path_to_component = get_component_mapping(False) utils.download_check_etag( "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json", "data/component_mapping.json", ) with open("data/component_mapping.json", "r") as f: data = json.load(f) for path, component in data.items(): path_to_component[path.encode("utf-8")] = "::".join(component).encode("utf-8") close_component_mapping()
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") dctx = zstandard.ZstdDecompressor() with open("regressormodel.zst", "rb") as input_f: with open("regressormodel", "wb") as output_f: dctx.copy_stream(input_f, output_f) assert os.path.exists("regressormodel"), "Decompressed file exists" self.model = RegressorModel.load("regressormodel")
def download(path, support_files_too=False, extract=True): # If a DB with the current schema is not available yet, we can't download. if is_different_schema(path): return False zst_path = f"{path}.zst" url = DATABASES[path]["url"] try: logger.info(f"Downloading {url} to {zst_path}") updated = utils.download_check_etag(url, zst_path) if extract and updated: utils.extract_file(zst_path) os.remove(zst_path) successful = True if support_files_too: for support_file in DATABASES[path]["support_files"]: successful |= download_support_file(path, support_file, extract) return successful except requests.exceptions.HTTPError: logger.info(f"{url} is not yet available to download", exc_info=True) return False
def download_component_mapping(): global path_to_component if path_to_component is not None: return utils.download_check_etag( "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/gecko.v2.mozilla-central.latest.source.source-bugzilla-info/artifacts/public/components.json", "data/component_mapping.json", ) with open("data/component_mapping.json", "r") as f: path_to_component = json.load(f) path_to_component = { path: "::".join(component) for path, component in path_to_component.items() }
def download(path, force=False, support_files_too=False): if os.path.exists(path) and not force: return xz_path = f"{path}.xz" # Only download if the xz file is not there yet. if not os.path.exists(xz_path) or force: url = DATABASES[path]["url"] print(f"Downloading {url} to {xz_path}") utils.download_check_etag(url, xz_path) extract_file(path) if support_files_too: for support_file in DATABASES[path]["support_files"]: download_support_file(path, support_file)
def download_coverage_mapping() -> None: commit_to_coverage = get_coverage_mapping(False) utils.download_check_etag( "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.production.cron.latest/artifacts/public/commit_coverage.json.zst", "data/coverage_mapping.json.zst", ) zstd_decompress("data/coverage_mapping.json") assert os.path.exists("data/coverage_mapping.json") with open("data/coverage_mapping.json", "r") as f: data = json.load(f) for commit_hash, commit_stats in data.items(): commit_to_coverage[commit_hash.encode("utf-8")] = pickle.dumps( commit_stats) close_coverage_mapping()
def download_similarity_model(model_name): path = f"{model_name_to_class[model_name].__name__.lower()}.similaritymodel" url = f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.train_similarity.latest/artifacts/public/{path}.zst" logger.info(f"Downloading similarity model from {url}...") updated = download_check_etag(url) if updated: zstd_decompress(path) os.remove(f"{path}.zst") assert os.path.exists(path), "Decompressed file exists" return path
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") zstd_decompress("regressormodel") assert os.path.exists( "regressormodel"), "Decompressed model exists" if not os.path.exists("regressormodel_data_X"): download_check_etag(URL, "regressormodel_data_X.zst") zstd_decompress("regressormodel_data_X") assert os.path.exists( "regressormodel_data_X"), "Decompressed X dataset exists" if not os.path.exists("regressormodel_data_y"): download_check_etag(URL, "regressormodel_data_y.zst") zstd_decompress("regressormodel_data_y") assert os.path.exists( "regressormodel_data_y"), "Decompressed y dataset exists" self.model = RegressorModel.load("regressormodel") # We use "clean" commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" commits. X = joblib.load("regressormodel_data_X") y = joblib.load("regressormodel_data_y") self.background_dataset = X[y == 0]
def download(path, force=False, support_files_too=False): if os.path.exists(path) and not force: return zst_path = f"{path}.zst" # Only download if the file is not there yet. if not os.path.exists(zst_path) or force: url = DATABASES[path]["url"] try: print(f"Downloading {url} to {zst_path}") utils.download_check_etag(url, zst_path) except requests.exceptions.HTTPError: print(f"{url} is not yet available to download") return extract_file(zst_path) if support_files_too: for support_file in DATABASES[path]["support_files"]: download_support_file(path, support_file)
def download(path, force=False, support_files_too=False): if os.path.exists(path) and not force: return zst_path = f"{path}.zst" xz_path = f"{path}.xz" # Only download if the file is not there yet. if (not os.path.exists(zst_path) and not os.path.exists(xz_path)) or force: url = DATABASES[path]["url"] try: path_compressed = zst_path print(f"Downloading {url} to {path_compressed}") utils.download_check_etag(url, path_compressed) except requests.exceptions.HTTPError: try: url_xz = f"{os.path.splitext(url)[0]}.xz" path_compressed = xz_path print(f"Downloading {url_xz} to {path_compressed} instead") utils.download_check_etag(url_xz, path_compressed) except requests.exceptions.HTTPError: print(f"{url} is not yet available to download") raise else: if os.path.exists(zst_path) or not os.path.exists(xz_path): path_compressed = zst_path else: path_compressed = xz_path extract_file(path_compressed) if support_files_too: for support_file in DATABASES[path]["support_files"]: download_support_file(path, support_file)
def download_support_file(path, file_name): # If a DB with the current schema is not available yet, we can't download. if is_old_schema(path): return False try: url = urljoin(DATABASES[path]["url"], file_name) path = os.path.join(os.path.dirname(path), file_name) logger.info(f"Downloading {url} to {path}") updated = utils.download_check_etag(url, path) if updated and path.endswith(".zst"): extract_file(path) return True except requests.exceptions.HTTPError: logger.info(f"{file_name} is not yet available to download for {path}", exc_info=True) return False