def download_model(model_name): if not os.path.exists(f"{model_name}model"): url = BASE_URL.format(model_name=model_name) logger.info(f"Downloading {url}...") download_check_etag(url, f"{model_name}model.zst") zstd_decompress(f"{model_name}model") assert os.path.exists(f"{model_name}model"), "Decompressed file exists"
def retrieve_model(name): os.makedirs(MODELS_DIR, exist_ok=True) file_name = f"{name}model" file_path = os.path.join(MODELS_DIR, file_name) base_model_url = BASE_URL.format(name, f"v{get_bugbug_version()}") model_url = f"{base_model_url}/{file_name}.zst" LOGGER.info(f"Checking ETAG of {model_url}") r = requests.head(model_url, allow_redirects=True) r.raise_for_status() new_etag = r.headers["ETag"] try: with open(f"{file_path}.etag", "r") as f: old_etag = f.read() except IOError: old_etag = None if old_etag != new_etag: LOGGER.info(f"Downloading the model from {model_url}") urlretrieve(model_url, f"{file_path}.zst") zstd_decompress(file_path) LOGGER.info(f"Written model in {file_path}") with open(f"{file_path}.etag", "w") as f: f.write(new_etag) else: LOGGER.info(f"ETAG for {model_url} is ok") return file_path
def download_model(model_url, file_path): logger.info( f"Downloading model from {model_url!r} and save it in {file_path!r}") urlretrieve(model_url, f"{file_path}.zst") zstd_decompress(file_path) logger.info(f"Written model in {file_path}")
def classify_bugs(model_name, classifier, bug_id): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info(f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag( f"https://index.taskcluster.net/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst", f"{model_file_name}.zst", ) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists(model_file_name), "Decompressed file doesn't exist" model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify( bug, probabilities=True, importances=True ) model.print_feature_importances( importance["importances"], class_probabilities=probas ) with open("importance.html", "w") as f: f.write(importance["html"]) else: probas = model.classify(bug, probabilities=True, importances=False) if np.argmax(probas) == 1: print(f"Positive! {probas}") else: print(f"Negative! {probas}") input()
def extract_file(path): path, compression_type = os.path.splitext(path) if compression_type == ".zst": zstd_decompress(path) else: assert False, f"Unexpected compression type: {compression_type}"
def main(args): model_file_name = f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag(URL.format(model_file_name)) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model = similarity.model_name_to_class[args.algorithm].load( f"{similarity.model_name_to_class[args.algorithm].__name__.lower()}.similaritymodel" ) bug_ids = model.get_similar_bugs(bugzilla.get(args.bug_id)[args.bug_id]) bugs = {} for bug in bugzilla.get_bugs(): if bug["id"] in bug_ids or bug["id"] == args.bug_id: bugs[bug["id"]] = bug print("{}: {}".format(args.bug_id, bugs[args.bug_id]["summary"])) for bug_id in bug_ids: print("{}: {}".format(bug_id, bugs[bug_id]["summary"]))
def download_model(model_url, file_path): logger.info( f"Downloading model from {model_url!r} and save it in {file_path!r}") download_check_etag(model_url) zstd_decompress(file_path) logger.info(f"Written model in {file_path}")
def _download_past_bugs(url: str) -> dict: path = os.path.join("data", os.path.basename(url)[:-4]) download_check_etag(url, path=f"{path}.zst") zstd_decompress(path) assert os.path.exists(path) with open(path, "r") as f: return json.load(f)
def __init__(self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = self.load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" if not os.path.exists(model_data_X_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")) zstd_decompress(model_data_X_path) assert os.path.exists( model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" if not os.path.exists(model_data_y_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")) zstd_decompress(model_data_y_path) assert os.path.exists( model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) if model_name == "testselect": self.use_test_history = True assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB) self.past_failures_data = test_scheduling.get_past_failures() self.backout_model = self.load_model("backout") assert self.backout_model is not None
def extract_file(path): inner_path, _ = os.path.splitext(path) if str(path).endswith(".tar.zst"): extract_tar_zst(inner_path) elif str(path).endswith(".zst"): zstd_decompress(inner_path) else: assert False, f"Unexpected compression type for {path}"
def load_model(self, model_name): model_path = f"{model_name}model" if not os.path.exists(model_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_path}.zst")) zstd_decompress(model_path) assert os.path.exists(model_path), "Decompressed model exists" return get_model_class(model_name).load(model_path)
def download_similarity_model(model_name): path = f"{model_name_to_class[model_name].__name__.lower()}.similaritymodel" url = f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.train_similarity.latest/artifacts/public/{path}.zst" logger.info(f"Downloading similarity model from {url}...") updated = download_check_etag(url) if updated: zstd_decompress(path) os.remove(f"{path}.zst") assert os.path.exists(path), "Decompressed file exists" return path
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") zstd_decompress("regressormodel") assert os.path.exists("regressormodel"), "Decompressed file exists" self.model = RegressorModel.load("regressormodel")
def test_zstd_compress_decompress(tmp_path): path = tmp_path / "prova" compressed_path = path.with_suffix(".zst") with open(path, "w") as f: json.dump({"Hello": "World"}, f) utils.zstd_compress(path) assert os.path.exists(compressed_path) os.remove(path) utils.zstd_decompress(path) with open(path, "r") as f: file_decomp = json.load(f) assert file_decomp == {"Hello": "World"}
def download_coverage_mapping() -> None: commit_to_coverage = get_coverage_mapping(False) utils.download_check_etag( "https://firefox-ci-tc.services.mozilla.com/api/index/v1/task/project.relman.code-coverage.production.cron.latest/artifacts/public/commit_coverage.json.zst", "data/coverage_mapping.json.zst", ) zstd_decompress("data/coverage_mapping.json") assert os.path.exists("data/coverage_mapping.json") with open("data/coverage_mapping.json", "r") as f: data = json.load(f) for commit_hash, commit_stats in data.items(): commit_to_coverage[commit_hash.encode("utf-8")] = pickle.dumps( commit_stats) close_coverage_mapping()
def __init__(self, cache_root): self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") if not os.path.exists("regressormodel"): download_check_etag(URL, "regressormodel.zst") zstd_decompress("regressormodel") assert os.path.exists( "regressormodel"), "Decompressed model exists" if not os.path.exists("regressormodel_data_X"): download_check_etag(URL, "regressormodel_data_X.zst") zstd_decompress("regressormodel_data_X") assert os.path.exists( "regressormodel_data_X"), "Decompressed X dataset exists" if not os.path.exists("regressormodel_data_y"): download_check_etag(URL, "regressormodel_data_y.zst") zstd_decompress("regressormodel_data_y") assert os.path.exists( "regressormodel_data_y"), "Decompressed y dataset exists" self.model = RegressorModel.load("regressormodel") # We use "clean" commits as the background dataset for feature importance. # This way, we can see the features which are most important in differentiating # the current commit from the "clean" commits. X = joblib.load("regressormodel_data_X") y = joblib.load("regressormodel_data_y") self.background_dataset = X[y == 0]
def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.repo_dir = repo_dir self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label") self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def __init__(self, cache_root, git_repo_dir, method_defect_predictor_dir): self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") regressormodel_path = "regressormodel" if not os.path.exists(regressormodel_path): download_check_etag(URL.format(f"{regressormodel_path}.zst"), f"{regressormodel_path}.zst") zstd_decompress(regressormodel_path) assert os.path.exists( regressormodel_path), "Decompressed model exists" regressormodel_data_X_path = "regressormodel_data_X" if not os.path.exists(regressormodel_data_X_path): download_check_etag( URL.format(f"{regressormodel_data_X_path}.zst"), f"{regressormodel_data_X_path}.zst", ) zstd_decompress(regressormodel_data_X_path) assert os.path.exists( regressormodel_data_X_path), "Decompressed X dataset exists" regressormodel_data_y_path = "regressormodel_data_y" if not os.path.exists(regressormodel_data_y_path): download_check_etag( URL.format(f"{regressormodel_data_y_path}.zst"), f"{regressormodel_data_y_path}.zst", ) zstd_decompress(regressormodel_data_y_path) assert os.path.exists( regressormodel_data_y_path), "Decompressed y dataset exists" self.model = RegressorModel.load(regressormodel_path) self.X = to_array(joblib.load(regressormodel_data_X_path)) self.y = to_array(joblib.load(regressormodel_data_y_path)) self.method_defect_predictor_dir = method_defect_predictor_dir self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) self.git_repo_dir = git_repo_dir self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)
def __init__( self, model_name: str, repo_dir: str, git_repo_dir: str, method_defect_predictor_dir: str, use_single_process: bool, skip_feature_importance: bool, ): self.model_name = model_name self.repo_dir = repo_dir self.model = Model.load(download_model(model_name)) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo( "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir ) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) self.use_single_process = use_single_process self.skip_feature_importance = skip_feature_importance if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" with open(model_data_X_path, "rb") as fb: self.X = to_array(pickle.load(fb)) with open(model_data_y_path, "rb") as fb: self.y = to_array(pickle.load(fb)) past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "r") as f: self.past_bugs_by_function = json.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label", True) self.testfailure_model = cast( TestFailureModel, TestFailureModel.load(download_model("testfailure")) ) assert self.testfailure_model is not None
def __init__( self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir ): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB ) self.past_failures_data = test_scheduling.get_past_failures() self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity) ) if updated: zstd_decompress(push_data_path) os.remove(f"{push_data_path}.zst") assert os.path.exists(push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity] ) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB ) failing_together_db = os.path.join( "data", test_scheduling.FAILING_TOGETHER_LABEL_DB ) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB ) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB ) db.download(test_scheduling_db, support_files_too=True) last_node = None for revs, _ in test_scheduling.get_test_scheduling_history(granularity): last_node = revs[0] def generate_failing_together_probabilities(push_data): # TODO: we should consider the probabilities of `task1 failure -> task2 failure` and # `task2 failure -> task1 failure` separately, as they could be different. count_runs = collections.Counter() count_single_failures = collections.Counter() count_both_failures = collections.Counter() for revisions, tasks, likely_regressions, candidate_regressions in tqdm( push_data ): failures = set(likely_regressions + candidate_regressions) all_tasks = list(set(tasks) | failures) for task1, task2 in itertools.combinations(sorted(all_tasks), 2): count_runs[(task1, task2)] += 1 if task1 in failures: if task2 in failures: count_both_failures[(task1, task2)] += 1 else: count_single_failures[(task1, task2)] += 1 elif task2 in failures: count_single_failures[(task1, task2)] += 1 stats = {} skipped = 0 for couple, run_count in count_runs.most_common(): failure_count = count_both_failures[couple] support = failure_count / run_count if support < 1 / 700: skipped += 1 continue if failure_count != 0: confidence = failure_count / ( count_single_failures[couple] + failure_count ) else: confidence = 0.0 stats[couple] = (support, confidence) logger.info(f"{skipped} couples skipped because their support was too low") logger.info("Redundancies with the highest support and confidence:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], -k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) logger.info("Redundancies with the highest confidence and lowest support:") for couple, (support, confidence) in sorted( stats.items(), key=lambda k: (-k[1][1], k[1][0]) )[:7]: failure_count = count_both_failures[couple] run_count = count_runs[couple] logger.info( f"{couple[0]} - {couple[1]} redundancy confidence {confidence}, support {support} ({failure_count} over {run_count})." ) failing_together = test_scheduling.get_failing_together_db() count_redundancies = collections.Counter() for couple, (support, confidence) in stats.items(): if confidence == 1.0: count_redundancies["==100%"] += 1 if confidence > 0.9: count_redundancies[">=90%"] += 1 if confidence > 0.8: count_redundancies[">=80%"] += 1 if confidence > 0.7: count_redundancies[">=70%"] += 1 if confidence < 0.7: continue failing_together[ f"{couple[0]}${couple[1]}".encode("utf-8") ] = struct.pack("ff", support, confidence) for percentage, count in count_redundancies.most_common(): logger.info(f"{count} with {percentage} confidence") test_scheduling.close_failing_together_db() def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures["push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [ ( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), []) ) # Filter runnables we don't need. all_runnables = filter_runnables( list(all_runnables_set), all_runnables_set, granularity ) all_runnables_set = set(all_runnables_set) logger.info(f"{len(all_runnables_set)} runnables run in the last 28 pushes") push_data = [ ( revisions, filter_runnables(push_tasks, all_runnables_set, granularity), filter_runnables( possible_regressions, all_runnables_set, granularity ), filter_runnables( likely_regressions, all_runnables_set, granularity ), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data ] if granularity == "label": generate_failing_together_probabilities(push_data) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together() next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map ) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # We consider only the runnables which run in this push, and the possible and likely regressions # from this push. We can't consider all runnables because we can't be sure that a task that didn't # run on a push would have been successful. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions) ) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) result = { "revs": revisions, "data": [], } for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: result["data"].append(data) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield result if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[: -len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[: -len(".tar.zst")]) if granularity == "label": with open_tar_zst(failing_together_db) as tar: tar.add(failing_together_db[: -len(".tar.zst")])
def count(is_first_task, is_second_task): updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity="label")) if updated: zstd_decompress("push_data_label.json") with open("push_data_label.json", "r") as f: push_data = json.load(f) print(f"Analyzing {len(push_data)} pushes...") all_tasks = set(task for _, push_tasks, _, _ in push_data for task in push_tasks) print(f"Considering {len(all_tasks)} tasks...") count_runs = 0 count_any_of_the_two = 0 count_first_but_not_second = 0 count_second_but_not_first = 0 for push in push_data: (revisions, push_tasks, possible_regressions, likely_regressions) = push first_group_tasks = [ task.split("/")[1] for task in push_tasks if is_first_task(task) ] second_group_tasks = [ task.split("/")[1] for task in push_tasks if is_second_task(task) ] if len(first_group_tasks) == 0 and len(second_group_tasks) == 0: continue in_both_tasks = set(first_group_tasks) & set(second_group_tasks) # Only consider pushes where tasks run in both groups. if len(in_both_tasks) == 0: continue count_runs += 1 failures = [ task for task in likely_regressions + possible_regressions if any( task.endswith(in_both_task) for in_both_task in in_both_tasks) ] first_failures = [task for task in failures if is_first_task(task)] second_failures = [task for task in failures if is_second_task(task)] if len(first_failures) > 0 or len(second_failures) > 0: count_any_of_the_two += 1 if len(first_failures) > 0 and len(second_failures) == 0: count_first_but_not_second += 1 elif len(first_failures) == 0 and len(second_failures) > 0: count_second_but_not_first += 1 return ( count_runs, count_any_of_the_two, count_first_but_not_second, count_second_but_not_first, )
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) logger.info(f"push data nodes: {len(push_data)}") HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None past_failures = shelve.open( "data/past_failures.shelve", protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] # Sync DB every 1000 commits, so we cleanup the shelve cache (we'd run OOM otherwise!). if len(commits_with_data) % 1000 == 0: past_failures.sync() if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = (task in commit_push_data[1] or task in commit_push_data[2]) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, commit_data["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, commit_data["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } # We no longer need the push data for this node, we can free the memory. del push_data[node] push_num += 1 logger.info( f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() zstd_compress("data/past_failures.shelve")
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL) zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history(): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures() push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # Store all tasks in the past_failures DB so it can be used in the evaluation phase. past_failures["all_tasks"] = all_tasks # XXX: Should we recreate the DB from scratch if the previous all_tasks are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, tasks_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling.TEST_SCHEDULING_DB, generate_all_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")
def classify_bugs(model_name, classifier, bug_id): if classifier != "default": assert ( model_name in MODELS_WITH_TYPE ), f"{classifier} is not a valid classifier type for {model_name}" model_file_name = f"{model_name}{classifier}model" model_name = f"{model_name}_{classifier}" else: model_file_name = f"{model_name}model" if not os.path.exists(model_file_name): logger.info( f"{model_file_name} does not exist. Downloading the model....") try: download_check_etag( f"https://community-tc.services.mozilla.com/api/index/v1/task/project.relman.bugbug.train_{model_name}.latest/artifacts/public/{model_file_name}.zst" ) except requests.HTTPError: logger.error( f"A pre-trained model is not available, you will need to train it yourself using the trainer script" ) raise SystemExit(1) zstd_decompress(model_file_name) assert os.path.exists( model_file_name), "Decompressed file doesn't exist" model_class = get_model_class(model_name) model = model_class.load(model_file_name) if bug_id: bugs = bugzilla.get(bug_id).values() assert bugs, f"A bug with a bug id of {bug_id} was not found" else: assert db.download(bugzilla.BUGS_DB) bugs = bugzilla.get_bugs() for bug in bugs: print( f'https://bugzilla.mozilla.org/show_bug.cgi?id={bug["id"]} - {bug["summary"]} ' ) if model.calculate_importance: probas, importance = model.classify(bug, probabilities=True, importances=True) model.print_feature_importances(importance["importances"], class_probabilities=probas) else: probas = model.classify(bug, probabilities=True, importances=False) probability = probas[0] pred_index = np.argmax(probability) if len(probability) > 2: pred_class = model.le.inverse_transform([pred_index])[0] else: pred_class = "Positive" if pred_index == 1 else "Negative" print(f"{pred_class} {probability}") input()
def generate_test_scheduling_history(self, granularity): push_data_path = f"push_data_{granularity}.json" updated = download_check_etag( test_scheduling.PUSH_DATA_URL.format(granularity=granularity)) if updated: zstd_decompress(push_data_path) assert os.path.exists( push_data_path), "Decompressed push data file exists" # Get the commits DB. assert db.download(repository.COMMITS_DB) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS[granularity]) if granularity == "label": test_scheduling_db = test_scheduling.TEST_LABEL_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_LABEL_DB) elif granularity == "group": test_scheduling_db = test_scheduling.TEST_GROUP_SCHEDULING_DB past_failures_db = os.path.join( "data", test_scheduling.PAST_FAILURES_GROUP_DB) touched_together_db = os.path.join( "data", test_scheduling.TOUCHED_TOGETHER_DB) db.download(test_scheduling_db, support_files_too=True) last_node = None for test_data in test_scheduling.get_test_scheduling_history( granularity): last_node = test_data["revs"][0] def generate_all_data(): past_failures = test_scheduling.get_past_failures(granularity) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open(push_data_path, "r") as f: push_data = json.load(f) logger.info(f"push data nodes: {len(push_data)}") if granularity == "label": push_data = [( revisions, rename_tasks(push_tasks), rename_tasks(possible_regressions), rename_tasks(likely_regressions), ) for revisions, push_tasks, possible_regressions, likely_regressions in push_data] # In the last 28 pushes, we definitely run all possible runnables. all_runnables_set = set( sum((push_runnables for _, push_runnables, _, _ in push_data[-28:]), [])) # Filter runnables we don't need. all_runnables = filter_runnables(list(all_runnables_set), all_runnables_set, granularity) all_runnables_set = set(all_runnables_set) logger.info( f"{len(all_runnables_set)} runnables run in the last 28 pushes" ) # Store all runnables in the past_failures DB so it can be used in the evaluation phase. past_failures["all_runnables"] = all_runnables # XXX: Should we recreate the DB from scratch if the previous all_runnables are not the # same as the current ones? saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_runnables = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False if granularity == "group": update_touched_together_gen = test_scheduling.update_touched_together( ) next(update_touched_together_gen) for i in tqdm(range(len(push_data))): ( revisions, push_runnables, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 50: skipped_too_big_commits += 1 continue # If we considered all_runnables, we'd generate a huge amount of data. # So we consider only the runnables which run in this push, and the possible and likely regressions # from this push. runnables_to_consider = list( set(push_runnables + possible_regressions + likely_regressions)) runnables_to_consider = filter_runnables( runnables_to_consider, all_runnables_set, granularity) if len(runnables_to_consider) == 0: skipped_no_runnables += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) if granularity == "group": update_touched_together_gen.send(commits[0]["node"]) for data in test_scheduling.generate_data( past_failures, merged_commits, push_num, runnables_to_consider, possible_regressions, likely_regressions, ): if pushdate > HISTORY_DATE_START: saved_nodes.add(i) data["revs"] = revisions yield data if granularity == "group": try: update_touched_together_gen.send(None) except StopIteration: pass logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info( f"skipped {skipped_no_runnables} (no interesting runnables)") past_failures["push_num"] = push_num past_failures.close() db.append(test_scheduling_db, generate_all_data()) zstd_compress(test_scheduling_db) with open_tar_zst(past_failures_db) as tar: tar.add(past_failures_db[:-len(".tar.zst")]) if granularity == "group": with open_tar_zst(touched_together_db) as tar: tar.add(touched_together_db[:-len(".tar.zst")])
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json" ), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version(repository.COMMITS_DB) or not db.exists( repository.COMMITS_DB ): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta(months=TRAINING_MONTHS) with open("push_data.json", "r") as f: data = json.load(f) push_data = {} for row in data[1:]: # Revision -> (all tasks, possible regressions, likely regressions) push_data[row[0]] = (row[1], row[2], row[3]) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["rev"] else: last_node = None try: with open("data/past_failures.pickle", "rb") as f: past_failures, push_num = pickle.load(f) except FileNotFoundError: past_failures = {} push_num = 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): if type_ not in past_failures: past_failures[type_] = {} if task not in past_failures[type_]: past_failures[type_][task] = {} values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] for item in items: if item not in past_failures[type_][task]: past_failures[type_][task][item] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0 ) value = past_failures[type_][task][item][push_num] values_total.append(value) values_prev_7.append( value - past_failures[type_][task][item][push_num - 7] ) values_prev_14.append( value - past_failures[type_][task][item][push_num - 14] ) values_prev_28.append( value - past_failures[type_][task][item][push_num - 28] ) values_prev_56.append( value - past_failures[type_][task][item][push_num - 56] ) if is_regression: past_failures[type_][task][item][push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num commits_with_data = set() saved_nodes = set() # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for commit_data in tqdm(repository.get_commits()): node = commit_data["node"] if node == last_node: can_start = True continue if not can_start: continue if node not in push_data: continue commits_with_data.add(node) commit_push_data = push_data[node] for task in commit_push_data[0]: if not any(task.startswith(j) for j in JOBS_TO_CONSIDER): continue is_regression = ( task in commit_push_data[1] or task in commit_push_data[2] ) total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures = get_and_update_past_failures( "all", task, ["all"], push_num, is_regression ) total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures = get_and_update_past_failures( "type", task, commit_data["types"], push_num, is_regression ) total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures = get_and_update_past_failures( "file", task, commit_data["files"], push_num, is_regression ) total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures = get_and_update_past_failures( "directory", task, commit_data["directories"], push_num, is_regression, ) total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures = get_and_update_past_failures( "component", task, commit_data["components"], push_num, is_regression, ) pushdate = dateutil.parser.parse(commit_data["pushdate"]) if pushdate > HISTORY_DATE_START: saved_nodes.add(node) yield { "rev": node, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in commit_push_data[1], "is_likely_regression": task in commit_push_data[2], } push_num += 1 logger.info(f"push data nodes: {len(push_data)}") logger.info(f"commits linked to push data: {len(commits_with_data)}") logger.info(f"saved push data nodes: {len(saved_nodes)}") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) with open("data/past_failures.pickle", "wb") as f: pickle.dump((past_failures, push_num), f, protocol=pickle.HIGHEST_PROTOCOL) zstd_compress("data/past_failures.pickle")
def generate_test_scheduling_history(self): if not os.path.exists("push_data.json"): download_check_etag(PUSH_DATA_URL, "push_data.json.zst") zstd_decompress("push_data.json") assert os.path.exists( "push_data.json"), "Decompressed push data file exists" # Get the commits DB. if db.is_old_version( repository.COMMITS_DB) or not db.exists(repository.COMMITS_DB): db.download(repository.COMMITS_DB, force=True) HISTORY_DATE_START = datetime.now() - relativedelta( months=TRAINING_MONTHS) HISTORICAL_TIMESPAN = 56 if not db.is_old_version(test_scheduling.TEST_SCHEDULING_DB): db.download(test_scheduling.TEST_SCHEDULING_DB, support_files_too=True) for test_data in test_scheduling.get_test_scheduling_history(): pass last_node = test_data["revs"][0] else: last_node = None past_failures = shelve.Shelf( LMDBDict("data/past_failures.lmdb"), protocol=pickle.HIGHEST_PROTOCOL, writeback=True, ) push_num = past_failures[ "push_num"] if "push_num" in past_failures else 0 def get_and_update_past_failures(type_, task, items, push_num, is_regression): values_total = [] values_prev_7 = [] values_prev_14 = [] values_prev_28 = [] values_prev_56 = [] key = f"{type_}${task}$" for item in items: full_key = key + item if full_key not in past_failures: cur = past_failures[full_key] = ExpQueue( push_num, HISTORICAL_TIMESPAN + 1, 0) else: cur = past_failures[full_key] value = cur[push_num] values_total.append(value) values_prev_7.append(value - cur[push_num - 7]) values_prev_14.append(value - cur[push_num - 14]) values_prev_28.append(value - cur[push_num - 28]) values_prev_56.append(value - cur[push_num - 56]) if is_regression: cur[push_num] = value + 1 return ( sum(values_total), sum(values_prev_7), sum(values_prev_14), sum(values_prev_28), sum(values_prev_56), ) def generate_data(): nonlocal push_num saved_nodes = set() skipped_no_commits = 0 skipped_too_big_commits = 0 skipped_no_tasks = 0 # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False commit_map = {} for commit_data in tqdm(repository.get_commits()): if not can_start: if last_node == commit_data["node"]: can_start = True continue commit_map[commit_data["node"]] = commit_data with open("push_data.json", "r") as f: push_data = json.load(f)[1:] logger.info(f"push data nodes: {len(push_data)}") # In the last 28 pushes, we definitely run all possible tasks. all_tasks_set = set( sum((push_tasks for _, push_tasks, _, _ in push_data[-28:]), [])) # Filter tasks we don't need. all_tasks = filter_tasks(list(all_tasks_set), all_tasks_set) all_tasks_set = set(all_tasks) logger.info( f"{len(all_tasks_set)} tasks run in the last 28 pushes") # We can start once we get to the last revision we added in the previous run. can_start = True if last_node is None else False for i in tqdm(range(len(push_data))): ( revisions, push_tasks, possible_regressions, likely_regressions, ) = push_data.pop(0) if not can_start: if last_node == revisions[0]: can_start = True continue push_num += 1 # XXX: Some commits are skipped in the repository mining, e.g. merges and backouts. Maybe we should not skip them. commits = tuple( commit_map.pop(revision) for revision in revisions if revision in commit_map) if len(commits) == 0: skipped_no_commits += 1 continue merged_commits = commit_features.merge_commits(commits) # XXX: For now, skip commits which are too large. # In the future we can either: # - Improve shelve perf and go back to consider all files; # - Consider only files which appear with a given frequency, like the "files" feature in commit_features; # - Keep a limit of number of files. if len(merged_commits["files"]) > 20: skipped_too_big_commits += 1 continue # If we considered all_tasks, we'd generate a huge amount of data. # So we consider only the tasks which run in this push, and the possible and likely regressions # from this push. tasks_to_consider = list( set(push_tasks + possible_regressions + likely_regressions)) tasks_to_consider = filter_tasks(tasks_to_consider, all_tasks_set) if len(tasks_to_consider) == 0: skipped_no_tasks += 1 continue # Sync DB every 250 pushes, so we cleanup the shelve cache (we'd run OOM otherwise!). if i % 250 == 0: past_failures.sync() pushdate = dateutil.parser.parse(merged_commits["pushdate"]) for task in tasks_to_consider: is_regression = (task in possible_regressions or task in likely_regressions) ( total_failures, past_7_pushes_failures, past_14_pushes_failures, past_28_pushes_failures, past_56_pushes_failures, ) = get_and_update_past_failures("all", task, ["all"], push_num, is_regression) ( total_types_failures, past_7_pushes_types_failures, past_14_pushes_types_failures, past_28_pushes_types_failures, past_56_pushes_types_failures, ) = get_and_update_past_failures("type", task, merged_commits["types"], push_num, is_regression) ( total_files_failures, past_7_pushes_files_failures, past_14_pushes_files_failures, past_28_pushes_files_failures, past_56_pushes_files_failures, ) = get_and_update_past_failures("file", task, merged_commits["files"], push_num, is_regression) ( total_directories_failures, past_7_pushes_directories_failures, past_14_pushes_directories_failures, past_28_pushes_directories_failures, past_56_pushes_directories_failures, ) = get_and_update_past_failures( "directory", task, merged_commits["directories"], push_num, is_regression, ) ( total_components_failures, past_7_pushes_components_failures, past_14_pushes_components_failures, past_28_pushes_components_failures, past_56_pushes_components_failures, ) = get_and_update_past_failures( "component", task, merged_commits["components"], push_num, is_regression, ) if pushdate > HISTORY_DATE_START: saved_nodes.add(i) yield { "revs": revisions, "name": task, "failures": total_failures, "failures_past_7_pushes": past_7_pushes_failures, "failures_past_14_pushes": past_14_pushes_failures, "failures_past_28_pushes": past_28_pushes_failures, "failures_past_56_pushes": past_56_pushes_failures, "failures_in_types": total_types_failures, "failures_past_7_pushes_in_types": past_7_pushes_types_failures, "failures_past_14_pushes_in_types": past_14_pushes_types_failures, "failures_past_28_pushes_in_types": past_28_pushes_types_failures, "failures_past_56_pushes_in_types": past_56_pushes_types_failures, "failures_in_files": total_files_failures, "failures_past_7_pushes_in_files": past_7_pushes_files_failures, "failures_past_14_pushes_in_files": past_14_pushes_files_failures, "failures_past_28_pushes_in_files": past_28_pushes_files_failures, "failures_past_56_pushes_in_files": past_56_pushes_files_failures, "failures_in_directories": total_directories_failures, "failures_past_7_pushes_in_directories": past_7_pushes_directories_failures, "failures_past_14_pushes_in_directories": past_14_pushes_directories_failures, "failures_past_28_pushes_in_directories": past_28_pushes_directories_failures, "failures_past_56_pushes_in_directories": past_56_pushes_directories_failures, "failures_in_components": total_components_failures, "failures_past_7_pushes_in_components": past_7_pushes_components_failures, "failures_past_14_pushes_in_components": past_14_pushes_components_failures, "failures_past_28_pushes_in_components": past_28_pushes_components_failures, "failures_past_56_pushes_in_components": past_56_pushes_components_failures, "is_possible_regression": task in possible_regressions, "is_likely_regression": task in likely_regressions, } logger.info(f"saved push data nodes: {len(saved_nodes)}") logger.info(f"skipped {skipped_no_commits} (no commits in our DB)") logger.info(f"skipped {skipped_too_big_commits} (too big commits)") logger.info(f"skipped {skipped_no_tasks} (no interesting tasks)") db.append(test_scheduling.TEST_SCHEDULING_DB, generate_data()) zstd_compress(test_scheduling.TEST_SCHEDULING_DB) past_failures["push_num"] = push_num past_failures.close() with open_tar_zst("data/past_failures.lmdb.tar.zst") as tar: tar.add("data/past_failures.lmdb")