def __init__(self, model_name, repo_dir, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.repo_dir = repo_dir self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label") self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None
def __init__(self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = self.load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" if not os.path.exists(model_data_X_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst")) zstd_decompress(model_data_X_path) assert os.path.exists( model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" if not os.path.exists(model_data_y_path): download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst")) zstd_decompress(model_data_y_path) assert os.path.exists( model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) if model_name == "testselect": self.use_test_history = True assert db.download_support_file(test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB) self.past_failures_data = test_scheduling.get_past_failures() self.backout_model = self.load_model("backout") assert self.backout_model is not None
def __init__(self, cache_root, git_repo_dir, method_defect_predictor_dir): self.cache_root = cache_root assert os.path.isdir( cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") regressormodel_path = "regressormodel" if not os.path.exists(regressormodel_path): download_check_etag(URL.format(f"{regressormodel_path}.zst"), f"{regressormodel_path}.zst") zstd_decompress(regressormodel_path) assert os.path.exists( regressormodel_path), "Decompressed model exists" regressormodel_data_X_path = "regressormodel_data_X" if not os.path.exists(regressormodel_data_X_path): download_check_etag( URL.format(f"{regressormodel_data_X_path}.zst"), f"{regressormodel_data_X_path}.zst", ) zstd_decompress(regressormodel_data_X_path) assert os.path.exists( regressormodel_data_X_path), "Decompressed X dataset exists" regressormodel_data_y_path = "regressormodel_data_y" if not os.path.exists(regressormodel_data_y_path): download_check_etag( URL.format(f"{regressormodel_data_y_path}.zst"), f"{regressormodel_data_y_path}.zst", ) zstd_decompress(regressormodel_data_y_path) assert os.path.exists( regressormodel_data_y_path), "Decompressed y dataset exists" self.model = RegressorModel.load(regressormodel_path) self.X = to_array(joblib.load(regressormodel_data_X_path)) self.y = to_array(joblib.load(regressormodel_data_y_path)) self.method_defect_predictor_dir = method_defect_predictor_dir self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) self.git_repo_dir = git_repo_dir self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir)
def print_feature_importances(self, important_features, class_probabilities=None): feature_names = self.get_human_readable_feature_names() # extract importance values from the top features for the predicted class # when classifying if class_probabilities is not None: # shap_values are stored in class 1 for binary classification if len(class_probabilities[0]) != 2: predicted_class_index = class_probabilities.argmax(axis=-1)[0] else: predicted_class_index = 0 predicted_class = self.class_names[predicted_class_index] imp_values = important_features["classes"][predicted_class][0] shap_val = [] top_feature_names = [] for importance, index, is_positive in imp_values: if is_positive: shap_val.append("+" + str(importance)) else: shap_val.append("-" + str(importance)) feature_value = np.squeeze( to_array(important_features["values"])[:, int(index)]) top_feature_names.append( f"{feature_names[int(index)]} = {feature_value.round(decimals=5)}" ) shap_val = [[predicted_class] + shap_val] # extract importance values from the top features for all the classes # when training else: top_feature_names = [ feature_names[int(index)] for importance, index, is_pos in important_features["average"] ] shap_val = [[class_name] + imp_values[1] for class_name, imp_values in important_features["classes"].items()] # allow maximum of 5 columns in a row to fit the page better print("Top {} features:".format(len(top_feature_names))) for i in range(0, len(top_feature_names), 5): table = [] for item in shap_val: table.append(item[i:i + 5]) print( tabulate( table, headers=(["classes"] + top_feature_names)[i:i + 5], tablefmt="grid", ), end="\n\n", )
def classify( self, items, probabilities=False, importances=False, importance_cutoff=0.15, background_dataset=None, ): assert items is not None assert (self.extraction_pipeline is not None and self.clf is not None), "The module needs to be initialized first" if not isinstance(items, list): items = [items] assert isinstance(items[0], dict) or isinstance(items[0], tuple) X = self.extraction_pipeline.transform(lambda: items) if probabilities: classes = self.clf.predict_proba(X) else: classes = self.clf.predict(X) classes = self.overwrite_classes(items, classes, probabilities) if importances: pred_class_index = classes.argmax(axis=-1)[0] pred_class = self.le.inverse_transform([pred_class_index])[0] if background_dataset is None: explainer = shap.TreeExplainer(self.clf) else: explainer = shap.TreeExplainer( self.clf, to_array(background_dataset(pred_class)), feature_perturbation="interventional", ) shap_values = explainer.shap_values(to_array(X)) # In the binary case, sometimes shap returns a single shap values matrix. if len(classes[0]) == 2 and not isinstance(shap_values, list): shap_values = [-shap_values, shap_values] important_features = self.get_important_features( importance_cutoff, shap_values) important_features["values"] = X top_indexes = [ int(index) for _, index, _ in important_features["classes"][pred_class][0] ] feature_names = self.get_human_readable_feature_names() feature_legend = { str(i + 1): feature_names[feature_i] for i, feature_i in enumerate(top_indexes) } return ( classes, { "importances": important_features, "feature_legend": feature_legend }, ) return classes
def train(self, importance_cutoff=0.15, limit=None): classes, self.class_names = self.get_labels() self.class_names = sort_class_names(self.class_names) # Get items and labels, filtering out those for which we have no labels. X_gen, y = split_tuple_generator(lambda: self.items_gen(classes)) # Extract features from the items. X = self.extraction_pipeline.fit_transform(X_gen) # Calculate labels. y = np.array(y) if limit: X = X[:limit] y = y[:limit] print(f"X: {X.shape}, y: {y.shape}") is_multilabel = isinstance(y[0], np.ndarray) is_binary = len(self.class_names) == 2 # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) if self.sampler is not None: pipeline = make_pipeline(self.sampler, self.clf) else: pipeline = self.clf tracking_metrics = {} # Use k-fold cross validation to evaluate results. if self.cross_validation_enabled: scorings = ["accuracy"] if len(self.class_names) == 2: scorings += ["precision", "recall"] scores = cross_validate(pipeline, X_train, y_train, scoring=scorings, cv=5) print("Cross Validation scores:") for scoring in scorings: score = scores[f"test_{scoring}"] tracking_metrics[f"test_{scoring}"] = { "mean": score.mean(), "std": score.std() * 2, } print( f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})" ) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") # Training on the resampled dataset if sampler is provided. if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X_train, y_train) print( f"resampled X_train: {X_train.shape}, y_train: {y_train.shape}" ) print(f"X_test: {X_test.shape}, y_test: {y_test.shape}") self.clf.fit(X_train, y_train) print("Model trained") feature_names = self.get_human_readable_feature_names() if self.calculate_importance and len(feature_names): explainer = shap.TreeExplainer(self.clf) shap_values = explainer.shap_values(X_train) # In the binary case, sometimes shap returns a single shap values matrix. if is_binary and not isinstance(shap_values, list): shap_values = [-shap_values, shap_values] summary_plot_value = shap_values[1] summary_plot_type = "layered_violin" else: summary_plot_value = shap_values summary_plot_type = None shap.summary_plot( summary_plot_value, to_array(X_train), feature_names=feature_names, class_names=self.class_names, plot_type=summary_plot_type, show=False, ) matplotlib.pyplot.savefig("feature_importance.png", bbox_inches="tight") matplotlib.pyplot.xlabel("Impact on model output") matplotlib.pyplot.clf() important_features = self.get_important_features( importance_cutoff, shap_values) self.print_feature_importances(important_features) # Save the important features in the metric report too feature_report = self.save_feature_importances( important_features, feature_names) tracking_metrics["feature_report"] = feature_report print("Training Set scores:") y_pred = self.clf.predict(X_train) if not is_multilabel: print( classification_report_imbalanced(y_train, y_pred, labels=self.class_names)) print("Test Set scores:") # Evaluate results on the test set. y_pred = self.clf.predict(X_test) if is_multilabel: assert isinstance( y_pred[0], np.ndarray), "The predictions should be multilabel" print(f"No confidence threshold - {len(y_test)} classified") if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( y_test, y_pred) else: confusion_matrix = metrics.confusion_matrix( y_test, y_pred, labels=self.class_names) print( classification_report_imbalanced(y_test, y_pred, labels=self.class_names)) report = classification_report_imbalanced_values( y_test, y_pred, labels=self.class_names) tracking_metrics["report"] = report print_labeled_confusion_matrix(confusion_matrix, self.class_names, is_multilabel=is_multilabel) tracking_metrics["confusion_matrix"] = confusion_matrix.tolist() confidence_thresholds = [0.6, 0.7, 0.8, 0.9] if is_binary: confidence_thresholds = [0.1, 0.2, 0.3, 0.4 ] + confidence_thresholds # Evaluate results on the test set for some confidence thresholds. for confidence_threshold in confidence_thresholds: y_pred_probas = self.clf.predict_proba(X_test) confidence_class_names = self.class_names + ["__NOT_CLASSIFIED__"] y_pred_filter = [] classified_indices = [] for i in range(0, len(y_test)): if not is_binary: argmax = np.argmax(y_pred_probas[i]) else: argmax = 1 if y_pred_probas[i][ 1] > confidence_threshold else 0 if y_pred_probas[i][argmax] < confidence_threshold: if not is_multilabel: y_pred_filter.append("__NOT_CLASSIFIED__") continue classified_indices.append(i) if is_multilabel: y_pred_filter.append(y_pred[i]) else: y_pred_filter.append(argmax) if not is_multilabel: y_pred_filter = np.array(y_pred_filter) y_pred_filter[classified_indices] = self.le.inverse_transform( np.array(y_pred_filter[classified_indices], dtype=int)) classified_num = sum(1 for v in y_pred_filter if v != "__NOT_CLASSIFIED__") print( f"\nConfidence threshold > {confidence_threshold} - {classified_num} classified" ) if is_multilabel: confusion_matrix = metrics.multilabel_confusion_matrix( y_test[classified_indices], np.asarray(y_pred_filter)) else: confusion_matrix = metrics.confusion_matrix( y_test.astype(str), y_pred_filter.astype(str), labels=confidence_class_names, ) print( classification_report_imbalanced( y_test.astype(str), y_pred_filter.astype(str), labels=confidence_class_names, )) print_labeled_confusion_matrix(confusion_matrix, confidence_class_names, is_multilabel=is_multilabel) self.evaluation() if self.entire_dataset_training: print("Retraining on the entire dataset...") if self.sampler is not None: X_train, y_train = self.sampler.fit_resample(X, y) else: X_train = X y_train = y print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") self.clf.fit(X_train, y_train) with open(self.__class__.__name__.lower(), "wb") as f: pickle.dump(self, f, protocol=pickle.HIGHEST_PROTOCOL) if self.store_dataset: with open(f"{self.__class__.__name__.lower()}_data_X", "wb") as f: pickle.dump(X, f, protocol=pickle.HIGHEST_PROTOCOL) with open(f"{self.__class__.__name__.lower()}_data_y", "wb") as f: pickle.dump(y, f, protocol=pickle.HIGHEST_PROTOCOL) return tracking_metrics
def __init__( self, model_name: str, repo_dir: str, git_repo_dir: str, method_defect_predictor_dir: str, use_single_process: bool, skip_feature_importance: bool, ): self.model_name = model_name self.repo_dir = repo_dir self.model = Model.load(download_model(model_name)) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo( "hg::https://hg.mozilla.org/mozilla-central", git_repo_dir ) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "8cc47f47ffb686a29324435a0151b5fabd37f865", ) self.use_single_process = use_single_process self.skip_feature_importance = skip_feature_importance if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" with open(model_data_X_path, "rb") as fb: self.X = to_array(pickle.load(fb)) with open(model_data_y_path, "rb") as fb: self.y = to_array(pickle.load(fb)) past_bugs_by_function_path = "data/past_fixed_bugs_by_function.json" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "r") as f: self.past_bugs_by_function = json.load(f) if model_name == "testlabelselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_LABEL_SCHEDULING_DB, test_scheduling.PAST_FAILURES_LABEL_DB, ) self.past_failures_data = test_scheduling.get_past_failures("label", True) self.testfailure_model = cast( TestFailureModel, TestFailureModel.load(download_model("testfailure")) ) assert self.testfailure_model is not None
def classify( self, items, probabilities=False, importances=False, importance_cutoff=0.15, background_dataset=None, ): assert items is not None assert (self.extraction_pipeline is not None and self.clf is not None), "The module needs to be initialized first" if not isinstance(items, list): items = [items] assert isinstance(items[0], dict) or isinstance(items[0], tuple) X = self.extraction_pipeline.transform(lambda: items) if probabilities: classes = self.clf.predict_proba(X) else: classes = self.clf.predict(X) classes = self.overwrite_classes(items, classes, probabilities) if importances: if background_dataset is None: explainer = shap.TreeExplainer(self.clf) else: explainer = shap.TreeExplainer( self.clf, to_array(background_dataset), feature_dependence="independent", ) shap_values = explainer.shap_values(to_array(X)) important_features = self.get_important_features( importance_cutoff, shap_values) important_features["values"] = X # Workaround: handle multi class case for force_plot to work correctly if len(classes[0]) > 2: pred_class_index = classes.argmax(axis=-1)[0] explainer.expected_value = explainer.expected_value[ pred_class_index] shap_values = shap_values[pred_class_index] else: pred_class_index = 0 pred_class = self.class_names[pred_class_index] top_indexes = [ int(index) for importance, index, is_positive in important_features["classes"][pred_class][0] ] feature_names = self.get_human_readable_feature_names() feature_legend = { str(i + 1): feature_names[feature_i] for i, feature_i in enumerate(top_indexes) } return ( classes, { "importances": important_features, "feature_legend": feature_legend }, ) return classes
def __init__( self, model_name, cache_root, git_repo_dir, method_defect_predictor_dir ): self.model_name = model_name self.cache_root = cache_root assert os.path.isdir(cache_root), f"Cache root {cache_root} is not a dir." self.repo_dir = os.path.join(cache_root, "mozilla-central") self.model = download_and_load_model(model_name) assert self.model is not None self.git_repo_dir = git_repo_dir if git_repo_dir: self.clone_git_repo("https://github.com/mozilla/gecko-dev", git_repo_dir) self.method_defect_predictor_dir = method_defect_predictor_dir if method_defect_predictor_dir: self.clone_git_repo( "https://github.com/lucapascarella/MethodDefectPredictor", method_defect_predictor_dir, "fa5269b959d8ddf7e97d1e92523bb64c17f9bbcd", ) if model_name == "regressor": self.use_test_history = False model_data_X_path = f"{model_name}model_data_X" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_X_path}.zst") ) if updated: zstd_decompress(model_data_X_path) assert os.path.exists(model_data_X_path), "Decompressed X dataset exists" model_data_y_path = f"{model_name}model_data_y" updated = download_check_etag( URL.format(model_name=model_name, file_name=f"{model_data_y_path}.zst") ) if updated: zstd_decompress(model_data_y_path) assert os.path.exists(model_data_y_path), "Decompressed y dataset exists" self.X = to_array(joblib.load(model_data_X_path)) self.y = to_array(joblib.load(model_data_y_path)) past_bugs_by_function_path = "data/past_bugs_by_function.pickle" download_check_etag( PAST_BUGS_BY_FUNCTION_URL, path=f"{past_bugs_by_function_path}.zst" ) zstd_decompress(past_bugs_by_function_path) assert os.path.exists(past_bugs_by_function_path) with open(past_bugs_by_function_path, "rb") as f: self.past_bugs_by_function = pickle.load(f) if model_name == "testselect": self.use_test_history = True assert db.download_support_file( test_scheduling.TEST_SCHEDULING_DB, test_scheduling.PAST_FAILURES_DB ) self.past_failures_data = test_scheduling.get_past_failures() self.testfailure_model = download_and_load_model("testfailure") assert self.testfailure_model is not None