def add_shap_binary(fout, model_path, fold_cnt, repeat_cnt): try: # Dependence SHAP dep_plots = [ f for f in os.listdir(model_path) if "_shap_dependence.png" in f ] if not len(dep_plots): return fout.write("\n\n## SHAP Dependence plots\n") for repeat in range(repeat_cnt): repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = learner_name + "_shap_dependence.png" if fname in dep_plots: fout.write( f"\n### Dependence (Fold {fold+1}{repeat_str})\n") fout.write( f"![SHAP Dependence from Fold {fold+1}{repeat_str}]({fname})" ) # SHAP Decisions dec_plots = [ f for f in os.listdir(model_path) if "_shap_class" in f and "decisions.png" in f ] if not len(dec_plots): return fout.write("\n\n## SHAP Decision plots\n") for target in [0, 1]: for decision_type in ["worst", "best"]: for repeat in range(repeat_cnt): repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = ( learner_name + f"_shap_class_{target}_{decision_type}_decisions.png" ) if fname in dec_plots: fout.write( f"\n### Top-10 {decision_type.capitalize()} decisions for class {target} (Fold {fold+1}{repeat_str})\n" ) fout.write( f"![SHAP {decision_type} decisions class {target} from Fold {fold+1}{repeat_str}]({fname})" ) except Exception as e: logger.error( f"Exception while saving SHAP explanations. {str(e)}\nContinuing ..." )
def add_linear_coefs(fout, model_path, fold_cnt, repeat_cnt): coef_files = [f for f in os.listdir(model_path) if "_coefs.csv" in f] if not len(coef_files): return # check if multiclass df = pd.read_csv(os.path.join(model_path, coef_files[0]), index_col=0) if df.shape[0] > 100: return multiclass = df.shape[1] > 1 if multiclass: fout.write("\n\n## Coefficients\n") for repeat in range(repeat_cnt): repeat_str = f", repeat #{repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = learner_name + "_coefs.csv" if fname in coef_files: fout.write( f"\n### Coefficients learner #{fold+1}{repeat_str}\n" ) df = pd.read_csv(os.path.join(model_path, fname), index_col=0) fout.write(df.to_markdown() + "\n") else: df_all = [] for repeat in range(repeat_cnt): repeat_str = f"_Repeat_{repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = learner_name + "_coefs.csv" if fname in coef_files: df = pd.read_csv(os.path.join(model_path, fname), index_col=0) df.columns = [f"Learner_{fold+1}{repeat_str}"] df_all += [df] df = pd.concat(df_all, axis=1) df["m"] = df.mean(axis=1) df = df.sort_values("m", axis=0, ascending=False) df = df.drop("m", axis=1) fout.write("\n\n## Coefficients\n") fout.write(df.to_markdown() + "\n")
def add_tree_viz(fout, model_path, fold_cnt, repeat_cnt): tree_viz = [f for f in os.listdir(model_path) if "_tree.svg" in f] if len(tree_viz): fout.write("\n\n## Tree visualizations\n") for repeat in range(repeat_cnt): repeat_str = f", Repeat #{repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = learner_name + "_tree.svg" if fname in tree_viz: fout.write(f"\n### Tree #{fold+1}{repeat_str}\n") fout.write(f"![Tree {fold+1}{repeat_str}]({fname})")
def add_shap_importance(fout, model_path, fold_cnt, repeat_cnt): try: # SHAP Importance imp_data = [ f for f in os.listdir(model_path) if "_shap_importance.csv" in f ] if not len(imp_data): return df_all = [] for repeat in range(repeat_cnt): repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = learner_name + "_shap_importance.csv" if fname in imp_data: df = pd.read_csv(os.path.join(model_path, fname), index_col=0) df.columns = [f"Learner {fold+1}{repeat_str}"] df_all += [df] df = pd.concat(df_all, axis=1) df["m"] = df.mean(axis=1) df = df.sort_values(by="m", ascending=False) df = df.drop("m", axis=1) # limit to max 25 features in the plot ax = df.head(25).plot.barh(figsize=(10, 7)) ax.invert_yaxis() ax.set_xlabel( "mean(|SHAP value|) average impact on model output magnitude") fig = ax.get_figure() fig.tight_layout(pad=2.0) if df.shape[0] > 25: ax.set_title("SHAP Top-25 important features") else: ax.set_title("SHAP feature importance") fig.savefig(os.path.join(model_path, "shap_importance.png")) fout.write("\n\n## SHAP Importance\n") fout.write(f"![SHAP Importance](shap_importance.png)") except Exception as e: logger.error( f"Exception while saving SHAP importance. {str(e)}\nContinuing ..." )
def test_repeated_kfold(self): REPEATS = 3 FOLDS = 2 a = AutoML( results_path=self.automl_dir, total_time_limit=10, algorithms=["Random Forest"], train_ensemble=False, validation_strategy={ "validation_type": "kfold", "k_folds": FOLDS, "repeats": REPEATS, "shuffle": True, "stratify": True, }, start_random_models=1, ) X, y = datasets.make_classification( n_samples=100, n_features=5, n_informative=4, n_redundant=1, n_classes=2, n_clusters_per_class=3, n_repeated=0, shuffle=False, random_state=0, ) X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])]) a.fit(X, y) result_files = os.listdir( os.path.join(self.automl_dir, "1_Default_RandomForest")) cnt = 0 for repeat in range(REPEATS): for fold in range(FOLDS): learner_name = construct_learner_name(fold, repeat, REPEATS) self.assertTrue( f"{learner_name}.random_forest" in result_files) self.assertTrue(f"{learner_name}_training.log" in result_files) cnt += 1 self.assertTrue(cnt, 6)
def add_tree_viz(fout, model_path, fold_cnt, repeat_cnt): tree_viz = [f for f in os.listdir(model_path) if "_tree.svg" in f] if len(tree_viz): fout.write("\n\n## Decision Tree \n") for repeat in range(repeat_cnt): repeat_str = f", Repeat #{repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name(fold, repeat, repeat_cnt) fname = learner_name + "_tree.svg" if fname in tree_viz: fout.write(f"\n### Tree #{fold+1}{repeat_str}\n") fout.write(f"![Tree {fold+1}{repeat_str}]({fname})") try: fname = os.path.join(model_path, learner_name + "_rules.txt") if os.path.exists(fname): fout.write("\n\n### Rules\n\n") with open(fname, "r") as fin: fout.write(fin.read() + "\n\n") except Exception as e: logger.info("Problem with adding rules to report. " + str(e))
def add_shap_multiclass(fout, model_path, fold_cnt, repeat_cnt): try: # Dependence SHAP dep_plots = [ f for f in os.listdir(model_path) if "_shap_dependence" in f ] if not len(dep_plots): return # get number of classes start_ind = 0 for i, a in enumerate(dep_plots[0].split("_")): if a == "class": start_ind = i + 1 break classes = [] for l in dep_plots: a = l.split("_") classes += ["".join(a[start_ind:])[:-4]] classes = np.unique(classes) fout.write("\n\n## SHAP Dependence plots\n") for repeat in range(repeat_cnt): repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) for t in classes: fname = learner_name + f"_shap_dependence_class_{t}.png" if fname in dep_plots: fout.write( f"\n### Dependence {t} (Fold {fold+1}{repeat_str})\n" ) fout.write( f"![SHAP Dependence from fold {fold+1}{repeat_str}]({fname})" ) # SHAP Decisions dec_plots = [ f for f in os.listdir(model_path) if "_sample_" in f and "decisions.png" in f ] if not len(dec_plots): return fout.write("\n\n## SHAP Decision plots\n") for decision_type in ["worst", "best"]: for sample in [0, 1, 2, 3]: for repeat in range(repeat_cnt): repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name( fold, repeat, repeat_cnt) fname = ( learner_name + f"_sample_{sample}_{decision_type}_decisions.png" ) if fname in dec_plots: fout.write( f"\n### {decision_type.capitalize()} decisions for selected sample {sample+1} (Fold {fold+1}{repeat_str})\n" ) fout.write( f"![SHAP {decision_type} decisions from Fold {fold+1}{repeat_str}]({fname})" ) except Exception as e: logger.error( f"Exception while saving SHAP explanations. {str(e)}\nContinuing ..." )
def add_permutation_importance(fout, model_path, fold_cnt, repeat_cnt): # permutation importance imp_data = [ f for f in os.listdir(model_path) if "_importance.csv" in f and "shap" not in f ] if not len(imp_data): return df_all = [] for repeat in range(repeat_cnt): repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else "" for fold in range(fold_cnt): learner_name = construct_learner_name(fold, repeat, repeat_cnt) fname = learner_name + "_importance.csv" if fname in imp_data: df = pd.read_csv(os.path.join(model_path, fname), index_col=0) df.columns = [f"Learner {fold+1}{repeat_str}"] df_all += [df] df = pd.concat(df_all, axis=1) df["m"] = df.mean(axis=1) df = df.sort_values(by="m", ascending=False) df = df.drop("m", axis=1) # limit to max 25 features in the plot ax = df.head(25).plot.barh(figsize=(10, 7)) ax.invert_yaxis() ax.set_xlabel("Mean of feature importance") fig = ax.get_figure() fig.tight_layout(pad=2.0) if df.shape[0] > 25: ax.set_title("Top-25 important features") else: ax.set_title("Feature importance") fig.savefig(os.path.join(model_path, "permutation_importance.png")) fout.write("\n\n## Permutation-based Importance\n") fout.write( f"![Permutation-based Importance](permutation_importance.png)") if "random_feature" in df.index.tolist(): df["counter"] = 0 df = df.fillna( 0 ) # there might be not-used features between different learners max_counter = 0.0 for col in df.columns: if "Learner" not in col: continue score = max(0, df[col]["random_feature"]) + 1e-6 df["counter"] += (df[col] <= score).astype(int) max_counter += 1.0 """ version 1 df["min_score"] = df.min(axis=1) df["max_score"] = df.max(axis=1) random_feature_score = max( 0.0, float(df["max_score"]["random_feature"]) ) # it should be at least 0 drop_features = df.index[ df["min_score"] < random_feature_score + 1e-6 ].tolist() """ # version 2 - should be better threshold = max_counter / 2.0 drop_features = df.index[df["counter"] >= threshold].tolist() fname = os.path.join(os.path.dirname(model_path), "drop_features.json") with open(fname, "w") as fout: fout.write(json.dumps(drop_features, indent=4)) fname = os.path.join( os.path.dirname(model_path), f"features_scores_threshold_{threshold}.csv", ) df.to_csv(fname, index=False)
def set_learner_name(self, fold, repeat, repeats): self.name = construct_learner_name(fold, repeat, repeats)