Beispiel #1
0
    def add_shap_binary(fout, model_path, fold_cnt, repeat_cnt):
        try:
            # Dependence SHAP
            dep_plots = [
                f for f in os.listdir(model_path)
                if "_shap_dependence.png" in f
            ]
            if not len(dep_plots):
                return

            fout.write("\n\n## SHAP Dependence plots\n")
            for repeat in range(repeat_cnt):
                repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(
                        fold, repeat, repeat_cnt)
                    fname = learner_name + "_shap_dependence.png"
                    if fname in dep_plots:
                        fout.write(
                            f"\n### Dependence (Fold {fold+1}{repeat_str})\n")
                        fout.write(
                            f"![SHAP Dependence from Fold {fold+1}{repeat_str}]({fname})"
                        )

            # SHAP Decisions
            dec_plots = [
                f for f in os.listdir(model_path)
                if "_shap_class" in f and "decisions.png" in f
            ]
            if not len(dec_plots):
                return

            fout.write("\n\n## SHAP Decision plots\n")
            for target in [0, 1]:
                for decision_type in ["worst", "best"]:
                    for repeat in range(repeat_cnt):
                        repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else ""
                        for fold in range(fold_cnt):
                            learner_name = construct_learner_name(
                                fold, repeat, repeat_cnt)
                            fname = (
                                learner_name +
                                f"_shap_class_{target}_{decision_type}_decisions.png"
                            )
                            if fname in dec_plots:
                                fout.write(
                                    f"\n### Top-10 {decision_type.capitalize()} decisions for class {target} (Fold {fold+1}{repeat_str})\n"
                                )
                                fout.write(
                                    f"![SHAP {decision_type} decisions class {target} from Fold {fold+1}{repeat_str}]({fname})"
                                )

        except Exception as e:
            logger.error(
                f"Exception while saving SHAP explanations. {str(e)}\nContinuing ..."
            )
Beispiel #2
0
    def add_linear_coefs(fout, model_path, fold_cnt, repeat_cnt):

        coef_files = [f for f in os.listdir(model_path) if "_coefs.csv" in f]
        if not len(coef_files):
            return

        # check if multiclass
        df = pd.read_csv(os.path.join(model_path, coef_files[0]), index_col=0)
        if df.shape[0] > 100:
            return
        multiclass = df.shape[1] > 1

        if multiclass:
            fout.write("\n\n## Coefficients\n")

            for repeat in range(repeat_cnt):
                repeat_str = f", repeat #{repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(
                        fold, repeat, repeat_cnt)
                    fname = learner_name + "_coefs.csv"
                    if fname in coef_files:
                        fout.write(
                            f"\n### Coefficients learner #{fold+1}{repeat_str}\n"
                        )
                        df = pd.read_csv(os.path.join(model_path, fname),
                                         index_col=0)
                        fout.write(df.to_markdown() + "\n")

        else:
            df_all = []
            for repeat in range(repeat_cnt):
                repeat_str = f"_Repeat_{repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(
                        fold, repeat, repeat_cnt)
                    fname = learner_name + "_coefs.csv"
                    if fname in coef_files:
                        df = pd.read_csv(os.path.join(model_path, fname),
                                         index_col=0)
                        df.columns = [f"Learner_{fold+1}{repeat_str}"]
                        df_all += [df]

            df = pd.concat(df_all, axis=1)
            df["m"] = df.mean(axis=1)

            df = df.sort_values("m", axis=0, ascending=False)
            df = df.drop("m", axis=1)

            fout.write("\n\n## Coefficients\n")
            fout.write(df.to_markdown() + "\n")
    def add_tree_viz(fout, model_path, fold_cnt, repeat_cnt):

        tree_viz = [f for f in os.listdir(model_path) if "_tree.svg" in f]
        if len(tree_viz):
            fout.write("\n\n## Tree visualizations\n")
            for repeat in range(repeat_cnt):
                repeat_str = f", Repeat #{repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(
                        fold, repeat, repeat_cnt)
                    fname = learner_name + "_tree.svg"
                    if fname in tree_viz:
                        fout.write(f"\n### Tree #{fold+1}{repeat_str}\n")
                        fout.write(f"![Tree {fold+1}{repeat_str}]({fname})")
Beispiel #4
0
    def add_shap_importance(fout, model_path, fold_cnt, repeat_cnt):
        try:
            # SHAP Importance
            imp_data = [
                f for f in os.listdir(model_path)
                if "_shap_importance.csv" in f
            ]
            if not len(imp_data):
                return

            df_all = []
            for repeat in range(repeat_cnt):
                repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(
                        fold, repeat, repeat_cnt)
                    fname = learner_name + "_shap_importance.csv"
                    if fname in imp_data:
                        df = pd.read_csv(os.path.join(model_path, fname),
                                         index_col=0)
                        df.columns = [f"Learner {fold+1}{repeat_str}"]
                        df_all += [df]

            df = pd.concat(df_all, axis=1)

            df["m"] = df.mean(axis=1)
            df = df.sort_values(by="m", ascending=False)
            df = df.drop("m", axis=1)

            # limit to max 25 features in the plot
            ax = df.head(25).plot.barh(figsize=(10, 7))
            ax.invert_yaxis()
            ax.set_xlabel(
                "mean(|SHAP value|) average impact on model output magnitude")
            fig = ax.get_figure()
            fig.tight_layout(pad=2.0)
            if df.shape[0] > 25:
                ax.set_title("SHAP Top-25 important features")
            else:
                ax.set_title("SHAP feature importance")
            fig.savefig(os.path.join(model_path, "shap_importance.png"))
            fout.write("\n\n## SHAP Importance\n")
            fout.write(f"![SHAP Importance](shap_importance.png)")
        except Exception as e:
            logger.error(
                f"Exception while saving SHAP importance. {str(e)}\nContinuing ..."
            )
Beispiel #5
0
    def test_repeated_kfold(self):
        REPEATS = 3
        FOLDS = 2

        a = AutoML(
            results_path=self.automl_dir,
            total_time_limit=10,
            algorithms=["Random Forest"],
            train_ensemble=False,
            validation_strategy={
                "validation_type": "kfold",
                "k_folds": FOLDS,
                "repeats": REPEATS,
                "shuffle": True,
                "stratify": True,
            },
            start_random_models=1,
        )

        X, y = datasets.make_classification(
            n_samples=100,
            n_features=5,
            n_informative=4,
            n_redundant=1,
            n_classes=2,
            n_clusters_per_class=3,
            n_repeated=0,
            shuffle=False,
            random_state=0,
        )
        X = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])

        a.fit(X, y)

        result_files = os.listdir(
            os.path.join(self.automl_dir, "1_Default_RandomForest"))

        cnt = 0
        for repeat in range(REPEATS):
            for fold in range(FOLDS):
                learner_name = construct_learner_name(fold, repeat, REPEATS)
                self.assertTrue(
                    f"{learner_name}.random_forest" in result_files)
                self.assertTrue(f"{learner_name}_training.log" in result_files)
                cnt += 1
        self.assertTrue(cnt, 6)
    def add_tree_viz(fout, model_path, fold_cnt, repeat_cnt):

        tree_viz = [f for f in os.listdir(model_path) if "_tree.svg" in f]
        if len(tree_viz):
            fout.write("\n\n## Decision Tree \n")
            for repeat in range(repeat_cnt):
                repeat_str = f", Repeat #{repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(fold, repeat, repeat_cnt)
                    fname = learner_name + "_tree.svg"
                    if fname in tree_viz:
                        fout.write(f"\n### Tree #{fold+1}{repeat_str}\n")
                        fout.write(f"![Tree {fold+1}{repeat_str}]({fname})")
                    try:
                        fname = os.path.join(model_path, learner_name + "_rules.txt")
                        if os.path.exists(fname):
                            fout.write("\n\n### Rules\n\n")
                            with open(fname, "r") as fin:
                                fout.write(fin.read() + "\n\n")
                    except Exception as e:
                        logger.info("Problem with adding rules to report. " + str(e))
Beispiel #7
0
    def add_shap_multiclass(fout, model_path, fold_cnt, repeat_cnt):
        try:
            # Dependence SHAP
            dep_plots = [
                f for f in os.listdir(model_path) if "_shap_dependence" in f
            ]
            if not len(dep_plots):
                return

            # get number of classes
            start_ind = 0
            for i, a in enumerate(dep_plots[0].split("_")):
                if a == "class":
                    start_ind = i + 1
                    break

            classes = []
            for l in dep_plots:
                a = l.split("_")
                classes += ["".join(a[start_ind:])[:-4]]
            classes = np.unique(classes)

            fout.write("\n\n## SHAP Dependence plots\n")

            for repeat in range(repeat_cnt):
                repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else ""
                for fold in range(fold_cnt):
                    learner_name = construct_learner_name(
                        fold, repeat, repeat_cnt)
                    for t in classes:
                        fname = learner_name + f"_shap_dependence_class_{t}.png"
                        if fname in dep_plots:
                            fout.write(
                                f"\n### Dependence {t} (Fold {fold+1}{repeat_str})\n"
                            )
                            fout.write(
                                f"![SHAP Dependence from fold {fold+1}{repeat_str}]({fname})"
                            )

            # SHAP Decisions
            dec_plots = [
                f for f in os.listdir(model_path)
                if "_sample_" in f and "decisions.png" in f
            ]
            if not len(dec_plots):
                return

            fout.write("\n\n## SHAP Decision plots\n")
            for decision_type in ["worst", "best"]:
                for sample in [0, 1, 2, 3]:
                    for repeat in range(repeat_cnt):
                        repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else ""
                        for fold in range(fold_cnt):
                            learner_name = construct_learner_name(
                                fold, repeat, repeat_cnt)
                            fname = (
                                learner_name +
                                f"_sample_{sample}_{decision_type}_decisions.png"
                            )
                            if fname in dec_plots:
                                fout.write(
                                    f"\n### {decision_type.capitalize()} decisions for selected sample {sample+1} (Fold {fold+1}{repeat_str})\n"
                                )
                                fout.write(
                                    f"![SHAP {decision_type} decisions from Fold {fold+1}{repeat_str}]({fname})"
                                )
        except Exception as e:
            logger.error(
                f"Exception while saving SHAP explanations. {str(e)}\nContinuing ..."
            )
Beispiel #8
0
    def add_permutation_importance(fout, model_path, fold_cnt, repeat_cnt):
        # permutation importance
        imp_data = [
            f for f in os.listdir(model_path)
            if "_importance.csv" in f and "shap" not in f
        ]
        if not len(imp_data):
            return

        df_all = []
        for repeat in range(repeat_cnt):
            repeat_str = f", Repeat {repeat+1}" if repeat_cnt > 1 else ""
            for fold in range(fold_cnt):
                learner_name = construct_learner_name(fold, repeat, repeat_cnt)
                fname = learner_name + "_importance.csv"
                if fname in imp_data:
                    df = pd.read_csv(os.path.join(model_path, fname),
                                     index_col=0)
                    df.columns = [f"Learner {fold+1}{repeat_str}"]
                    df_all += [df]

        df = pd.concat(df_all, axis=1)

        df["m"] = df.mean(axis=1)
        df = df.sort_values(by="m", ascending=False)
        df = df.drop("m", axis=1)

        # limit to max 25 features in the plot
        ax = df.head(25).plot.barh(figsize=(10, 7))
        ax.invert_yaxis()
        ax.set_xlabel("Mean of feature importance")
        fig = ax.get_figure()
        fig.tight_layout(pad=2.0)
        if df.shape[0] > 25:
            ax.set_title("Top-25 important features")
        else:
            ax.set_title("Feature importance")

        fig.savefig(os.path.join(model_path, "permutation_importance.png"))
        fout.write("\n\n## Permutation-based Importance\n")
        fout.write(
            f"![Permutation-based Importance](permutation_importance.png)")

        if "random_feature" in df.index.tolist():

            df["counter"] = 0
            df = df.fillna(
                0
            )  # there might be not-used features between different learners
            max_counter = 0.0
            for col in df.columns:
                if "Learner" not in col:
                    continue
                score = max(0, df[col]["random_feature"]) + 1e-6
                df["counter"] += (df[col] <= score).astype(int)
                max_counter += 1.0
            """ version 1
            df["min_score"] = df.min(axis=1)
            df["max_score"] = df.max(axis=1)
            random_feature_score = max(
                0.0, float(df["max_score"]["random_feature"])
            )  # it should be at least 0
            drop_features = df.index[
                df["min_score"] < random_feature_score + 1e-6
            ].tolist()
            """

            # version 2 - should be better
            threshold = max_counter / 2.0
            drop_features = df.index[df["counter"] >= threshold].tolist()

            fname = os.path.join(os.path.dirname(model_path),
                                 "drop_features.json")
            with open(fname, "w") as fout:
                fout.write(json.dumps(drop_features, indent=4))

            fname = os.path.join(
                os.path.dirname(model_path),
                f"features_scores_threshold_{threshold}.csv",
            )
            df.to_csv(fname, index=False)
 def set_learner_name(self, fold, repeat, repeats):
     self.name = construct_learner_name(fold, repeat, repeats)