Beispiel #1
0
    def save_regression(additional_metrics, model_desc, model_path, fold_cnt,
                        repeat_cnt):
        max_metrics = additional_metrics["max_metrics"]
        with open(os.path.join(model_path, "README.md"), "w",
                  encoding="utf-8") as fout:
            fout.write(model_desc)
            fout.write("\n### Metric details:\n{}\n\n".format(
                tabulate(max_metrics.values,
                         max_metrics.columns,
                         tablefmt="pipe")))
            AdditionalMetrics.add_learning_curves(fout)
            AdditionalMetrics.add_tree_viz(fout, model_path, fold_cnt,
                                           repeat_cnt)
            AdditionalMetrics.add_linear_coefs(fout, model_path, fold_cnt,
                                               repeat_cnt)
            AdditionalMetrics.add_permutation_importance(
                fout, model_path, fold_cnt, repeat_cnt)

            plots = additional_metrics.get("additional_plots")
            if plots is not None:
                AdditionalPlots.append(fout, model_path, plots)

            AdditionalMetrics.add_shap_importance(fout, model_path, fold_cnt,
                                                  repeat_cnt)
            AdditionalMetrics.add_shap_regression(fout, model_path, fold_cnt,
                                                  repeat_cnt)

            fout.write("\n\n[<< Go back](../README.md)\n")
Beispiel #2
0
    def save_multiclass_classification(additional_metrics, model_desc,
                                       model_path, fold_cnt, repeat_cnt):
        max_metrics = additional_metrics["max_metrics"].transpose()
        confusion_matrix = additional_metrics["confusion_matrix"]

        with open(os.path.join(model_path, "README.md"), "w",
                  encoding="utf-8") as fout:
            fout.write(model_desc)
            fout.write("\n### Metric details\n{}\n\n".format(
                max_metrics.to_markdown()))
            fout.write("\n## Confusion matrix\n{}".format(
                confusion_matrix.to_markdown()))
            AdditionalMetrics.add_learning_curves(fout)
            AdditionalMetrics.add_tree_viz(fout, model_path, fold_cnt,
                                           repeat_cnt)
            AdditionalMetrics.add_linear_coefs(fout, model_path, fold_cnt,
                                               repeat_cnt)
            AdditionalMetrics.add_permutation_importance(
                fout, model_path, fold_cnt, repeat_cnt)

            plots = additional_metrics.get("additional_plots")
            if plots is not None:
                AdditionalPlots.append(fout, model_path, plots)

            AdditionalMetrics.add_shap_importance(fout, model_path, fold_cnt,
                                                  repeat_cnt)
            AdditionalMetrics.add_shap_multiclass(fout, model_path, fold_cnt,
                                                  repeat_cnt)

            fout.write("\n\n[<< Go back](../README.md)\n")
Beispiel #3
0
    def regression(target, predictions, sample_weight=None):
        regression_metrics = {
            "MAE":
            mean_absolute_error,
            "MSE":
            mean_squared_error,
            "RMSE":
            lambda t, p, sample_weight: np.sqrt(
                mean_squared_error(t, p, sample_weight=sample_weight)),
            "R2":
            r2_score,
            "MAPE":
            mean_absolute_percentage_error,
        }
        max_metrics = {}
        for k, v in regression_metrics.items():
            max_metrics[k] = v(target,
                               predictions,
                               sample_weight=sample_weight)

        return {
            "max_metrics":
            pd.DataFrame({
                "Metric": list(max_metrics.keys()),
                "Score": list(max_metrics.values()),
            }),
            "additional_plots":
            AdditionalPlots.plots_regression(target, predictions),
        }
Beispiel #4
0
    def multiclass_classification(target, predictions, sample_weight=None):
        all_labels = [i[11:] for i in predictions.columns.tolist()[:-1]]

        predicted_probas = predictions[predictions.columns[:-1]]
        ll = logloss(target,
                     predictions[predictions.columns[:-1]],
                     sample_weight=sample_weight)

        if "target" in target.columns.tolist():
            # multiclass coding with integer
            labels = {i: l for i, l in enumerate(all_labels)}
            target = target["target"].map(labels)
        else:
            # multiclass coding with one-hot encoding
            old_columns = target.columns
            t = target[old_columns[0]]
            for l in all_labels:
                t[target[f"target_{l}"] == 1] = l

            target = pd.DataFrame({"target": t})

        # Print the confusion matrix
        predicted_labels = predictions["label"]
        predictions = predictions["label"]
        if not pd.api.types.is_string_dtype(predictions):
            predictions = predictions.astype(str)

        if not pd.api.types.is_string_dtype(target):
            target = target.astype(str)

        conf_matrix = confusion_matrix(target,
                                       predictions,
                                       labels=all_labels,
                                       sample_weight=sample_weight)

        rows = [f"Predicted as {a}" for a in all_labels]
        cols = [f"Labeled as {a}" for a in all_labels]

        conf_matrix = pd.DataFrame(conf_matrix, columns=rows, index=cols)

        max_metrics = classification_report(
            target,
            predictions,
            digits=6,
            labels=all_labels,
            output_dict=True,
            sample_weight=sample_weight,
        )
        max_metrics["logloss"] = ll

        return {
            "max_metrics":
            pd.DataFrame(max_metrics).transpose(),
            "confusion_matrix":
            conf_matrix,
            "additional_plots":
            AdditionalPlots.plots_multiclass(target, predicted_labels,
                                             predicted_probas),
        }
Beispiel #5
0
    def binary_classification(target, predictions, sample_weight=None):

        negative_label, positive_label = "0", "1"
        mapping = None
        try:
            pred_col = predictions.columns[0]
            if "_0_for_" in pred_col and "_1_for_" in pred_col:
                t = pred_col.split("_0_for_")[1]
                t = t.split("_1_for_")
                negative_label, positive_label = t[0], t[1]
                mapping = {0: negative_label, 1: positive_label}
        except Exception as e:
            pass

        predictions = np.array(predictions)
        sorted_predictions = np.sort(predictions)
        STEPS = 100  # can go lower for speed increase ???
        details = {
            "threshold": [],
            "f1": [],
            "accuracy": [],
            "precision": [],
            "recall": [],
            "mcc": [],
        }
        samples_per_step = max(1, np.floor(predictions.shape[0] / STEPS))

        for i in range(STEPS):
            idx = int(i * samples_per_step)
            if idx + 1 >= predictions.shape[0]:
                break
            if i == 0:
                th = 0.9 * np.min(sorted_predictions)
            else:
                th = float(
                    0.5 *
                    (sorted_predictions[idx] + sorted_predictions[idx + 1]))

            if np.sum(predictions > th) < 1:
                break
            response = (predictions > th).astype(int)

            details["threshold"] += [th]
            details["f1"] += [
                f1_score(target, response, sample_weight=sample_weight)
            ]
            details["accuracy"] += [
                accuracy_score(target, response, sample_weight=sample_weight)
            ]
            details["precision"] += [
                precision_score(target, response, sample_weight=sample_weight)
            ]
            details["recall"] += [
                recall_score(target, response, sample_weight=sample_weight)
            ]
            if i == 0:
                details["mcc"] += [0.0]
            else:
                details["mcc"] += [
                    matthews_corrcoef(target,
                                      response,
                                      sample_weight=sample_weight)
                ]

        # max metrics
        max_metrics = {
            "logloss": {
                "score": logloss(target,
                                 predictions,
                                 sample_weight=sample_weight),
                "threshold": None,
            },  # there is no threshold for LogLoss
            "auc": {
                "score":
                roc_auc_score(target, predictions,
                              sample_weight=sample_weight),
                "threshold":
                None,
            },  # there is no threshold for AUC
            "f1": {
                "score": np.max(details["f1"]),
                "threshold": details["threshold"][np.argmax(details["f1"])],
            },
            "accuracy": {
                "score": np.max(details["accuracy"]),
                "threshold":
                details["threshold"][np.argmax(details["accuracy"])],
            },
            "precision": {
                "score": np.max(details["precision"]),
                "threshold":
                details["threshold"][np.argmax(details["precision"])],
            },
            "recall": {
                "score": np.max(details["recall"]),
                "threshold":
                details["threshold"][np.argmax(details["recall"])],
            },
            "mcc": {
                "score": np.max(details["mcc"]),
                "threshold": details["threshold"][np.argmax(details["mcc"])],
            },
        }

        threshold = float(max_metrics["accuracy"]["threshold"])

        # if sample_weight is not None:
        #    new_max_metrics = {}
        #    for k, v in max_metrics.items():
        #        new_max_metrics["weighted_" + k] = v
        #    max_metrics = new_max_metrics

        # confusion matrix

        conf_matrix = confusion_matrix(target,
                                       predictions > threshold,
                                       sample_weight=sample_weight)

        conf_matrix = pd.DataFrame(
            conf_matrix,
            columns=[
                f"Predicted as {negative_label}",
                f"Predicted as {positive_label}",
            ],
            index=[
                f"Labeled as {negative_label}", f"Labeled as {positive_label}"
            ],
        )

        predicted_labels = pd.Series(
            (predictions.ravel() > threshold).astype(int))
        predicted_probas = pd.DataFrame({
            "proba_0": 1 - predictions.ravel(),
            "proba_1": predictions.ravel(),
        })

        if mapping is not None:
            labeled_target = target["target"].map(mapping)
            predicted_labels = predicted_labels.map(mapping)
        else:
            labeled_target = target

        return {
            "metric_details":
            pd.DataFrame(details),
            "max_metrics":
            pd.DataFrame(max_metrics),
            "confusion_matrix":
            conf_matrix,
            "threshold":
            threshold,
            "additional_plots":
            AdditionalPlots.plots_binary(labeled_target, predicted_labels,
                                         predicted_probas),
        }