Ejemplo n.º 1
0
def get_preds_probas(est: ClassifierMixin, X_test: DataFrame, y_test: Series,
                     mapper_dict: Dict) -> DataFrame:
    """
    Get prediction probabilities (if available) or return true and predicted
    labels
    """
    df_preds = DataFrame(est.predict(X_test), index=X_test.index)
    if hasattr(est.named_steps["clf"], "predict_proba"):
        # Get prediction probabilities (if available)
        df_probas = DataFrame(est.predict_proba(X_test), index=X_test.index)

        # Append prediction and prediction probabilities
        df_summ = concat([df_preds, df_probas], axis=1)
        df_summ.columns = ["predicted_label"] + [
            f"probability_of_{i}" for i in range(0, len(np.unique(y_test)))
        ]

        # Get label (class) with maximum prediction probability for each row
        df_summ["max_class_number_manually"] = df_probas.idxmax(axis=1)
        df_summ["probability_of_max_class"] = df_probas.max(axis=1)

        # Compare .predict_proba() and manually extracted prediction
        # probability
        lhs = df_summ["max_class_number_manually"]
        rhs = df_summ["predicted_label"].replace(mapper_dict)
        assert (lhs == rhs).eq(True).all()
    else:
        df_summ = df_preds.copy()
    # Get true label
    df_summ.insert(0, "true_label", y_test)
    return df_summ
Ejemplo n.º 2
0
def test_probabilities(model: ClassifierMixin,
                       X: np.array,
                       y: pd.Series,
                       bins: int = 10,
                       threshold: float = 0.5):
    """Print confusion matrix based on class probability."""
    probs = [p[1] for p in model.predict_proba(X)]
    print('\tProbabilities')
    df = pd.DataFrame({'prob': probs, 'label': y})
    step = 1 / bins
    cut_labels = [round(step * f, 1) for f in range(10)]
    by_prob = (df.groupby(pd.cut(df['prob'], bins,
                                 labels=cut_labels)).agg(['sum',
                                                          'count'])['label'])
    print('\t\tprobs\t1\t0\tacc')
    for index, row in by_prob.iloc[::-1].iterrows():
        ones = row['sum']
        if math.isnan(ones):
            ones = 0
        else:
            ones = int(ones)
        count = row['count']
        zeros = int(count) - ones
        if count > 0:
            acc = zeros / count if index < threshold else ones / count
        else:
            acc = 0.0
        print(f'\t\t{index}\t{ones}\t{zeros}\t{acc:.3f}')
Ejemplo n.º 3
0
    def _predict_oof_model(
        self,
        estimator: ClassifierMixin,
        X: ArrayLike,
    ) -> NDArray:
        """
        Predict probabilities of a test set from a fitted estimator.

        Parameters
        ----------
        estimator : ClassifierMixin
            Fitted estimator.
        X : ArrayLike
            Test set.

        Returns
        -------
        ArrayLike
            Predicted probabilities.
        """
        y_pred_proba = estimator.predict_proba(X)
        # we enforce y_pred_proba to contain all labels included y
        if len(estimator.classes_) != self.n_classes_:
            y_pred_proba = self._fix_number_of_classes(
                estimator.classes_,
                y_pred_proba
            )
        return y_pred_proba
Ejemplo n.º 4
0
def plot_decision_boundary(
        X: pd.DataFrame,
        y: pd.Series,
        clf: ClassifierMixin = sklearn.linear_model.LogisticRegression(),
        title: str = "Decision Boundary Logistic Regression",
        legend_title: str = "Legend",
        h: float = 0.05,
        figsize: tuple = (11.7, 8.27),
):
    """Generate a simple plot of the decision boundary of a classifier.
    
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Classifier vector, where n_samples is the number of samples and
        n_features is the number of features.
    y : array-like, shape (n_samples)
        Target relative to X for classification. Datatype should be integers.
    clf : scikit-learn algorithm
        An object that has the `predict` and `predict_proba` methods
    h : int (default: 0.05)
        Step size in the mesh
    title : string
        Title for the plot.
    legend_title : string
        Legend title for the plot.
    figsize: tuple (default: (11.7, 8.27))
        Width and height of the figure in inches
    
    Returns
    -------
    boundaries: Figure
        Properties of the figure can be changed later, e.g. use `boundaries.axes[0].set_ylim(0,100)` to change ylim
    ax: Axes
        The axes associated with the boundaries Figure.
    
    Examples
    --------
    >>> import seaborn as sns
    >>> from sklearn.svm import SVC
    >>> data = sns.load_dataset("iris")
    >>> # convert the target from string to category to numeric as sklearn cannot handle strings as target
    >>> y = data["species"]
    >>> X = data[["sepal_length", "sepal_width"]]
    >>> clf = SVC(kernel="rbf", gamma=2, C=1, probability=True)
    >>> _ = plot_decision_boundary(X=X, y=y, clf=clf, title = 'Decision Boundary', legend_title = "Species")

    """

    if X.shape[1] != 2:
        raise ValueError("X must contains only two features.")

    if not (pd.api.types.is_integer_dtype(y) or pd.api.types.is_object_dtype(y)
            or pd.api.types.is_categorical_dtype(y)):
        raise TypeError(
            "The target variable y can only have the following dtype: [int, object, category]."
        )

    label_0 = X.columns.tolist()[0]
    label_1 = X.columns.tolist()[1]

    X = X.copy()
    y = y.copy()

    X = X.values
    y = y.astype("category").cat.codes.values

    #     full_col_list = list(sns.color_palette("husl", len(np.unique(y))))
    full_col_list = list(sns.color_palette())

    if len(np.unique(y)) > len(full_col_list):
        raise ValueError(
            "More labels in the data then colors in the color list. Either reduce the number of labels or expend the color list"
        )

    sub_col_list = full_col_list[0:len(np.unique(y))]
    cmap_bold = ListedColormap(sub_col_list)

    # Try to include a mapping in a later release (+ show categorical labels in the legend)

    _ = clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    Z_proba = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z_max = Z_proba.max(axis=1)  # Take the class with highest probability
    Z_max = Z_max.reshape(xx.shape)

    # Put the result into a color plot
    boundaries, ax = plt.subplots(figsize=figsize)
    _ = ax.contour(xx, yy, Z, cmap=cmap_bold)
    _ = ax.scatter(xx,
                   yy,
                   s=(Z_max**2 / h),
                   c=Z,
                   cmap=cmap_bold,
                   alpha=1,
                   edgecolors="none")

    # Plot also the training points
    training = ax.scatter(X[:, 0],
                          X[:, 1],
                          c=y,
                          cmap=cmap_bold,
                          edgecolors="black")
    _ = plt.xlim(xx.min(), xx.max())
    _ = plt.ylim(yy.min(), yy.max())
    _ = plt.title(title)
    _ = plt.subplots_adjust(right=0.8)
    _ = plt.xlabel(label_0)
    _ = plt.ylabel(label_1)

    # Add legend colors
    leg1 = plt.legend(
        *training.legend_elements(),
        frameon=False,
        fontsize=12,
        borderaxespad=0,
        bbox_to_anchor=(1, 0.5),
        handlelength=2,
        handletextpad=1,
        title=legend_title,
    )

    # Add legend sizes
    l1 = plt.scatter([], [], c="black", s=0.4**2 / h, edgecolors="none")
    l2 = plt.scatter([], [], c="black", s=0.6**2 / h, edgecolors="none")
    l3 = plt.scatter([], [], c="black", s=0.8**2 / h, edgecolors="none")
    l4 = plt.scatter([], [], c="black", s=1**2 / h, edgecolors="none")

    labels = ["0.4", "0.6", "0.8", "1"]
    _ = plt.legend(
        [l1, l2, l3, l4],
        labels,
        frameon=False,
        fontsize=12,
        borderaxespad=0,
        bbox_to_anchor=(1, 1),
        handlelength=2,
        handletextpad=1,
        title="Probabilities",
        scatterpoints=1,
    )
    _ = plt.gca().add_artist(leg1)

    return boundaries, ax