Example #1
0
def feature_correlation_plot(df,
                             y_column,
                             model,
                             feature_column,
                             columns_to_exclude=()):
    """This function detects the feature type and plots the correlation information between feature and actual label.
       The correlation is defined as the fraction of data that has a positive true label (a.k.a. average of true label
       for binary label). It also plots the predicted positive class probability.

    Parameters
    ----------
    df : DataFrame
        Data to be plotted

    y_column : str
        Name of the class column

    model : Scikitlearn-like-model
        The model object to be evaluated

    feature_column : str, or 1d array-like
        Name of the feature column to plot correlation on. If passed in as 1d array-like, the features will be treated
        as one-hot encoded.

    columns_to_exclude : tuple, optional (default=())
        Names of unwanted columns

    Returns
    -------
    plot_wrapper : pytalite.plotwrapper.PlotWrapper
        The PlotWrapper object that contains the information and data of the plot
    """

    # Get X, y array representation and feature indices of data
    X, y, name_to_idx = df_to_arrays(df,
                                     y_column,
                                     columns_to_exclude,
                                     return_index=True)

    if type(feature_column) is str:
        # Set up feature values and feature names
        feature_idx = name_to_idx[feature_column]
        feature_values = X[:, feature_idx]

        # Determine the feature type and plot accordingly
        if _feature_type(feature_values) == "categorical":
            return _categorical_fc_plot(X, y, model, feature_idx,
                                        feature_column)
        else:
            return _numerical_fc_plot(X, y, model, feature_idx, feature_column)
    else:  # One-hot features
        feature_idx = np.array([name_to_idx[cat] for cat in feature_column])
        return _categorical_fc_plot(X,
                                    y,
                                    model,
                                    feature_idx,
                                    feature_column,
                                    one_hot=True)
Example #2
0
def _feature_importance(df,
                        y_column,
                        model,
                        n_jobs,
                        columns_to_exclude=(),
                        n_samples=100):
    """Compute all feature importances by performing multiprocessing"""
    X, y, name_to_idx = df_to_arrays(df,
                                     y_column,
                                     columns_to_exclude,
                                     return_index=True)
    n_jobs = None if n_jobs < 0 else n_jobs

    sample_importance_func = partial(_sample_feature_importance,
                                     X=X,
                                     y_true=y,
                                     model=model,
                                     sample_size=X.shape[0] // n_samples)

    if n_jobs == 1:
        sys.stderr.write("Going single process")
        stats = []
        for stat in map(sample_importance_func, range(n_samples)):
            stats.append(stat)

    else:
        with Pool(n_jobs) as executor_instance:
            chunksize, extra = divmod(n_samples, len(executor_instance._pool))
            if extra:
                chunksize += 1

            stats = []
            sys.stderr.write("Start Multiprocessing, num_processes=%d" %
                             len(executor_instance._pool))
            for stat in executor_instance.map(sample_importance_func,
                                              range(n_samples), chunksize):
                stats.append(stat)

    assert len(stats) == 100

    stats = np.array(stats)

    normalized = (stats - np.min(stats)) / np.ptp(stats)

    importance = [(name, _comp_mean_ci(normalized[:, idx]))
                  for name, idx in name_to_idx.items()]
    return sorted(importance, key=lambda x: -x[1][0])
Example #3
0
def feature_ale_plot(df,
                     y_column,
                     model,
                     feature_column,
                     predictor=None,
                     columns_to_exclude=(),
                     bins=100):
    """This function create the Accumulated Local Effect (ALE) plot of the target feature.
       Visit https://christophm.github.io/interpretable-ml-book/ale.html for more detailed explanation of ALE.

    Parameters
    ----------
    df : DataFrame
        Data to be plotted

    y_column : str
        Name of the class column

    model : Scikitlearn-like-model
        The model object to be evaluated

    feature_column : str
        Name of the feature column to plot ALE on

    predictor : function, optional (default=None)
        The prediction function, which should take in the feature matrix and return an array of predictions
        The function should output positive class probabilities for a classification task, and actual predicted values
        for a regression task.
        If not specified, defaults to a function equivalent to: lambda X: model.predict_prob(X)[:, 1], which is for
        classification.

    columns_to_exclude : tuple, optional (default=())
        Names of unwanted columns

    bins : int, optional (default=100)
        The number of intervals for the ALE plot

    Returns
    -------
    plot_wrapper : pytalite.plotwrapper.PlotWrapper
        The PlotWrapper object that contains the information and data of the plot
    """
    # Get X, y array representation and feature indices from data
    X, _, name_to_idx = df_to_arrays(df,
                                     y_column,
                                     columns_to_exclude,
                                     return_index=True)
    feature_idx = name_to_idx[feature_column]

    if predictor is None:

        def predictor(X):
            return model.predict_proba(X)[:, 1]

    unique_feature_vals = np.unique(X[:, feature_idx])
    unique_feature_vals = unique_feature_vals[unique_feature_vals != -1]

    quantiles = np.percentile(unique_feature_vals,
                              [i * 100 / bins for i in range(0, bins + 1)])
    ale, counts = _ale_num(feature_idx, X, predictor, quantiles)

    xs = (quantiles[1:] + quantiles[:-1]) / 2

    with plt.style.context(style_path):
        fig = plt.figure(figsize=(12, 9))
        grid = GridSpec(2, 1, height_ratios=[10, 1], hspace=0)

        ax1 = plt.subplot(grid[0])
        fig.add_subplot(ax1)
        line_plot(ax1,
                  xs,
                  ale,
                  line_label=False,
                  xticks=[],
                  ylabel="ALE of %s" % y_column)

        ax2 = plt.subplot(grid[1])
        fig.add_subplot(ax2, sharex=ax1)
        event_plot(ax2,
                   X[:, feature_idx][X[:, feature_idx] != -1],
                   0.5,
                   1,
                   xlabel=feature_column,
                   yticks=[],
                   ylim=(-0.2, 1.2))
    plt.show()

    return PlotWrapper(fig, (ax1, ax2), {
        "quantiles": quantiles,
        "ale": ale,
        "quantile_distribution": counts
    })
Example #4
0
def density_plot(df, y_column, models, model_names=(), columns_to_exclude=()):
    """This function creates the density plot of predicted positive class probability on actual positive and negative
       data by each model in models in the same plot. It also computes the difference between the distributions on
       positive and negative data using Bhattacharyya distance, KL distance, and cross entropy (a.k.a. log-loss).

    Parameters
    ----------
    df : DataFrame
        Data to be plotted

    y_column : str
        Label of the class column

    models : array-like
        The model objects to be evaluated

    model_names : array-like
        The name of the models to be shown in the legends

    columns_to_exclude : tuple, optional (default=())
        Labels of unwanted columns

    Returns
    -------
    plot_wrapper : pytalite.plotwrapper.PlotWrapper
        The PlotWrapper object that contains the information and data of the plot

    Raises
    ------
    ValueError
        If models is empty or models and model_names does not have the same length
    """

    # Get X, y array representation of data snd predict probability
    X, y = df_to_arrays(df, y_column, columns_to_exclude)
    pos_idx = y == 1
    neg_idx = y == 0
    n_models = len(models)

    if n_models == 0:
        raise ValueError("no models to evaluate")

    if len(model_names) == 0:
        model_names = ["model %d" % (i + 1) for i in range(n_models)]

    if len(model_names) != n_models:
        raise ValueError("models and model_names must have the same length")

    # List and array to store data
    pos_data = np.empty((0, 1000))
    neg_data = np.empty((0, 1000))
    bds = []
    kls = []
    ces = []

    with plt.style.context(style_path):
        fig = plt.figure(figsize=(12, 9))
        grid = GridSpec(2, 1, height_ratios=[3.5, 3.5], hspace=0)
        ax1 = fig.add_subplot(grid[0])
        ax2 = fig.add_subplot(grid[1])
        scores = []

        # Compute density curve for all models
        for model, model_name in zip(models, model_names):
            y_prob = model.predict_proba(X)[:, 1]

            # Fit gaussian kernels on the data
            kernel_pos = st.gaussian_kde(y_prob[pos_idx])
            kernel_neg = st.gaussian_kde(y_prob[neg_idx])

            xs = np.arange(1000) / 1000
            pos_y = kernel_pos(xs)
            neg_y = kernel_neg(xs)

            # Normalize the curve
            pos_norm = (pos_y / pos_y.sum())[np.newaxis, :]
            neg_norm = (neg_y / neg_y.sum())[np.newaxis, :]

            # Compute all three scores
            bd = _bhattacharyya_distance(pos_norm, neg_norm, normalize=True)
            kl = st.entropy(pos_norm[0], neg_norm[0])
            ce = _cross_entropy(pos_norm, neg_norm, normalize=True)

            # Plot using the kernels
            line_plot(ax1,
                      xs,
                      pos_y,
                      legend=model_name,
                      line_color=None,
                      line_label=False)
            line_plot(ax2, xs, neg_y, line_color=None, line_label=False)

            scores.append(
                "%s: Bhattacharyya Distance: %.4f, KL Distance: %.4f, Cross-Entropy: %.4f"
                % (model_name, bd, kl, ce))

            # Add data
            pos_data = np.vstack((pos_data, pos_y))
            neg_data = np.vstack((neg_data, neg_y))
            bds.append(bd)
            kls.append(kl)
            ces.append(ce)

        ylim_max = max(pos_data.max(), neg_data.max()) * 1.1
        ylim_min = round(-ylim_max * 0.05, 1)

        # Add scores to plot as text
        # ax3.text(0.5, 0.5, "\n".join(scores), va="center", ha="center")

        config_axes(ax1,
                    xticks=[],
                    ylabel="Positive Density",
                    ylim=(ylim_min, ylim_max))
        config_axes(ax2,
                    y_invert=True,
                    xlabel="Probability\n" + "\n".join(scores),
                    ylabel="Negative Density",
                    ylim=(ylim_min, ylim_max))
    plt.show()

    return PlotWrapper(
        fig, (ax1, ax2), {
            "probability": xs,
            "pos_density": pos_data,
            "neg_density": neg_data,
            "Bhattacharyya": np.array(bds),
            "KL": np.array(kls),
            "cross_entropy": np.array(ces)
        })
Example #5
0
def decile_plot(df, y_column, model, columns_to_exclude=(), num_deciles=10):
    """The function sorts the data points by the predicted positive class probability and divide them into bins.
     It plots bins based on the cumulative precision and recall in two plots.

    Parameters
    ----------
    df : DataFrame
        Data to be plotted

    y_column : str
        Name of the class column

    model : Scikitlearn-like-model
        The model object to be evaluated

    columns_to_exclude : tuple, optional (default=())
        Names of unwanted columns

    num_deciles : int, optional (default=10)
        Number of bars to be plotted, each bar represents about 1/num_deciles of the data

    Returns
    -------
    plot_wrapper : pytalite.plotwrapper.PlotWrapper
        The PlotWrapper object that contains the information and data of the plot

    Raises
    ------
    ValueError
        If the number of deciles exceeds 50
    """

    # Validation check
    if num_deciles > 50:
        raise ValueError("The number of deciles cannot exceed 50")

    # Get X, y array representation of data
    X, y = df_to_arrays(df, y_column, columns_to_exclude)

    # Get and sort predicted probability, then split to 10 arrays (deciles)
    y_prob = model.predict_proba(X)
    indices = np.argsort(y_prob[:, 1])[::-1]
    deciles = list(indices[:indices.shape[0] -
                           indices.shape[0] % num_deciles].reshape(
                               (num_deciles, y_prob.shape[0] // num_deciles)))
    deciles[-1] = np.concatenate(
        (deciles[-1],
         indices[indices.shape[0] - indices.shape[0] % num_deciles:]))
    true_counts = np.array(
        [np.bincount(y[decile], minlength=2)[1] for decile in deciles])
    decile_size = np.array([decile.shape[0] for decile in deciles])

    # Calculate the true label fraction on each decile and cumulative decile precision
    cum_recall_score = np.cumsum(true_counts) / true_counts.sum()
    cum_precision_score = np.cumsum(true_counts) / np.cumsum(decile_size)

    # Create decile plot
    with plt.style.context(style_path):
        fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12, 9))
        xticks = np.arange(0, 11) / 10
        xtick_labels = list(map(lambda x: "%d%%" % x, xticks * 100))
        xs = np.arange(num_deciles) / num_deciles

        # Draw bar plot
        bar_plot(ax1,
                 xs,
                 cum_precision_score,
                 width=1 / num_deciles,
                 align='edge',
                 ylim=(0, np.max(cum_precision_score) * 1.2),
                 ylabel="Cumulative precision",
                 edge_color='w',
                 bar_label=False)

        # Create cumulative decile plot
        ax2 = plt.subplot(2, 1, 2, sharex=ax1)

        # Draw bar plot
        bar_plot(ax2,
                 xs,
                 cum_recall_score,
                 width=1 / num_deciles,
                 align='edge',
                 xticks=xticks,
                 xticklabels=xtick_labels,
                 xlim=(0, 1),
                 xlabel="Deciles",
                 ylim=(0, np.max(cum_recall_score) * 1.2),
                 ylabel="Cumulative recall",
                 bar_color=clr.main[0],
                 edge_color='w',
                 bar_label=False)

    plt.show()

    return PlotWrapper(
        fig, (ax1, ax2), {
            "shared_x": xs,
            "cum_recall_score": cum_recall_score,
            "cum_precision_score": cum_precision_score
        })
Example #6
0
def partial_dependence_plot(df,
                            y_column,
                            model,
                            feature_column,
                            predictor=None,
                            columns_to_exclude=(),
                            n_jobs=-1):
    """This function create the Partial Dependence plot (PDP) of the target feature.
       Visit https://christophm.github.io/interpretable-ml-book/pdp.html for more detailed explanation of PDP.

    Parameters
    ----------
    df : DataFrame
        Data to be plotted

    y_column : str
        Name of the class column

    model : Scikitlearn-like-model
        The model object to be evaluated

    feature_column : str, or 1d array-like
        Name of the feature column to plot PDP on. If passed in as 1d array-like, the features will be treated as one-
        hot encoded.

    predictor : function, optional (default=None)
        The prediction function, which should take in the feature matrix and return an array of predictions
        The function should output positive class probabilities for a classification task, and actual predicted values
        for a regression task.
        If not specified, defaults to a function equivalent to: lambda X: model.predict_prob(X)[:, 1], which is for
        classification.

    columns_to_exclude : tuple, optional (default=())
        Names of unwanted columns

    n_jobs : int, optional (default=-1)
        Level of multiprocessing, 1 means single-process, -1 means unlimited (actually number of processes depends on
        the machine)

    Returns
    -------
    plot_wrapper : pytalite.plotwrapper.PlotWrapper
        The PlotWrapper object that contains the information and data of the plot
    """
    X, _, name_to_idx = df_to_arrays(df,
                                     y_column,
                                     columns_to_exclude,
                                     return_index=True)

    if predictor is None:

        def predictor(X):
            return model.predict_proba(X)[:, 1]

    if type(feature_column) is str:
        feature_idx = name_to_idx[feature_column]
        feature_values = X[:, feature_idx]

        if _feature_type(feature_values) == "categorical":
            return _partial_dependence_plot_cat(X, predictor, feature_idx,
                                                y_column, feature_column,
                                                n_jobs)
        else:
            return _partial_dependence_plot_num(X, predictor, feature_idx,
                                                y_column, feature_column,
                                                n_jobs)
    else:
        feature_idx = np.array([name_to_idx[cat] for cat in feature_column])
        return _partial_dependence_plot_cat(X,
                                            predictor,
                                            feature_idx,
                                            y_column,
                                            feature_column,
                                            n_jobs,
                                            one_hot=True)