Beispiel #1
0
def eda_transform_log1(self, data, name, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates a two_panel visualization. The left plot is the log + 1 transformed
            distribution overlayed on a normal distribution. The right plot is a log + 1 
            adjusted qqplot overlayed across a straight line.

        ---
        Parameters:
            data : Pandas Series
                Target variable data object.
            name : str
                Name of target variable.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="dist/kde - {} (log+1)".format(name),
        x_label="",
        y_label="",
        y_shift=0.8,
        position=223,
    )

    # add distribution / kernel density plot to canvas
    p.dist_plot(
        np.log1p(data), color=style.style_grey, fit=stats.norm, x_rotate=True, ax=ax
    )

    # turn off x and y ticks
    plt.xticks([])
    plt.yticks([])

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="probability plot - {} (log+1)".format(name),
        x_label="",
        y_label="",
        y_shift=0.8,
        position=224,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(np.log1p(data), plot=ax)

    # turn off x and y ticks
    plt.xticks([])
    plt.yticks([])
Beispiel #2
0
def eda_num_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[self.target.name] = bi_df[self.target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Probability plot\n* {}".format(feature),
                       position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Regression plot - feature vs. target\n* {}".format(feature),
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=self.target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )
    plt.show()
Beispiel #3
0
def eda_cat_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            outliers_out_of_scope=None,
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a number
            feature in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            outliers_out_of_scope : boolean, float or int, default=None
                Truncates the x-axis upper limit so that outliers are out of scope of the visualization.
                The x-axis upper limit is reset to the maximum non-outlier value.

                To identify outliers, the IQR is calculated, and values that are below the first quartile
                minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True
                is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is
                passed, the IQR is multiplied by that value. Higher values increase how extremem values need
                to be to be identified as outliers.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates larger plots
                and increases visual elements proportionally.
    """
    ### data summaries
    ## bivariate roll_up table
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # bivariate summary statistics
    bi_summ_stats_df = pd.DataFrame(
        columns=["Class", "Count", "Proportion", "Mean", "StdDev"])

    # for each unique class label
    for labl in np.unique(self.target):

        # get feature values associated with single class label
        feature_slice = bi_df[bi_df[self.target.name] == labl][feature]

        # append summary statistics for feature values associated with class label
        bi_summ_stats_df = bi_summ_stats_df.append(
            {
                "Class": labl,
                "Count": len(feature_slice),
                "Proportion": len(feature_slice) / len(bi_df[feature]) * 100,
                "Mean": np.mean(feature_slice),
                "StdDev": np.std(feature_slice),
            },
            ignore_index=True,
        )

    # apply custom legend labels, or set dtype to int if column values are numeric
    if legend_labels is not None:
        bi_summ_stats_df["Class"] = legend_labels
    elif is_numeric_dtype(bi_summ_stats_df["Class"]):
        bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int)

    ## Feature summary
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add missing percentage
    describe_df = describe_df.append(
        {
            "index": "missing",
            feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5),
        },
        ignore_index=True,
    )

    # add skew
    describe_df = describe_df.append(
        {
            "index":
            "skew",
            feature:
            np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5),
        },
        ignore_index=True,
    )
    # add kurtosis
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # execute z-test or t-test
    if len(np.unique(self.target)) == 2:
        s1 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[0])][feature]
        s2 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[1])][feature]
        if len(s1) > 30 and len(s2) > 30:

            # perform z-test, return z-statistic and p-value
            z, p_val = ztest(s1, s2)

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)
        else:
            # perform t-test, return t-score and p-value
            t, p_val = stats.ttest_ind(s1, s2)

            # add t-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "t-test statistic": t,
                    "p-value": p_val
                }],
                columns=["t-test statistic", "p-value"],
                index=[feature],
            ).round(4)

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df, stat_test_df),
            names=[
                "Feature summary", "Feature vs. target summary",
                "Statistical test"
            ],
        )
    else:

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # if boolean is passed to outliers_out_of_scope
    if isinstance(outliers_out_of_scope, bool):
        # if outliers_out_of_scope = True
        if outliers_out_of_scope:

            # identify outliers using IQR method and an IQR step of 5
            outliers = self.outlier_IQR(self.data[feature], iqr_step=5)

            # reset x-axis minimum and maximum
            x_axis_min = self.data[feature].drop(index=outliers).min()
            x_axis_max = self.data[feature].drop(index=outliers).max()
    # if outliers_out_of_scope is a float or int
    elif isinstance(outliers_out_of_scope, float) or isinstance(
            outliers_out_of_scope, int):
        # identify outliers using IQR method and an IQR step equal to the float/int passed
        outliers = self.outlier_IQR(self.data[feature],
                                    iqr_step=outliers_out_of_scope)

        # reset x-axis minimum and maximum
        x_axis_min = self.data[feature].drop(index=outliers).min()
        x_axis_max = self.data[feature].drop(index=outliers).max()

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Feature distribution\n* {}".format(feature),
        title_scale=0.85,
        position=221,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units="f",
        x_units=x_units,
        ax=ax,
    )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Probability plot\n* {}".format(feature),
        title_scale=0.85,
        position=222,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(
        x=bi_df[feature].values,
        plot=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Distribution by class\n* {}".format(feature),
        title_scale=0.85,
        position=223,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # generate color list
    color_list = style.color_gen(name=color_map,
                                 num=len(np.unique(self.target)))

    # add one distribution plot to canvas for each category class
    for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)):
        p.dist_plot(
            bi_df[bi_df[self.target.name] == labl][feature].values,
            color=color_list[ix],
            y_units="f",
            x_units=x_units,
            legend_labels=legend_labels if legend_labels is not None else
            np.arange(len(np.unique(self.target))),
            alpha=0.4,
            bbox=(1.0, 1.0),
            ax=ax,
        )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Boxplot by class\n* {}".format(feature),
        title_scale=0.85,
        position=224,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add horizontal box plot to canvas
    p.box_plot_h(x=feature,
                 y=self.target.name,
                 data=bi_df,
                 alpha=0.7,
                 x_units=x_units,
                 legend_labels=legend_labels,
                 bbox=(1.2, 1.0),
                 suppress_outliers=True,
                 ax=ax)

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max)

    # apply position adjustment to subplots
    plt.subplots_adjust(bottom=-0.1)

    plt.show()
Beispiel #4
0
def eda_num_target_num_feat(self,
                            feature,
                            training_data=True,
                            color_map="viridis",
                            chart_scale=15,
                            save_plots=False):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
            save_plots : boolean, default=False
                Controls whether model loss plot imgaes are saved to the experiment directory.
    """
    # dynamically choose training data objects or validation data objects
    data, target, mlm_dtypes = self.training_or_validation_dataset(
        training_data)

    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([data[feature], target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[target.name] = bi_df[target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Feature distribution\n* {feature}",
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Probability plot\n* {feature}", position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=f"Regression plot - feature vs. target\n* {feature}",
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )

    # save plots or show
    if save_plots:
        plot_path = os.path.join(
            self.eda_object_dir,
            f"{feature}.jpg".replace("/", ""),
        )
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()
    else:
        plt.show()