Python PrettierPlot.dist_plotの例

プログラミング言語: Python

名前空間/パッケージ名: prettierplot.plotter

クラス/型: PrettierPlot

メソッド/関数: dist_plot

hotexamples.comのコード掲載数: 5

Python PrettierPlot.dist_plot - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのprettierplot.plotter.PrettierPlot.dist_plotの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

PrettierPlot(17)

make_canvas(17)

bar_v(6)

dist_plot(5)

prob_plot(4)

reg_plot(4)

facet_cat(2)

kde_plot(2)

roc_curve_plot(2)

tree_map(2)

box_plot_h(1)

box_plot_v(1)

scatter_2d(1)

stacked_bar_h(1)

コード例 #1

ファイルを表示

ファイル: eda_preprocessing.py プロジェクト: o7s8r6/mlmachine

def eda_transform_log1(self, data, name, chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates a two_panel visualization. The left plot is the log + 1 transformed
            distribution overlayed on a normal distribution. The right plot is a log + 1 
            adjusted qqplot overlayed across a straight line.

        ---
        Parameters:
            data : Pandas Series
                Target variable data object.
            name : str
                Name of target variable.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="dist/kde - {} (log+1)".format(name),
        x_label="",
        y_label="",
        y_shift=0.8,
        position=223,
    )

    # add distribution / kernel density plot to canvas
    p.dist_plot(
        np.log1p(data), color=style.style_grey, fit=stats.norm, x_rotate=True, ax=ax
    )

    # turn off x and y ticks
    plt.xticks([])
    plt.yticks([])

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="probability plot - {} (log+1)".format(name),
        x_label="",
        y_label="",
        y_shift=0.8,
        position=224,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(np.log1p(data), plot=ax)

    # turn off x and y ticks
    plt.xticks([])
    plt.yticks([])

コード例 #2

ファイルを表示

ファイル: eda_suite.py プロジェクト: o7s8r6/mlmachine

def eda_num_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
    """
    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[self.target.name] = bi_df[self.target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Feature distribution\n* {}".format(feature),
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title="Probability plot\n* {}".format(feature),
                       position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Regression plot - feature vs. target\n* {}".format(feature),
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=self.target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )
    plt.show()

コード例 #3

ファイルを表示

ファイル: eda_suite.py プロジェクト: o7s8r6/mlmachine

def eda_cat_target_num_feat(self,
                            feature,
                            color_map="viridis",
                            outliers_out_of_scope=None,
                            legend_labels=None,
                            chart_scale=15):
    """
    Documentation:

        ---
        Description:
            Creates exploratory data visualizations and statistical summaries for a number
            feature in the context of a categorical target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            outliers_out_of_scope : boolean, float or int, default=None
                Truncates the x-axis upper limit so that outliers are out of scope of the visualization.
                The x-axis upper limit is reset to the maximum non-outlier value.

                To identify outliers, the IQR is calculated, and values that are below the first quartile
                minus the IQR, or above the third quarterile plus the IQR are designated as outliers. If True
                is passed as a value, the IQR that is subtracted/added is multiplied by 5. If a float or int is
                passed, the IQR is multiplied by that value. Higher values increase how extremem values need
                to be to be identified as outliers.
            legend_labels : list, default=None
                Class labels displayed in plot legend.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates larger plots
                and increases visual elements proportionally.
    """
    ### data summaries
    ## bivariate roll_up table
    # combine feature column and target
    bi_df = pd.concat([self.data[feature], self.target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # bivariate summary statistics
    bi_summ_stats_df = pd.DataFrame(
        columns=["Class", "Count", "Proportion", "Mean", "StdDev"])

    # for each unique class label
    for labl in np.unique(self.target):

        # get feature values associated with single class label
        feature_slice = bi_df[bi_df[self.target.name] == labl][feature]

        # append summary statistics for feature values associated with class label
        bi_summ_stats_df = bi_summ_stats_df.append(
            {
                "Class": labl,
                "Count": len(feature_slice),
                "Proportion": len(feature_slice) / len(bi_df[feature]) * 100,
                "Mean": np.mean(feature_slice),
                "StdDev": np.std(feature_slice),
            },
            ignore_index=True,
        )

    # apply custom legend labels, or set dtype to int if column values are numeric
    if legend_labels is not None:
        bi_summ_stats_df["Class"] = legend_labels
    elif is_numeric_dtype(bi_summ_stats_df["Class"]):
        bi_summ_stats_df["Class"] = bi_summ_stats_df["Class"].astype(np.int)

    ## Feature summary
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add missing percentage
    describe_df = describe_df.append(
        {
            "index": "missing",
            feature: np.round(self.data.shape[0] - bi_df[feature].shape[0], 5),
        },
        ignore_index=True,
    )

    # add skew
    describe_df = describe_df.append(
        {
            "index":
            "skew",
            feature:
            np.round(stats.skew(bi_df[feature].values, nan_policy="omit"), 5),
        },
        ignore_index=True,
    )
    # add kurtosis
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # execute z-test or t-test
    if len(np.unique(self.target)) == 2:
        s1 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[0])][feature]
        s2 = bi_df[(bi_df[self.target.name] == bi_df[
            self.target.name].unique()[1])][feature]
        if len(s1) > 30 and len(s2) > 30:

            # perform z-test, return z-statistic and p-value
            z, p_val = ztest(s1, s2)

            # add z-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "z-test statistic": z,
                    "p-value": p_val
                }],
                columns=["z-test statistic", "p-value"],
                index=[feature],
            ).round(4)
        else:
            # perform t-test, return t-score and p-value
            t, p_val = stats.ttest_ind(s1, s2)

            # add t-statistic and p-value to DataFrame
            stat_test_df = pd.DataFrame(
                data=[{
                    "t-test statistic": t,
                    "p-value": p_val
                }],
                columns=["t-test statistic", "p-value"],
                index=[feature],
            ).round(4)

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df, stat_test_df),
            names=[
                "Feature summary", "Feature vs. target summary",
                "Statistical test"
            ],
        )
    else:

        # display summary tables
        self.df_side_by_side(
            dfs=(describe_df, bi_summ_stats_df),
            names=["Feature summary", "Feature vs. target summary"],
        )

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_standard")

    # if boolean is passed to outliers_out_of_scope
    if isinstance(outliers_out_of_scope, bool):
        # if outliers_out_of_scope = True
        if outliers_out_of_scope:

            # identify outliers using IQR method and an IQR step of 5
            outliers = self.outlier_IQR(self.data[feature], iqr_step=5)

            # reset x-axis minimum and maximum
            x_axis_min = self.data[feature].drop(index=outliers).min()
            x_axis_max = self.data[feature].drop(index=outliers).max()
    # if outliers_out_of_scope is a float or int
    elif isinstance(outliers_out_of_scope, float) or isinstance(
            outliers_out_of_scope, int):
        # identify outliers using IQR method and an IQR step equal to the float/int passed
        outliers = self.outlier_IQR(self.data[feature],
                                    iqr_step=outliers_out_of_scope)

        # reset x-axis minimum and maximum
        x_axis_min = self.data[feature].drop(index=outliers).min()
        x_axis_max = self.data[feature].drop(index=outliers).max()

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Feature distribution\n* {}".format(feature),
        title_scale=0.85,
        position=221,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units="f",
        x_units=x_units,
        ax=ax,
    )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Probability plot\n* {}".format(feature),
        title_scale=0.85,
        position=222,
    )

    # add QQ / probability plot to canvas
    p.prob_plot(
        x=bi_df[feature].values,
        plot=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Distribution by class\n* {}".format(feature),
        title_scale=0.85,
        position=223,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # generate color list
    color_list = style.color_gen(name=color_map,
                                 num=len(np.unique(self.target)))

    # add one distribution plot to canvas for each category class
    for ix, labl in enumerate(np.unique(bi_df[self.target.name].values)):
        p.dist_plot(
            bi_df[bi_df[self.target.name] == labl][feature].values,
            color=color_list[ix],
            y_units="f",
            x_units=x_units,
            legend_labels=legend_labels if legend_labels is not None else
            np.arange(len(np.unique(self.target))),
            alpha=0.4,
            bbox=(1.0, 1.0),
            ax=ax,
        )

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min, x_axis_max)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Boxplot by class\n* {}".format(feature),
        title_scale=0.85,
        position=224,
    )

    ## dynamically determine precision of x-units
    # capture min and max feature values
    dist_min = bi_df[feature].values.min()
    dist_max = bi_df[feature].values.max()

    # determine x-units precision based on min and max values in feature
    if -3 < dist_min < 3 and -3 < dist_max < 3 and dist_max / dist_min < 10:
        x_units = "fff"
    elif -30 < dist_min < 30 and -30 < dist_max < 30 and dist_max / dist_min < 3:
        x_units = "fff"
    elif -5 < dist_min < 5 and -5 < dist_max < 5 and dist_max / dist_min < 10:
        x_units = "ff"
    elif -90 < dist_min < 90 and -90 < dist_max < 90 and dist_max / dist_min < 5:
        x_units = "ff"
    else:
        x_units = "f"

    # add horizontal box plot to canvas
    p.box_plot_h(x=feature,
                 y=self.target.name,
                 data=bi_df,
                 alpha=0.7,
                 x_units=x_units,
                 legend_labels=legend_labels,
                 bbox=(1.2, 1.0),
                 suppress_outliers=True,
                 ax=ax)

    # optionally reset x-axis limits
    if outliers_out_of_scope is not None:
        plt.xlim(x_axis_min - (x_axis_min * 0.1), x_axis_max)

    # apply position adjustment to subplots
    plt.subplots_adjust(bottom=-0.1)

    plt.show()

コード例 #4

ファイルを表示

def regression_panel(self,
                     model,
                     X_train,
                     y_train,
                     X_valid=None,
                     y_valid=None,
                     n_folds=None,
                     title_scale=1.0,
                     color_map="viridis",
                     random_state=1,
                     chart_scale=15):
    """
    Documentation:
        Description:
            creates a set of residual plots and pandas DataFrames, where each row captures various summary statistics
            pertaining to a model's performance. generates residual plots and captures performance data for training
            and validation datasets. If no validation set is provided, then cross_validation is performed on the
            training dataset.
        Parameters:
            model : model object
                Instantiated model object.
            X_train : Pandas DataFrame
                Training data observations.
            y_train : Pandas Series
                Training target data.
            X_valid : Pandas DataFrame, default=None
                Validation data observations.
            y_valid : Pandas Series, default=None
                Validation target data.
            n_folds : int, default=None
                Number of cross-validation folds to use. If validation data is provided through
                X_valid/y_valid, n_folds is ignored.
            title_scale : float, default=1.0
                Controls the scaling up (higher value) and scaling down (lower value) of the size of
                the main chart title, the x_axis title and the y_axis title.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            random_state : int, default=1
                Random number seed.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates larger plots
                and increases visual elements proportionally.
    """

    print("*" * 55)
    print("* Estimator: {}".format(model.estimator_name))
    print("* Parameter set: {}".format(model.model_iter))
    print("*" * 55)

    print("\n" + "*" * 55)
    print("Training data evaluation")

    # fit model on training data
    model.fit(X_train.values, y_train.values)

    ## training dataset
    # generate predictions using training data and calculate residuals
    y_pred = model.predict(X_train.values)
    residuals = y_pred - y_train.values

    # create prettierplot object
    p = PrettierPlot(plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title="Residual plot - training data\nModel: {}\nParameter set: {}".
        format(
            model.estimator_name,
            model.model_iter,
        ),
        x_label="Predicted values",
        y_label="Residuals",
        y_shift=0.55,
        title_scale=title_scale,
        position=121,
    )

    # dynamically size precision of x-units based on magnitude of maximum
    # predicted values
    if -1 <= np.nanmax(y_pred) <= 1:
        x_units = "fff"
    elif -100 <= np.nanmax(y_pred) <= 100:
        x_units = "ff"
    else:
        x_units = "f"

    # dynamically size precision of y-units based on magnitude of maximum
    # predicted values
    if -0.1 <= np.nanmax(residuals) <= 0.1:
        y_units = "ffff"
    elif -1 <= np.nanmax(residuals) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(residuals) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x tick label rotation
    if -10000 < np.nanmax(y_pred) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add 2-dimensional scatter plot to canvas
    p.scatter_2d(
        x=y_pred,
        y=residuals,
        size=7,
        color=style.style_grey,
        y_units=y_units,
        x_units=x_units,
        ax=ax,
    )

    # plot horizontal line at y=0
    plt.hlines(y=0,
               xmin=np.min(y_pred),
               xmax=np.max(y_pred),
               color=style.style_grey,
               lw=2)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=
        "Residual distribution - training data\nModel: {}\nParameter set: {}".
        format(
            model.estimator_name,
            model.model_iter,
        ),
        title_scale=title_scale,
        position=122,
    )

    # add distribution plot to canvas
    p.dist_plot(
        residuals,
        fit=stats.norm,
        color=style.style_grey,
        y_units="ff",
        x_units="fff",
        ax=ax,
    )
    plt.show()

    # generate regression_stats using training data and predictions
    results = self.regression_stats(
        model=model,
        y_true=y_train.values,
        y_pred=y_pred,
        feature_count=X_train.shape[1],
    )

    # create shell results DataFrame and append
    regression_results_summary = pd.DataFrame(columns=list(results.keys()))
    regression_results_summary = regression_results_summary.append(
        results, ignore_index=True)

    ## validation dataset
    # if validation data is provided...
    if X_valid is not None:
        print("\n" + "*" * 55)
        print("Training data evaluation")

        # generate predictions with validation data and calculate residuals
        y_pred = model.predict(X_train.values)
        residuals = y_pred - y_train.values

        # create prettierplot object
        p = PrettierPlot(plot_orientation="wide_narrow")

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title="Residual plot - training data\nModel: {}\nParameter set: {}"
            .format(
                model.estimator_name,
                model.model_iter,
            ),
            x_label="Predicted values",
            y_label="Residuals",
            y_shift=0.55,
            title_scale=title_scale,
            position=121,
        )

        # add 2-dimensional scatter plot to canvas
        p.scatter_2d(
            x=y_pred,
            y=residuals,
            size=7,
            color=style.style_grey,
            y_units=y_units,
            x_units=x_units,
            ax=ax,
        )

        # plot horizontal line at y=0
        plt.hlines(y=0,
                   xmin=np.min(y_pred),
                   xmax=np.max(y_pred),
                   color=style.style_grey,
                   lw=2)

        # add canvas to prettierplot object
        ax = p.make_canvas(
            title=
            "Residual distribution - training data\nModel: {}\nParameter set: {}"
            .format(
                model.estimator_name,
                model.model_iter,
            ),
            title_scale=title_scale,
            position=122,
        )

        # add distribution plot to canvas
        p.dist_plot(
            residuals,
            fit=stats.norm,
            color=style.style_grey,
            y_units="ff",
            x_units="fff",
            ax=ax,
        )
        plt.show()

        # generate regression_stats using validation data and predictions
        results = self.regression_stats(
            model=model,
            y_true=y_train.values,
            y_pred=y_pred,
            feature_count=X_train.shape[1],
            data_type="validation",
        )

        # append results to regression_results_summary
        regression_results_summary = regression_results_summary.append(
            results, ignore_index=True)
        display(regression_results_summary)

    # if n_folds are provided, indicating cross-validation
    elif isinstance(n_folds, int):

        # generate cross-validation indices
        cv = list(
            KFold(n_splits=n_folds, shuffle=True,
                  random_state=random_state).split(X_train, y_train))

        print("\n" + "*" * 55)
        print("Cross validation evaluation")

        # iterate through cross-validation indices
        for i, (train_ix, valid_ix) in enumerate(cv):
            X_train_cv = X_train.iloc[train_ix]
            y_train_cv = y_train.iloc[train_ix]
            X_valid_cv = X_train.iloc[valid_ix]
            y_valid_cv = y_train.iloc[valid_ix]

            # fit model on training data and generate predictions using holdout observations
            y_pred = model.fit(X_train_cv.values,
                               y_train_cv.values).predict(X_valid_cv.values)

            # calculate residuals
            residuals = y_pred - y_valid_cv.values

            # create prettierplot object
            p = PrettierPlot(plot_orientation="wide_narrow")

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title="Residual plot - CV fold {}\nModel: {}\nParameter set: {}"
                .format(
                    i + 1,
                    model.estimator_name,
                    model.model_iter,
                ),
                x_label="Predicted values",
                y_label="Residuals",
                y_shift=0.55,
                position=121,
                title_scale=title_scale,
            )

            # add 2-dimensional scatter plot to canvas
            p.scatter_2d(
                x=y_pred,
                y=residuals,
                size=7,
                color=style.style_grey,
                # color=color_list[i],
                y_units=y_units,
                x_units=x_units,
                ax=ax,
            )

            # plot horizontal line at y=0
            plt.hlines(
                y=0,
                xmin=np.min(y_pred),
                xmax=np.max(y_pred),
                color=style.style_grey,
                lw=2,
            )

            # add canvas to prettierplot object
            ax = p.make_canvas(
                title=
                "Residual distribution - CV fold {}\nModel: {}\nParameter set: {}"
                .format(
                    i + 1,
                    model.estimator_name,
                    model.model_iter,
                ),
                title_scale=title_scale,
                position=122,
            )

            # add distribution plot to canvas
            p.dist_plot(
                residuals,
                fit=stats.norm,
                color=style.style_grey,
                y_units="ff",
                x_units="fff",
                ax=ax,
            )
            plt.show()

            # generate regression_stats using holdout observations and predictions
            results = self.regression_stats(
                model=model,
                y_true=y_valid_cv,
                y_pred=y_pred,
                feature_count=X_valid_cv.shape[1],
                data_type="validation",
                fold=i + 1,
            )

            # append results to regression_results_summary
            regression_results_summary = regression_results_summary.append(
                results, ignore_index=True)
        print("\n" + "*" * 55)
        print("Summary")

        display(regression_results_summary)
    else:
        display(regression_results_summary)

コード例 #5

ファイルを表示

def eda_num_target_num_feat(self,
                            feature,
                            training_data=True,
                            color_map="viridis",
                            chart_scale=15,
                            save_plots=False):
    """
    Documentation:

        ---
        Description:
            Produces exploratory data visualizations and statistical summaries for a numeric
            feature in the context of a numeric target.

        ---
        Parameters:
            feature : str
                Feature to visualize.
            training_data : boolean, dafault=True
                Controls which dataset (training or validation) is used for visualization.
            color_map : str specifying built-in matplotlib colormap, default="viridis"
                Color map applied to plots.
            chart_scale : int or float, default=15
                Controls size and proportions of chart and chart elements. Higher value creates
                larger plots and increases visual elements proportionally.
            save_plots : boolean, default=False
                Controls whether model loss plot imgaes are saved to the experiment directory.
    """
    # dynamically choose training data objects or validation data objects
    data, target, mlm_dtypes = self.training_or_validation_dataset(
        training_data)

    ### data summaries
    ## feature summary
    # combine feature column and target
    bi_df = pd.concat([data[feature], target], axis=1)

    # remove any rows with nulls
    bi_df = bi_df[bi_df[feature].notnull()]

    # cast target as float
    bi_df[target.name] = bi_df[target.name].astype(float)

    # create summary statistic table
    describe_df = pd.DataFrame(bi_df[feature].describe()).reset_index()

    # add skew and kurtosis to describe_df
    describe_df = describe_df.append(
        {
            "index": "skew",
            feature: stats.skew(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.append(
        {
            "index": "kurtosis",
            feature: stats.kurtosis(bi_df[feature].values, nan_policy="omit"),
        },
        ignore_index=True,
    )
    describe_df = describe_df.rename(columns={"index": ""})

    # display summary tables
    display(describe_df)

    ### visualizations
    # create prettierplot object
    p = PrettierPlot(chart_scale=chart_scale, plot_orientation="wide_narrow")

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Feature distribution\n* {feature}",
                       position=131,
                       title_scale=1.2)

    # determine x-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        x_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        x_units = "ff"
    else:
        x_units = "f"

    # determine y-units precision based on magnitude of max value
    if -1 <= np.nanmax(bi_df[feature].values) <= 1:
        y_units = "fff"
    elif -10 <= np.nanmax(bi_df[feature].values) <= 10:
        y_units = "ff"
    else:
        y_units = "f"

    # x rotation
    if -10000 < np.nanmax(bi_df[feature].values) < 10000:
        x_rotate = 0
    else:
        x_rotate = 45

    # add distribution plot to canvas
    p.dist_plot(
        bi_df[feature].values,
        color=style.style_grey,
        y_units=y_units,
        x_rotate=x_rotate,
        ax=ax,
    )

    # add canvas to prettierplot object
    ax = p.make_canvas(title=f"Probability plot\n* {feature}", position=132)

    # add QQ / probability plot to canvas
    p.prob_plot(x=bi_df[feature].values, plot=ax)

    # add canvas to prettierplot object
    ax = p.make_canvas(
        title=f"Regression plot - feature vs. target\n* {feature}",
        position=133,
        title_scale=1.5)

    # add regression plot to canvas
    p.reg_plot(
        x=feature,
        y=target.name,
        data=bi_df,
        x_jitter=0.1,
        x_rotate=x_rotate,
        x_units=x_units,
        y_units=y_units,
        ax=ax,
    )

    # save plots or show
    if save_plots:
        plot_path = os.path.join(
            self.eda_object_dir,
            f"{feature}.jpg".replace("/", ""),
        )
        plt.tight_layout()
        plt.savefig(plot_path)
        plt.close()
    else:
        plt.show()