Ejemplo n.º 1
0
def plot_many_univariates(df, y_var_name):
    """ A linear spline regression all continuous columns in the dataframe. of string 'y_var' across the named string 'xvar' in the dataframe var_name on matplotlib axis 'ax'
        INPUT:
            ax:
                matplotlib axis
                (use 'fig, ax = matplotlib.pyplot.subplots(1,1)')
            dataframe:
                dataframe of floats or ints
            x_var_name:
                the column name of the x variable in the dataframe
            y_var_name:
                string, the column name of the dependent y variable in the dataframe
        OUTPUT:
            A linear regression, with light blue bootstrapped lines showing the instability of the regression
    """
    (continuous_features, category_features) = autoregression.sort_features(df)
    print(continuous_features)
    continuous_features_greater_two = list(
        filter(lambda x: len(df[x].unique()) > 2, continuous_features))
    if len(continuous_features_greater_two) > 1:
        num_plot_rows = int(np.ceil(
            len(continuous_features_greater_two) / 2.0))
        fig, axs = plt.subplots(num_plot_rows,
                                2,
                                figsize=(14, 3 * num_plot_rows))
        for i, continuous_feature in tqdm.tqdm(
                enumerate(continuous_features_greater_two)):
            # if len(df[continuous_feature].unique()) > 2:
            plot_one_univariate(axs.flatten()[i], df, continuous_feature,
                                y_var_name)
            axs.flatten()[i].set_title(
                f"{continuous_feature}: Univariate Plot")
    elif len(continuous_features_greater_two) == 1:
        fig, axs = plt.subplots(len(continuous_features_greater_two),
                                1,
                                figsize=(14, 4.5 *
                                         len(continuous_features_greater_two)))
        for i, continuous_feature in enumerate(
                continuous_features_greater_two):
            plot_one_univariate(axs, df, continuous_feature, y_var_name)
            axs.set_title("{}: Univariate Plot".format(continuous_feature))
            fig.set_tight_layout(tight=True)  # this doesn't work!!!
            # 'tight_layout' must be used in calling script as well
            fig.tight_layout(pad=2)
    else:
        print('No Continous Features to Plot')
Ejemplo n.º 2
0
def drop_categories_exeeding_limit(df, y_var_name, category_limit):
    """ Drops categories if their # of unique variables exceed the limit.
        INPUT:
            df:
                A dataframe of independent features and one dependent y feature
            y_var_name:
                string, the column name of the dependent y variable in the dataframe
        OUTPUT:
            df:
                A dataframe with X less features for each who exeeds the limit.
    """
    (continuous_features, category_features) = autoregression.sort_features(
        df.drop(y_var_name, axis=1))
    for cat in category_features:
        if len(df[cat].unique()) > category_limit:
            df.drop(cat, axis=1)
            print('Too many unique values in categorical feature "' + cat +
                  '", dropping "' + cat + '"')
    return df
Ejemplo n.º 3
0
def clean_df_X(df_X):
    """ Finds pesky nulls and np.infs. Replaces them with appropriate means or labels. Adds a labeling feature (True/False's only).
        (Ex: -np.inf => "was_neg_inf")
        INPUT:
            df_X:
                A dataframe of independent variables.
        OUTPUT:
            df_X:
                The same dataframe, with meaned values that were null. At most three new features (of 0's and 1's) per column.
    """
    (continuous_features,
     categorical_features) = autoregression.sort_features(df_X)
    for feature in continuous_features:
        df_X = add_feature_continuous_null(df_X, feature)
    for feature in categorical_features:
        df_X = category_clean_null_and_inf(df_X, feature)
    for feature in continuous_features:
        if (len(df_X[feature].unique()) <= 1):
            df_X = df_X.drop(feature, axis=1)
    return df_X
Ejemplo n.º 4
0
def plot_scatter_matrix(df, y_var_name=None):
    """ plots a series of scatter matrix of the continuous variables
        INPUT:
            df:
                dataframe
            y_var_name:
                string, the column name of the dependent y variable in the dataframe
            jitter:
                a float that widens the data, make this wider according to number of datapoints.
            **options:
                the **options input found in matplotlib scatter
        OUTPUT:
            A jitterplot on ax.
    """
    (continuous_features, category_features) = autoregression.sort_features(
        df.drop(y_var_name, axis=1))
    if len(df) < 300:
        sample_limit = len(df)
    else:
        sample_limit = 300
    if y_var_name:
        if y_var_name in continuous_features:
            continuous_features.remove(y_var_name)
    while 5 < len(continuous_features):
        if y_var_name:
            plot_sample_df = df[[y_var_name] +
                                continuous_features[:6]].sample(n=sample_limit)
        else:
            plot_sample_df = df[continuous_features[:6]].sample(n=sample_limit)

        pd.plotting.scatter_matrix(plot_sample_df,
                                   figsize=(len(plot_sample_df) * .07,
                                            len(plot_sample_df) * .07))
        plt.show()
        continuous_features = continuous_features[5:]
    plot_sample_df = df[[y_var_name] +
                        continuous_features].sample(n=sample_limit)
    pd.plotting.scatter_matrix(plot_sample_df,
                               figsize=(len(plot_sample_df) * .1,
                                        len(plot_sample_df) * .1))