def plot_many_univariates(df, y_var_name): """ A linear spline regression all continuous columns in the dataframe. of string 'y_var' across the named string 'xvar' in the dataframe var_name on matplotlib axis 'ax' INPUT: ax: matplotlib axis (use 'fig, ax = matplotlib.pyplot.subplots(1,1)') dataframe: dataframe of floats or ints x_var_name: the column name of the x variable in the dataframe y_var_name: string, the column name of the dependent y variable in the dataframe OUTPUT: A linear regression, with light blue bootstrapped lines showing the instability of the regression """ (continuous_features, category_features) = autoregression.sort_features(df) print(continuous_features) continuous_features_greater_two = list( filter(lambda x: len(df[x].unique()) > 2, continuous_features)) if len(continuous_features_greater_two) > 1: num_plot_rows = int(np.ceil( len(continuous_features_greater_two) / 2.0)) fig, axs = plt.subplots(num_plot_rows, 2, figsize=(14, 3 * num_plot_rows)) for i, continuous_feature in tqdm.tqdm( enumerate(continuous_features_greater_two)): # if len(df[continuous_feature].unique()) > 2: plot_one_univariate(axs.flatten()[i], df, continuous_feature, y_var_name) axs.flatten()[i].set_title( f"{continuous_feature}: Univariate Plot") elif len(continuous_features_greater_two) == 1: fig, axs = plt.subplots(len(continuous_features_greater_two), 1, figsize=(14, 4.5 * len(continuous_features_greater_two))) for i, continuous_feature in enumerate( continuous_features_greater_two): plot_one_univariate(axs, df, continuous_feature, y_var_name) axs.set_title("{}: Univariate Plot".format(continuous_feature)) fig.set_tight_layout(tight=True) # this doesn't work!!! # 'tight_layout' must be used in calling script as well fig.tight_layout(pad=2) else: print('No Continous Features to Plot')
def drop_categories_exeeding_limit(df, y_var_name, category_limit): """ Drops categories if their # of unique variables exceed the limit. INPUT: df: A dataframe of independent features and one dependent y feature y_var_name: string, the column name of the dependent y variable in the dataframe OUTPUT: df: A dataframe with X less features for each who exeeds the limit. """ (continuous_features, category_features) = autoregression.sort_features( df.drop(y_var_name, axis=1)) for cat in category_features: if len(df[cat].unique()) > category_limit: df.drop(cat, axis=1) print('Too many unique values in categorical feature "' + cat + '", dropping "' + cat + '"') return df
def clean_df_X(df_X): """ Finds pesky nulls and np.infs. Replaces them with appropriate means or labels. Adds a labeling feature (True/False's only). (Ex: -np.inf => "was_neg_inf") INPUT: df_X: A dataframe of independent variables. OUTPUT: df_X: The same dataframe, with meaned values that were null. At most three new features (of 0's and 1's) per column. """ (continuous_features, categorical_features) = autoregression.sort_features(df_X) for feature in continuous_features: df_X = add_feature_continuous_null(df_X, feature) for feature in categorical_features: df_X = category_clean_null_and_inf(df_X, feature) for feature in continuous_features: if (len(df_X[feature].unique()) <= 1): df_X = df_X.drop(feature, axis=1) return df_X
def plot_scatter_matrix(df, y_var_name=None): """ plots a series of scatter matrix of the continuous variables INPUT: df: dataframe y_var_name: string, the column name of the dependent y variable in the dataframe jitter: a float that widens the data, make this wider according to number of datapoints. **options: the **options input found in matplotlib scatter OUTPUT: A jitterplot on ax. """ (continuous_features, category_features) = autoregression.sort_features( df.drop(y_var_name, axis=1)) if len(df) < 300: sample_limit = len(df) else: sample_limit = 300 if y_var_name: if y_var_name in continuous_features: continuous_features.remove(y_var_name) while 5 < len(continuous_features): if y_var_name: plot_sample_df = df[[y_var_name] + continuous_features[:6]].sample(n=sample_limit) else: plot_sample_df = df[continuous_features[:6]].sample(n=sample_limit) pd.plotting.scatter_matrix(plot_sample_df, figsize=(len(plot_sample_df) * .07, len(plot_sample_df) * .07)) plt.show() continuous_features = continuous_features[5:] plot_sample_df = df[[y_var_name] + continuous_features].sample(n=sample_limit) pd.plotting.scatter_matrix(plot_sample_df, figsize=(len(plot_sample_df) * .1, len(plot_sample_df) * .1))