def plot_scatter_matrix(df, y_continuous=True, y_var_name=None, colors=None): """ Plots a series of scatter matrix of the continuous variables. INPUT: df: dataframe y_var_name: string, the column name of the dependent y variable in the dataframe jitter: a float that widens the data, make this wider according to number of datapoints **options: the **options input found in matplotlib scatter OUTPUT: A scatterplot on ax. """ if not y_var_name: y_var_name = df.columns[0] (continuous_features, category_features) = sort_features(df.drop(y_var_name, axis=1)) color_wheel = make_color_wheel(df, y_var_name) sample_df = take_sample(df) # Oh my, possible error: you can make a subsample that one unique value while 5 < len(continuous_features): plot_sample_df = sample_df[[y_var_name] + continuous_features[:5]] plot_one_scatter_matrix(plot_sample_df, sample_df, y_var_name, color_wheel, colors, y_continuous) plt.show() continuous_features = continuous_features[5:] plot_sample_df = sample_df[[y_var_name] + continuous_features] plot_one_scatter_matrix(plot_sample_df, sample_df, y_var_name, color_wheel, colors, y_continuous) return None
def make_models(df, df_X, y, y_var_name, univariates, alphas=np.logspace(start=-2, stop=5, num=5)): """CHOOSE MODELS FOR CONTINUOUS OR CATEGORICAL Y, make the Models""" print(len(y.unique())) (continuous_features, category_features) = sort_features(df_X) is_continuous = (y_var_name in continuous_features) if is_continuous: print('Y VARIABLE: "' + y_var_name + '" IS CONTINUOUS') print() if univariates: plot_many_univariates(df, y_var_name) plt.show() names_models = make_cont_models(alphas) scoring = 'neg_mean_squared_error' else: print('Y VARIABLE: "' + y_var_name + '" IS CATEGORICAL') print() names_models = make_cat_models(alphas) scoring = 'accuracy' models = [x[1] for x in names_models] return (names_models, continuous_features, category_features, models, scoring, is_continuous, alphas)
def auto_spline_pipeliner(df_X, knots=10): (continuous_features, category_features) = sort_features(df_X) # print(continuous_features) # print(category_features) continuous_pipelet = [] category_pipelet = [] for name in continuous_features: knotspace = list(np.linspace(df_X[name].min(), df_X[name].max(), knots)) continuous_pipelet.append( (name + '_fit', simple_spline_specification(name, knotspace))) for name in category_features: category_pipe = simple_category_specification( name, list(df_X[name].unique())) category_pipelet.append((name + '_spec', category_pipe)) # print(df_X[name].unique()[:-1]) category_features_pipe = FeatureUnion(category_pipelet) if (continuous_features == []) & (category_features == []): return "(continuous_features == []) & (category_features == [])" if continuous_features == []: return category_features_pipe continuous_features_scaled = Pipeline([('continuous_features', FeatureUnion(continuous_pipelet)), ('standardizer', StandardScaler())]) if category_features == []: return continuous_features_scaled pipe_continuous_category = FeatureUnion([ ('continuous_features', continuous_features_scaled), ('category_features', category_features_pipe) ]) return pipe_continuous_category
def plot_many_univariates(df, y_var_name): """ A linear spline regression of all continuous columns in the dataframe. INPUT: ax: matplotlib axis (use 'fig, ax = matplotlib.pyplot.subplots(1,1)') dataframe: dataframe of floats or ints x_var_name: the column name of the x variable in the dataframe y_var_name: string, the column name of the dependent y variable in the dataframe OUTPUT: A linear regression, with light blue bootstrapped lines showing the instability of the regression """ (continuous_features, category_features) = sort_features(df) continuous_features_greater_two = list( filter(lambda x: len(df[x].unique()) > 2, continuous_features)) if len(continuous_features_greater_two) > 1: num_plot_rows = int(np.ceil( len(continuous_features_greater_two) / 2.0)) fig, axs = plt.subplots(num_plot_rows, 2, figsize=(14, 3 * num_plot_rows)) for i, continuous_feature in tqdm.tqdm( enumerate(continuous_features_greater_two)): # if len(df[continuous_feature].unique()) > 2: plot_one_univariate(axs.flatten()[i], df, continuous_feature, y_var_name) axs.flatten()[i].set_title( f"{continuous_feature}: Univariate Plot") elif len(continuous_features_greater_two) == 1: fig, axs = plt.subplots(len(continuous_features_greater_two), 1, figsize=(14, 4.5 * len(continuous_features_greater_two))) for i, continuous_feature in enumerate( continuous_features_greater_two): plot_one_univariate(axs, df, continuous_feature, y_var_name) axs.set_title("{}: Univariate Plot".format(continuous_feature)) fig.set_tight_layout(tight=True) # this doesn't work!!! # 'tight_layout' must be used in calling script as well fig.tight_layout(pad=2) else: raise ValueError('No Continous Features to Plot') return None
def plot_continuous_error_graphs(df, y, y_var_name, model, is_continuous, sample_limit=300, predicteds_vs_actuals=True, residuals=True): df_X_sample = df.sample(sample_limit).drop(y_var_name, axis=1) y_hat_sample = model.predict(df_X_sample) if is_continuous: if len(y) > 0: if len(y) == len(y_hat_sample): if predicteds_vs_actuals: (continuous_features, category_features) = sort_features(df_X_sample) timeit(plot_many_predicteds_vs_actuals, df_X_sample, continuous_features, y, y_hat_sample.reshape(-1), n_bins=50) plt.show() # add feature to jitter plot to categorical features # add cdf??? if residuals: fig, ax = plt.subplots() timeit(plot_residual_error, ax, df_X_sample.values[:, 0].reshape(-1), y.reshape(-1), y_hat_sample.reshape(-1), s=30) plt.show() else: print('len(y) != len(y_hat), so no regressions included') else: print('No y, so no regressions included') return None
def compare_predictions(df, y_var_name, percent_data=None, category_limit=11, knots=3, alphas=np.logspace(start=-2, stop=10, num=50), corr_matrix=True, scatter_matrix=True, bootstrap_coefs=True, feature_importances=True, partial_dep=True, actual_vs_predicted=True, residuals=True, univariates=True, compare_models=True, ROC=True, bootstraps=10): """Takes dataframe INPUT: name: string, a feature name to spline knots: int, number knots (divisions) which are divisions between splines. OUTPUT: pipeline """ starttotal = time() df, sample_limit = clean_dataframe(df, y_var_name, percent_data) # REMEMBER OLD DATAFRAME df_unpiped, df_X_unpiped = df.copy(), df.copy().drop(y_var_name, axis=1) (unpiped_continuous_features, unpiped_category_features) = sort_features(df_X_unpiped) columns_unpiped = df_X_unpiped.columns # REMOVE CATEGORICAL VARIABLES THAT HAVE TOO MANY CATEGORIES TO BE USEFUL df = drop_category_exeeding_limit(df, y_var_name, category_limit) # SHOW CORRELATION MATRIX if corr_matrix: if len(unpiped_continuous_features) > 0: timeit(plt.matshow, df.sample(sample_limit).corr()) # MAKE SCATTER MATRIX if scatter_matrix: if len(unpiped_continuous_features) > 0: timeit(plot_scatter_matrix, df, y_var_name, colors=True) plt.show() # TRANSFORM DATAFRAME print('DF COLUMNS: \n' + str(list(df.columns)) + '\n') df, df_X, X, y, pipeline = use_spline(df, y_var_name) print('DF COLUMNS AFTER TRANSFORM: \n' + str(list(df.columns)) + '\n') # MAKE MODELS (names_models, continuous_features, category_features, models, scoring, is_continuous, alphas) = make_models(df, df_X, y, y_var_name, univariates, alphas) # evaluate each model in turn fit_models, results, names, y_hats, errors, seed = [], [], [], [], [], 7 for name, model in tqdm.tqdm(names_models): # if not linear: change df_X to df_X unpiped kfold = model_selection.KFold(n_splits=10, random_state=seed) if name == 'RR' or name == 'LASSO': alpha, cv_results = timeit(plot_choose_alpha, df, model, y_var_name, alphas, kfold, scoring) model = model(alpha) else: cv_results = timeit(cross_val_score, model, X, y, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: mean=%f std=%f" % (name, cv_results.mean(), cv_results.std()) print(msg) # OTHER CROSS VALIDATE METHOD: # FIT MODEL WITH ALL DATA model.fit(X, y) fit_models.append(model) # PLOT PREDICTED VS ACTUALS if is_continuous: timeit(plot_predicted_vs_actuals, df, model, y_var_name, sample_limit) plt.show() # MAKE BOOTSTRAPS if bootstrap_coefs or partial_dep: bootstrap_models = bootstrap_train_premade(model, X, y, bootstraps=bootstraps, fit_intercept=False) # PLOT COEFFICIANTS if hasattr(model, "coef_"): coefs = model.coef_ columns = list(df.drop(y_var_name, axis=1).columns) while (type(coefs[0]) is list) or (type(coefs[0]) is np.ndarray): coefs = list(coefs[0]) timeit(plot_coefs, coefs=coefs, columns=columns, graph_name=name) plt.show() # PLOT BOOTSTRAP COEFFICIANTS if is_continuous: if bootstrap_coefs: # PLOT BOOTSTRAP COEFS fig, axs = timeit(plot_bootstrap_coefs, bootstrap_models, df_X.columns, n_col=4) fig.tight_layout() plt.show() # PLOT FEATURE IMPORTANCES if feature_importances: if 'feature_importances_' in dir(model): timeit(plot_feature_importances, model, df_X) plt.show() # PLOT PARTIAL DEPENDENCIES if partial_dep: timeit(plot_partial_dependences, model, X=df_X_unpiped, var_names=unpiped_continuous_features, y=y, bootstrap_models=bootstrap_models, pipeline=pipeline, n_points=250) plt.tight_layout() plt.show() # PLOT PREDICTED VS ACTUALS plot_continuous_error_graphs(df, y, y_var_name, model, is_continuous, sample_limit, predicteds_vs_actuals=True, residuals=True) df_X = df.drop(y_var_name, axis=1) # GET ERROR y_hat, error = get_error(name, model, df_X, y, is_continuous) y_hats.append(y_hat) errors.append(error) # --COMPARE MODELS-- if compare_models: choose_box_and_violin_plots(names, scoring, compare_models, results, is_continuous) # ROC CURVE if ROC: if not is_continuous: timeit(plot_rocs, models, df_X, y) plt.show() print(f'MAKE SUBSAMPLE TIME: {time() - starttotal}') return names, results, fit_models, pipeline, df_X, y_hats, errors