def __init__(self, X, y, linear_modelcv, name): self.X = X.copy() self.X_std = None self.y_std = None self.y = y.copy() self.model_cv = linear_modelcv self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.type = self.model_cv.__class__.__name__ self.name = self.type self.hyperparameters = pd.Series() self.scaler = XyScaler()
def __init__(self, data, model, predict, name): super().__init__(data, predict) self.model = model self.name = name self.alphas = None self.log_alphas = None self.scaler = XyScaler() self.columns = None
def lasso_model(X_train, X_hold, y_train, y_hold): lasso_alphas = np.logspace(-2, 2, num=40) lasso_cv_errors_train, lasso_cv_errors_test = train_at_various_alphas( X_train.values, y_train.values, Lasso, lasso_alphas) lasso_mean_cv_errors_train = lasso_cv_errors_train.mean(axis=0) lasso_mean_cv_errors_test = lasso_cv_errors_test.mean(axis=0) lasso_optimal_alpha = get_optimal_alpha(lasso_mean_cv_errors_test) #plot_mean_CV_error(lasso_mean_cv_errors_train, lasso_mean_cv_errors_test, lasso_alphas, #lasso_optimal_alpha,'Optimal Lasso-Alpha Level') standardizer = XyScaler() standardizer.fit(X_train.values, y_train.values) X_train_std, y_train_std = standardizer.transform(X_train.values, y_train.values) X_hold_std, y_hold_std = standardizer.transform(X_hold.values, y_hold.values) final_lasso = Lasso(alpha=lasso_optimal_alpha).fit(X_train_std, y_train_std) y_hold_pred_std = final_lasso.predict(X_hold_std) final_lasso_mse = mse(y_hold_std, y_hold_pred_std) r2 = r2_score(y_hold_std, y_hold_pred_std) ress = y_hold_std - y_hold_pred_std print("Lasso R2 score is: ", r2) print("Final Lasso RSS: ", final_lasso_mse) print("Optimal Lasso Alpha: ", lasso_optimal_alpha) return (final_lasso, y_hold_std, y_hold_pred_std, ress, final_lasso_mse)
def cross_valid(X, y, base_estimator, n_folds, random_seed=154): models = [] kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed) test_cv_errors, train_cv_errors = np.empty(n_folds), np.empty(n_folds) for idx, (train, test) in enumerate(kf.split(X)): # Split into train and test X_cv_train, y_cv_train = X[train], y[train] X_cv_test, y_cv_test = X[test], y[test] # Standardize data. standardizer = XyScaler() standardizer.fit(X_cv_train, y_cv_train) X_cv_train_std, y_cv_train_std = standardizer.transform( X_cv_train, y_cv_train) X_cv_test_std, y_cv_test_std = standardizer.transform( X_cv_test, y_cv_test) # Fit estimator estimator = clone(base_estimator) estimator.fit(X_cv_train_std, y_cv_train_std) # Measure performance y_hat_train = estimator.predict(X_cv_train_std) y_hat_test = estimator.predict(X_cv_test_std) # Calclate the error metrics train_cv_errors[idx] = mse(y_cv_train_std, y_hat_train) test_cv_errors[idx] = mse(y_cv_test_std, y_hat_test) return train_cv_errors, test_cv_errors
def ridge_model(X_train, X_hold, y_train, y_hold): ridge_alphas = np.logspace(-2, 4, num=40) ridge_cv_errors_train, ridge_cv_errors_test = train_at_various_alphas( X_train.values, y_train.values, Ridge, ridge_alphas) ridge_mean_cv_errors_train = ridge_cv_errors_train.mean(axis=0) ridge_mean_cv_errors_test = ridge_cv_errors_test.mean(axis=0) ridge_optimal_alpha = get_optimal_alpha(ridge_mean_cv_errors_test) #plot_mean_CV_error(ridge_mean_cv_errors_train, ridge_mean_cv_errors_test, ridge_alphas, ridge_optimal_alpha, #'Optimal Ridge-Alpha Level') standardizer = XyScaler() standardizer.fit(X_train.values, y_train.values) X_train_std, y_train_std = standardizer.transform(X_train.values, y_train.values) X_hold_std, y_hold_std = standardizer.transform(X_hold.values, y_hold.values) final_ridge = Ridge(alpha=ridge_optimal_alpha).fit(X_train_std, y_train_std) y_hold_pred_std = final_ridge.predict(X_hold_std) final_ridge_mse = mse(y_hold_std, y_hold_pred_std) r2 = r2_score(y_hold_std, y_hold_pred_std) ress = y_hold_std - y_hold_pred_std print("Ridge R2 score is: ", r2) print("Final Ridge RSS: ", final_ridge_mse) print("Optimal Ridge Alpha: ", ridge_optimal_alpha) return (final_ridge, y_hold_std, y_hold_pred_std, ress, final_ridge_mse, X_hold_std)
def cv(X, y, base_estimator, n_folds, random_seed=None): """Estimate the in- and out-of-sample error of a model using cross validation. Parameters ---------- X: np.array Matrix of predictors. y: np.array Target array. base_estimator: sklearn model object. The estimator to fit. Must have fit and predict methods. n_folds: int The number of folds in the cross validation. random_seed: int A seed for the random number generator, for repeatability. Returns ------- train_cv_errors, test_cv_errors: tuple of arrays The training and testing errors for each fold of cross validation. """ kf = KFold(n_splits=n_folds) test_cv_errors, train_cv_errors = np.zeros(n_folds), np.zeros(n_folds) X = np.array(X) y = np.array(y) for idx, (train, test) in enumerate(kf.split(X_train)): # Split into train and test X_cv_train, y_cv_train = X[train], y[train] X_cv_test, y_cv_test = X[test], y[test] # Standardize data. standardizer = XyScaler() standardizer.fit(X_cv_train, y_cv_train) X_cv_train_std, y_cv_train_std = standardizer.transform(X_cv_train, y_cv_train) X_cv_test_std, y_cv_test_std = standardizer.transform(X_cv_test, y_cv_test) # Fit estimator model = base_estimator model.fit(X_cv_train_std, y_cv_train_std) # Measure performance y_hat_train = model.predict(X_cv_train_std) y_hat_test = model.predict(X_cv_test_std) # Calclate the error metrics train_cv_errors[idx] = mean_squared_error(y_cv_train_std, y_hat_train) test_cv_errors[idx] = mean_squared_error(y_cv_test_std, y_hat_test) return train_cv_errors, test_cv_errors
def linear_model(X_train, X_hold, y_train, y_hold): ln = LinearRegression() linear_cv_errors_train, linear_cv_errors_test = cross_valid( X_train.values, y_train.values, ln, 10) linear_mean_cv_errors_train = linear_cv_errors_train.mean(axis=0) linear_mean_cv_errors_test = linear_cv_errors_test.mean(axis=0) standardizer = XyScaler() standardizer.fit(X_train.values, y_train.values) X_train_std, y_train_std = standardizer.transform(X_train.values, y_train.values) X_hold_std, y_hold_std = standardizer.transform(X_hold.values, y_hold.values) final_linear = LinearRegression().fit(X_train_std, y_train_std) y_hold_pred_std = final_linear.predict(X_hold_std) final_linear_mse = mse(y_hold_std, y_hold_pred_std) r2 = r2_score(y_hold_std, y_hold_pred_std) ress = y_hold_std - y_hold_pred_std print("Linear R2 Score: ", r2) print("Final Linear MSE: ", final_linear_mse) return (final_linear, y_hold_std, y_hold_pred_std, ress, final_linear_mse)
class LinearDataset: """Adds functionalities to linear_model""" def __init__(self, X, y, linear_modelcv, name): self.X = X.copy() self.X_std = None self.y_std = None self.y = y.copy() self.model_cv = linear_modelcv self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.type = self.model_cv.__class__.__name__ self.name = self.type self.hyperparameters = pd.Series() self.scaler = XyScaler() def scale_X(self): """Scales X and y""" self.scaler.fit(self.X, self.y) X_std, y_std = self.scaler.transform(self.X, self.y) self.X_std = pd.DataFrame(data=X_std, columns=self.X.columns, index=self.X.index) self.y_std = pd.Series(data=y_std) def add_constant(self): self.X['constant'] = 1 def goldfeldtquandt(self): """Conducts a Goldfeldt-Quandt test for heteroscedasticity with the null hypothesis that errors are normally distributed""" het_F_stat, het_p_stat, z = sm.stats.diagnostic.het_goldfeldquandt( self.y, self.X) return {"F": het_F_stat, "p": het_p_stat} def vif(self): """Returns the variance inflation factor for dataframe of features to test for multicolinearity, scores of 5 and up indicate multicolinearity""" vif = pd.DataFrame() vif["Features"] = self.X_std.columns vif["VIF Factor"] = [ variance_inflation_factor(self.X_std.values, i) for i in range(self.X_std.shape[1]) ] return vif def test_split(self, ratio=0.25): """Splits a training set with a given ratio""" self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X_std, self.y_std, test_size=ratio, random_state=1) def log_transform_y(self): """Log transforms the target vector""" self.y = np.log(self.y) def get_coefs(self): """Returns model coefficients for sklearn models""" df = pd.DataFrame(self.model_cv.coef_, index=self.X_train.columns) return df def fit_cross_val(self, cv=10, alphas=None, l1_ratio=None): """Fits sklearn LinearModelCV and adds hyperparameters to object name""" self.model_cv.fit(self.X_train, self.y_train) if self.name in ['ElasticNetCV', 'RidgeCV', 'LassoCV']: self.hyperparameters['a'] = self.model_cv.alpha_ if self.name == 'ElasticNetCV': self.hyperparameters['l1_ratio'] = self.model_cv.l1_ratio_ self.name += str(dict(self.hyperparameters.round(3))) def plot_MSE(self, ax=plt, c_title=''): """Plots the mean squared error for various alphas in a cross-validation""" alphas = self.model_cv.alphas_ m_log_alphas = -np.log10(alphas) ymin, ymax = self.model_cv.mse_path_.min( ) * 0.9, self.model_cv.mse_path_.max() * 1.1 ax.plot(m_log_alphas, self.model_cv.mse_path_, ':') ax.axvline(-np.log10(self.model_cv.alpha_), linestyle='--', color='k', label='alpha: CV estimate') ax.legend(list(range(1, self.model_cv.mse_path_.shape[1] + 1)), title='Fold') ax.set_xlabel('-log(alpha)') ax.set_ylabel('Mean square error') ax.set_title(c_title + ' Mean square error on each fold') ax.set_ylim(ymin, ymax) def plot_coeff_paths(self, ax=plt, c_title=''): """Plots coefficient paths for various alphas in a cross-validation""" alphas = self.model_cv.alphas_ m_log_alphas = np.log10(alphas) coeffs = self.model_cv.path(self.X_train, self.y_train)[1] ymin, ymax = coeffs.min(), coeffs.max() ax.plot(m_log_alphas, coeffs.T) ax.legend(self.X_train.columns, title='Feature', loc='upper left', bbox_to_anchor=(1, 1)) ax.set_xlabel('log(alpha)') ax.set_title(c_title + ' Coefficient Descent') ax.set_ylim(ymin, ymax) def plot_actual_predicted(self, ax=plt, y_log=True): """Plots model predicted values verus actual values""" y_hat_test = self.y_hat_test y_test = self.y_test if y_log: y_test = np.log(self.y_test) y_hat_test = np.log(self.y_hat_test) ax.scatter(y_test, y_hat_test) model_name = self.model_cv.__class__.__name__ ax.set_title(model_name + ' Actual vs. Predicted') ax.set_ylabel('Actual') ax.set_xlabel('Predicted') plt.subplots_adjust(hspace=.300, wspace=.200) def _rss(self, y, y_hat): """Returns the residual sum of squares""" return np.mean((y - y_hat)**2) def predict(self): """Returns standardized and unstandardized predictions""" self.y_hat_train = self.model_cv.predict(self.X_train) self.y_hat_test = self.model_cv.predict(self.X_test) self.X_train_unstandardized, self.y_hat_train_unstandardized = self.scaler.inverse_transform( self.X_train, self.y_hat_train) self.X_test_unstandardized, self.y_hat_test_unstandardized = self.scaler.inverse_transform( self.X_test, self.y_hat_test) self.X_test_unstandardized, self.y_test_unstandardized = self.scaler.inverse_transform( self.X_test, self.y_test) def test_and_train_errs(self): """Returns the residual sum of squares for training and test sets""" rss_train = self._rss(self.y_train, self.y_hat_train) rss_test = self._rss(self.y_test, self.y_hat_test) rss_train_unstandardized = self._rss(self.y_train, self.y_hat_train_unstandardized) rss_test_unstandardized = self._rss(self.y_test, self.y_hat_test_unstandardized) r2_train = self.model_cv.score(self.X_train, self.y_train) r2_test = self.model_cv.score(self.X_test, self.y_test) return [ r2_train, r2_test, rss_train, rss_test, rss_train_unstandardized, rss_test_unstandardized ] def find_residuals(self): """Returns difference of actual and predicted targets for training""" return self.y_train - self.model_cv.predict(self.X_train) def plot_qqplot(self): """Creates quantile-quantile plots bases on the residuals""" qqplot(self.find_residuals()) def set_up(self, y_log=True, ratio=0.30): """Automates processing steps""" if y_log: self.log_transform_y() self.scale_X() self.add_constant() self.test_split(ratio=ratio) self.fit_cross_val() self.predict()
def train_at_various_alphas(X, y, model, alphas, n_folds=10, **kwargs): """Train a regularized regression model using cross validation at various values of alpha. Parameters ---------- X: np.array Matrix of predictors. y: np.array Target array. model: sklearn model class A class in sklearn that can be used to create a regularized regression object. Options are `Ridge` and `Lasso`. alphas: numpy array An array of regularization parameters. n_folds: int Number of cross validation folds. Returns ------- cv_errors_train, cv_errors_test: tuple of DataFrame DataFrames containing the training and testing errors for each value of alpha and each cross validation fold. Each row represents a CV fold, and each column a value of alpha. """ cv_errors_train = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))), columns=alphas) cv_errors_test = pd.DataFrame(np.empty(shape=(n_folds, len(alphas))), columns=alphas) X = np.array(X) y = np.array(y) for alpha in alphas: kf = KFold(n_splits=n_folds) for idx, (train, test) in enumerate(kf.split(X)): X_cv_train, y_cv_train = X[train], y[train] X_cv_test, y_cv_test = X[test], y[test] standardizer = XyScaler() standardizer.fit(X_cv_train, y_cv_train) X_cv_train_std, y_cv_train_std = standardizer.transform(X_cv_train, y_cv_train) X_cv_test_std, y_cv_test_std = standardizer.transform(X_cv_test, y_cv_test) m = model(alpha = alpha) m.fit(X_cv_train_std, y_cv_train_std) y_hat_train = m.predict(X_cv_train_std) y_hat_test = m.predict(X_cv_test_std) cv_errors_train[alpha][idx] = mean_squared_error(y_cv_train_std, y_hat_train) cv_errors_test[alpha][idx] = mean_squared_error(y_cv_test_std, y_hat_test) return cv_errors_train, cv_errors_test
avg_errors_train = ridge_cv_errors_train.mean() avg_errors_test = ridge_cv_errors_test.mean() avg_errors_lasso_train = lasso_cv_errors_train.mean() avg_errors_lasso_test = lasso_cv_errors_test.mean() #Calculate Optimal Alpha ridge_optimal_alpha = get_optimal_alpha(avg_errors_test) #Optimal Alpha: 11.51395399 lasso_optimal_alpha = get_optimal_alpha(avg_errors_lasso_test) #Optimal Alpha: .0001 #### Graphing Lambdas#### #Placeholder Dataframe for coefs features = X.columns df_coefs = pd.DataFrame(np.empty(shape=(len(ridge_alphas), X.shape[1])), columns = features, index = ridge_alphas) standardizer = XyScaler() X_train = np.array(X_train) y_train = np.array(y_train) standardizer.fit(X_train, y_train) X_cv_train_std, y_cv_train_std = standardizer.transform(X_cv_train, y_cv_train) #Re-run and get coefs for idx, alpha in enumerate(ridge_alphas): model = Ridge(alpha = alpha) model.fit(X_cv_train_std, y_cv_train_std) df_coefs.iloc[idx] = model.coef_ #Plot fig, ax = plt.subplots(figsize = (30,30)) for col in df_coefs.columns: ax.plot(np.log(df_coefs.index), df_coefs[col], label = str(col))