def testFunc8(savepath='Results/bikeshare_RidgeCV_AlphaSelection.png'): ''' 基于共享单车数据使用AlphaSelection ''' data = pd.read_csv('fixtures/bikeshare/bikeshare.csv') X = data[[ "season", "month", "hour", "holiday", "weekday", "workingday", "weather", "temp", "feelslike", "humidity", "windspeed" ]] Y = data["riders"] alphas = np.logspace(-10, 1, 200) visualizer = AlphaSelection(RidgeCV(alphas=alphas)) visualizer.fit(X, Y) visualizer.poof(outpath=savepath)
def alphas(ax): from sklearn.linear_model import RidgeCV from yellowbrick.regressor import AlphaSelection features = [ "relative compactness", "surface area", "wall area", "roof area", "overall height", "orientation", "glazing area", "glazing area distribution" ] target = "heating load" # target = "cooling load" X, y = load_data("energy", cols=features, target=target) estimator = RidgeCV(scoring="neg_mean_squared_error") visualizer = AlphaSelection(estimator, ax=ax) visualizer.title = "" visualizer.fit(X, y) return visualizer
def hyperparameter_tuning(fname="hyperparameter_tuning.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18,6)) # Load the concrete dataset data = load_concrete(split=False) # Create a list of alphas to cross-validate against alphas = np.logspace(-10, 1, 400) # Add AlphaSelection to the left oz = AlphaSelection(LassoCV(alphas=alphas), ax=axes[0]) oz.fit(data.X, data.y) oz.finalize() # Add LearningCurve to the right oz = LearningCurve(RandomForestRegressor(), scoring='r2', ax=axes[1]) oz.fit(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def hyperparameter_tuning(fname="hyperparameter_tuning.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18, 6)) # Load the concrete dataset data = load_concrete(split=False) # Create a list of alphas to cross-validate against alphas = np.logspace(-10, 1, 400) # Add AlphaSelection to the left oz = AlphaSelection(LassoCV(alphas=alphas), ax=axes[0]) oz.fit(data.X, data.y) oz.finalize() # Add LearningCurve to the right oz = LearningCurve(RandomForestRegressor(), scoring='r2', ax=axes[1]) oz.fit(data.X, data.y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def alphas(): X, y = load_concrete() alphas = np.logspace(-10, 1, 400) oz = AlphaSelection(LassoCV(alphas=alphas), ax=newfig()) oz.fit(X, y) savefig(oz, "alpha_selection")
def main(processed_path = "data/processed", models_path = "models", visualizations_path = "visualizations"): """Creates visualizations.""" # logging logger = logging.getLogger(__name__) # normalize paths processed_path = os.path.normpath(processed_path) logger.debug("Path to processed data normalized: {}" .format(processed_path)) models_path = os.path.normpath(models_path) logger.debug("Path to models normalized: {}" .format(models_path)) visualizations_path = os.path.normpath(visualizations_path) logger.debug("Path to visualizations normalized: {}" .format(visualizations_path)) #%% load selected_df selected_df = pd.read_pickle(os.path.join(processed_path, 'selected_df.pkl')) logger.info("Loaded selected_df. Shape of df: {}" .format(selected_df.shape)) # load models mod = pickle.load(open( os.path.join(models_path, 'sklearn_ElasticNetCV.pkl'), 'rb')) logger.info("Loaded sklearn_ElasticNetCV.pkl.") mod_sm = pickle.load(open( os.path.join(models_path, 'sm_OLS_fit_regularized.pkl'), 'rb')) logger.info("Loaded sm_OLS_fit_regularized.") #%% split selected_df into dependent and independent variables teams_df = selected_df.iloc[:, :9] y = selected_df.iloc[:, 9:10] X = selected_df.iloc[:, 10:] yX = pd.concat([y, X], axis=1) logger.debug("Splitted selected_df to teams_df, y, X and yX.") #%% start visualization start = time() sns.set_context('paper') logger.debug("Set seaborn context to 'paper'.") rcParams.update({'figure.autolayout': True}) logger.debug("Set figure.autoLayout to True.") #%% correlation coefficient matrix logger.info("Start visualizing correlation_coefficient_matrix.png.") corr = yX.corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 10)) # Generate a custom diverging colormap cmap = sns.diverging_palette(240, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio fig = sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}).get_figure() fig.savefig(os.path.join(visualizations_path, 'correlation_coefficient_matrix.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing correlation_coefficient_matrix.png.") #%% histograms of transformation sns.set_style("darkgrid") logger.debug("Set seaborn_style to darkgrid.") logger.info("Start visualizing histograms.") # histogram of ranking fig = sns.distplot(teams_df.Ranking, rug=True, axlabel='ranking').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogram_ranking.png'), dpi=300) fig.clear() plt.close() # histogram of ranking_log fig = sns.distplot(y, rug=True, axlabel='ranking_log').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogam_ranking_log.png'), dpi=300) fig.clear() plt.close() # histogram of loc_max fig = sns.distplot(np.e**X.loc_max_log, rug=True, axlabel='loc_max').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogram_loc_max.png'), dpi=300) fig.clear() plt.close() # histogram of loc_max_log fig = sns.distplot(X.loc_max_log, rug=True, axlabel='loc_max_log').get_figure() fig.savefig(os.path.join(visualizations_path, 'histogram_loc_max_log.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing histograms.") #%% standardize logger.info("Start standardizing X.") scaler = StandardScaler() not_standardize = ['core', 'visualization', 'machine_learning', 'deep_learning'] X_standardized = scaler.fit_transform(X .drop(columns=not_standardize) .values) X_standardized = pd.DataFrame(X_standardized, index = X.index, columns = X.columns.drop(not_standardize)) X_not_standardized = X[not_standardize] X = pd.concat([X_standardized, X_not_standardized], axis=1) logger.debug("After Standardization:\n{}".format(X.describe().to_string)) # update yX yX = pd.concat([y, X], axis=1) logger.info("Finished standardizing X.") #%% boxplot logger.info("Start visualizing boxplot.png.") f, ax = plt.subplots(figsize=(12, 8)) fig = sns.boxplot(data=yX) fig.set_xticklabels(fig.get_xticklabels(), rotation=270) fig.get_figure().savefig(os.path.join(visualizations_path, 'boxplot.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing boxplot.png.") #%% residual plot logger.info("Start visualizing residplot.png.") f, ax = plt.subplots(figsize=(5, 5)) fig = sns.residplot(x=mod_sm.fittedvalues, y=y, data=X).get_figure() fig.savefig(os.path.join(visualizations_path, 'residplot.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing residplot.png.") #%% plot ElasticNetCV results # need to refit model with fixed l1_ratio (to best l1_ratio) # in order to visualize correctly mod.set_params(l1_ratio=mod.l1_ratio_) logger.info("Fixed l1_ratio to {}".format(mod.l1_ratio_)) mod.fit(X.values, y.values) logger.info("Refitted ElaticNetCV model.") # print MSE's across folds logger.info("Start visualizing ElasticNetCV_MSE_per_fold.png.") alphas = mod.alphas_ fig = plt.figure() plt.plot(alphas, mod.mse_path_, ':') plt.plot(alphas, mod.mse_path_.mean(axis=-1), 'b', label='Average over the folds') plt.axvline(mod.alpha_, linestyle='--', color='k', label="$\\alpha={:0.3f}$".format(mod.alpha_)) plt.legend() plt.xlabel('alpha') plt.ylabel('error (or score)') plt.title('ElasticNetCV Alpha Error (per CV-fold)') plt.axis('tight') fig.savefig(os.path.join(visualizations_path, 'ElasticNetCV_MSE_per_fold.png'), dpi=300) fig.clear() plt.close() logger.info("Finished visualizing ElasticNetCV_MSE_per_fold.png.") # print R^2 errors (minimization equivalent to MSE) logger.info("Start visualizing ElasticNetCV_MSE.png.") visualizer = AlphaSelection(mod) visualizer.fit(X, y) visualizer.poof(outpath=os.path.join(visualizations_path, 'ElasticNetCV_MSE.png'), dpi=300) plt.close() logger.info("Finished visualizing ElasticNetCV_MSE.png.") #%% pairplot not performed since too big # X_used = X.loc[:, mod.coef_ != 0] # fig = sns.pairplot(pd.concat([y, X_used], axis=1), kind='reg') # fig.savefig(os.path.join(visualizations_path, # 'pairplot.png'), dpi=100) # fig.clear() # plt.close() #%% logging time passed end = time() time_passed = pd.Timedelta(seconds=end-start).round(freq='s') logger.info("Time needed to create visualizations: {}" .format(time_passed))
# # min_mse = min(mse_list) # min_mse_index = mse_list.index(min_mse) # optimal_alpha = alphas[min_mse_index] # print("Optimal Alpha: ", optimal_alpha) # print("Minimum MSE: ", min_mse) # # plt.scatter(alphas, mse_list) # plt.ylim((min(mse_list) - 0.0001, max(mse_list) + 0.0001)) # plt.show() # Yellowbrick Regressor - Predict optimal alpha ytrain = np.reshape(ytrain, (ytrain.shape[0])) alphas = np.logspace(-10, 1, 200) visualizer = AlphaSelection(RidgeCV(alphas=alphas)) visualizer.fit(xtrain, ytrain) visualizer.show() # Optimal model optimal_alpha = 4.103 ridge_reg = RidgeCV(alphas=np.array([optimal_alpha])) x = ridge_reg.fit(xtrain, ytrain) # print("Coefficients: ", ridge_reg.coef_) y_pred = ridge_reg.predict(xtest) err = mean_squared_error(ytest, y_pred) print("MSE for optimal model: ", err) # Yellowbrick Regressor - Plot error visualizer = PredictionError(ridge_reg) visualizer.fit(xtrain, ytrain) visualizer.score(xtest, ytest)
ridge.coef_, c=np.sign(ridge.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import RidgeCV ### Find optimal alpha alphas = np.logspace(-10, 1, 400) ridge_alpha = RidgeCV(alphas=alphas) ridge_yb = AlphaSelection(ridge_alpha) ridge_yb.fit(X, y) ridge_yb.poof() ### RVF plot ridge_yb = ResidualsPlot(ridge, hist=True) ridge_yb.fit(X_train, y_train) ridge_yb.score(X_test, y_test) ridge_yb.poof() ### Prediction Error ridge_yb = PredictionError(ridge, hist=True) ridge_yb.fit(X_train, y_train) ridge_yb.score(X_test, y_test) ridge_yb.poof()
# Alpha usually settles around 5 alphas = np.logspace(-2, 1, 250) # Instantiate model cv = KFold(n_splits=5, shuffle=True, random_state=7) lasso = LassoCV(alphas=alphas, n_alphas=250, fit_intercept=True, normalize=False, cv=cv, tol=0.0001, n_jobs=-1, verbose=1) # Cross-validation cv_score(lasso) from yellowbrick.regressor import AlphaSelection visualizer = AlphaSelection(lasso) visualizer.fit(Xtrain, ytrain) g = visualizer.poof() # Which variables were selected? lasso.fit(Xtrain, ytrain) # Put coefficients and variable names in df lassodf = pd.DataFrame(lasso.coef_, index=Xtrain.columns) # Select nonzeros results = lassodf[(lassodf.T != 0).any()] # Sort by magnitude results['sorted'] = results[0].abs() results.sort_values(by='sorted', inplace=True, ascending=False)
import numpy as np import pandas as pd from sklearn.linear_model import LassoCV from yellowbrick.regressor import AlphaSelection if __name__ == '__main__': # Load the regression data set df = pd.read_csv("../../../examples/data/concrete/concrete.csv") feature_names = ['cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'] target_name = 'strength' # Get the X and y data from the DataFrame X = df[feature_names] y = df[target_name] # Instantiate the linear model and visualizer alphas = np.logspace(-10, 1, 400) visualizer = AlphaSelection(LassoCV(alphas=alphas)) visualizer.fit(X, y) g = visualizer.poof(outpath="images/alpha_selection.png")
lasso_reg = LassoCV(cv=5, alphas=[0.011], max_iter=15000) # Previously Optimised ## Model Evaluation & Hyperparameter Tuning ## # CV Root Mean Squared Error on Training Set (Robust Scaled) cv_rmse(lasso_reg, X_scaled, np.ravel(y)) # LASSO: 0.319 cv_rmse(elastic_reg, X_scaled, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.317 # CV Root Mean Squared Error on Training Set (Standardised) cv_rmse(lasso_reg, X_standard, np.ravel(y)) # LASSO: 0.2992 cv_rmse(elastic_reg, X_standard, np.ravel(y)) # Elastic Net (ratio = 0.5): 0.3012 # Alpha Selection alphas = np.logspace(-10, 1, 400) visualizer = AlphaSelection(elastic_reg) visualizer.fit(X_scaled, y) visualizer.show() # Optimal Alpha = 0.020 alphas = np.logspace(-10, 1, 400) visualizer = AlphaSelection(elastic_reg) visualizer.fit(X_standard, y) visualizer.show() # Optimal Alpha = 0.020 # Search Algorithms to Further tune our Hyperparameters # RandomizedSearchCV to narrow search space rnd_params = {"l1_ratio": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], "alphas": [[0.1], [0.2], [0.3], [0.4], [0.5]], "max_iter": [15000], "normalize": [False]} rnd_src = RandomizedSearchCV(elastic_reg, param_distributions=rnd_params, n_iter=100, scoring="neg_mean_squared_error", n_jobs=-1) rnd_src.fit(X_scaled, np.ravel(y))
columns='param_l1_ratio') sns.heatmap(res, annot=True, cmap="YlGnBu") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import ElasticNetCV ### Find optimal alpha alphas = np.logspace(-10, 1, 400) elastic_alpha = ElasticNetCV(alphas=alphas) elastic_yb = AlphaSelection(elastic_alpha) elastic_yb.fit(X, y) elastic_yb.poof() ### RVF plot elastic_yb = ResidualsPlot(elastic, hist=True) elastic_yb.fit(X_train, y_train) elastic_yb.score(X_test, y_test) elastic_yb.poof() ### Prediction Error elastic_yb = PredictionError(elastic, hist=True) elastic_yb.fit(X_train, y_train) elastic_yb.score(X_test, y_test) elastic_yb.poof()
r.append(mean_squared_log_error(y_test, y_pred)) r.append(np.sqrt(r[0])) r.append(r2_score(y_test, y_pred)) r.append(round(r2_score(y_test, y_pred) * 100, 4)) return (r) """ dataframe that store the performance of each model """ accu = pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score', 'Accuracy(%)']) """ RIDGE REGRISSION METHODE @-@ """ """ predicting value of alpha """ alphas = 10**np.linspace(10, -2, 400) model = RidgeCV(alphas=alphas) visualizer = AlphaSelection(model) visualizer.fit(X_train, y_train) visualizer.show() """ model object and fitting model """ RR = Ridge(alpha=1.109, solver='auto') RR.fit(X_train, y_train) y_pred = RR.predict(X_test) """ model evaluation """ y_test_2, y_pred_2 = remove_neg(y_test, y_pred) r2_ridge = result(y_test_2, y_pred_2) print("MSLE : {}".format(r2_ridge[0])) print("Root MSLE : {}".format(r2_ridge[1])) print("R2 Score : {} or {}%".format(r2_ridge[2], r2_ridge[3])) accu['Ridge Regression'] = r2_ridge """ Visualization of Feature Importance """
lasso_lars = grid.best_estimator_ plt.scatter(range(X_poly.shape[1]), lasso_lars.coef_, c=np.sign(lasso_lars.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import LassoLarsCV ### Find optimal alpha lassolars_yb = AlphaSelection(LassoLarsCV()) lassolars_yb.fit(X, y) lassolars_yb.poof() ### RVF plot lasso_yb = ResidualsPlot(lasso_lars, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof() ### Prediction Error lasso_yb = PredictionError(lasso_lars, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof()
import numpy as np import bikeshare from sklearn.linear_model import RidgeCV from yellowbrick.regressor import AlphaSelection alphas = np.logspace(-10, 1, 200) visualizer = AlphaSelection(RidgeCV(alphas=alphas)) visualizer.fit(bikeshare.X, bikeshare.y) visualizer.poof()
lasso.coef_, c=np.sign(lasso.coef_), cmap="bwr_r") ######## Yellowbrick from yellowbrick.regressor import AlphaSelection, ResidualsPlot, PredictionError from sklearn.linear_model import LassoCV ### Find optimal alpha alphas = np.logspace(-10, 1, 400) lasso_alpha = LassoCV(alphas=alphas) lasso_yb = AlphaSelection(lasso_alpha) lasso_yb.fit(X, y) lasso_yb.poof() ### RVF plot lasso_yb = ResidualsPlot(lasso, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof() ### Prediction Error lasso_yb = PredictionError(lasso, hist=True) lasso_yb.fit(X_train, y_train) lasso_yb.score(X_test, y_test) lasso_yb.poof()
############################################################################# ############################### MODELING ###################################### ############################################################################# ################################### RIDGE REGRESSION ######################### ####### a) looking for best parameters #Run it to find the best alpha #Set a ranges for alphas alphas_range = np.arange(1, 200, 5) # Crossvalidate for the best alphas regr_cv = RidgeCV(alphas=alphas_range) #Visualize alpha visualizer = AlphaSelection(regr_cv) # Fit the linear regression visualizer.fit(X, y) g = visualizer.poof() visualizer.alpha_ # best parameter shows up to be 81 ####### b) Implement Ridge Regression ridge = Ridge(alpha=visualizer.alpha_) # this parameter is choosen by RidgeCV ridge.fit(X_train, y_train) # Fit a ridge regression on the training data coefs_ridge = pd.DataFrame(ridge.coef_.T, index=[X.columns]) # Print coefficients coefs_ridge = coefs_ridge.rename(columns={0: 'coef_value'}) # TRAIN SET pred_train = ridge.predict(X_train) # Use this model to predict the train data # Calculate RMSE train print("RMSE for Train:", sqrt(mean_squared_error(y_train, pred_train))) #RMSE print("R^2 for Train:", ridge.score(X_train, y_train)) #R2
visualizer.score(X_test, y_test) visualizer.show() visualizer = ResidualsPlot(svmReg,size=(1080, 720)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show() visualizer = ResidualsPlot(adaReg,size=(1080, 720)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show() visualizer = ResidualsPlot(rfReg,size=(1080, 720)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show() visualizer = ResidualsPlot(mlpReg,size=(1080, 720)) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) visualizer.show() from yellowbrick.regressor import AlphaSelection from sklearn.linear_model import RidgeCV model = AlphaSelection(RidgeCV()) model.fit(X_train, y_train) model.show()