def make_ridge(): #X=StandardScaler().fit_transform(all_training_data) X=all_training_data y=train_labels n_alphas = 200 alphas = np.logspace(-6, 6, n_alphas) ridge = Ridge(fit_intercept=False) """ coefs = [] for a in alphas: clf.set_params(alpha=a) clf.fit(X, y) coefs.append(clf.coef_) print(clf.coef_) make_prediction(clf, all_testing_data, test_labels)""" scores = list() scores_std = list() n_folds = 3 for i, alpha in enumerate(alphas): print(i) ridge.alpha = alpha this_scores = cross_val_score(ridge, X, y, cv=n_folds, n_jobs=1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) clf = Ridge(fit_intercept=False) clf.alpha = alpha clf.fit(X,y) print(clf.coef_) make_prediction(clf, all_testing_data, test_labels) scores, scores_std = np.array(scores), np.array(scores_std) plt.figure().set_size_inches(8, 6) plt.semilogx(alphas, scores) # plot error lines showing +/- std. errors of the scores std_error = scores_std / np.sqrt(n_folds) plt.semilogx(alphas, scores + std_error, 'b--') plt.semilogx(alphas, scores - std_error, 'b--') # alpha=0.2 controls the translucency of the fill color plt.fill_between(alphas, scores + std_error, scores - std_error, alpha=0.2) plt.ylabel('CV score +/- std error') plt.xlabel('alpha') plt.axhline(np.max(scores), linestyle='--', color='.5') print(scores.argmax(axis=0)) plt.xlim([alphas[0], alphas[-1]]) plt.show()
def main(data_dir='./data/', N=10, cv_test_size=0.2, files_to_use='all', submit_name='submission.csv'): if files_to_use == 'all': files_to_use = [ 'dswrf_sfc', 'dlwrf_sfc', 'uswrf_sfc', 'ulwrf_sfc', 'ulwrf_tatm', 'pwat_eatm', 'tcdc_eatm', 'apcp_sfc', 'pres_msl', 'spfh_2m', 'tcolc_eatm', 'tmax_2m', 'tmin_2m', 'tmp_2m', 'tmp_sfc' ] train_sub_str = '_latlon_subset_19940101_20071231.nc' test_sub_str = '_latlon_subset_20080101_20121130.nc' print 'Loading training data...' trainX = load_GEFS_data(data_dir, files_to_use, train_sub_str) times, trainY = load_csv_data(os.path.join(data_dir, 'train.csv')) print 'Training data shape', trainX.shape, trainY.shape # Gotta pick a scikit-learn model model = Ridge(normalize=True) # Normalizing is usually a good idea print 'Finding best regularization value for alpha...' alphas = np.logspace(-3, 1, 8, base=10) # List of alphas to check alphas = np.array((0.1, 0.2, 0.3, 0.4, 0.5, 0.6)) maes = [] for alpha in alphas: model.alpha = alpha mae = cv_loop(trainX, trainY, model, N) maes.append(mae) print 'alpha %.4f mae %.4f' % (alpha, mae) best_alpha = alphas[np.argmin(maes)] print 'Best alpha of %s with mean average error of %s' % (best_alpha, np.min(maes)) print 'Fitting model with best alpha...' model.alpha = best_alpha model.fit(trainX, trainY) print 'Loading test data...' testX = load_GEFS_data(data_dir, files_to_use, test_sub_str) print 'Test data shape', testX.shape print 'Predicting...' preds = model.predict(testX) print 'Saving to csv...' save_submission(preds, submit_name, data_dir)
def compare(X, y, ringe_alpha, lasso_alpha, k, plot): kf = KFold(n_splits=10) kf.get_n_splits(X) knn_errors = [] ridge_errors = [] lasso_errors = [] for train_index, test_index in kf.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) pred_y = knn.predict(X_test) knn_errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y)) lasso = Lasso(normalize=True) lasso.alpha = lasso_alpha lasso.fit(X_train, y_train) pred_y = lasso.predict(X_test) lasso_errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y)) ridge = Ridge(normalize=True) ridge.alpha = ringe_alpha ridge.fit(X_train, y_train) pred_y = ridge.predict(X_test) ridge_errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y)) if plot: plt.plot([0, 1, 2], [np.mean(knn_errors), np.mean(ridge_errors), np.mean(lasso_errors)], 'ro') plt.title("Comparison") plt.xlabel('models (knn - 0, ridge - 2, lasso - 3)') plt.ylabel('MSE') # plt.xscale('log') plt.show() return np.mean(knn_errors), np.mean(ridge_errors), np.mean(lasso_errors)
def process_optimized_ridge(data): c_alpha = 0.001 step = 0.01 max_alpha = 20 min_mean_sqr_error = 10000000 max_r2_score = 0 global optimized_bridge_alpha while c_alpha <= max_alpha: model = Ridge() model.alpha = c_alpha model.fit(data["X_train"], data["y_train"]) predicted_values = model.predict(data["X_test"]) mean_sqr_error = mean_squared_error(data["y_test"], predicted_values) r2_score_calc = r2_score(data["y_test"], predicted_values) if max_r2_score < abs(r2_score_calc): min_mean_sqr_error = mean_sqr_error max_r2_score = r2_score_calc optimized_bridge_alpha = c_alpha c_alpha = c_alpha + step dict_result = { "name": "RR", 'data': { "alpha": optimized_bridge_alpha }, 'mean_sqr_err': min_mean_sqr_error, 'r2_score': max_r2_score } return dict_result
def regularization_ridge(X, y): # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) ridge_scores = [] ridge_scores_std = [] # Create a ridge regressor: ridge ridge = Ridge(normalize=True) # Compute scores over range of alphas for alpha in alpha_space: # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge, X, y, cv=10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) # Display the plot display_plot(ridge_scores, ridge_scores_std)
def main(data_dir='./data/',N=10,cv_test_size=0.2,files_to_use='all',submit_name='submission.csv'): if files_to_use == 'all': files_to_use = ['dswrf_sfc','dlwrf_sfc','uswrf_sfc','ulwrf_sfc','ulwrf_tatm','pwat_eatm','tcdc_eatm','apcp_sfc','pres_msl','spfh_2m','tcolc_eatm','tmax_2m','tmin_2m','tmp_2m','tmp_sfc'] train_sub_str = '_latlon_subset_19940101_20071231.nc' test_sub_str = '_latlon_subset_20080101_20121130.nc' print 'Loading training data...' trainX = load_GEFS_data(data_dir,files_to_use,train_sub_str) times,trainY = load_csv_data(os.path.join(data_dir,'train.csv')) print 'Training data shape',trainX.shape,trainY.shape # Gotta pick a scikit-learn model model = Ridge(normalize=True) # Normalizing is usually a good idea print 'Finding best regularization value for alpha...' alphas = np.logspace(-3,1,8,base=10) # List of alphas to check alphas = np.array(( 0.1, 0.2, 0.3, 0.4, 0.5, 0.6 )) maes = [] for alpha in alphas: model.alpha = alpha mae = cv_loop(trainX,trainY,model,N) maes.append(mae) print 'alpha %.4f mae %.4f' % (alpha,mae) best_alpha = alphas[np.argmin(maes)] print 'Best alpha of %s with mean average error of %s' % (best_alpha,np.min(maes)) print 'Fitting model with best alpha...' model.alpha = best_alpha model.fit(trainX,trainY) print 'Loading test data...' testX = load_GEFS_data(data_dir,files_to_use,test_sub_str) print 'Test data shape',testX.shape print 'Predicting...' preds = model.predict(testX) print 'Saving to csv...' save_submission(preds,submit_name,data_dir)
def process_optimized_ridge_step2(data): model = Ridge() model.alpha = optimized_bridge_alpha model.fit(data["X_train"], data["y_train"]) predicted_values = model.predict(data["X_test"]) mean_sqr_error = mean_squared_error(data["y_test"], predicted_values) r2_score_calc = r2_score(data["y_test"], predicted_values) dict_result = { "name": "RR", 'data': { "alpha": optimized_bridge_alpha }, 'mean_sqr_err': mean_sqr_error, 'r2_score': r2_score_calc } return dict_result
def main(): ''' Linear regression minimizes a loss function It choose a coefficient for each feature variable Large coefficients can lead to overfitting Regularization is penalizing large coefficients This function uses RIDGE REGRESSION''' # Create a dataframe from the .csv file df = pd.read_csv('gapminderstats.csv') # Create an array for the target variable y = np.array(df['life']) # Drop the target variable column from the data frame df_X = df.drop('life', axis=1) # Get the column names # df_columns = df_X.dtypes.index # Create an array for the features X = np.array(df_X) # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) ridge_scores = [] ridge_scores_std = [] # Create a ridge regressor: ridge ridge = Ridge(normalize=True) # Compute scores over range of alphas for alpha in alpha_space: # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge, X, y, cv=10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) # Display the plot display_plot(ridge_scores, ridge_scores_std, alpha_space)
def calc_ridge(X, y, alphas, plot): kf = KFold(n_splits=10) kf.get_n_splits(X) mses = [] for alpha in alphas: errors = [] for train_index, test_index in kf.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] ridge = Ridge(normalize=True) ridge.alpha = alpha ridge.fit(X_train, y_train) pred_y = ridge.predict(X_test) errors.append(mean_squared_error(y_true=y_test, y_pred=pred_y)) mses.append(np.mean(errors)) if plot: plt.plot(alphas, mses, 'ro') plt.title("MSE for different alpha levels for Ridge Regression") plt.xlabel('alpha') plt.ylabel('MSE') plt.xscale('log') plt.show() return mses
_ = plt.margins(0.02) plt.show() # Ridge (-> first choice for regression models!) # adds sum of squared coeff to loss fun from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score ridge = Ridge(normalize=True) # Model # which alpha?? alpha_space = np.logspace(-4, 0, 50) # array of alphas ridge_scores = [] # lists to store scores ridge_scores_std = [] for alpha in alpha_space: # compute scores over range of alphas # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # access Ridge(alpha=)!!!!! # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge, X, y, cv=10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) display_plot(ridge_scores, ridge_scores_std) # ElasticNet() (-> see Tuning section) #endregion (REGULARIZED REGRESSION) #region CROSS VALIDATION # problem 1: performance depends on way the data is split # problem 2: overfit to (one) sample
label='Test data') plt.xlabel('Predicted values') plt.ylabel('Residuals') plt.legend(loc='upper left') plt.hlines(y=0, xmin=-10, xmax=50, color='black', lw=2) plt.xlim([-10, 50]) plt.show() # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) MSE2_test_scores = [] MSE2_train_scores = [] R22_test_scores = [] R22_train_scores = [] for alpha in alpha_space: sr.alpha = alpha sr.fit(X, y) y_train_pred2 = sr.predict(X_train) y_test_pred2 = sr.predict(X_test) MSE2_test_scores.append(mean_squared_error(y_test, y_test_pred2)) MSE2_train_scores.append(mean_squared_error(y_train, y_train_pred2)) R22_test_scores.append(r2_score(y_test, y_test_pred2)) R22_train_scores.append(r2_score(y_train, y_train_pred2)) plt.plot(alpha_space, MSE2_test_scores) plt.xlabel('alpha_space') plt.ylabel('MSE2_test_scores') plt.show() plt.plot(alpha_space, MSE2_train_scores) plt.xlabel('alpha_space') plt.ylabel('MSE2_train_scores')
#Lasso is great for feature selection, but when building regression models, Ridge regression should be your first choice. #Recall that lasso performs regularization by adding to the loss function a penalty term of the absolute value of each coefficient #multiplied by some alpha. This is also known as L1 regularization because the regularization term is the L1 norm of the coefficients. #If instead you took the sum of the squared values of the coefficients multiplied by some alpha - like in Ridge regression - #you would be computing the L2 norm. ## Regularization RIDGE # Import necessary modules from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) ridge_scores = []ridge_scores_std = [] # Create a ridge regressor: ridge ridge = Ridge(normalize=True) # Compute scores over range of alphas for alpha in alpha_space: # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge, X,y,cv=10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) # Display the plot display_plot(ridge_scores, ridge_scores_std)
I'll start off with using Ridge to perform a regularization of the regression model. """ #import modules from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score alpha_space = np.logspace(-4, 0, 1) ridge_scores = [] ridge_scores_std = [] ridge = Ridge(normalize=True) for alpha in alpha_space: ridge.alpha = alpha_space ridge_cv_scores = cross_val_score(ridge, X_train, y_train, cv=10) ridge_scores.append(np.mean(ridge_cv_scores)) ridge_scores_std.append(np.std(ridge_cv_scores)) """Then build a decision tree for my model using XGBRegressor, which comes with a built in tree paramter.""" from sklearn.metrics import mean_squared_error # Instantiating the XGBRegressor xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10) # fitting the reggresor to the training set xg_reg.fit(X_train, y_train) # making predictions preds1 = xg_reg.predict(X_test) """Vizualizing the trees and feature importance""" xgb.plot_tree(xg_reg, num_trees=0, rankdir="LR")
def oppgave_6(o=15, seed=4, test=True): # Load the terrain terrain = imread("{}SRTM_data_Norway_1.tif".format(image_path)) # Show the terrain plt.figure() plt.title('Terrain Norway 1, Original') plt.imshow(terrain, cmap='gray') plt.xlabel('X') plt.ylabel('Y') plt.show() #Pick out small square to analyze if test is set to True if test: #Pick out small square to analyze square_size = 100 x_shift = np.random.randint(0, 1801 - square_size) y_shift = np.random.randint(0, 3601 - square_size) terrain = terrain[y_shift:y_shift + square_size, x_shift:x_shift + square_size] plt.figure() plt.title('Terrain part 1, Original {} pt box'.format(square_size)) plt.imshow(terrain, cmap='gray') plt.xlabel('X') plt.ylabel('Y') plt.show() else: #Use settings determined by analysing small squares for analysis #on entire dataset. Attemting to rebuild the image from #a model based on evenly spaced datapoints #Set model parameters order = 15 #Ridge parameter lmd = 0.0001 #Lasso parameter alph = 0.0001 #Set the coarseness of the sample grid coarseness = 5 x_dimension_original = len(terrain[0, :]) y_dimension_original = len(terrain[:, 0]) x_dimension = x_dimension_original // coarseness y_dimension = y_dimension_original // coarseness terrain_points = np.zeros((y_dimension, x_dimension)) for x_axis in range(x_dimension): for y_axis in range(y_dimension): terrain_points[y_axis, x_axis] = terrain[y_axis * coarseness, x_axis * coarseness] #Create mesh grid for training data, selected points x = np.linspace(0, 1, x_dimension) y = np.linspace(0, 1, y_dimension) x_grid, y_grid = np.meshgrid(x, y) #Create meshgrid for original data x_original = np.linspace(0, 1, x_dimension_original) y_original = np.linspace(0, 1, y_dimension_original) x_grid_original, y_grid_original = np.meshgrid(x_original, y_original) #Flatten grids data = np.ravel(terrain_points) data_original = np.ravel(terrain) x = np.ravel(x_grid) y = np.ravel(y_grid) x_original = np.ravel(x_grid_original) y_original = np.ravel(y_grid_original) #Creates a scaler to normalize data scaler = MinMaxScaler() print("Running time: {} seconds".format(time() - t0)) #Normalizing data scaler.fit(data.reshape(-1, 1)) #Normalizing training data normalized_data = scaler.transform(data.reshape(-1, 1)) normalized_data = normalized_data[:, 0] #Normalizing original data --------not used? normalized_data_original = scaler.transform( data_original.reshape(-1, 1)) normalized_data_original = normalized_data_original[:, 0] #Initiate instances of the regressors linear_regression = LinearRegression() ridge_regression = Ridge(solver="svd", alpha=lmd) lasso_regression = Lasso(alpha=alph) print("Running time: {} seconds".format(time() - t0)) #Create training matrix A = design_matrix(order, x, y) #Remove intercept A = A[:, 1:] print("Running time: {} seconds".format(time() - t0)) #Create prediction matrix X_test = design_matrix(order, x_original, y_original) X_test = X_test[:, 1:] print("Running time: {} seconds".format(time() - t0)) #Make prediction using OLS model linear_regression.fit(A, normalized_data) rebuilt = linear_regression.predict(X_test) print("OLS MSE: ", MSE(normalized_data_original, rebuilt)) rebuilt = scaler.inverse_transform(rebuilt.reshape(-1, 1)) rebuilt = np.reshape(rebuilt, y_grid_original.shape) fig_rebuild = plt.figure(figsize=(9, 5)) ax1 = fig_rebuild.add_subplot(131) ax2 = fig_rebuild.add_subplot(132) ax3 = fig_rebuild.add_subplot(133) plt.title('Terrain Norway 1, rebuild') ax1.imshow(rebuilt, cmap='gray') plt.xlabel('X') plt.ylabel('Y') #Make prediction using Ridge model ridge_regression.fit(A, normalized_data) rebuilt = ridge_regression.predict(X_test) print("Ridge MSE: ", MSE(normalized_data_original, rebuilt)) rebuilt = scaler.inverse_transform(rebuilt.reshape(-1, 1)) print("Running time: {} seconds".format(time() - t0)) rebuilt = np.reshape(rebuilt, y_grid_original.shape) ax2.imshow(rebuilt, cmap='gray') #Make prediction using LASSO model lasso_regression.fit(A, normalized_data) rebuilt = lasso_regression.predict(X_test) print("LASSO MSE: ", MSE(normalized_data_original, rebuilt)) rebuilt = scaler.inverse_transform(rebuilt.reshape(-1, 1)) print("Running time: {} seconds".format(time() - t0)) rebuilt = np.reshape(rebuilt, y_grid_original.shape) ax3.imshow(rebuilt, cmap='gray') fig_rebuild.savefig("{}TerrainRebuilOrder{}P4.png".format( plots_path, order)) return () #Get dimensions of data set and make a grid to base the model on y_dimension = len(terrain[:, 0]) x_dimension = len(terrain[0, :]) x = np.linspace(0, 1, x_dimension) y = np.linspace(0, 1, y_dimension) x_grid, y_grid = np.meshgrid(x, y) #Flatten grid data = np.ravel(terrain) x = np.ravel(x_grid) y = np.ravel(y_grid) #set random seed np.random.seed(seed) #Creates a scaler to normalize data scaler = MinMaxScaler() #Normalizing data scaler.fit(data.reshape(-1, 1)) normalized_data = scaler.transform(data.reshape(-1, 1)) normalized_data = normalized_data[:, 0] #Create instances of sklearn kFold klass to split data for kfoldcv splits = 5 kfold = KFold(n_splits=splits, shuffle=True) #Sets a range of polynomial orders to fit to the data polynomial_order = np.arange(o) + 1 #---------OLS------------------------------ #------------------------------------------ #Solve using OLS linear_regression = LinearRegression() dta = list() for order in polynomial_order: print("Using polynomial order {}".format(order)) #Creating designmatrix A = design_matrix(order, x, y) mse_test = np.zeros(splits) mse_train = np.zeros(splits) counter = 0 #Initiating kfold cv for train_index, test_index in kfold.split(normalized_data): print("Calculating fold {} of {}".format(counter + 1, splits)) X_train, X_test = A[train_index], A[test_index] y_train, y_test = normalized_data[train_index], normalized_data[ test_index] #Using current polynomial order and fold to solve using OLS linear_regression.fit(X_train, y_train) ytilde = linear_regression.predict(X_train) ypredict = linear_regression.predict(X_test) #Get MSE metric for training and testing data mse_test[counter] = MSE(y_test, ypredict) mse_train[counter] = MSE(y_train, ytilde) counter = counter + 1 print(counter) print("Running time: {} seconds".format(time() - t0)) dta.append(["{}".format(order), mse_test.mean(), mse_train.mean()]) ''' rebuilt = linear_regression.predict(A) rebuilt = scaler.inverse_transform(rebuilt.reshape(-1,1)) rebuilt = np.reshape(rebuilt,y_grid.shape) plt.figure() plt.title('Terrain Norway 1, rebuild') plt.imshow(rebuilt, cmap='gray') plt.xlabel('X') plt.ylabel('Y') plt.show() ''' df = pd.DataFrame( dta, columns=["Polynomial", "MSE test set", "MSE training set"]) plt.figure() fig1 = plt.figure(figsize=(8, 4)) ax1 = fig1.add_subplot(111) ax1.set_position([0.1, 0.1, 0.6, 0.8]) ax1.set_xlabel("Polynomial order") ax1.set_ylabel("Training MSE") fig2 = plt.figure(figsize=(8, 4)) ax2 = fig2.add_subplot(111) ax2.set_position([0.1, 0.1, 0.6, 0.8]) ax2.set_xlabel("Polynomial order") ax2.set_ylabel("Testing MSE") ax1.plot(df["Polynomial"], df["MSE training set"], label="Training OLS") ax2.plot(df["Polynomial"], df["MSE test set"], label="Test OLS") fig1.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0) fig2.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0) fig1.savefig("{}TerrainOLStrainSeed{}.png".format(plots_path, seed)) fig2.savefig("{}TerrainOLStestSeed{}.png".format(plots_path, seed)) #---------RIDGE---------------------------- #------------------------------------------ #Creates a dictionary to store dataframes for each Ridge parameter dataframe_dic = dict() ridge_regression = Ridge(solver="svd") #Set a range og shrinkage factors for the Ridge regression lambdas = np.logspace(-5, -1, 10) for lmd in lambdas: print("Calculating Ridge, lambda: {}".format(lmd)) #Creates a list to store the results of each iteration in dta = list() for order in polynomial_order: print("Using polynomial order {}".format(order)) #Creating designmatrix A = design_matrix(order, x, y) #Removing intercept A = A[:, 1:] lambda_mse_test = np.zeros(splits) lambda_mse_train = np.zeros(splits) counter = 0 #Initiating kfold cv for train_index, test_index in kfold.split(normalized_data): X_train, X_test = A[train_index], A[test_index] y_train, y_test = normalized_data[ train_index], normalized_data[test_index] #Using current lambda and polynomial order solve using Ridge ridge_regression.alpha = lmd ridge_regression.fit(X_train, y_train) #Estimate testing and training data ypredict = ridge_regression.predict(X_test) ytilde = ridge_regression.predict(X_train) #Get MSE metric for training and testing data lambda_mse_test[counter] = MSE(y_test, ypredict) lambda_mse_train[counter] = MSE(y_train, ytilde) print("Calculating fold {} of {}".format(counter + 1, splits)) counter = counter + 1 print("Running time: {} seconds".format(time() - t0)) dta.append([ "{}".format(order), lambda_mse_test.mean(), lambda_mse_train.mean() ]) ''' rebuilt = ridge_regression.predict(A) rebuilt = scaler.inverse_transform(rebuilt.reshape(-1,1)) rebuilt = np.reshape(rebuilt,y_grid.shape) plt.figure() plt.title('Terrain Norway 1, rebuild') plt.imshow(rebuilt, cmap='gray') plt.xlabel('X') plt.ylabel('Y') plt.show() ''' df = pd.DataFrame( dta, columns=["Polynomial", "MSE test set", "MSE training set"]) dataframe_dic[lmd] = df cmap = plt.get_cmap('jet_r') plt.figure() fig1 = plt.figure(figsize=(8, 4)) ax1 = fig1.add_subplot(111) ax1.set_position([0.1, 0.1, 0.6, 0.8]) ax1.set_xlabel("Polynomial order") ax1.set_ylabel("Training MSE") fig2 = plt.figure(figsize=(8, 4)) ax2 = fig2.add_subplot(111) ax2.set_position([0.1, 0.1, 0.6, 0.8]) ax2.set_xlabel("Polynomial order") ax2.set_ylabel("Testing MSE") n = 0 for df in dataframe_dic: ax1.plot(dataframe_dic[df]["Polynomial"], dataframe_dic[df]["MSE training set"], color=cmap(float(n) / len(lambdas)), label="Alpha=%10.2E" % (df)) ax2.plot(dataframe_dic[df]["Polynomial"], dataframe_dic[df]["MSE test set"], color=cmap(float(n) / len(lambdas)), label="Alpha=%10.2E" % (df)) n = n + 1 fig1.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0) fig2.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0) fig1.savefig("{}TerrainRidgetrainSeed{}.png".format(plots_path, seed)) fig2.savefig("{}TerrainRidgetestSeed{}.png".format(plots_path, seed)) #---------LASSO---------------------------- #------------------------------------------ #Create an instance of the Lasso class from sklearn lasso_regression = Lasso() #Set a range og shrinkage factors for the LASSO regression alphas = np.logspace(-5, -2, 10) dataframe_dic = dict() for alph in alphas: print("Calculating LASSO, alpha: {}".format(alph)) #Creates a list to store the results of each iteration in dta = list() for order in polynomial_order: print("Using polynomial order {}".format(order)) #Creating designmatrix A = design_matrix(order, x, y) #Removing intercept A = A[:, 1:] alpha_mse_test = np.zeros(splits) alpha_mse_train = np.zeros(splits) counter = 0 #Initiating kfold cv for train_index, test_index in kfold.split(normalized_data): X_train, X_test = A[train_index], A[test_index] y_train, y_test = normalized_data[ train_index], normalized_data[test_index] #Using current aplha and polynomial order solve using Lasso lasso_regression.alpha = alph lasso_regression.fit(X_train, y_train) #Estimate testing and training data ypredict = lasso_regression.predict(X_test) ytilde = lasso_regression.predict(X_train) #Get MSE metric for training and testing data alpha_mse_test[counter] = MSE(y_test, ypredict) alpha_mse_train[counter] = MSE(y_train, ytilde) print("Calculating fold {} of {}".format(counter + 1, splits)) counter = counter + 1 print("Running time: {} seconds".format(time() - t0)) dta.append([ "{}".format(order), alpha_mse_test.mean(), alpha_mse_train.mean() ]) df = pd.DataFrame( dta, columns=["Polynomial", "MSE test set", "MSE training set"]) dataframe_dic[alph] = df cmap = plt.get_cmap('jet_r') plt.figure() fig1 = plt.figure(figsize=(8, 4)) ax1 = fig1.add_subplot(111) ax1.set_position([0.1, 0.1, 0.6, 0.8]) ax1.set_xlabel("Polynomial order") ax1.set_ylabel("Training MSE") fig2 = plt.figure(figsize=(8, 4)) ax2 = fig2.add_subplot(111) ax2.set_position([0.1, 0.1, 0.6, 0.8]) ax2.set_xlabel("Polynomial order") ax2.set_ylabel("Testing MSE") n = 0 for df in dataframe_dic: ax1.plot(dataframe_dic[df]["Polynomial"], dataframe_dic[df]["MSE training set"], color=cmap(float(n) / len(alphas)), label="Alpha=%10.2E" % (df)) ax2.plot(dataframe_dic[df]["Polynomial"], dataframe_dic[df]["MSE test set"], color=cmap(float(n) / len(alphas)), label="Alpha=%10.2E" % (df)) n = n + 1 fig1.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0) fig2.legend(bbox_to_anchor=(0.71, 0.5), loc="center left", borderaxespad=0) fig1.savefig("{}TerrainLASSOtrainSeed{}.png".format(plots_path, seed)) fig2.savefig("{}TerrainLASSOtestSeed{}.png".format(plots_path, seed))
def main(lat, lon, station_index): files = ['dswrf_sfc','dlwrf_sfc','uswrf_sfc','ulwrf_sfc','ulwrf_tatm','pwat_eatm','tcdc_eatm','apcp_sfc','pres_msl','spfh_2m','tcolc_eatm','tmax_2m','tmin_2m','tmp_2m','tmp_sfc'] train_sub_strings = '_latlon_subset_19940101_20071231.nc' #test_sub_str = '_latlon_subset_20080101_20121130.nc' #Load csv Solar Energy print 'Importing solarenergy trainings data' energy = np.genfromtxt('train.csv', delimiter=',', dtype="float") energy = np.squeeze(energy[:,station_index]) energy = np.delete(energy, 0, 0) #Split in train and test data print 'Splitting solarenergy data in test and train data' energy_split = np.split(energy,[4018,5113]) train_energy = energy_split[0] test_energy = energy_split[1] #Loading netCDF4 data for a specific point(lat,lon) train_matrix = loadNetCDF4(files, train_sub_strings, 5113, lat, lon) #Deleting zero colum train_matrix = np.delete(train_matrix, 0, 1) #Split in train and test data print 'Splitting weather data in test and train data' train_split = np.split(train_matrix,[4018,5113]) train_matrix = train_split[0] test_matrix = train_split[1] #Build csv train np.savetxt(str(lat) + '_' + str(lon) + '_' + str(station_index) + '_train.csv', train_matrix, delimiter = ",", fmt = "%.06f" ) print 'Setting up Regressor' ridge = Ridge() #Prepare a range of alpha values to test alphas = np.array([1,0.1,0.01,0.001,0.0001,0]) #Printing alphas, taken form scikit #print_alpha(ridge, train_matrix, train_energy, alphas) # create and fit a ridge regression model, testing each alpha, taken from scikit grid = GridSearchCV(estimator=ridge, param_grid=dict(alpha=alphas)) grid.fit(train_matrix, train_energy) print 'Best estimated alpha: ' print(grid.best_estimator_.alpha) ridge.alpha=grid.best_estimator_.alpha print 'Training the Regressor' ridge.fit(train_matrix,train_energy) print 'Predicting Energy' prediction_matrix = ridge.predict(test_matrix) #Sace csv prediction np.savetxt( str(lat) + '_' + str(lon) + '_' +str(station_index) + '_prediction.csv', prediction_matrix, delimiter = ",", fmt = "%d" ) #Plotting #Setting up date x-axis time = pd.date_range('2005-01-01', periods=1095)#1095 fig, ax = plt.subplots(1) fig.autofmt_xdate() xfmt = mdates.DateFormatter('%d-%m-%y') ax.xaxis.set_major_formatter(xfmt) #Plot prediction and actual values ax = plt.gca() ax.plot(time, prediction_matrix, linewidth=0.5) ax.plot(time, test_energy, linewidth=0.5,) #Labels and Legend plt.xlabel('Time') plt.ylabel('Jouls per square meter') plt.title('Solar Energy of Tahlequah(Oklahoma)' + ' (lat: ' + str(lat) + ' lon: ' + str(lon-360) + ')' ) plt.axis('tight') prediction_patch = mpatches.Patch(color='blue', label='Prediction') meassured_patch = mpatches.Patch(color='orange', label='Meassured') plt.legend(handles=[prediction_patch, meassured_patch]) plt.show() #Plot difference graph differnece = np.subtract(prediction_matrix, test_energy) fig, ax = plt.subplots(1) fig.autofmt_xdate() xfmt = mdates.DateFormatter('%d-%m-%y') ax.xaxis.set_major_formatter(xfmt) ax = plt.gca() ax.plot(time,differnece, linewidth = 0.5) #ax.plot(time, np.full((1095,),4000000), color='orange') #ax.plot(time, np.full((1095,),-4000000), color='orange') plt.xlabel('Time') plt.ylabel('Jouls per square meter') plt.title('Solar Energy of Tahlequah(Oklahoma)' + ' (lat: ' + str(lat) + ' lon: ' + str(lon-360) + ')' ) plt.axis('tight') difference_patch = mpatches.Patch(color='blue', label='Difference') plt.legend(handles=[difference_patch]) plt.show()
# fitting ridge regression models over a range of different alphas, and plot cross-validated R**2 scores for each # Import necessary modules from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) ridge_scores = [] ridge_scores_std = [] # Create a ridge regressor: ridge ridge = Ridge(normalize=True) # Compute scores over range of alphas for alpha in alpha_space: # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge, X, y, cv=10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) # Display the plot display_plot(ridge_scores, ridge_scores_std)
reg2 = Ridge(alpha=1) reg3 = Lasso(alpha=1) reg1.fit(trainX, trainy) reg1.coef_ reg2.fit(trainX, trainy) reg2.coef_ reg3.fit(trainX, trainy) reg3.coef_ alphas = np.logspace(-3, 3, 30) #30개 생성 linear_r2 = reg1.score(validX, validy) result = pd.DataFrame(index=alphas, columns=['Ridge', 'Lasso']) for alpha in alphas: reg2.alpha = alpha reg3.alpha = alpha reg2.fit(trainX, trainy) result.loc[alpha, 'Ridge'] = reg2.score(validX, validy) reg3.fit(trainX, trainy) result.loc[alpha, 'Lasso'] = reg3.score(validX, validy) plt.plot(np.log(alphas), result['Ridge'], label="Ridge") plt.plot(np.log(alphas), result['Lasso'], label="Lasso") plt.hlines(linear_r2, np.log(alphas[0]), np.log(alphas[-1]), ls=':', color="k", label='Ordinary') plt.legend()
from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_predict from sklearn.metrics import mean_squared_error from math import sqrt import pandas as pd import numpy as np filename = 'task1a_lm1d1z/train.csv' data = pd.read_csv(filename) y = data['y'] X = data.drop(['Id','y'],axis=1) lam = [0.1, 1, 10, 100, 1000] ridge = Ridge(normalize=False) rms=[] for parameter in lam: ridge.alpha = parameter predicted = cross_val_predict(ridge,X, y, cv=10) rms.append(np.mean(mean_squared_error(predicted,y))) print (rms)
print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train , y_train_pred) ,mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' %(r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) ###########################Ridge alpha_space = np.logspace(-4, 0, 50) ridge_scores = [] ridge_scores_std = [] ridge = Ridge(normalize=True) for alpha in alpha_space: ridge.alpha = Ridge(alpha,normalize=True) ridge_cv_scores = cross_val_score(ridge.alpha,X,y,cv=10) ridge_scores.append(np.mean(ridge_cv_scores)) ridge_scores_std.append(np.std(ridge_cv_scores)) def display_plot(cv_scores, cv_scores_std): fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.plot(alpha_space, cv_scores) std_error = cv_scores_std / np.sqrt(10) ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2) ax.set_ylabel('CV Score +/- Std Error') ax.set_xlabel('Alpha') ax.axhline(np.max(cv_scores), linestyle='--', color='.5') ax.set_xlim([alpha_space[0], alpha_space[-1]]) ax.set_xscale('log')
reg3 = Lasso(alpha=1) reg1.fit(train_X, train_y) reg1.score(test_X, test_y) reg2.fit(train_X, train_y) reg2.score(test_X, test_y) reg3.fit(train_X, train_y) reg3.score(test_X, test_y) #logspace이용하여 알파 파라미터 찾기 alphas = np.logspace(-3, 3, 30) result = pd.DataFrame(index=alphas, columns=['Ridge', 'Lasso']) for alpha in alphas: reg2.alpha = alpha reg3.alpha = alpha reg2.fit(train_X, train_y) result.loc[alpha, 'Ridge'] = reg2.score(test_X, test_y) reg3.fit(train_X, train_y) result.loc[alpha, 'Lasso'] = reg3.score(test_X, test_y) param_Ridge = 0.78804 param_Lasso = 0.001 ##test 5-fold cross validation from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True, random_state=1) for train, test in kf.split(train_data): print(train, test)
from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score ridge3_scores = [] ridge6_scores = [] ridge9_scores = [] alpha_space = np.logspace(-4,0,50) ridge3 = Ridge() ridge6 = Ridge() ridge9 = Ridge() #finding the best alpha for alpha in alpha_space: ridge3.alpha = alpha ridge3_cv_scores = cross_val_score(ridge3, X, y3, cv=10) ridge3_scores.append(np.mean(ridge3_cv_scores)) ridge6.alpha = alpha ridge6_cv_scores = cross_val_score(ridge6, X, y6, cv=10) ridge6_scores.append(np.mean(ridge6_cv_scores)) ridge9.alpha = alpha ridge9_cv_scores = cross_val_score(ridge9, X, y9, cv=10) ridge9_scores.append(np.mean(ridge9_cv_scores)) print("The best alpha value is: ", alpha_space[np.argmax(ridge3_scores)]) print("The best alpha value is: ", alpha_space[np.argmax(ridge6_scores)]) print("The best alpha value is: ", alpha_space[np.argmax(ridge9_scores)]) ridge3 = Ridge(alpha = alpha_space[np.argmax(ridge3_scores)])
def main(data_dir='./data/', N=10, cv_test_size=0.3, files_to_use='all', submit_name='submission.csv'): if files_to_use == 'all': files_to_use = [ 'dswrf_sfc', 'dlwrf_sfc', 'uswrf_sfc', 'ulwrf_sfc', 'ulwrf_tatm', 'pwat_eatm', 'tcdc_eatm', 'apcp_sfc', 'pres_msl', 'spfh_2m', 'tcolc_eatm', 'tmax_2m', 'tmin_2m', 'tmp_2m', 'tmp_sfc' ] train_sub_str = '_latlon_subset_19940101_20071231.nc' test_sub_str = '_latlon_subset_20080101_20121130.nc' print('Loading training data...') trainX = load_GEFS_data(data_dir, files_to_use, train_sub_str) # 训练样本 times, trainY = load_csv_data(os.path.join(data_dir, 'train.csv')) # 训练样本的目标值 print('Training data shape', trainX.shape, trainY.shape) # Gotta pick a scikit-learn model model = Ridge(normalize=True) # Normalizing is usually a good idea print('Finding best regularization value for alpha...') alphas = np.logspace(-3, 1, 8, base=10) # List of alphas to check alphas = np.array((0.1, 0.2, 0.3, 0.4, 0.5, 0.6)) maes = [] for alpha in alphas: model.alpha = alpha mae = cv_loop(trainX, trainY, model, N) maes.append(mae) print('alpha %.4f mae %.4f' % (alpha, mae)) best_alpha = alphas[np.argmin(maes)] print('Best alpha of %s with mean average error of %s' % (best_alpha, np.min(maes))) print('Fitting model with best alpha...') model.alpha = best_alpha model.fit(trainX, trainY) print('Loading test data...') testX = load_GEFS_data(data_dir, files_to_use, test_sub_str) print('Raw test data shape', testX.shape) # # predictions_rf = run_random_forest(trainX, trainY, testX) # # predictions_svr = run_svr(trainX, trainY, testX) # # predictions_ridge = run_ridge(trainX, trainY, testX) # # predictions_gbr = run_gbr(trainX, trainY, testX) # # parameters = { # "loss": 'ls', # "n_estimators": 3000, # "learning_rate": 0.035, # "max_features": 80, # "max_depth": 7, # "subsample": 0.5 # } # # model = GradientBoostingRegressor(parameters) # # print("CV loop ", cv_loop(trainX, trainY[:, ], model, 10)) print('Predicting...') preds = model.predict(testX) print('Saving to csv...') save_submission(preds, submit_name, data_dir)
################################# # Create training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=508) # Import necessary modules from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score # Create a ridge regressor: ridge ridge = Ridge(normalize=True) ridge.alpha = 0.75 ridge.fit(X_train, y_train) # Calling the score method, which compares the predicted values to the actual values y_score = ridge.score(X_test, y_test) # The score is directly comparable to R-Square print(y_score) # Predict on the test data: y_pred y_pred = ridge.predict(X_test) # Compute and print R^2 and RMSE print("R^2: {}".format(ridge.score(X_test, y_test))) rmse = np.sqrt(mean_squared_error(y_test , y_pred))
# Import necessary modules from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score # Setup the array of alphas and lists to store scores alpha_space = np.logspace(-4, 0, 50) ridge_scores = [] ridge_scores_std = [] # Create a ridge regressor: ridge ridge = Ridge(normalize = True) # Compute scores over range of alphas for alpha in alpha_space: # Specify the alpha value to use: ridge.alpha ridge.alpha = alpha # Perform 10-fold CV: ridge_cv_scores ridge_cv_scores = cross_val_score(ridge,X,y, cv = 10) # Append the mean of ridge_cv_scores to ridge_scores ridge_scores.append(np.mean(ridge_cv_scores)) # Append the std of ridge_cv_scores to ridge_scores_std ridge_scores_std.append(np.std(ridge_cv_scores)) # Display the plot display_plot(ridge_scores, ridge_scores_std)