def one_fold_cv(train_data, validation_data, test_data, zip_): best_mse = 1e6 mse = [] r2 = [] for p in tqdm(range(11, 19)): predictions = get_predictions(train_data, zip_, 12, p=p)[0] actual = validation_data[validation_data.ZIP == zip_].PK_Count.values mse_value = MSE(predictions, actual) r2_value = R2(predictions, actual) mse.append(mse_value) r2.append(r2_value) if mse_value < best_mse: next_prediction = predictions[0] best_mse = mse_value best_r2 = r2_value best_p = p test_prediction = get_predictions(pd.concat([train_data, validation_data]), zip_, 12, p=best_p) test_mse = MSE(test_data[test_data.ZIP == zip_].PK_Count.values, test_prediction[0]) test_r2 = R2(test_data[test_data.ZIP == zip_].PK_Count.values, test_prediction[0]) return zip_, best_p, test_prediction, mse, r2, test_mse, test_r2
def metrics(measured, estimated, name='test'): ''' measured, estimated: numpy 1D array of NORMALISED records name: string naming the test ran ''' mae = MAE(estimated, measured) mse = MSE(estimated, measured) root_mse = np.sqrt(mse) relative_mse = MSE(estimated, measured, sample_weight=1 / (estimated.flatten()**2)) r2 = R2(estimated, measured) re = (((estimated + 1) - (measured + 1)) / (measured + 1)) * 100 vals = [ mae, mse, root_mse, relative_mse, r2, re.mean(), re.std(), 3 * re.std() ] cols = [ 'mae [-]', 'mse [-]', 'root_mse [-]', 'relative_mse [-]', 'r2 [-]', 're_avg ', 're_std', 'TL' ] df = pd.DataFrame([vals], columns=cols, index=[name]) return df
def RunModel(model, data, columns, Predict): X = data[columns] Y = data[Predict] X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=train, test_size=test, random_state=42) Model = model Model.fit(X_train, y_train) prediction = Model.predict(X_test) mse = (MSE(y_test, prediction)) r2 = (R2(y_test, prediction)) mae = (MAE(y_test, prediction)) acc = AS(y_test, prediction) con_met = CM(y_test, prediction) return mse, r2, mae, acc, con_met
kernel_indep = gp.kernel_ if k_mode == 'RBF': try: indep_lengthscale[i] += kernel_indep.k2.length_scale except: indep_lengthscale[i] += kernel_indep.k2.k2.length_scale if k_mode == 'RBF': indep_lengthscale[i] /= T # Compute performance if denormalize_y: # De-standardize predictions Y_pred_base = scaler_Y.inverse_transform(Y_pred_base_norm) rmse = RMSE(Y_tst, Y_pred_base) r2 = R2(Y_tst, Y_pred_base) else: rmse = RMSE(Y_tst_norm, Y_pred_base_norm) r2 = R2(Y_tst_norm, Y_pred_base_norm) rmse_base.append(rmse) rmse_norm_base.append(rmse_norm) r2_base.append(r2) print(f'RMSE = {rmse}') print(f'R2 = {r2}') if train_bonilla: # BONILLA MODEL if exists_bonilla: pass else:
"""Ridge Regression""" lamda = np.logspace(-5, 0, 6) MSE_train_Ridge = np.zeros((lamda.shape)) MSE_test_Ridge = np.zeros((lamda.shape)) r2_train_Ridge = np.zeros((lamda.shape)) r2_test_Ridge = np.zeros((lamda.shape)) for i in range(len(lamda)): clf = skl.Ridge(alpha=lamda[i], fit_intercept=True).fit(X_train_scaled, y_train) y_train_pred = clf.predict(X_train_scaled) y_test_pred = clf.predict(X_test_scaled) MSE_train_Ridge[i] = MSE(y_train, y_train_pred) MSE_test_Ridge[i] = MSE(y_test, y_test_pred) r2_train_Ridge[i] = R2(y_train, y_train_pred) r2_test_Ridge[i] = R2(y_test, y_test_pred) #%% """OLS Regression""" MSE_train_OLS = np.zeros((lamda.shape)) MSE_test_OLS = np.zeros((lamda.shape)) r2_train_OLS = np.zeros((lamda.shape)) r2_test_OLS = np.zeros((lamda.shape)) clf = skl.LinearRegression(fit_intercept=True).fit(X_train_scaled, y_train) y_train_pred = clf.predict(X_train_scaled) y_test_pred = clf.predict(X_test_scaled) MSE_train_OLS[:] = MSE(y_train, y_train_pred) MSE_test_OLS[:] = MSE(y_test, y_test_pred)
def metrics1(y_true, y_pred): mse = MSE_score(y_true, y_pred) # mse=np.mean((y_true-y_pred)**2) r2 = R2(y_true, y_pred) # r2=1-np.sum((y_true-y_pred)**2)/np.sum((y_true-np.mean(y_true))**2) return mse, r2
random_state=12) z_pred_train = np.zeros((len(z_train), n_bootstrap)) z_pred_test = np.zeros((len(z_test), n_bootstrap)) for k in range(n_bootstrap): X_, z_ = resample(X_train, z_train) clf = skl.Ridge(alpha=lamda[j], fit_intercept=False).fit(X_, z_) z_pred_test[:, k] = clf.predict(X_test).flatten() z_pred_train[:, k] = clf.predict(X_train).flatten() mse_ridge_boot[i, j, 0] = MSE(z_train, np.mean(z_pred_train, axis=1, keepdims=True)) mse_ridge_boot[i, j, 1] = MSE(z_test, np.mean(z_pred_test, axis=1, keepdims=True)) r2_ridge_boot[i, j, 0] = R2(z_train, np.mean(z_pred_train, axis=1, keepdims=True)) r2_ridge_boot[i, j, 1] = R2(z_test, np.mean(z_pred_test, axis=1, keepdims=True)) #%% """Declare arrays to store MSE and r2-score for CV""" mse_ridge_CV = np.zeros((polynomial.shape[0], lamda.shape[0], 2)) r2_ridge_CV = np.zeros((polynomial.shape[0], lamda.shape[0], 2)) #%% """Split data into training and testing data and perform regression with CV""" k = 10 #define number of folds kfold = KFold(n_splits=k) #introduce k-fold cross-validation
# Print results of the model print('GrLivArea Coefficient: %.3f' % lr.coef_[0]) print('Intercept: %.3f' % lr.intercept_) ''' Our coefficient for "GrLivArea" is 100.273. This means, for every unit increase living area, the sale price increases by $100.273. ''' # Generate and print RMSE and R-Squared values print('Simple Linear Regression RMSE train: %.3f, test: %.3f' % ( MSE(y_train, y_train_pred)**(1/2), MSE(y_test, y_test_pred)**(1/2))) print('Simple Linear Regression R^2 train: %.3f, test: %.3f' % ( R2(y_train, y_train_pred), R2(y_test, y_test_pred))) ''' The performance of our model was as follows: - Simple Linear Regression RMSE train: 55953.432, test: 56605.876 - Simple Linear Regression R^2 train: 0.474, test: 0.551 The RMSE will be used for comparison with other models. However, we can examine the R-Squared value and see that our predictor explained 47.4% and 55.1% of our response in the train and test set respectively. Though this is perfect, considering that this is only ONE predictor variable, this means it does a good job in predicting. The next lines of code uses the "fit_plot" function we created earlier. This helps us visualise the line of best fit of our model.
from sklearn.linear_model import LinearRegression as LR from sklearn.metrics import r2_score as R2 from sklearn.model_selection import train_test_split import sys import pickle import pandas as pd # データセットの読み込み dataset_df = pd.read_csv("dataset.csv") # データセットから説明変数と回帰対象の変数を取り出し(今回はWater Solubilityを回帰) X = dataset_df[["MaxEStateIndex", "MinEStateIndex"]] y = dataset_df["Water Solubility"] # 学習用と評価用にデータを分割 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 線形回帰 model = LR() model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) # 決定係数R2でモデルの性能評価 print("Train score: ", R2(y_train, y_train_pred)) print("Test score: ", R2(y_test, y_test_pred)) ##モデルの保存 pickle.dump(model, open("src/mymodel.pkl", "wb"))
def plot_calculate_errors_weekday(model_name, model, train, test, plot=False, target='Débit horaire'): if plot: plt.figure(figsize=(25, 25)) relative_errors = [] errors = [] r2_list = [] errors = [] for i in range(7): weekday_y_train = train_weekdays[i][['Débit horaire']].values weekday_dates_train = train_weekdays[i]['Date et heure de comptage'] weekday_X_train = train_weekdays[i].drop(columns=[ 'Débit horaire', 'Taux d\'occupation', 'Date et heure de comptage', 'index' ]).values weekday_y_test = test_weekdays[i][['Débit horaire']].values weekday_dates_test = test_weekdays[i]['Date et heure de comptage'] weekday_X_test = test_weekdays[i].drop(columns=[ 'Débit horaire', 'Taux d\'occupation', 'Date et heure de comptage', 'index' ]).values search = model.fit(weekday_X_train, weekday_y_train) y_pred = model.predict(weekday_X_test) error = MSE(weekday_y_test, y_pred)**(1 / 2) r2 = R2(weekday_y_test, y_pred) if model_name == 'xgbrabo weekday': print(" [.] Params:", search.steps[1][1].best_params_) if plot: plt.subplot(4, 2, i + 1) # plot test and prediction values plt.plot(weekday_dates_test, y_pred, label='prediction') plt.plot(weekday_dates_test, weekday_y_test, label='test', alpha=0.5) # [Optional] Some extra details # diff = y_pred_xg.reshape(list_test_data[i][1].shape) - list_test_data[i][1] # plt.plot(test_dates[i],(diff**2)**(1/2)) plt.ylabel('Débit horaire') plt.title('Débit horaire (prévision x real data) for ' + calendar.day_name[i]) plt.xticks(rotation=45) plt.legend() # TODO: calculate the mean of every weekday #relative_error = error/df_analysis[target].mean() errors.append(error) #relative_errors.append(relative_error) r2_list.append(r2) plt.show() print(" [+] Model Name:", model_name) print(' [.] RMSE: {}'.format(str(errors))) print(' [.] RMSE mean: {}'.format(float(np.mean(errors)))) #print(' [.] Relative RMSE: {}'.format(str(relative_errors))) #print(' [.] Relative RMSE Mean: {:.2%}'.format(float(np.mean(relative_errors)))) print(' [.] R^2: {}'.format(str(r2_list))) print(' [.] R^2 Mean: {:.2%}'.format(float(np.mean(r2_list)))) print("-----------------------------------------------------") rrmse_mean[model_name] = float(np.mean(errors)) #relative_rmse_mean[model_name] = float(np.mean(relative_errors)) r2_mean[model_name] = float(np.mean(r2_list))
def plot_calculate_errors(model_name, model, n, train_data, test_data, test_dates, plot=False): if plot: plt.figure(figsize=(25, 15)) relative_errors = [] errors = [] r2_list = [] errors = [] models = [] for i, (X_train, y_train) in tqdm(enumerate(train_data)): search = model.fit(X_train, y_train) models.append(model) if model_name == 'xgboost': print(" [.] Params:", search.steps[1][1].best_params_) y_pred = model.predict(test_data[i][0]) error = MSE(test_data[i][1], y_pred)**(1 / 2) r2 = R2(test_data[i][1], y_pred) if plot: plt.subplot(n // 2, 2, i + 1) # plot test and prediction values plt.plot(test_dates[i], y_pred, label='prediction') plt.plot(test_dates[i], test_data[i][1], label='test', alpha=0.5) # [Optional] Some extra details # diff = y_pred_xg.reshape(test_data[i][1].shape) - test_data[i][1] # plt.plot(test_dates[i],(diff**2)**(1/2)) plt.ylabel('Débit horaire') plt.title('Débit horaire (prévision x real data)') plt.xticks(rotation=45) plt.legend() relative_error = error / df_analysis[target].mean() errors.append(error) relative_errors.append(relative_error) r2_list.append(r2) plt.show() print(" [+] Model Name:", model_name) print(' [.] RMSE: {}'.format(str(errors))) print(' [.] RMSE mean: {}'.format(float(np.mean(errors)))) print(' [.] Relative RMSE: {}'.format(str(relative_errors))) print(' [.] Relative RMSE Mean: {:.2%}'.format( float(np.mean(relative_errors)))) print(' [.] R^2: {}'.format(str(r2_list))) print(' [.] R^2 Mean: {:.2%}'.format(float(np.mean(r2_list)))) print("-----------------------------------------------------") rmse_mean[model_name] = float(np.mean(errors)) relative_rmse_mean[model_name] = float(np.mean(relative_errors)) r2_mean[model_name] = float(np.mean(r2_list)) return models
penalty, activation="tanh") NN.set_learning_params(a1, a2) NN.fit(X_train, Z_train, n_minibatches, n_epochs, std_W=std_W, const_b=const_b, track_cost=[X_test, Z_test]) Z_pred = NN.predict(X_test) print(f"Neural Network with penalty lambda = {penalty}") print(" MSE score =", MSE(Z_test, Z_pred)) print(" R2 score =", R2(Z_test, Z_pred)) plt.plot(np.arange(1, n_epochs + 1), NN.cost, label=f"$\lambda={penalty:.2f}$") plt.xlabel("Number of epochs", fontsize=12) plt.ylabel("Cost function", fontsize=12) #plt.xscale("log") #plt.yscale("log") plt.title("Evolution of cost function", fontsize=15) plt.legend() #plt.savefig("Figures/NN_sgd_cost_function.png", dpi=300) plt.show() # test with Ridge
trainExo, testExo, errorBias = biasCorr(trainTar, testTar) feats = RandomForestRegressor(n_estimators=1000, max_features=0.7, min_samples_split=100, oob_score=True) feats.fit(trainIn, np.array(trainExo).flatten()) # Commented out section provides feature importance values ## Print out the feature and importances #importances = (feats.feature_importances_*100).round(2) # ## List of tuples with variable and importance #feature_importances = [(feature, importance) # for feature, importance in zip(trainIn.columns, importances)] # ## Sort the feature importances by most important first #feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) #print('Oob Score:',feats.oob_score_.round(2), '\n') # #[print('Variable: {:20} Importance: {}%'.format(*pair)) for pair in feature_importances]; testErrPred = feats.predict(testIn) real = np.array(testExo).flatten() remainder = real-testErrPred rmse = (np.sqrt(real**2)).mean() newrmse = (np.sqrt(remainder**2)).mean() r = np.round(R2(real, testErrPred)*100, 1)