コード例 #1
0
def one_fold_cv(train_data, validation_data, test_data, zip_):
    best_mse = 1e6
    mse = []
    r2 = []
    for p in tqdm(range(11, 19)):
        predictions = get_predictions(train_data, zip_, 12, p=p)[0]
        actual = validation_data[validation_data.ZIP == zip_].PK_Count.values
        mse_value = MSE(predictions, actual)
        r2_value = R2(predictions, actual)
        mse.append(mse_value)
        r2.append(r2_value)

        if mse_value < best_mse:
            next_prediction = predictions[0]
            best_mse = mse_value
            best_r2 = r2_value
            best_p = p

    test_prediction = get_predictions(pd.concat([train_data, validation_data]),
                                      zip_,
                                      12,
                                      p=best_p)
    test_mse = MSE(test_data[test_data.ZIP == zip_].PK_Count.values,
                   test_prediction[0])
    test_r2 = R2(test_data[test_data.ZIP == zip_].PK_Count.values,
                 test_prediction[0])

    return zip_, best_p, test_prediction, mse, r2, test_mse, test_r2
コード例 #2
0
def metrics(measured, estimated, name='test'):
    '''
    measured, estimated: numpy 1D array of NORMALISED records
    name: string naming the test ran
    '''

    mae = MAE(estimated, measured)
    mse = MSE(estimated, measured)
    root_mse = np.sqrt(mse)
    relative_mse = MSE(estimated,
                       measured,
                       sample_weight=1 / (estimated.flatten()**2))
    r2 = R2(estimated, measured)
    re = (((estimated + 1) - (measured + 1)) / (measured + 1)) * 100

    vals = [
        mae, mse, root_mse, relative_mse, r2,
        re.mean(),
        re.std(), 3 * re.std()
    ]
    cols = [
        'mae [-]', 'mse [-]', 'root_mse [-]', 'relative_mse [-]', 'r2 [-]',
        're_avg ', 're_std', 'TL'
    ]

    df = pd.DataFrame([vals], columns=cols, index=[name])

    return df
コード例 #3
0
def RunModel(model, data, columns, Predict):
    X = data[columns]
    Y = data[Predict]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        train_size=train,
                                                        test_size=test,
                                                        random_state=42)

    Model = model
    Model.fit(X_train, y_train)

    prediction = Model.predict(X_test)
    mse = (MSE(y_test, prediction))
    r2 = (R2(y_test, prediction))
    mae = (MAE(y_test, prediction))
    acc = AS(y_test, prediction)
    con_met = CM(y_test, prediction)
    return mse, r2, mae, acc, con_met
コード例 #4
0
                kernel_indep = gp.kernel_
                if k_mode == 'RBF':
                    try:
                        indep_lengthscale[i] += kernel_indep.k2.length_scale
                    except:
                        indep_lengthscale[i] += kernel_indep.k2.k2.length_scale
            if k_mode == 'RBF':
                indep_lengthscale[i] /= T

            # Compute performance
            if denormalize_y:
                # De-standardize predictions
                Y_pred_base = scaler_Y.inverse_transform(Y_pred_base_norm)

                rmse = RMSE(Y_tst, Y_pred_base)
                r2 = R2(Y_tst, Y_pred_base)
            else:
                rmse = RMSE(Y_tst_norm, Y_pred_base_norm)
                r2 = R2(Y_tst_norm, Y_pred_base_norm)

            rmse_base.append(rmse)
            rmse_norm_base.append(rmse_norm)
            r2_base.append(r2)
            print(f'RMSE = {rmse}')
            print(f'R2 = {r2}')

    if train_bonilla:
        # BONILLA MODEL
        if exists_bonilla:
            pass
        else:
コード例 #5
0
"""Ridge Regression"""

lamda = np.logspace(-5, 0, 6)
MSE_train_Ridge = np.zeros((lamda.shape))
MSE_test_Ridge = np.zeros((lamda.shape))
r2_train_Ridge = np.zeros((lamda.shape))
r2_test_Ridge = np.zeros((lamda.shape))

for i in range(len(lamda)):
    clf = skl.Ridge(alpha=lamda[i],
                    fit_intercept=True).fit(X_train_scaled, y_train)
    y_train_pred = clf.predict(X_train_scaled)
    y_test_pred = clf.predict(X_test_scaled)
    MSE_train_Ridge[i] = MSE(y_train, y_train_pred)
    MSE_test_Ridge[i] = MSE(y_test, y_test_pred)
    r2_train_Ridge[i] = R2(y_train, y_train_pred)
    r2_test_Ridge[i] = R2(y_test, y_test_pred)

#%%
"""OLS Regression"""

MSE_train_OLS = np.zeros((lamda.shape))
MSE_test_OLS = np.zeros((lamda.shape))
r2_train_OLS = np.zeros((lamda.shape))
r2_test_OLS = np.zeros((lamda.shape))

clf = skl.LinearRegression(fit_intercept=True).fit(X_train_scaled, y_train)
y_train_pred = clf.predict(X_train_scaled)
y_test_pred = clf.predict(X_test_scaled)
MSE_train_OLS[:] = MSE(y_train, y_train_pred)
MSE_test_OLS[:] = MSE(y_test, y_test_pred)
def metrics1(y_true, y_pred):
    mse = MSE_score(y_true, y_pred)
    # mse=np.mean((y_true-y_pred)**2)
    r2 = R2(y_true, y_pred)
    # r2=1-np.sum((y_true-y_pred)**2)/np.sum((y_true-np.mean(y_true))**2)
    return mse, r2
コード例 #7
0
                                                    random_state=12)
        z_pred_train = np.zeros((len(z_train), n_bootstrap))
        z_pred_test = np.zeros((len(z_test), n_bootstrap))
        for k in range(n_bootstrap):
            X_, z_ = resample(X_train, z_train)
            clf = skl.Ridge(alpha=lamda[j], fit_intercept=False).fit(X_, z_)
            z_pred_test[:, k] = clf.predict(X_test).flatten()
            z_pred_train[:, k] = clf.predict(X_train).flatten()
        mse_ridge_boot[i, j,
                       0] = MSE(z_train,
                                np.mean(z_pred_train, axis=1, keepdims=True))
        mse_ridge_boot[i, j,
                       1] = MSE(z_test,
                                np.mean(z_pred_test, axis=1, keepdims=True))
        r2_ridge_boot[i, j,
                      0] = R2(z_train,
                              np.mean(z_pred_train, axis=1, keepdims=True))
        r2_ridge_boot[i, j,
                      1] = R2(z_test,
                              np.mean(z_pred_test, axis=1, keepdims=True))

#%%
"""Declare arrays to store MSE and r2-score for CV"""

mse_ridge_CV = np.zeros((polynomial.shape[0], lamda.shape[0], 2))
r2_ridge_CV = np.zeros((polynomial.shape[0], lamda.shape[0], 2))

#%%
"""Split data into training and testing data and perform regression with CV"""

k = 10  #define number of folds
kfold = KFold(n_splits=k)  #introduce k-fold cross-validation
コード例 #8
0
# Print results of the model
print('GrLivArea Coefficient: %.3f' % lr.coef_[0])
print('Intercept: %.3f' % lr.intercept_)

'''
Our coefficient for "GrLivArea" is 100.273. This means, for every unit increase
living area, the sale price increases by $100.273.
'''

# Generate and print RMSE and R-Squared values
print('Simple Linear Regression RMSE train: %.3f, test: %.3f' % (
        MSE(y_train, y_train_pred)**(1/2),
        MSE(y_test, y_test_pred)**(1/2)))
print('Simple Linear Regression R^2 train: %.3f, test: %.3f' % (
        R2(y_train, y_train_pred),
        R2(y_test, y_test_pred)))

'''
The performance of our model was as follows:
    - Simple Linear Regression RMSE train: 55953.432, test: 56605.876
    - Simple Linear Regression R^2 train: 0.474, test: 0.551

The RMSE will be used for comparison with other models. However, we can examine
the R-Squared value and see that our predictor explained 47.4% and 55.1% of our
response in the train and test set respectively. Though this is perfect, 
considering that this is only ONE predictor variable, this means it does a good
job in predicting. 

The next lines of code uses the "fit_plot" function we created earlier. This
helps us visualise the line of best fit of our model.
コード例 #9
0
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import r2_score as R2
from sklearn.model_selection import train_test_split
import sys
import pickle
import pandas as pd

# データセットの読み込み
dataset_df = pd.read_csv("dataset.csv")

# データセットから説明変数と回帰対象の変数を取り出し(今回はWater Solubilityを回帰)
X = dataset_df[["MaxEStateIndex", "MinEStateIndex"]]
y = dataset_df["Water Solubility"]

# 学習用と評価用にデータを分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 線形回帰
model = LR()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# 決定係数R2でモデルの性能評価
print("Train score: ", R2(y_train, y_train_pred))
print("Test score: ", R2(y_test, y_test_pred))

##モデルの保存
pickle.dump(model, open("src/mymodel.pkl", "wb"))

コード例 #10
0
def plot_calculate_errors_weekday(model_name,
                                  model,
                                  train,
                                  test,
                                  plot=False,
                                  target='Débit horaire'):
    if plot:
        plt.figure(figsize=(25, 25))

    relative_errors = []
    errors = []
    r2_list = []
    errors = []
    for i in range(7):
        weekday_y_train = train_weekdays[i][['Débit horaire']].values
        weekday_dates_train = train_weekdays[i]['Date et heure de comptage']
        weekday_X_train = train_weekdays[i].drop(columns=[
            'Débit horaire', 'Taux d\'occupation', 'Date et heure de comptage',
            'index'
        ]).values

        weekday_y_test = test_weekdays[i][['Débit horaire']].values
        weekday_dates_test = test_weekdays[i]['Date et heure de comptage']
        weekday_X_test = test_weekdays[i].drop(columns=[
            'Débit horaire', 'Taux d\'occupation', 'Date et heure de comptage',
            'index'
        ]).values

        search = model.fit(weekday_X_train, weekday_y_train)

        y_pred = model.predict(weekday_X_test)
        error = MSE(weekday_y_test, y_pred)**(1 / 2)
        r2 = R2(weekday_y_test, y_pred)

        if model_name == 'xgbrabo weekday':
            print(" [.] Params:", search.steps[1][1].best_params_)

        if plot:
            plt.subplot(4, 2, i + 1)

            # plot test and prediction values
            plt.plot(weekday_dates_test, y_pred, label='prediction')
            plt.plot(weekday_dates_test,
                     weekday_y_test,
                     label='test',
                     alpha=0.5)

            # [Optional] Some extra details
            #  diff = y_pred_xg.reshape(list_test_data[i][1].shape) - list_test_data[i][1]
            #  plt.plot(test_dates[i],(diff**2)**(1/2))

            plt.ylabel('Débit horaire')
            plt.title('Débit horaire (prévision x real data) for ' +
                      calendar.day_name[i])
            plt.xticks(rotation=45)
            plt.legend()

        # TODO: calculate the mean of every weekday
        #relative_error = error/df_analysis[target].mean()
        errors.append(error)
        #relative_errors.append(relative_error)
        r2_list.append(r2)

    plt.show()

    print(" [+] Model Name:", model_name)
    print(' [.] RMSE: {}'.format(str(errors)))
    print(' [.] RMSE mean: {}'.format(float(np.mean(errors))))
    #print(' [.] Relative RMSE: {}'.format(str(relative_errors)))
    #print(' [.] Relative RMSE Mean: {:.2%}'.format(float(np.mean(relative_errors))))
    print(' [.] R^2: {}'.format(str(r2_list)))
    print(' [.] R^2 Mean: {:.2%}'.format(float(np.mean(r2_list))))
    print("-----------------------------------------------------")

    rrmse_mean[model_name] = float(np.mean(errors))
    #relative_rmse_mean[model_name] = float(np.mean(relative_errors))
    r2_mean[model_name] = float(np.mean(r2_list))
コード例 #11
0
def plot_calculate_errors(model_name,
                          model,
                          n,
                          train_data,
                          test_data,
                          test_dates,
                          plot=False):
    if plot:
        plt.figure(figsize=(25, 15))

    relative_errors = []
    errors = []
    r2_list = []
    errors = []
    models = []
    for i, (X_train, y_train) in tqdm(enumerate(train_data)):
        search = model.fit(X_train, y_train)
        models.append(model)

        if model_name == 'xgboost':
            print(" [.] Params:", search.steps[1][1].best_params_)

        y_pred = model.predict(test_data[i][0])
        error = MSE(test_data[i][1], y_pred)**(1 / 2)
        r2 = R2(test_data[i][1], y_pred)

        if plot:
            plt.subplot(n // 2, 2, i + 1)

            # plot test and prediction values
            plt.plot(test_dates[i], y_pred, label='prediction')
            plt.plot(test_dates[i], test_data[i][1], label='test', alpha=0.5)

            # [Optional] Some extra details
            #  diff = y_pred_xg.reshape(test_data[i][1].shape) - test_data[i][1]
            #  plt.plot(test_dates[i],(diff**2)**(1/2))

            plt.ylabel('Débit horaire')
            plt.title('Débit horaire (prévision x real data)')
            plt.xticks(rotation=45)
            plt.legend()

        relative_error = error / df_analysis[target].mean()
        errors.append(error)
        relative_errors.append(relative_error)
        r2_list.append(r2)

    plt.show()

    print(" [+] Model Name:", model_name)
    print(' [.] RMSE: {}'.format(str(errors)))
    print(' [.] RMSE mean: {}'.format(float(np.mean(errors))))
    print(' [.] Relative RMSE: {}'.format(str(relative_errors)))
    print(' [.] Relative RMSE Mean: {:.2%}'.format(
        float(np.mean(relative_errors))))
    print(' [.] R^2: {}'.format(str(r2_list)))
    print(' [.] R^2 Mean: {:.2%}'.format(float(np.mean(r2_list))))
    print("-----------------------------------------------------")

    rmse_mean[model_name] = float(np.mean(errors))
    relative_rmse_mean[model_name] = float(np.mean(relative_errors))
    r2_mean[model_name] = float(np.mean(r2_list))

    return models
コード例 #12
0
                       penalty,
                       activation="tanh")
    NN.set_learning_params(a1, a2)
    NN.fit(X_train,
           Z_train,
           n_minibatches,
           n_epochs,
           std_W=std_W,
           const_b=const_b,
           track_cost=[X_test, Z_test])

    Z_pred = NN.predict(X_test)

    print(f"Neural Network with penalty lambda = {penalty}")
    print("  MSE score =", MSE(Z_test, Z_pred))
    print("  R2 score  =", R2(Z_test, Z_pred))

    plt.plot(np.arange(1, n_epochs + 1),
             NN.cost,
             label=f"$\lambda={penalty:.2f}$")

plt.xlabel("Number of epochs", fontsize=12)
plt.ylabel("Cost function", fontsize=12)
#plt.xscale("log")
#plt.yscale("log")
plt.title("Evolution of cost function", fontsize=15)
plt.legend()
#plt.savefig("Figures/NN_sgd_cost_function.png", dpi=300)
plt.show()

# test with Ridge
コード例 #13
0
trainExo, testExo, errorBias = biasCorr(trainTar, testTar)

feats = RandomForestRegressor(n_estimators=1000,
                             max_features=0.7,
                             min_samples_split=100,
                             oob_score=True)
                             
feats.fit(trainIn, np.array(trainExo).flatten())

# Commented out section provides feature importance values
## Print out the feature and importances
#importances = (feats.feature_importances_*100).round(2)
#
## List of tuples with variable and importance
#feature_importances = [(feature, importance)
#                       for feature, importance in zip(trainIn.columns, importances)]
#
## Sort the feature importances by most important first
#feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
#print('Oob Score:',feats.oob_score_.round(2), '\n')
#
#[print('Variable: {:20} Importance: {}%'.format(*pair)) for pair in feature_importances];

testErrPred = feats.predict(testIn)
real = np.array(testExo).flatten()

remainder = real-testErrPred
rmse = (np.sqrt(real**2)).mean()
newrmse = (np.sqrt(remainder**2)).mean()
r = np.round(R2(real, testErrPred)*100, 1)