Example #1
0
def mlp_synthetic(X_train,
                  X_test,
                  y_train,
                  y_test,
                  L2reg=0.01,
                  hidden_width=50,
                  mini_batchsize=5):
    X = T.fmatrix(name='X')
    Y = T.fmatrix(name='Y')

    input_size = X_train.shape[1]
    print input_size
    w_h1 = uniform_weights((input_size, hidden_width))
    w_h2 = uniform_weights((hidden_width, hidden_width))
    w_h3 = uniform_weights((hidden_width, hidden_width))
    b_h1 = init_bias(hidden_width)
    b_h2 = init_bias(hidden_width)
    b_h3 = init_bias(hidden_width)

    w_o = uniform_weights((hidden_width, 1))
    # b_h = init_bias(hidden_width)
    b_o = init_bias(1)

    op = model(X, w_h1, w_h2, w_h3, w_o, b_h1, b_h2, b_h3, b_o)
    params = [w_h1, w_h2, w_h3, w_o, b_h1, b_h2, b_h3, b_o]
    cost = MSE_reg(Y, op, params, L2reg=L2reg)
    updates = sgd(cost, params)
    # updates=Adam(cost,params)
    train = theano.function(inputs=[X, Y],
                            outputs=cost,
                            updates=updates,
                            allow_input_downcast=True,
                            name='train')
    predict = theano.function(inputs=[X],
                              outputs=op,
                              allow_input_downcast=True)
    fcost = theano.function(inputs=[op, Y],
                            outputs=cost,
                            allow_input_downcast=True)

    test_costs = []
    train_costs = []

    for i in range(epochs):
        for start, end in zip(
                range(0, len(X_train), mini_batchsize),
                range(mini_batchsize, len(X_train), mini_batchsize)):
            yd = (floatX(y_train[start:end])).reshape(mini_batchsize, 1)
            # print (X_train[start:end]).shape
            cost_v = train(X_train[start:end], yd)

        # Done this cost prediction needs to change
        # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1))
        # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1))
        y_predicted = predict(X_train)

        fin_cost_test = MSE(predict(X_test), y_test)
        fin_cost_train = MSE(predict(X_train), y_train)

        test_costs.append(fin_cost_test)
        train_costs.append(fin_cost_train)
        # print i, fin_cost_test, fin_cost_train

    # print 'final b_o values'
    # print b_o.get_value()

    # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1))
    # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1))
    fin_cost_test = MSE(predict(X_test), y_test)
    fin_cost_train = MSE(predict(X_train), y_train)
    # print 'NumTP: {}, Hwidth: {}, BatchSize: {}, L2reg: {}, Seed {},Train: {}, Test: {}'.format(numTrainPoints,
    #                                                                                             hidden_width,
    #                                                                                             mini_batchsize, L2reg,
    #                                                                                             rand_seed,
    #                                                                                             fin_cost_train,
    #                                                                                             fin_cost_test)

    # Calculate RMS error with simple mean prediction
    test_mean = np.mean(y_test)
    train_mean = np.mean(y_train)

    mean_p_test = np.ones(y_test.size) * test_mean
    mean_p_train = np.ones(y_train.size) * train_mean

    # test_cost=fcost(floatX(mean_p_test).reshape(len(y_test), 1), floatX(y_test).reshape(len(y_test), 1))
    # train_cost=fcost(floatX(mean_p_train).reshape(len(y_train), 1), floatX(y_train).reshape(len(y_train), 1))
    test_cost = MSE(mean_p_test, y_test)
    train_cost = MSE(mean_p_train, y_train)

    tArray = np.ones(epochs) * test_cost
    # print 'MSE for mean prediction, Train:{} ,Test:{}'.format(train_cost,test_cost)
    ref_err = MSE_reference(y_test)

    # ref_arr=ref_err*np.ones(epochs)
    # plt.plot(range(epochs), test_costs, label='Test')
    # plt.plot(range(epochs),train_costs,label='Train')
    # plt.plot(range(epochs),ref_arr,label='Ref')
    # plt.xlabel('Epochs')
    # plt.ylabel('Error')
    # plt.title('TrainCost:{}, TestCost: {}'.format(fin_cost_train, fin_cost_test))
    # plt.legend()
    # plt.show()
    # plt.close()

    # dest_pkl = 'my_test.pkl'
    # f = open(dest_pkl, 'wb')
    # strip_pickler = StripPickler(f, protocol=-1)
    # strip_pickler.dump(params)
    # f.close()

    h3 = model_act(X, w_h1, w_h2, w_h3, w_o, b_h1, b_h2, b_h3, b_o)
    transform = theano.function(inputs=[X],
                                outputs=h3,
                                allow_input_downcast=True)

    test_transformed = transform(X_test)
    train_transformed = transform(X_train)
    test_predictions = predict(X_test)
    # returns the transformed test data, ie the activations from the third hidden layer
    return fin_cost_train, fin_cost_test, train_transformed, test_transformed, test_predictions
Example #2
0
    def reconstruction(ncomp, U, S, V, var=1):
        if mode == 'lapack':
            rec_matrix = np.dot(U[:, :ncomp],
                                np.dot(np.diag(S[:ncomp]), V[:ncomp]))
            rec_matrix = rec_matrix.T
            print('  Matrix reconstruction with {} PCs:'.format(ncomp))
            print('  Mean Absolute Error =', MAE(matrix, rec_matrix))
            print('  Mean Squared Error =', MSE(matrix, rec_matrix))

            # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py
            exp_var = (S**2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var  # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        elif mode == 'eigen':
            exp_var = (S**2) / (S.shape[0] - 1)
            full_var = np.sum(exp_var)
            explained_variance_ratio = exp_var / full_var  # % of variance explained by each PC
            ratio_cumsum = np.cumsum(explained_variance_ratio)
        else:
            rec_matrix = np.dot(U, np.dot(np.diag(S), V))
            print('  Matrix reconstruction MAE =', MAE(matrix, rec_matrix))
            exp_var = (S**2) / (S.shape[0] - 1)
            full_var = np.var(matrix, axis=0).sum()
            explained_variance_ratio = exp_var / full_var  # % of variance explained by each PC
            if var == 1:
                pass
            else:
                explained_variance_ratio = explained_variance_ratio[::-1]
            ratio_cumsum = np.cumsum(explained_variance_ratio)
            msg = '  This info makes sense when the matrix is mean centered '
            msg += '(temp-mean scaling)'
            print(msg)

        lw = 2
        alpha = 0.4
        fig = plt.figure(figsize=vip_figsize)
        fig.subplots_adjust(wspace=0.4)
        ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2)
        ax1.step(range(explained_variance_ratio.shape[0]),
                 explained_variance_ratio,
                 alpha=alpha,
                 where='mid',
                 label='Individual EVR',
                 lw=lw)
        ax1.plot(ratio_cumsum,
                 '.-',
                 alpha=alpha,
                 label='Cumulative EVR',
                 lw=lw)
        ax1.legend(loc='best', frameon=False, fontsize='medium')
        ax1.set_ylabel('Explained variance ratio (EVR)')
        ax1.set_xlabel('Principal components')
        ax1.grid(linestyle='solid', alpha=0.2)
        ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10)
        ax1.set_ylim(0, 1)

        trunc = 20
        ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1)
        # plt.setp(ax2.get_yticklabels(), visible=False)
        ax2.step(range(trunc),
                 explained_variance_ratio[:trunc],
                 alpha=alpha,
                 where='mid',
                 lw=lw)
        ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw)
        ax2.set_xlabel('Principal components')
        ax2.grid(linestyle='solid', alpha=0.2)
        ax2.set_xlim(-2, trunc + 2)
        ax2.set_ylim(0, 1)

        msg = '  Cumulative explained variance ratio for {} PCs = {:.5f}'
        # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight')
        print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))
Example #3
0
y_hat['Count'] = train['Count'][len(train) - 1]

# Visualize Naive method predictions
plt.figure(figsize=(40, 20))
plt.plot(train.Datetime, train['Count'], label='train')
plt.plot(valid.Datetime, valid['Count'], label='validation')
plt.plot(y_hat.Datetime, y_hat['Count'], label='Naive Forecast')
plt.xlabel('Datetime')
plt.ylabel('Passenger count')
plt.legend(loc='best')
plt.show()

rmse = pd.DataFrame(columns=['Method', 'RMSE'])

# Calculate RMSE for Naive method
rmse.loc[len(rmse)] = "Naive", sqrt(MSE(valid.Count, y_hat.Count))

# Moving Average Method to predict time series

# last 10 days
y_hat['Count'] = train['Count'].rolling(10).mean().iloc[-1]
# Calculate RMSE for Moving average 10 days
rmse.loc[len(rmse)] = "Moving Average 10D", sqrt(MSE(valid.Count, y_hat.Count))

# last 20 days
y_hat['Count'] = train['Count'].rolling(20).mean().iloc[-1]
# Calculate RMSE for Moving average 20 days
rmse.loc[len(rmse)] = "Moving Average 20D", sqrt(MSE(valid.Count, y_hat.Count))

# last 50 days
y_hat['Count'] = train['Count'].rolling(50).mean().iloc[-1]
Example #4
0
    loss_test = criterion(test_predict, testY)
    
    if epoch % 100 == 0:
        print("Epoch: %d, loss: %f, test loss: %f" % (epoch, loss.detach().numpy(), loss_test.item()))


# Test
lstm.eval()
train_predict = lstm(dataX)
data_predict = train_predict.data.numpy()
dataY_plot = dataY.data.numpy()

data_predict = sc.inverse_transform(data_predict)
dataY_plot = sc.inverse_transform(dataY_plot)

print(MSE(data_predict, dataY_plot))

plt.axvline(x=train_size, c='r', linestyle='--')

plt.plot(dataY_plot)
plt.plot(data_predict)
plt.suptitle('Time-Series Prediction')
plt.show()


# MSE on train data
test_predict = lstm(trainX)
test_predict = test_predict.data.numpy()
testY_plot = trainY.data.numpy()

test_predict = sc.inverse_transform(test_predict)
# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))

#################################           Evaluate the training error           #################################

# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Fit dt to the training set
dt.fit(X_train, y_train)

# Predict the labels of the training set
y_pred_train = dt.predict(X_train)

# Evaluate the training set RMSE of dt
RMSE_train = (MSE(y_train, y_pred_train))**(0.5)

# Print RMSE_train
print('Train RMSE: {:.2f}'.format(RMSE_train))

#################################           Define the ensemble           #################################

# Set seed for reproducibility
SEED = 1

# Instantiate lr
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
knn = KNN(n_neighbors=27)
                               max_depth=3,
                               min_child_weight=0,
                               gamma=0,
                               subsample=0.7,
                               colsample_bytree=0.7,
                               objective='reg:linear',
                               nthread=-1,
                               scale_pos_weight=1,
                               seed=27,
                               reg_alpha=0.00006)
# first pass
housing_xgb.fit(X_train, y_train, verbose=True)

# first pass 0.63
pred = housing_xgb.predict(X_test)
rmse = np.sqrt(MSE(np.log(y_test + 1), np.log(pred + 1)))
print("RMSE : % f" % (rmse))

rss = sum((y_test - pred)**2)

tss = sum((y_test - np.mean(y_test))**2)

rsq = 1 - (rss / tss)

#makes a list of featurew with thier importance
df_import = pd.DataFrame({
    'cols':
    X_test.columns,
    'feat_import':
    pd.Series(housing_xgb.feature_importances_)
})
Example #7
0
def autoencoder(dataset, logfile, random_state=1910299034):
    # Save home path
    home = str(Path.home())

    # Hyperparameters
    hidden_layer_nodes = 32
    learn_rate = 0.001

    # Load the MovieLens (download it if needed)
    if dataset == 'ml-100k':
        datafile = 'u.data'
        input_layer_nodes = 1682
        output_layer_nodes = input_layer_nodes
        ratings = pd.read_csv('{}/.surprise_data/{}/{}/{}'.format(
            home, dataset, dataset, datafile),
                              sep="\t",
                              header=None,
                              engine='python')
        batch_size = 20
        epochs = 200
    else:
        datafile = 'ratings.dat'
        input_layer_nodes = 3706
        output_layer_nodes = input_layer_nodes
        ratings = pd.read_csv('{}/.surprise_data/{}/{}/{}'.format(
            home, dataset, dataset, datafile),
                              sep="::",
                              header=None,
                              engine='python')
        batch_size = 80
        epochs = 100

    # Create DataFrame without timestamps
    ratings_pivot = pd.pivot_table(ratings[[0, 1, 2]],
                                   values=2,
                                   index=0,
                                   columns=1).fillna(0)

    # 80-20 split
    X_train, X_test = sk_train_test_split(ratings_pivot,
                                          test_size=0.2,
                                          random_state=random_state)

    # Initialize weights
    hidden_layer_weights = {
        'weights':
        tf.Variable(
            tf.random_normal([input_layer_nodes + 1, hidden_layer_nodes],
                             seed=random_state))
    }
    output_layer_weights = {
        'weights':
        tf.Variable(
            tf.random_normal([hidden_layer_nodes + 1, output_layer_nodes],
                             seed=random_state))
    }

    # Set input placeholder
    input_layer = tf.placeholder('float', [None, input_layer_nodes])

    # Add bias to input
    bias = tf.fill([tf.shape(input_layer)[0], 1], 1.0)
    input_layer_concat = tf.concat([input_layer, bias], 1)

    # Forward and activate with Sigmoid
    hidden_activations = tf.nn.sigmoid(
        tf.matmul(input_layer_concat, hidden_layer_weights['weights']))

    # Add bias
    bias = tf.fill([tf.shape(hidden_activations)[0], 1], 1.0)
    hidden_activations = tf.concat([hidden_activations, bias], 1)

    # Forward for final output
    output_layer = tf.matmul(hidden_activations,
                             output_layer_weights['weights'])

    # Set output placeholder
    output_true = tf.placeholder('float', [None, output_layer_nodes])

    # Loss
    mse_loss = tf.reduce_mean(tf.square(output_layer - output_true))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(learn_rate).minimize(mse_loss)

    # Tensorflow session initialization
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    # Running model
    for epoch in range(epochs):
        epoch_loss = 0

        for i in range(int(X_train.shape[0] / batch_size)):
            batch_X = X_train[i * batch_size:(i + 1) * batch_size]
            _, c = sess.run([optimizer, mse_loss],
                            feed_dict={
                                input_layer: batch_X,
                                output_true: batch_X
                            })
            epoch_loss += c

        output_train = sess.run(output_layer, feed_dict={input_layer: X_train})
        output_test = sess.run(output_layer, feed_dict={input_layer: X_test})

        log(
            logfile, 'MSE train ' + str(round(MSE(output_train, X_train), 2)) +
            ' MSE test ' + str(round(MSE(output_test, X_test), 2)))
        log(
            logfile, 'Epoch ' + str(epoch) + '/' + str(epochs) + ' loss: ' +
            str(round(epoch_loss, 2)))

    # Final test
    time_start = time.time()
    output_test = sess.run(output_layer, feed_dict={input_layer: X_test})
    time_stop = time.time()
    runtime = round(time_stop - time_start, 4)
    log(logfile, 'Test time: {0:f}'.format(runtime).strip('0'))
    mse = round(MSE(output_test, X_test), 3)
    log(logfile, 'MSE test: ' + str(mse) + '\n')
    return [mse, runtime]
Example #8
0
# model.compile(optimizer=SGD(), loss='mean_squared_error')

# Verify that model contains information from compiling
print("Loss function: " + model.loss)

# Fit the model
model.fit(X_train.values, y_train, validation_split=0.2, epochs=100)

#%%
#
# =============================================================================
model.summary()

y_pred = model.predict(X_test)

print('RMSE:', np.sqrt(MSE(y_test, y_pred)))

#%%
#                               Classification
# =============================================================================

from keras.utils import to_categorical

titanic_df = pd.read_csv("data/titanic.csv")

# Convert the target to categorical: target
y = to_categorical(titanic_df.survived)
X = titanic_df.drop('survived', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
Example #9
0
y_pred= dt_gini.predict(X_test) # Use dt_gini
accuracy_gini = accuracy_score(y_test, y_pred)# Evaluate accuracy_gini
print('Accuracy achieved by using entropy: ', accuracy_entropy)# Print accuracy_entropy
print('Accuracy achieved by using the gini index: ', accuracy_gini)# Print accuracy_gini
#####################################################
#Decision tree for Classification
#####################################################
#Regression tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=8,
             min_samples_leaf=0.13,
            random_state=3)
dt.fit(X_train, y_train)
from sklearn.metrics import mean_squared_error as MSE
y_pred = dt.predict(X_test)# Compute y_pred
mse_dt = MSE(y_test, y_pred)# Compute mse_dt
rmse_dt = mse_dt**(1/2)# Compute rmse_dt
print("Test set RMSE of dt: {:.2f}".format(rmse_dt))
#Linear regression
y_pred_lr = lr.predict(X_test)# Predict test 
mse_lr = MSE (y_pred_lr, y_test)# Compute mse_lr
rmse_lr = mse_lr**(1/2)# Compute rmse_lr
print('Linear Regression test set RMSE: {:.2f}'.format(rmse_lr))
print('Regression Tree test set RMSE: {:.2f}'.format(rmse_dt))
################################################
#Supervised Learning
    #Fit the model f(x) that best approximates(f(x) can be logistic regression, decision tree, neural network)
    #discard noise as much as possible
    #low predictive error on unseen dataset
#difficulties
    #overfitting: predictive power is low
Example #10
0
regr_training.fit(X, y)

inter2 = np.ones((len(v_CDD), 1))
X_vals = np.column_stack((inter2, v_CDD, v_HDD))

y_vals = regr_training.predict(X_vals)

#plotting actual demand vs. simulated demand
demand = df_validation.loc[:, 'demand']

plt.figure()
plt.scatter(y_vals, demand)
plt.xlabel('Actual Electricity Demand (MWh)')
plt.ylabel('Predicted Electricity Demand (MWh)')

#calculating R^2 value

Rsq = r2_score(demand, y_vals)

#calculating mean square error

number5 = MSE(demand, y_vals)

#plot actual demand vs. residuals

residuals = y_vals - demand
plt.figure()
plt.scatter(demand, residuals)
plt.xlabel('Actual Demand (MWh)')
plt.ylabel('Residuals (MWh)')
Example #11
0
params_IGRNN = {
    'kernel': ["RBF"],
    'sigma': list(np.arange(0.1, 4, 0.01)),
    'calibration': ['None']
}
grid_IGRNN = GridSearchCV(estimator=IGRNN,
                          param_grid=params_IGRNN,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          verbose=1,
                          n_jobs=-1)
grid_IGRNN.fit(X_train_BestSet, Ytrain.ravel())
# Use the best model to perform prediction, and compute mse
best_model = grid_IGRNN.best_estimator_
Ypred_IGRNN = best_model.predict(X_test_BestSet)
mse_IGRNN = MSE(Ytest, Ypred_IGRNN)
Ypred_IGRNN = np.round(Ypred_IGRNN, 0)
grid_IGRNN.fit(Xtrain, Ytrain.ravel())
best_model = grid_IGRNN.best_estimator_
Ypred_IGRNN_be = best_model.predict(Xtest)
mse_IGRNN_be = MSE(Ytest, Ypred_IGRNN_be)
Ypred_IGRNN_be = np.round(Ypred_IGRNN, 0)
#print(accuracy_score(Ytest, Ypred_IGRNN))

AnisotropicSelector = FS.Anisotropic_selector()
start = time.time()
AnisotropicSelector.max_dist(Xtrain, Ytrain.ravel(), feature_names=featnames)
print('Time to complete the feature selection [s]: ' +
      str(time.time() - start))

AGRNN = GRNN()
Example #12
0
                                                    test_size=0.999,
                                                    random_state=1)

nbs = np.arange(1, 51, 2)

# Let's start with regression

reg_train_mse = []
reg_test_mse = []

for n in nbs:
    knnreg = KNeighborsRegressor(n_neighbors=n)
    knnreg.fit(X_train, y_train)
    y_hat_train = knnreg.predict(X_train)
    y_hat_test = knnreg.predict(X_test)
    train_mse = MSE(y_train, y_hat_train)
    reg_train_mse.append(train_mse)
    test_mse = MSE(y_test, y_hat_test)
    reg_test_mse.append(test_mse)
    print(n)

# Let us plot the train and test errors:

plt.plot(nbs, reg_train_mse, marker='.', color='blue', label='Train MSE')
plt.plot(nbs, reg_test_mse, marker='.', color='orange', label='Test MSE')
plt.xticks(nbs)
plt.xlabel('# of neighbors')
plt.ylabel('MSE')
plt.title('kNN regression')
plt.legend()
Example #13
0
#

train_file = '../data/train.csv'
split_at = -365

d = pd.read_csv(train_file)

# some regions have other date spans than others
d['mean_mortality_rate'] = d.groupby('date').mortality_rate.transform('mean')
d = d.drop_duplicates('date')

d = d[['date', 'mortality_rate']]
d.columns = ['ds', 'y']

d.y = np.log(d.y)

train = d[:split_at].copy().reset_index(drop=True)
test = d[split_at:].copy().reset_index(drop=True)

prophet = Prophet()
prophet.fit(train)
p = prophet.predict(test)

score = sqrt(MSE(np.exp(test.y), np.exp(p.yhat)))
print 'RMSE: {:.2%}'.format(score)

prophet.plot(p)
prophet.plot_components(p)
plt.show()
Example #14
0
def MSE_reference(y_test):
    mean_pred = np.mean(y_test)
    mean_arr = np.ones(y_test.shape) * mean_pred

    return MSE(y_test, mean_arr)
"""
Evaluate the training error
You'll now evaluate the training set RMSE achieved by the regression tree dt that you instantiated in a previous exercise.

In addition to dt, X_train and y_train are available in your workspace.

INSTRUCTION
-----------
Import mean_squared_error as MSE from sklearn.metrics.
Fit dt to the training set.
Predict dt's training set labels and assign the result to y_pred_train.
Evaluate dt's training set MSE and assign it to RMSE_train.
"""

# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Fit dt to the training set
dt.fit(X_train, y_train)

# Predict the labels of the training set
y_pred_train = dt.predict(X_train)

# Evaluate the training set RMSE of dt
RMSE_train = (MSE(y_train, y_pred_train))**(1 / 2)

# Print RMSE_train
print('Train RMSE: {:.2f}'.format(RMSE_train))
xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420)

# In[5]:

reg = XGBR(n_estimators=100).fit(xtrain, ytrain)
reg.predict(xtest)

# In[6]:

# 测试集的结果分数,默认是返回R平方指标
reg.score(xtest, ytest)

# In[7]:

# 均方误差
MSE(ytest, reg.predict(xtest))

# In[8]:

y.mean()
# 均方误差结果大约占y均值的三分之一,效果一般

# In[9]:

# 树模型可以查看模型的重要性分数,可以使用嵌入法(select from model)进行特征选择
reg.feature_importances_

# # 使用交叉验证来进行对比

# In[10]:
def RMSE(y_true, y_pred):
    return sqrt(MSE(y_true, y_pred))
#####################################
#### KNN model for classification
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn.score(X_test, y_test) # print the performance score of fitted model

#####################################
#### CART (classification and regression tree) - regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=3)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
mse_dt = MSE(y_test, y_pred)
rmse_dt = mse_dt**(1/2)
# using CV with regression tree needs some tweak below
#   (score is for maximization while MSE is for minimization)
MSE_CV = - cross_val_score(dt, X_train, y_train, cv= 10, scoring='neg_mean_squared_error', n_jobs = -1)
print(MSE_CV.mean())
#### CART (classification and regression tree) - classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(max_depth=2, min_samples_split=2, min_samples_leaf=1, \
                            criterion='gini', random_state=1) # decision tree
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

#####################################
print("Root Mean Squared Error: {}".format(rmse))


# DECISION TREE REGRESSION

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
dt = DecisionTreeRegressor(max_depth=4,
                            min_samples_leaf=0.1, # Each leaf must contain AT LEAST 10% of the training data
                            random_state=3)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
mse_dt = MSE(y_test, y_pred) # Compute test MSE
rmse_dt = mse_dt **(1/2) # Compute test RMSE
print(rmse_dt)



###########################################################

# 4) CROSS VALIDATION

from sklearn.model_selection import cross_val_score
reg = linear_model.LinearRegression()
cv_results = cross_val_score(reg, X, y, cv = 5)
print(cv_results)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
Example #20
0
for r in sorted( d.region.unique()):
	regions[r] = d[ d.region == r ].copy()
	print r, len( regions[r] )
	
for r, df in regions.items():
	df = df[['date', 'mortality_rate']]
	df.columns = ['ds', 'y']
	df.y = np.log( df.y )
	
	trains[r] = df[:split_at].copy().reset_index( drop = True )
	tests[r] = df[split_at:].copy().reset_index( drop = True )
	prophets[r] = Prophet()
	
	prophets[r].fit( trains[r] )
	predictions[r] = prophets[r].predict( tests[r] )
	scores[r] = sqrt( MSE( np.exp( tests[r].y ), np.exp( predictions[r].yhat )))
	
	print '{} RMSE: {:.2%}'.format( r, scores[r] )
	prophets[r].plot( predictions[r] )
	prophets[r].plot_components( predictions[r] )

for r in sorted( regions ):
	print '{} RMSE: {:.2%}'.format( r, scores[r] )
	prophets[r].plot_components( predictions[r] )
	plt.title( r )
	
print '\nAverage RMSE: {:.2%}'.format( np.mean( scores.values()))
plt.show()

"""
E12000001 RMSE: 25.44%
Example #21
0
                       n_jobs=1,
                       verbose=0,
                       scoring="neg_mean_squared_error",
                       return_train_score=True)
rf_grid.fit(X, y)
print(rf_grid.best_params_)

# Extract best model from 'rf_grid'
best_model = rf_grid.best_estimator_

# Predict the test set labels
y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error as MSE
# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1 / 2)
# Print the test set RMSE
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))
#----------------------

print(rf_grid.score(X_test, y_test))

# xgboost

train = df.iloc[:1000, :]
test = df.iloc[1000:, :]

x = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']
x_test = test.drop(['SalePrice'], axis=1)
y_test = test['SalePrice']
Example #22
0
    del trainX["tt"]
    testX = tmp[tmp["tt"] == 0]
    del testX["tt"]
    y_train = tmp[tmp["tt"] == 1]["y"]
    y_test = tmp[tmp["tt"] == 0]["y"]

    model1, model2 = learning(trainX, y_train)

    pred_train = model1.predict(trainX["days"].values.reshape(
        -1, 1)) + model2.predict(trainX.iloc[:,
                                             ~trainX.columns.str.match("y")])
    pred_test = model1.predict(testX["days"].values.reshape(
        -1, 1)) + model2.predict(testX.iloc[:, ~testX.columns.str.match("y")])

    print("TRAIN:",
          MSE(y_train, pred_train)**0.5, "VARIDATE",
          MSE(y_test, pred_test)**0.5)
    trains.append(MSE(y_train, pred_train)**0.5)
    tests.append(MSE(y_test, pred_test)**0.5)
print("AVG")
print(numpy.array(trains).mean(), numpy.array(tests).mean())
# %%
cols = ["precipitation", "weather", "days", "fun", "curry", "y", "t"]
tmp = pandas.get_dummies(dat[cols])
trainX = tmp[tmp["t"] == 1]
del trainX["t"]
testX = tmp[tmp["t"] == 0]
del testX["t"]
y_train = tmp[tmp["t"] == 1]["y"]
y_test = tmp[tmp["t"] == 0]["y"]
model = lgbm.LGBMRegressor(**params)

# train model
model.fit(
    X_train,
    (y_train),
    categorical_feature=categorical_features,
    # eval_metric=RMSLE,
)

# make predictions
y_pred = (model.predict(X_test))

# print params and metric
print('Test RMSLE:', RMSLE(y_test, y_pred))
print('Test RMSE:', MSE(y_test, y_pred, squared=False))
print('Test MAE:', MAE(y_test, y_pred))
print('=' * 75)
print('\n')

print('Saving model... \n')
save_model(model, 'lgbm_model.pkl')

#%%

# CROSS VALIDATE SCORE


def cross_val_model(model, X_train, y_train):

    result = cross_val_score(
Evaluate the optimal forest
In this last exercise of the course, you'll evaluate the test set RMSE of grid_rf's optimal model.

The dataset is already loaded and processed for you and is split into 80% train and 20% test. In your environment are available X_test, y_test and the function mean_squared_error from sklearn.metrics under the alias MSE. In addition, we have also loaded the trained GridSearchCV object grid_rf that you instantiated in the previous exercise. Note that grid_rf was trained as follows:

grid_rf.fit(X_train, y_train)
Instructions
100 XP
Import mean_squared_error as MSE from sklearn.metrics.

Extract the best estimator from grid_rf and assign it to best_model.

Predict best_model's test set labels and assign the result to y_pred.

Compute best_model's test set RMSE.
'''
SOLUTION
# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Extract the best estimator
best_model = grid_rf.best_estimator_

# Predict test set labels
y_pred = best_model.predict(X_test)

# Compute rmse_test
rmse_test = MSE(y_pred, y_test)**(1 / 2)

# Print rmse_test
print('Test RMSE of best model: {:.3f}'.format(rmse_test))
def rmse(y_true, y_pred):
    return round(np.sqrt(MSE(y_true, y_pred)), 3)
Example #26
0
    def get_model(self):
        ''' return model create by voter'''

        if self.mod == "regressor":

            from sklearn.tree import DecisionTreeRegressor
            from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor
            from sklearn.metrics import mean_squared_error as MSE
            import sklearn.metrics as SM

            self.mod1 = GradientBoostingRegressor(criterion='mae',
                                                  n_estimators=200,
                                                  max_depth=5)
            self.mod2 = RandomForestRegressor(criterion='mae',
                                              n_estimators=200,
                                              max_depth=5)
            self.mod3 = DecisionTreeRegressor(criterion='mae',
                                              splitter='best',
                                              max_depth=5)

            self.vtr = VotingRegressor(estimators=[('gb', self.mod1),
                                                   ('rf', self.mod2),
                                                   ('lr', self.mod3)],
                                       weights=self.weights)

            self.mod1 = self.mod1.fit(self.x_train, self.y_train)
            self.mod2 = self.mod2.fit(self.x_train, self.y_train)
            self.mod3 = self.mod3.fit(self.x_train, self.y_train)
            self.vtr = self.vtr.fit(self.x_train, self.y_train)

            xt = self.x_train[:50]

            plt.figure(figsize=(20, 10))
            plt.plot(self.mod1.predict(xt),
                     'gd',
                     label='GradientBoostingRegressor')
            plt.plot(self.mod2.predict(xt),
                     'b^',
                     label='RandomForestRegressor')
            plt.plot(self.mod3.predict(xt),
                     'ys',
                     label='DecisionTreeRegressor')
            plt.plot(self.vtr.predict(xt), 'r*', label='VotingRegressor')

            plt.tick_params(axis='x',
                            which='both',
                            bottom=False,
                            top=False,
                            labelbottom=False)
            plt.ylabel('predicted')
            plt.xlabel('training samples')
            plt.legend(loc="best")
            plt.title('Comparison of individual predictions with averaged')
            plt.show()

            print("Model Voting")
            vote_pred = self.vtr.predict(self.x_val)
            RMSE = np.sqrt(MSE(vote_pred, self.y_val))
            score = SM.mean_absolute_error(vote_pred, self.y_val)

            print("RMSE on val = ", RMSE.round(4))
            print("MAPE on val = ", score)
            print("")

            print("Model GradientBoostingRegressor")

            mod1_pred = self.mod1.predict(self.x_val)

            RMSE = np.sqrt(MSE(mod1_pred, self.y_val))
            score = SM.mean_absolute_error(mod1_pred, self.y_val)

            print("RMSE on val = ", RMSE.round(4))
            print("MAPE on val = ", score)
            print("")

            print("Model RandomForestRegressor")

            mod2_pred = self.mod2.predict(self.x_val)

            RMSE = np.sqrt(MSE(mod2_pred, self.y_val))
            score = SM.mean_absolute_error(mod2_pred, self.y_val)

            print("RMSE on val = ", RMSE.round(4))
            print("MAPE on val = ", score)
            print("")

            print("Model DecisionTreeRegressor")

            mod3_pred = self.mod3.predict(self.x_val)

            RMSE = np.sqrt(MSE(mod3_pred, self.y_val))
            score = SM.mean_absolute_error(mod3_pred, self.y_val)

            print("RMSE on val = ", RMSE.round(4))
            print("MAPE on val = ", score)
            print("")

            return self.vtr

        elif self.mod == "classifier":

            from sklearn.linear_model import LogisticRegression
            from sklearn.naive_bayes import GaussianNB
            from sklearn.ensemble import RandomForestClassifier, VotingClassifier
            import sklearn.metrics as SM

            self.clf1 = LogisticRegression(max_iter=3000,
                                           random_state=42,
                                           solver='lbfgs')
            self.clf2 = RandomForestClassifier(n_estimators=100,
                                               random_state=123)
            self.clf3 = GaussianNB()

            self.vtc = VotingClassifier(estimators=[('lr', self.clf1),
                                                    ('rf', self.clf2),
                                                    ('gnb', self.clf3)],
                                        voting='soft',
                                        weights=self.weights)

            # predict class probabilities for all classifiers
            probas = [
                c.fit(self.x_train, self.y_train).predict_proba(self.x_train)
                for c in (self.clf1, self.clf2, self.clf3, self.vtc)
            ]

            # get class probabilities for the first sample in the dataset
            class1_1 = [pr[0, 0] for pr in probas]
            class2_1 = [pr[0, 1] for pr in probas]

            # plotting

            N = 4  # number of groups
            ind = np.arange(N)  # group positions
            width = 0.35  # bar width

            fig, ax = plt.subplots(figsize=(20, 10))

            # bars for classifier 1-3
            p1 = ax.bar(ind,
                        np.hstack(([class1_1[:-1], [0]])),
                        width,
                        color='green',
                        edgecolor='k')
            p2 = ax.bar(ind + width,
                        np.hstack(([class2_1[:-1], [0]])),
                        width,
                        color='lightgreen',
                        edgecolor='k')

            # bars for VotingClassifier
            p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]],
                        width,
                        color='blue',
                        edgecolor='k')
            p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]],
                        width,
                        color='steelblue',
                        edgecolor='k')

            # plot annotations
            plt.axvline(2.8, color='k', linestyle='dashed')
            ax.set_xticks(ind + width)
            ax.set_xticklabels([
                f'LogisticRegression\nweight {self.weights[0]}',
                f'GaussianNB\nweight {self.weights[1]}',
                f'RandomForestClassifier\nweight {self.weights[2]}',
                'VotingClassifier\n(average probabilities)'
            ],
                               rotation=40,
                               ha='right')

            plt.ylim([0, 1])
            plt.title(
                'Class probabilities for sample 1 by different classifiers')
            plt.legend([p1[0], p2[0]], ['class 1', 'class 2'],
                       loc='upper left')
            plt.tight_layout()
            plt.show()

            print("Model VotingClassifier")
            vote_pred = self.vtc.predict(self.x_val)
            score = SM.accuracy_score(vote_pred, self.y_val)

            print("Accuracy = ", score.round(4))
            print("")

            print("Model LogisticRegression")

            vote_pred = self.clf1.predict(self.x_val)
            score = SM.accuracy_score(vote_pred, self.y_val)

            print("Accuracy = ", score.round(4))
            print("")

            print("Model GaussianNB")

            vote_pred = self.clf3.predict(self.x_val)
            score = SM.accuracy_score(vote_pred, self.y_val)

            print("Accuracy = ", score.round(4))
            print("")

            print("Model RandomForestClassifier")

            vote_pred = self.clf2.predict(self.x_val)
            score = SM.accuracy_score(vote_pred, self.y_val)

            print("Accuracy = ", score.round(4))
            print("")
            return self.vtc
regtree0 = DecisionTreeRegressor(
    max_depth=4, min_samples_leaf=0.1,
    random_state=22)  # set minimum leaf to contain at least 10% of data points
# DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None,
#     max_leaf_nodes=None, min_impurity_decrease=0.0,
#     min_impurity_split=None, min_samples_leaf=0.13,
#     min_samples_split=2, min_weight_fraction_leaf=0.0,
#     presort=False, random_state=3, splitter='best')

regtree0.fit(X_train, y_train)  # Fit regtree0 to the training set
# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# evaluation
y_pred = regtree0.predict(X_test)  # Compute y_pred
mse_regtree0 = MSE(y_test, y_pred)  # Compute mse_regtree0
rmse_regtree0 = mse_regtree0**(.5)  # Compute rmse_regtree0
print("Test set RMSE of regtree0: {:.2f}".format(rmse_regtree0))

#%%
# Let us compare the performance with OLS
from sklearn import linear_model
olspizza = linear_model.LinearRegression()
olspizza.fit(X_train, y_train)

y_pred_ols = olspizza.predict(X_test)  # Predict test set labels/values

mse_ols = MSE(y_test, y_pred_ols)  # Compute mse_ols
rmse_ols = mse_ols**(0.5)  # Compute rmse_ols

print('Linear Regression test set RMSE: {:.2f}'.format(rmse_ols))
# Instantiate dt
dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13, random_state=3)

# Fit dt to the training set
dt.fit(X_train, y_train)

#################################           Evaluate the regression tree           #################################

# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute y_pred
y_pred = dt.predict(X_test)

# Compute mse_dt
mse_dt = MSE(y_test, y_pred)

# Compute rmse_dt
rmse_dt = mse_dt**0.5

# Print rmse_dt
print("Test set RMSE of dt: {:.2f}".format(rmse_dt))

#################################           Linear regression vs regression tree           #################################

# Predict test set labels
y_pred_lr = lr.predict(X_test)

# Compute mse_lr
mse_lr = MSE(y_test, y_pred_lr)
            n_estimators=200,                                
            random_state=2)

# Train the SGB regressor
# In this exercise, you'll train the SGBR sgbr instantiated in the previous exercise and predict the test set labels.

# The bike sharing demand dataset is already loaded processed for you; it is split into 80% train and 20% test. The feature matrices X_train and X_test, the arrays of labels y_train and y_test, and the model instance sgbr that you defined in the previous exercise are available in your workspace.

# Fit sgbr to the training set
sgbr.fit(X_train, y_train)

# Predict test set labels
y_pred = sgbr.predict(X_test)

# Evaluate the SGB regressor
# You have prepared the ground to determine the test set RMSE of sgbr which you shall evaluate in this exercise.
# y_pred and y_test are available in your workspace.
# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute test set MSE
mse_test = MSE(y_test, y_pred)

# Compute test set RMSE
rmse_test = mse_test ** (0.5)

# Print rmse_test
print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))
# Test set RMSE of sgbr: 49.979
# The stochastic gradient boosting regressor achieves a lower test set RMSE than the gradient boosting regressor (which was 52.065)!
Example #30
0
# plt.plot(axisx, rs, c='green', label='XGB')
# plt.legend()
# plt.show()

for booster in ['gbtree', 'gblinear', 'dart']:
    reg = XGBR(n_estimators=180,
               learning_rate=0.1,
               random_state=0,
               booster=booster,
               objective='reg:squarederror').fit(Xtrain, Ytrain)
    print(booster)
    print(reg.score(Xtest, Ytest))

reg = XGBR(n_estimators=180, objective='reg:squarederror').fit(Xtrain, Ytrain)
reg.score(Xtest, Ytest)
MSE(Ytest, reg.predict(Xtest))

import xgboost as xgb

dtrain = xgb.DMatrix(Xtrain, Ytrain)
dtest = xgb.DMatrix(Xtest, Ytest)

param = {'silent': False, 'objective': 'reg:squarederror', 'eta': 0.1}
num_round = 180
bst = xgb.train(param, dtrain, num_round)
from sklearn.metrics import r2_score

axisx = np.arange(0, 5, 0.05)
rs = []
var = []
ge = []