def mlp_synthetic(X_train, X_test, y_train, y_test, L2reg=0.01, hidden_width=50, mini_batchsize=5): X = T.fmatrix(name='X') Y = T.fmatrix(name='Y') input_size = X_train.shape[1] print input_size w_h1 = uniform_weights((input_size, hidden_width)) w_h2 = uniform_weights((hidden_width, hidden_width)) w_h3 = uniform_weights((hidden_width, hidden_width)) b_h1 = init_bias(hidden_width) b_h2 = init_bias(hidden_width) b_h3 = init_bias(hidden_width) w_o = uniform_weights((hidden_width, 1)) # b_h = init_bias(hidden_width) b_o = init_bias(1) op = model(X, w_h1, w_h2, w_h3, w_o, b_h1, b_h2, b_h3, b_o) params = [w_h1, w_h2, w_h3, w_o, b_h1, b_h2, b_h3, b_o] cost = MSE_reg(Y, op, params, L2reg=L2reg) updates = sgd(cost, params) # updates=Adam(cost,params) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True, name='train') predict = theano.function(inputs=[X], outputs=op, allow_input_downcast=True) fcost = theano.function(inputs=[op, Y], outputs=cost, allow_input_downcast=True) test_costs = [] train_costs = [] for i in range(epochs): for start, end in zip( range(0, len(X_train), mini_batchsize), range(mini_batchsize, len(X_train), mini_batchsize)): yd = (floatX(y_train[start:end])).reshape(mini_batchsize, 1) # print (X_train[start:end]).shape cost_v = train(X_train[start:end], yd) # Done this cost prediction needs to change # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1)) # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1)) y_predicted = predict(X_train) fin_cost_test = MSE(predict(X_test), y_test) fin_cost_train = MSE(predict(X_train), y_train) test_costs.append(fin_cost_test) train_costs.append(fin_cost_train) # print i, fin_cost_test, fin_cost_train # print 'final b_o values' # print b_o.get_value() # fin_cost_test = fcost(predict(X_test), floatX(y_test).reshape(len(y_test), 1)) # fin_cost_train = fcost(predict(X_train), floatX(y_train).reshape(len(y_train), 1)) fin_cost_test = MSE(predict(X_test), y_test) fin_cost_train = MSE(predict(X_train), y_train) # print 'NumTP: {}, Hwidth: {}, BatchSize: {}, L2reg: {}, Seed {},Train: {}, Test: {}'.format(numTrainPoints, # hidden_width, # mini_batchsize, L2reg, # rand_seed, # fin_cost_train, # fin_cost_test) # Calculate RMS error with simple mean prediction test_mean = np.mean(y_test) train_mean = np.mean(y_train) mean_p_test = np.ones(y_test.size) * test_mean mean_p_train = np.ones(y_train.size) * train_mean # test_cost=fcost(floatX(mean_p_test).reshape(len(y_test), 1), floatX(y_test).reshape(len(y_test), 1)) # train_cost=fcost(floatX(mean_p_train).reshape(len(y_train), 1), floatX(y_train).reshape(len(y_train), 1)) test_cost = MSE(mean_p_test, y_test) train_cost = MSE(mean_p_train, y_train) tArray = np.ones(epochs) * test_cost # print 'MSE for mean prediction, Train:{} ,Test:{}'.format(train_cost,test_cost) ref_err = MSE_reference(y_test) # ref_arr=ref_err*np.ones(epochs) # plt.plot(range(epochs), test_costs, label='Test') # plt.plot(range(epochs),train_costs,label='Train') # plt.plot(range(epochs),ref_arr,label='Ref') # plt.xlabel('Epochs') # plt.ylabel('Error') # plt.title('TrainCost:{}, TestCost: {}'.format(fin_cost_train, fin_cost_test)) # plt.legend() # plt.show() # plt.close() # dest_pkl = 'my_test.pkl' # f = open(dest_pkl, 'wb') # strip_pickler = StripPickler(f, protocol=-1) # strip_pickler.dump(params) # f.close() h3 = model_act(X, w_h1, w_h2, w_h3, w_o, b_h1, b_h2, b_h3, b_o) transform = theano.function(inputs=[X], outputs=h3, allow_input_downcast=True) test_transformed = transform(X_test) train_transformed = transform(X_train) test_predictions = predict(X_test) # returns the transformed test data, ie the activations from the third hidden layer return fin_cost_train, fin_cost_test, train_transformed, test_transformed, test_predictions
def reconstruction(ncomp, U, S, V, var=1): if mode == 'lapack': rec_matrix = np.dot(U[:, :ncomp], np.dot(np.diag(S[:ncomp]), V[:ncomp])) rec_matrix = rec_matrix.T print(' Matrix reconstruction with {} PCs:'.format(ncomp)) print(' Mean Absolute Error =', MAE(matrix, rec_matrix)) print(' Mean Squared Error =', MSE(matrix, rec_matrix)) # see https://github.com/scikit-learn/scikit-learn/blob/c3980bcbabd9d2527548820581725df2904e4a0d/sklearn/decomposition/pca.py exp_var = (S**2) / (S.shape[0] - 1) full_var = np.sum(exp_var) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) elif mode == 'eigen': exp_var = (S**2) / (S.shape[0] - 1) full_var = np.sum(exp_var) explained_variance_ratio = exp_var / full_var # % of variance explained by each PC ratio_cumsum = np.cumsum(explained_variance_ratio) else: rec_matrix = np.dot(U, np.dot(np.diag(S), V)) print(' Matrix reconstruction MAE =', MAE(matrix, rec_matrix)) exp_var = (S**2) / (S.shape[0] - 1) full_var = np.var(matrix, axis=0).sum() explained_variance_ratio = exp_var / full_var # % of variance explained by each PC if var == 1: pass else: explained_variance_ratio = explained_variance_ratio[::-1] ratio_cumsum = np.cumsum(explained_variance_ratio) msg = ' This info makes sense when the matrix is mean centered ' msg += '(temp-mean scaling)' print(msg) lw = 2 alpha = 0.4 fig = plt.figure(figsize=vip_figsize) fig.subplots_adjust(wspace=0.4) ax1 = plt.subplot2grid((1, 3), (0, 0), colspan=2) ax1.step(range(explained_variance_ratio.shape[0]), explained_variance_ratio, alpha=alpha, where='mid', label='Individual EVR', lw=lw) ax1.plot(ratio_cumsum, '.-', alpha=alpha, label='Cumulative EVR', lw=lw) ax1.legend(loc='best', frameon=False, fontsize='medium') ax1.set_ylabel('Explained variance ratio (EVR)') ax1.set_xlabel('Principal components') ax1.grid(linestyle='solid', alpha=0.2) ax1.set_xlim(-10, explained_variance_ratio.shape[0] + 10) ax1.set_ylim(0, 1) trunc = 20 ax2 = plt.subplot2grid((1, 3), (0, 2), colspan=1) # plt.setp(ax2.get_yticklabels(), visible=False) ax2.step(range(trunc), explained_variance_ratio[:trunc], alpha=alpha, where='mid', lw=lw) ax2.plot(ratio_cumsum[:trunc], '.-', alpha=alpha, lw=lw) ax2.set_xlabel('Principal components') ax2.grid(linestyle='solid', alpha=0.2) ax2.set_xlim(-2, trunc + 2) ax2.set_ylim(0, 1) msg = ' Cumulative explained variance ratio for {} PCs = {:.5f}' # plt.savefig('figure.pdf', dpi=300, bbox_inches='tight') print(msg.format(ncomp, ratio_cumsum[ncomp - 1]))
y_hat['Count'] = train['Count'][len(train) - 1] # Visualize Naive method predictions plt.figure(figsize=(40, 20)) plt.plot(train.Datetime, train['Count'], label='train') plt.plot(valid.Datetime, valid['Count'], label='validation') plt.plot(y_hat.Datetime, y_hat['Count'], label='Naive Forecast') plt.xlabel('Datetime') plt.ylabel('Passenger count') plt.legend(loc='best') plt.show() rmse = pd.DataFrame(columns=['Method', 'RMSE']) # Calculate RMSE for Naive method rmse.loc[len(rmse)] = "Naive", sqrt(MSE(valid.Count, y_hat.Count)) # Moving Average Method to predict time series # last 10 days y_hat['Count'] = train['Count'].rolling(10).mean().iloc[-1] # Calculate RMSE for Moving average 10 days rmse.loc[len(rmse)] = "Moving Average 10D", sqrt(MSE(valid.Count, y_hat.Count)) # last 20 days y_hat['Count'] = train['Count'].rolling(20).mean().iloc[-1] # Calculate RMSE for Moving average 20 days rmse.loc[len(rmse)] = "Moving Average 20D", sqrt(MSE(valid.Count, y_hat.Count)) # last 50 days y_hat['Count'] = train['Count'].rolling(50).mean().iloc[-1]
loss_test = criterion(test_predict, testY) if epoch % 100 == 0: print("Epoch: %d, loss: %f, test loss: %f" % (epoch, loss.detach().numpy(), loss_test.item())) # Test lstm.eval() train_predict = lstm(dataX) data_predict = train_predict.data.numpy() dataY_plot = dataY.data.numpy() data_predict = sc.inverse_transform(data_predict) dataY_plot = sc.inverse_transform(dataY_plot) print(MSE(data_predict, dataY_plot)) plt.axvline(x=train_size, c='r', linestyle='--') plt.plot(dataY_plot) plt.plot(data_predict) plt.suptitle('Time-Series Prediction') plt.show() # MSE on train data test_predict = lstm(trainX) test_predict = test_predict.data.numpy() testY_plot = trainY.data.numpy() test_predict = sc.inverse_transform(test_predict)
# Print RMSE_CV print('CV RMSE: {:.2f}'.format(RMSE_CV)) ################################# Evaluate the training error ################################# # Import mean_squared_error from sklearn.metrics as MSE from sklearn.metrics import mean_squared_error as MSE # Fit dt to the training set dt.fit(X_train, y_train) # Predict the labels of the training set y_pred_train = dt.predict(X_train) # Evaluate the training set RMSE of dt RMSE_train = (MSE(y_train, y_pred_train))**(0.5) # Print RMSE_train print('Train RMSE: {:.2f}'.format(RMSE_train)) ################################# Define the ensemble ################################# # Set seed for reproducibility SEED = 1 # Instantiate lr lr = LogisticRegression(random_state=SEED) # Instantiate knn knn = KNN(n_neighbors=27)
max_depth=3, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=0.00006) # first pass housing_xgb.fit(X_train, y_train, verbose=True) # first pass 0.63 pred = housing_xgb.predict(X_test) rmse = np.sqrt(MSE(np.log(y_test + 1), np.log(pred + 1))) print("RMSE : % f" % (rmse)) rss = sum((y_test - pred)**2) tss = sum((y_test - np.mean(y_test))**2) rsq = 1 - (rss / tss) #makes a list of featurew with thier importance df_import = pd.DataFrame({ 'cols': X_test.columns, 'feat_import': pd.Series(housing_xgb.feature_importances_) })
def autoencoder(dataset, logfile, random_state=1910299034): # Save home path home = str(Path.home()) # Hyperparameters hidden_layer_nodes = 32 learn_rate = 0.001 # Load the MovieLens (download it if needed) if dataset == 'ml-100k': datafile = 'u.data' input_layer_nodes = 1682 output_layer_nodes = input_layer_nodes ratings = pd.read_csv('{}/.surprise_data/{}/{}/{}'.format( home, dataset, dataset, datafile), sep="\t", header=None, engine='python') batch_size = 20 epochs = 200 else: datafile = 'ratings.dat' input_layer_nodes = 3706 output_layer_nodes = input_layer_nodes ratings = pd.read_csv('{}/.surprise_data/{}/{}/{}'.format( home, dataset, dataset, datafile), sep="::", header=None, engine='python') batch_size = 80 epochs = 100 # Create DataFrame without timestamps ratings_pivot = pd.pivot_table(ratings[[0, 1, 2]], values=2, index=0, columns=1).fillna(0) # 80-20 split X_train, X_test = sk_train_test_split(ratings_pivot, test_size=0.2, random_state=random_state) # Initialize weights hidden_layer_weights = { 'weights': tf.Variable( tf.random_normal([input_layer_nodes + 1, hidden_layer_nodes], seed=random_state)) } output_layer_weights = { 'weights': tf.Variable( tf.random_normal([hidden_layer_nodes + 1, output_layer_nodes], seed=random_state)) } # Set input placeholder input_layer = tf.placeholder('float', [None, input_layer_nodes]) # Add bias to input bias = tf.fill([tf.shape(input_layer)[0], 1], 1.0) input_layer_concat = tf.concat([input_layer, bias], 1) # Forward and activate with Sigmoid hidden_activations = tf.nn.sigmoid( tf.matmul(input_layer_concat, hidden_layer_weights['weights'])) # Add bias bias = tf.fill([tf.shape(hidden_activations)[0], 1], 1.0) hidden_activations = tf.concat([hidden_activations, bias], 1) # Forward for final output output_layer = tf.matmul(hidden_activations, output_layer_weights['weights']) # Set output placeholder output_true = tf.placeholder('float', [None, output_layer_nodes]) # Loss mse_loss = tf.reduce_mean(tf.square(output_layer - output_true)) # Optimizer optimizer = tf.train.AdamOptimizer(learn_rate).minimize(mse_loss) # Tensorflow session initialization init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) # Running model for epoch in range(epochs): epoch_loss = 0 for i in range(int(X_train.shape[0] / batch_size)): batch_X = X_train[i * batch_size:(i + 1) * batch_size] _, c = sess.run([optimizer, mse_loss], feed_dict={ input_layer: batch_X, output_true: batch_X }) epoch_loss += c output_train = sess.run(output_layer, feed_dict={input_layer: X_train}) output_test = sess.run(output_layer, feed_dict={input_layer: X_test}) log( logfile, 'MSE train ' + str(round(MSE(output_train, X_train), 2)) + ' MSE test ' + str(round(MSE(output_test, X_test), 2))) log( logfile, 'Epoch ' + str(epoch) + '/' + str(epochs) + ' loss: ' + str(round(epoch_loss, 2))) # Final test time_start = time.time() output_test = sess.run(output_layer, feed_dict={input_layer: X_test}) time_stop = time.time() runtime = round(time_stop - time_start, 4) log(logfile, 'Test time: {0:f}'.format(runtime).strip('0')) mse = round(MSE(output_test, X_test), 3) log(logfile, 'MSE test: ' + str(mse) + '\n') return [mse, runtime]
# model.compile(optimizer=SGD(), loss='mean_squared_error') # Verify that model contains information from compiling print("Loss function: " + model.loss) # Fit the model model.fit(X_train.values, y_train, validation_split=0.2, epochs=100) #%% # # ============================================================================= model.summary() y_pred = model.predict(X_test) print('RMSE:', np.sqrt(MSE(y_test, y_pred))) #%% # Classification # ============================================================================= from keras.utils import to_categorical titanic_df = pd.read_csv("data/titanic.csv") # Convert the target to categorical: target y = to_categorical(titanic_df.survived) X = titanic_df.drop('survived', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_pred= dt_gini.predict(X_test) # Use dt_gini accuracy_gini = accuracy_score(y_test, y_pred)# Evaluate accuracy_gini print('Accuracy achieved by using entropy: ', accuracy_entropy)# Print accuracy_entropy print('Accuracy achieved by using the gini index: ', accuracy_gini)# Print accuracy_gini ##################################################### #Decision tree for Classification ##################################################### #Regression tree from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13, random_state=3) dt.fit(X_train, y_train) from sklearn.metrics import mean_squared_error as MSE y_pred = dt.predict(X_test)# Compute y_pred mse_dt = MSE(y_test, y_pred)# Compute mse_dt rmse_dt = mse_dt**(1/2)# Compute rmse_dt print("Test set RMSE of dt: {:.2f}".format(rmse_dt)) #Linear regression y_pred_lr = lr.predict(X_test)# Predict test mse_lr = MSE (y_pred_lr, y_test)# Compute mse_lr rmse_lr = mse_lr**(1/2)# Compute rmse_lr print('Linear Regression test set RMSE: {:.2f}'.format(rmse_lr)) print('Regression Tree test set RMSE: {:.2f}'.format(rmse_dt)) ################################################ #Supervised Learning #Fit the model f(x) that best approximates(f(x) can be logistic regression, decision tree, neural network) #discard noise as much as possible #low predictive error on unseen dataset #difficulties #overfitting: predictive power is low
regr_training.fit(X, y) inter2 = np.ones((len(v_CDD), 1)) X_vals = np.column_stack((inter2, v_CDD, v_HDD)) y_vals = regr_training.predict(X_vals) #plotting actual demand vs. simulated demand demand = df_validation.loc[:, 'demand'] plt.figure() plt.scatter(y_vals, demand) plt.xlabel('Actual Electricity Demand (MWh)') plt.ylabel('Predicted Electricity Demand (MWh)') #calculating R^2 value Rsq = r2_score(demand, y_vals) #calculating mean square error number5 = MSE(demand, y_vals) #plot actual demand vs. residuals residuals = y_vals - demand plt.figure() plt.scatter(demand, residuals) plt.xlabel('Actual Demand (MWh)') plt.ylabel('Residuals (MWh)')
params_IGRNN = { 'kernel': ["RBF"], 'sigma': list(np.arange(0.1, 4, 0.01)), 'calibration': ['None'] } grid_IGRNN = GridSearchCV(estimator=IGRNN, param_grid=params_IGRNN, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1) grid_IGRNN.fit(X_train_BestSet, Ytrain.ravel()) # Use the best model to perform prediction, and compute mse best_model = grid_IGRNN.best_estimator_ Ypred_IGRNN = best_model.predict(X_test_BestSet) mse_IGRNN = MSE(Ytest, Ypred_IGRNN) Ypred_IGRNN = np.round(Ypred_IGRNN, 0) grid_IGRNN.fit(Xtrain, Ytrain.ravel()) best_model = grid_IGRNN.best_estimator_ Ypred_IGRNN_be = best_model.predict(Xtest) mse_IGRNN_be = MSE(Ytest, Ypred_IGRNN_be) Ypred_IGRNN_be = np.round(Ypred_IGRNN, 0) #print(accuracy_score(Ytest, Ypred_IGRNN)) AnisotropicSelector = FS.Anisotropic_selector() start = time.time() AnisotropicSelector.max_dist(Xtrain, Ytrain.ravel(), feature_names=featnames) print('Time to complete the feature selection [s]: ' + str(time.time() - start)) AGRNN = GRNN()
test_size=0.999, random_state=1) nbs = np.arange(1, 51, 2) # Let's start with regression reg_train_mse = [] reg_test_mse = [] for n in nbs: knnreg = KNeighborsRegressor(n_neighbors=n) knnreg.fit(X_train, y_train) y_hat_train = knnreg.predict(X_train) y_hat_test = knnreg.predict(X_test) train_mse = MSE(y_train, y_hat_train) reg_train_mse.append(train_mse) test_mse = MSE(y_test, y_hat_test) reg_test_mse.append(test_mse) print(n) # Let us plot the train and test errors: plt.plot(nbs, reg_train_mse, marker='.', color='blue', label='Train MSE') plt.plot(nbs, reg_test_mse, marker='.', color='orange', label='Test MSE') plt.xticks(nbs) plt.xlabel('# of neighbors') plt.ylabel('MSE') plt.title('kNN regression') plt.legend()
# train_file = '../data/train.csv' split_at = -365 d = pd.read_csv(train_file) # some regions have other date spans than others d['mean_mortality_rate'] = d.groupby('date').mortality_rate.transform('mean') d = d.drop_duplicates('date') d = d[['date', 'mortality_rate']] d.columns = ['ds', 'y'] d.y = np.log(d.y) train = d[:split_at].copy().reset_index(drop=True) test = d[split_at:].copy().reset_index(drop=True) prophet = Prophet() prophet.fit(train) p = prophet.predict(test) score = sqrt(MSE(np.exp(test.y), np.exp(p.yhat))) print 'RMSE: {:.2%}'.format(score) prophet.plot(p) prophet.plot_components(p) plt.show()
def MSE_reference(y_test): mean_pred = np.mean(y_test) mean_arr = np.ones(y_test.shape) * mean_pred return MSE(y_test, mean_arr)
""" Evaluate the training error You'll now evaluate the training set RMSE achieved by the regression tree dt that you instantiated in a previous exercise. In addition to dt, X_train and y_train are available in your workspace. INSTRUCTION ----------- Import mean_squared_error as MSE from sklearn.metrics. Fit dt to the training set. Predict dt's training set labels and assign the result to y_pred_train. Evaluate dt's training set MSE and assign it to RMSE_train. """ # Import mean_squared_error from sklearn.metrics as MSE from sklearn.metrics import mean_squared_error as MSE # Fit dt to the training set dt.fit(X_train, y_train) # Predict the labels of the training set y_pred_train = dt.predict(X_train) # Evaluate the training set RMSE of dt RMSE_train = (MSE(y_train, y_pred_train))**(1 / 2) # Print RMSE_train print('Train RMSE: {:.2f}'.format(RMSE_train))
xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420) # In[5]: reg = XGBR(n_estimators=100).fit(xtrain, ytrain) reg.predict(xtest) # In[6]: # 测试集的结果分数,默认是返回R平方指标 reg.score(xtest, ytest) # In[7]: # 均方误差 MSE(ytest, reg.predict(xtest)) # In[8]: y.mean() # 均方误差结果大约占y均值的三分之一,效果一般 # In[9]: # 树模型可以查看模型的重要性分数,可以使用嵌入法(select from model)进行特征选择 reg.feature_importances_ # # 使用交叉验证来进行对比 # In[10]:
def RMSE(y_true, y_pred): return sqrt(MSE(y_true, y_pred))
##################################### #### KNN model for classification from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=8) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) knn.score(X_test, y_test) # print the performance score of fitted model ##################################### #### CART (classification and regression tree) - regression from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error as MSE dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=3) dt.fit(X_train, y_train) y_pred = dt.predict(X_test) mse_dt = MSE(y_test, y_pred) rmse_dt = mse_dt**(1/2) # using CV with regression tree needs some tweak below # (score is for maximization while MSE is for minimization) MSE_CV = - cross_val_score(dt, X_train, y_train, cv= 10, scoring='neg_mean_squared_error', n_jobs = -1) print(MSE_CV.mean()) #### CART (classification and regression tree) - classification from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score dt = DecisionTreeClassifier(max_depth=2, min_samples_split=2, min_samples_leaf=1, \ criterion='gini', random_state=1) # decision tree dt.fit(X_train,y_train) y_pred = dt.predict(X_test) accuracy_score(y_test, y_pred) #####################################
print("Root Mean Squared Error: {}".format(rmse)) # DECISION TREE REGRESSION from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error as MSE X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42) dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, # Each leaf must contain AT LEAST 10% of the training data random_state=3) dt.fit(X_train, y_train) y_pred = dt.predict(X_test) mse_dt = MSE(y_test, y_pred) # Compute test MSE rmse_dt = mse_dt **(1/2) # Compute test RMSE print(rmse_dt) ########################################################### # 4) CROSS VALIDATION from sklearn.model_selection import cross_val_score reg = linear_model.LinearRegression() cv_results = cross_val_score(reg, X, y, cv = 5) print(cv_results) print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
for r in sorted( d.region.unique()): regions[r] = d[ d.region == r ].copy() print r, len( regions[r] ) for r, df in regions.items(): df = df[['date', 'mortality_rate']] df.columns = ['ds', 'y'] df.y = np.log( df.y ) trains[r] = df[:split_at].copy().reset_index( drop = True ) tests[r] = df[split_at:].copy().reset_index( drop = True ) prophets[r] = Prophet() prophets[r].fit( trains[r] ) predictions[r] = prophets[r].predict( tests[r] ) scores[r] = sqrt( MSE( np.exp( tests[r].y ), np.exp( predictions[r].yhat ))) print '{} RMSE: {:.2%}'.format( r, scores[r] ) prophets[r].plot( predictions[r] ) prophets[r].plot_components( predictions[r] ) for r in sorted( regions ): print '{} RMSE: {:.2%}'.format( r, scores[r] ) prophets[r].plot_components( predictions[r] ) plt.title( r ) print '\nAverage RMSE: {:.2%}'.format( np.mean( scores.values())) plt.show() """ E12000001 RMSE: 25.44%
n_jobs=1, verbose=0, scoring="neg_mean_squared_error", return_train_score=True) rf_grid.fit(X, y) print(rf_grid.best_params_) # Extract best model from 'rf_grid' best_model = rf_grid.best_estimator_ # Predict the test set labels y_pred = best_model.predict(X_test) from sklearn.metrics import mean_squared_error as MSE # Evaluate the test set RMSE rmse_test = MSE(y_test, y_pred)**(1 / 2) # Print the test set RMSE print('Test set RMSE of rf: {:.2f}'.format(rmse_test)) #---------------------- print(rf_grid.score(X_test, y_test)) # xgboost train = df.iloc[:1000, :] test = df.iloc[1000:, :] x = train.drop(['SalePrice'], axis=1) y = train['SalePrice'] x_test = test.drop(['SalePrice'], axis=1) y_test = test['SalePrice']
del trainX["tt"] testX = tmp[tmp["tt"] == 0] del testX["tt"] y_train = tmp[tmp["tt"] == 1]["y"] y_test = tmp[tmp["tt"] == 0]["y"] model1, model2 = learning(trainX, y_train) pred_train = model1.predict(trainX["days"].values.reshape( -1, 1)) + model2.predict(trainX.iloc[:, ~trainX.columns.str.match("y")]) pred_test = model1.predict(testX["days"].values.reshape( -1, 1)) + model2.predict(testX.iloc[:, ~testX.columns.str.match("y")]) print("TRAIN:", MSE(y_train, pred_train)**0.5, "VARIDATE", MSE(y_test, pred_test)**0.5) trains.append(MSE(y_train, pred_train)**0.5) tests.append(MSE(y_test, pred_test)**0.5) print("AVG") print(numpy.array(trains).mean(), numpy.array(tests).mean()) # %% cols = ["precipitation", "weather", "days", "fun", "curry", "y", "t"] tmp = pandas.get_dummies(dat[cols]) trainX = tmp[tmp["t"] == 1] del trainX["t"] testX = tmp[tmp["t"] == 0] del testX["t"] y_train = tmp[tmp["t"] == 1]["y"] y_test = tmp[tmp["t"] == 0]["y"]
model = lgbm.LGBMRegressor(**params) # train model model.fit( X_train, (y_train), categorical_feature=categorical_features, # eval_metric=RMSLE, ) # make predictions y_pred = (model.predict(X_test)) # print params and metric print('Test RMSLE:', RMSLE(y_test, y_pred)) print('Test RMSE:', MSE(y_test, y_pred, squared=False)) print('Test MAE:', MAE(y_test, y_pred)) print('=' * 75) print('\n') print('Saving model... \n') save_model(model, 'lgbm_model.pkl') #%% # CROSS VALIDATE SCORE def cross_val_model(model, X_train, y_train): result = cross_val_score(
Evaluate the optimal forest In this last exercise of the course, you'll evaluate the test set RMSE of grid_rf's optimal model. The dataset is already loaded and processed for you and is split into 80% train and 20% test. In your environment are available X_test, y_test and the function mean_squared_error from sklearn.metrics under the alias MSE. In addition, we have also loaded the trained GridSearchCV object grid_rf that you instantiated in the previous exercise. Note that grid_rf was trained as follows: grid_rf.fit(X_train, y_train) Instructions 100 XP Import mean_squared_error as MSE from sklearn.metrics. Extract the best estimator from grid_rf and assign it to best_model. Predict best_model's test set labels and assign the result to y_pred. Compute best_model's test set RMSE. ''' SOLUTION # Import mean_squared_error from sklearn.metrics as MSE from sklearn.metrics import mean_squared_error as MSE # Extract the best estimator best_model = grid_rf.best_estimator_ # Predict test set labels y_pred = best_model.predict(X_test) # Compute rmse_test rmse_test = MSE(y_pred, y_test)**(1 / 2) # Print rmse_test print('Test RMSE of best model: {:.3f}'.format(rmse_test))
def rmse(y_true, y_pred): return round(np.sqrt(MSE(y_true, y_pred)), 3)
def get_model(self): ''' return model create by voter''' if self.mod == "regressor": from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor from sklearn.metrics import mean_squared_error as MSE import sklearn.metrics as SM self.mod1 = GradientBoostingRegressor(criterion='mae', n_estimators=200, max_depth=5) self.mod2 = RandomForestRegressor(criterion='mae', n_estimators=200, max_depth=5) self.mod3 = DecisionTreeRegressor(criterion='mae', splitter='best', max_depth=5) self.vtr = VotingRegressor(estimators=[('gb', self.mod1), ('rf', self.mod2), ('lr', self.mod3)], weights=self.weights) self.mod1 = self.mod1.fit(self.x_train, self.y_train) self.mod2 = self.mod2.fit(self.x_train, self.y_train) self.mod3 = self.mod3.fit(self.x_train, self.y_train) self.vtr = self.vtr.fit(self.x_train, self.y_train) xt = self.x_train[:50] plt.figure(figsize=(20, 10)) plt.plot(self.mod1.predict(xt), 'gd', label='GradientBoostingRegressor') plt.plot(self.mod2.predict(xt), 'b^', label='RandomForestRegressor') plt.plot(self.mod3.predict(xt), 'ys', label='DecisionTreeRegressor') plt.plot(self.vtr.predict(xt), 'r*', label='VotingRegressor') plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False) plt.ylabel('predicted') plt.xlabel('training samples') plt.legend(loc="best") plt.title('Comparison of individual predictions with averaged') plt.show() print("Model Voting") vote_pred = self.vtr.predict(self.x_val) RMSE = np.sqrt(MSE(vote_pred, self.y_val)) score = SM.mean_absolute_error(vote_pred, self.y_val) print("RMSE on val = ", RMSE.round(4)) print("MAPE on val = ", score) print("") print("Model GradientBoostingRegressor") mod1_pred = self.mod1.predict(self.x_val) RMSE = np.sqrt(MSE(mod1_pred, self.y_val)) score = SM.mean_absolute_error(mod1_pred, self.y_val) print("RMSE on val = ", RMSE.round(4)) print("MAPE on val = ", score) print("") print("Model RandomForestRegressor") mod2_pred = self.mod2.predict(self.x_val) RMSE = np.sqrt(MSE(mod2_pred, self.y_val)) score = SM.mean_absolute_error(mod2_pred, self.y_val) print("RMSE on val = ", RMSE.round(4)) print("MAPE on val = ", score) print("") print("Model DecisionTreeRegressor") mod3_pred = self.mod3.predict(self.x_val) RMSE = np.sqrt(MSE(mod3_pred, self.y_val)) score = SM.mean_absolute_error(mod3_pred, self.y_val) print("RMSE on val = ", RMSE.round(4)) print("MAPE on val = ", score) print("") return self.vtr elif self.mod == "classifier": from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier, VotingClassifier import sklearn.metrics as SM self.clf1 = LogisticRegression(max_iter=3000, random_state=42, solver='lbfgs') self.clf2 = RandomForestClassifier(n_estimators=100, random_state=123) self.clf3 = GaussianNB() self.vtc = VotingClassifier(estimators=[('lr', self.clf1), ('rf', self.clf2), ('gnb', self.clf3)], voting='soft', weights=self.weights) # predict class probabilities for all classifiers probas = [ c.fit(self.x_train, self.y_train).predict_proba(self.x_train) for c in (self.clf1, self.clf2, self.clf3, self.vtc) ] # get class probabilities for the first sample in the dataset class1_1 = [pr[0, 0] for pr in probas] class2_1 = [pr[0, 1] for pr in probas] # plotting N = 4 # number of groups ind = np.arange(N) # group positions width = 0.35 # bar width fig, ax = plt.subplots(figsize=(20, 10)) # bars for classifier 1-3 p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color='green', edgecolor='k') p2 = ax.bar(ind + width, np.hstack(([class2_1[:-1], [0]])), width, color='lightgreen', edgecolor='k') # bars for VotingClassifier p3 = ax.bar(ind, [0, 0, 0, class1_1[-1]], width, color='blue', edgecolor='k') p4 = ax.bar(ind + width, [0, 0, 0, class2_1[-1]], width, color='steelblue', edgecolor='k') # plot annotations plt.axvline(2.8, color='k', linestyle='dashed') ax.set_xticks(ind + width) ax.set_xticklabels([ f'LogisticRegression\nweight {self.weights[0]}', f'GaussianNB\nweight {self.weights[1]}', f'RandomForestClassifier\nweight {self.weights[2]}', 'VotingClassifier\n(average probabilities)' ], rotation=40, ha='right') plt.ylim([0, 1]) plt.title( 'Class probabilities for sample 1 by different classifiers') plt.legend([p1[0], p2[0]], ['class 1', 'class 2'], loc='upper left') plt.tight_layout() plt.show() print("Model VotingClassifier") vote_pred = self.vtc.predict(self.x_val) score = SM.accuracy_score(vote_pred, self.y_val) print("Accuracy = ", score.round(4)) print("") print("Model LogisticRegression") vote_pred = self.clf1.predict(self.x_val) score = SM.accuracy_score(vote_pred, self.y_val) print("Accuracy = ", score.round(4)) print("") print("Model GaussianNB") vote_pred = self.clf3.predict(self.x_val) score = SM.accuracy_score(vote_pred, self.y_val) print("Accuracy = ", score.round(4)) print("") print("Model RandomForestClassifier") vote_pred = self.clf2.predict(self.x_val) score = SM.accuracy_score(vote_pred, self.y_val) print("Accuracy = ", score.round(4)) print("") return self.vtc
regtree0 = DecisionTreeRegressor( max_depth=4, min_samples_leaf=0.1, random_state=22) # set minimum leaf to contain at least 10% of data points # DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None, # max_leaf_nodes=None, min_impurity_decrease=0.0, # min_impurity_split=None, min_samples_leaf=0.13, # min_samples_split=2, min_weight_fraction_leaf=0.0, # presort=False, random_state=3, splitter='best') regtree0.fit(X_train, y_train) # Fit regtree0 to the training set # Import mean_squared_error from sklearn.metrics as MSE from sklearn.metrics import mean_squared_error as MSE # evaluation y_pred = regtree0.predict(X_test) # Compute y_pred mse_regtree0 = MSE(y_test, y_pred) # Compute mse_regtree0 rmse_regtree0 = mse_regtree0**(.5) # Compute rmse_regtree0 print("Test set RMSE of regtree0: {:.2f}".format(rmse_regtree0)) #%% # Let us compare the performance with OLS from sklearn import linear_model olspizza = linear_model.LinearRegression() olspizza.fit(X_train, y_train) y_pred_ols = olspizza.predict(X_test) # Predict test set labels/values mse_ols = MSE(y_test, y_pred_ols) # Compute mse_ols rmse_ols = mse_ols**(0.5) # Compute rmse_ols print('Linear Regression test set RMSE: {:.2f}'.format(rmse_ols))
# Instantiate dt dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=0.13, random_state=3) # Fit dt to the training set dt.fit(X_train, y_train) ################################# Evaluate the regression tree ################################# # Import mean_squared_error from sklearn.metrics as MSE from sklearn.metrics import mean_squared_error as MSE # Compute y_pred y_pred = dt.predict(X_test) # Compute mse_dt mse_dt = MSE(y_test, y_pred) # Compute rmse_dt rmse_dt = mse_dt**0.5 # Print rmse_dt print("Test set RMSE of dt: {:.2f}".format(rmse_dt)) ################################# Linear regression vs regression tree ################################# # Predict test set labels y_pred_lr = lr.predict(X_test) # Compute mse_lr mse_lr = MSE(y_test, y_pred_lr)
n_estimators=200, random_state=2) # Train the SGB regressor # In this exercise, you'll train the SGBR sgbr instantiated in the previous exercise and predict the test set labels. # The bike sharing demand dataset is already loaded processed for you; it is split into 80% train and 20% test. The feature matrices X_train and X_test, the arrays of labels y_train and y_test, and the model instance sgbr that you defined in the previous exercise are available in your workspace. # Fit sgbr to the training set sgbr.fit(X_train, y_train) # Predict test set labels y_pred = sgbr.predict(X_test) # Evaluate the SGB regressor # You have prepared the ground to determine the test set RMSE of sgbr which you shall evaluate in this exercise. # y_pred and y_test are available in your workspace. # Import mean_squared_error as MSE from sklearn.metrics import mean_squared_error as MSE # Compute test set MSE mse_test = MSE(y_test, y_pred) # Compute test set RMSE rmse_test = mse_test ** (0.5) # Print rmse_test print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test)) # Test set RMSE of sgbr: 49.979 # The stochastic gradient boosting regressor achieves a lower test set RMSE than the gradient boosting regressor (which was 52.065)!
# plt.plot(axisx, rs, c='green', label='XGB') # plt.legend() # plt.show() for booster in ['gbtree', 'gblinear', 'dart']: reg = XGBR(n_estimators=180, learning_rate=0.1, random_state=0, booster=booster, objective='reg:squarederror').fit(Xtrain, Ytrain) print(booster) print(reg.score(Xtest, Ytest)) reg = XGBR(n_estimators=180, objective='reg:squarederror').fit(Xtrain, Ytrain) reg.score(Xtest, Ytest) MSE(Ytest, reg.predict(Xtest)) import xgboost as xgb dtrain = xgb.DMatrix(Xtrain, Ytrain) dtest = xgb.DMatrix(Xtest, Ytest) param = {'silent': False, 'objective': 'reg:squarederror', 'eta': 0.1} num_round = 180 bst = xgb.train(param, dtrain, num_round) from sklearn.metrics import r2_score axisx = np.arange(0, 5, 0.05) rs = [] var = [] ge = []