def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng).fit(X_train, y_train) assert_equal(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng).fit(X_train, y_train) assert_greater(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train))
def test_bootstrap_samples(): """Test that bootstraping samples generate non-perfect base estimators.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng).fit(X_train, y_train) assert_equal(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng).fit(X_train, y_train) assert_greater(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train))
def bagging(x_train, y_train): model = BaggingRegressor(base_estimator=SVR(), n_estimators=10, random_state=0) model.fit(x_train, y_train) score = model.score(x_train, y_train) return score
def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert abs(test_score - clf.oob_score_) < 0.1 # Test with few estimators warn_msg = ( "Some inputs do not have OOB scores. This probably means too few " "estimators were used to compute any reliable oob estimates.") with pytest.warns(UserWarning, match=warn_msg): regr = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, random_state=rng) regr.fit(X_train, y_train)
class _BaggingRegressorImpl: def __init__( self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0, ): estimator_impl = base_estimator self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "max_samples": max_samples, "max_features": max_features, "bootstrap": bootstrap, "bootstrap_features": bootstrap_features, "oob_score": oob_score, "warm_start": warm_start, "n_jobs": n_jobs, "random_state": random_state, "verbose": verbose, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y, sample_weight=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = ( feature_transformer >> self._hyperparams["base_estimator"] ) self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.fit(X, y, sample_weight) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def bagging_regressor(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training BaggingRegressor...') start_time = self.timer() bg = BaggingRegressor(oob_score=True, verbose=1) bg.fit(x_tr, y_tr) print("The R2 is: {}".format(bg.score(x_tr, y_tr))) # print("The alpha choose by CV is:{}".format(krrl.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(bg.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/bg.pkl', 'wb') as f: pickle.dump(bg, f) print('Making prediction and saving into a csv') y_test = bg.predict(self.x_test) return y_test
def models(xtrain, ytrain): #LogesticRegression; used when Y value only has 2 values from sklearn.linear_model import LogisticRegression lin = LogisticRegression(random_state=0) print(sum(cross_val_score(lin, xtrain, ytrain, cv=5)) / 5) lin.fit(xtrain, ytrain) #Decision Tree; from sklearn.tree import DecisionTreeClassifier tree = DecisionTreeClassifier(criterion="entropy", random_state=0) print(sum(cross_val_score(tree, xtrain, ytrain, cv=5)) / 5) tree.fit(xtrain, ytrain) #random forest classifer from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0) print(sum(cross_val_score(forest, xtrain, ytrain, cv=5)) / 5) forest.fit(xtrain, ytrain) from sklearn.ensemble import BaggingRegressor bag = BaggingRegressor() print(sum(cross_val_score(forest, xtrain, ytrain, cv=5)) / 5) bag.fit(xtrain, ytrain) #Accuracy print('Logestic Regression accuracy:', lin.score(xtrain, ytrain)) print('Decision Tree Classifer accuracy:', tree.score(xtrain, ytrain)) print('Random Forest Classifier accuracy:', forest.score(xtrain, ytrain)) print('Bagging Regressor accuracy:', bag.score(xtrain, ytrain)) return lin, tree, forest, bag
def KNeighborsBagging(neigh): kn1 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='uniform') kn2 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='distance') bgg = BaggingRegressor(kn1, n_estimators=10, max_samples=0.7, max_features=0.9, verbose=0) #, max_features=0.5 bgg.fit(X_train, y_train) print(bgg.score(X_train, y_train)) y_pred = bgg.predict(X_test) # Generate ROC curve values: fpr, tpr, thresholds fpr, tpr, thresholds = roc_curve(y_test, y_pred) # Plot ROC curve plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.show()
def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns( UserWarning, BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def run(): print "Bagged Decision Tree Regression started..." #Preparing Training data dir_path = "" train_file_path = dir_path + "train.csv" train_file = read_csv(train_file_path, skiprows=1, header=None) train_file = train_file.drop(train_file.columns[0], axis=1) train_file = train_file.values #Combining previous 5 time step data into one row train_X_temp = train_file[5:50000, :-1] train_Y = train_file[6:50001, -1] train_X = np.zeros((train_X_temp.shape[0], 8 * 5)) for i in range(train_X_temp.shape[0]): for j in range(5): for k in range(8): train_X[i][j * 8 + k] = train_X_temp[i - j][k] #Preparing testing data test_file_name = dir_path + "test2.csv" test_file = read_csv(test_file_name, skiprows=1, header=None) test_file = test_file.values test_X = np.array(test_file[:, :-1]) test_y = test_file[:, -1] # print "\nSimple Decison Tree:" # dec_tree = DecisionTreeRegressor(max_depth = 5) # dec_tree.fit(train_X, train_Y) # prediction = dec_tree.predict(test_X) # print "Predictions: \n",prediction # print "Score: ",dec_tree.score(test_X,test_y) # print "\nADABoost Decision Tree:" # ada_boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 5),n_estimators = 10) # ada_boost.fit(train_X, train_Y) # prediction = ada_boost.predict(test_X) # print "Predictions: \n",prediction # print "Score: ",ada_boost.score(test_X,test_y) #Model training and prediction print "\nBagged Decision Tree:" start = time.time() bag_reg = BaggingRegressor(DecisionTreeRegressor(), n_jobs=2, random_state=0).fit(train_X, train_Y) #bag_reg.set_params(n_jobs=1) #Calculating and printing Results prediction = bag_reg.predict(test_X) mse = np.mean((prediction - test_y)**2) print "MSE: ", mse # print "Predictions: \n",prediction print "Score: ", bag_reg.score(test_X, test_y) print "Time: ", (time.time() - start) print "Decision Tree Regressor done...\n"
def Bagging(x_train, y_train, x_test, y_test): estimator = BaggingRegressor(n_estimators=1000, random_state=0, n_jobs=-1) estimator.fit(x_train, y_train) t = estimator.score(x_train, y_train) y_pred = estimator.predict(x_test) mse_score = mse(y_test, y_pred) print("mse_score: " + str(mse_score)) r2_score = r2(y_test, y_pred) print("r2_score: " + str(r2_score)) print(t)
def makeBaggingBoostDefaultDecisionTreePrediction(n_est): global y_t_pred, result print "Prediction #estimators = %s and Decision Trees" % (n_est) prefix = "%s_BaggingBoost_n_est%s_DefaultTree" % (name, n_est) model = BaggingRegressor(n_estimators=n_est) x1 = x[:, :] # use all data x_t1 = x_t[:, :] # use all data y_t_pred = model.fit(x1, y).predict(x_t1) r = model.score(x1, y) print("score r = %s" % r) return prefix, model
def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor( base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng, ).fit(X_train, y_train) assert base_estimator.score(X_train, y_train) == ensemble.score(X_train, y_train) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor( base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng, ).fit(X_train, y_train) assert base_estimator.score(X_train, y_train) > ensemble.score( X_train, y_train) # check that each sampling correspond to a complete bootstrap resample. # the size of each bootstrap should be the same as the input data but # the data should be different (checked using the hash of the data). ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(), bootstrap=True).fit(X_train, y_train) training_hash = [] for estimator in ensemble.estimators_: assert estimator.training_size_ == X_train.shape[0] training_hash.append(estimator.training_hash_) assert len(set(training_hash)) == len(training_hash)
def KNeighborsBaggingResul(neigh): kn1 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='uniform') kn2 = neighbors.KNeighborsRegressor(n_neighbors=neigh, weights='distance') bgg = BaggingRegressor(kn1, n_estimators=10, max_samples=0.7, max_features=0.9, verbose=0) #, max_features=0.5 bgg.fit(X_train, y_train) resul.append(bgg.score(X_train, y_train))
def makeBaggingBoostGaussianProcessPrediction(n_est, maxfs): global y_t_pred, result print "Prediction #estimators = %s and Gaussian Process" % (n_est) prefix = "%s_BaggingBoost_max_features%s_GP" % (name, maxfs) #GaussianProcessRegressor(kernel=RationalQuadratic(),n_restarts_optimizer=9) model = BaggingRegressor(n_estimators=n_est) #,max_features=maxfs,n_estimators=n_est) x1 = x[:, :] # use all data x_t1 = x_t[:, :] # use all data y_t_pred = model.fit(x1, y).predict(x_t1) r = model.score(x1, y) print("score r = %s" % r) return prefix, model
def test_bootstrap_samples(): # Test that bootstrapping samples generate non-perfect base estimators. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng).fit(X_train, y_train) assert_equal(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng).fit(X_train, y_train) assert_greater(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # check that each sampling correspond to a complete bootstrap resample. # the size of each bootstrap should be the same as the input data but # the data should be different (checked using the hash of the data). ensemble = BaggingRegressor(base_estimator=DummySizeEstimator(), bootstrap=True).fit(X_train, y_train) training_hash = [] for estimator in ensemble.estimators_: assert estimator.training_size_ == X_train.shape[0] training_hash.append(estimator.training_hash_) assert len(set(training_hash)) == len(training_hash)
def run(): print "Decision Tree Regression started..." #Preparing Training data dir_path = "" train_file_path = dir_path + "train.csv" train_file = read_csv(train_file_path,skiprows=1,header=None) train_file = train_file.drop(train_file.columns[0],axis=1) train_file = train_file.values train_X_temp = train_file[5:50000,:-1] train_Y = train_file[6:50001,-1] #Combining previous 5 time step data into one row train_X = np.zeros((train_X_temp.shape[0],8*5)) for i in range(train_X_temp.shape[0]): for j in range(5): for k in range(8): train_X[i][j*8+k] = train_X_temp[i-j][k] #Preparing testing data test_file_name = dir_path + "test2.csv" test_file = read_csv(test_file_name,skiprows=1,header=None) test_file = test_file.values test_X = np.array(test_file[:,:-1]) test_y = test_file[:,-1] #Model training and prediction for different no of trees estimators = np.arange(10, 100, 10) print "\nBagged Decision Tree:" bag_reg = BaggingRegressor(DecisionTreeRegressor(),n_jobs=2,random_state=0).fit(train_X, train_Y) scores = [] prediction = [] for n in estimators: bag_reg.set_params(n_estimators=n) bag_reg.fit(train_X, train_Y) score = bag_reg.score(test_X, test_y) print score scores.append(score) #prediction.append(bag_reg.predict(test_X)) #plotting the effect of increasing no of trees on accuracy score plt.title("Effect of n_estimators") plt.xlabel("n_estimator") plt.ylabel("score") plt.plot(estimators, scores) plt.show()
def Bagging(Xtrain, Ytrain, Xtest, Ytest): """ Apply the extra trees regressor """ from sklearn.ensemble import BaggingRegressor print('\nBagging regressor:') clf = BaggingRegressor(n_estimators=100, n_jobs=-1).fit(Xtrain, Ytrain) print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain))) #find the training error prediction = clf.predict(Xtrain) Etrain = error(prediction, Ytrain) print('Training error: {0}'.format(Etrain)) #find the test error prediction = clf.predict(Xtest) Etrain = error(prediction, Ytest) print('Test error: {0}'.format(Etrain))
def run_tree_models(x,y): ''' Get an overview of performances of different tree models. Tree models: Decision tree, AdaBoost, Bagged tree INPUT: Dataframe with features (X) and target variable dataframe (y) OUTPUT: Scores of each tree model ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) dt = DecisionTreeRegressor() dt.fit(X_train, y_train) print('Decision Tree Score: ' + str(dt.score(X_test, y_test))) ada = AdaBoostRegressor(LinearRegression()) ada.fit(X_train, y_train) print('AdaBoost Regressor Score: ' + str(ada.score(X_test, y_test))) # Train and Score Bagged Tree Regressor (ensemble learner) bagged_tree = BaggingRegressor(DecisionTreeRegressor()) bagged_tree.fit(X_train, y_train) print('Bagged Tree Score: ' + str(bagged_tree.score(X_test, y_test)))
def bag_svr(X_train=pd.DataFrame(), yinzi=[], y_train=pd.DataFrame(), n_iter=10, n_jobs=2): from sklearn.svm import SVR from sklearn.ensemble import BaggingRegressor bag_rg = BaggingRegressor(base_estimator=SVR(), n_estimators=n_iter, max_samples=0.5, max_features=0.5, bootstrap=True, bootstrap_features=True, random_state=0, n_jobs=n_jobs) bag_rg = bag_rg.fit(X_train[yinzi], y_train) score0 = bag_rg.score(X_train[yinzi], y_train) print(f"bag_rg,得分:{score0}") pre = bag_rg.predict(X_train[yinzi]) X_train = pd.DataFrame(X_train) X_train.loc[:, '预测值'] = pd.Series(pre, index=X_train.index) return bag_rg, X_train, score0
def bagging(X, y, k_cv): kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0) regr = BaggingRegressor(base_estimator=BayesianRidge(n_iter=1000), n_estimators=20, random_state=0, max_samples=1.0, max_features=0.7, n_jobs=15) # regr = BaggingRegressor(base_estimator=SVR(C=40,gamma=0.01), # n_estimators=100, random_state=0, # max_samples=0.8,max_features=0.8,n_jobs=15) vaild_split = kfold.split(y) for i in range(k_cv): split_index = vaild_split.__next__() test_index = split_index[1] y_test = y[test_index] trainval_index = split_index[0] X_trainval = X[trainval_index, :] X_test = X[test_index, :] y_trainval = y[trainval_index] regr.fit(X_trainval, y_trainval) print((regr.score(X_trainval, y_trainval))**0.5) test_pre = regr.predict(X_test) print("accuracy: ", (r_2(y_test, test_pre))**0.5)
print '******************************************' if name=='Boston': # Regression problem rfr = RandomForestRegressor(**params) rfr.fit(X, y) scores_rfr = cross_val_score(rfr, X, y ,cv=5) br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators) br.fit(X, y) scores_br = cross_val_score(br, X, y, cv=5) boston[i,1] = rfr.score(X, y) boston[i,2] = np.mean(scores_rfr) boston[i,3] = np.std(scores_rfr) boston[i,4] = br.score(X, y) boston[i,5] = np.mean(np.mean(scores_br)) boston[i,6] = np.std(scores_br) print 'Score RandomForestRegressor = %s' % ( boston[i,1]) print 'Cross Val : mean = %s' % (boston[i,2]) print 'Cross Val : std = %s' % (boston[i,3]) print 'Score BaggingRegressor = %s' % (boston[i,4]) print 'Cross Val : mean = %s' %(boston[i,5]) print 'Cross Val : std = %s' %(boston[i,6]) if name=='Diabetes': # Regression problem rfr = RandomForestRegressor(**params) rfr.fit(X, y) scores_rfr = cross_val_score(rfr, X, y ,cv=5)
encoding="GBK") data_cat_df = dataXFCA[[ 'area', 'province', 'city', 'year', 'month', 'day', 'industry' ]].astype(str) y_data = dataXFCA['fcA'] data_num_df = dataXFCA[['gcA']] train, y_data = preprocess(data_cat_df, data_num_df, y_data) trainXF, testXF, trainyF, testyF = train_test_split(train, y_data, test_size=0.1, random_state=1) #BaggingRegression ridge = Ridge(15) clf = BaggingRegressor(n_estimators=15, base_estimator=ridge) clf.fit(trainXF, trainyF) y_FCA = clf.predict(testXF) # print y_FCA #deal with scaling FCA_pred = scaling(y_FCA) print FCA_pred scores = clf.score(testXF, testyF) scores_c = cross_val_score(clf_Ada, train_Salary, train_Salary_y) scores_cv = np.mean(scores_c) print 'Baggingregression:', scores print 'BaggingRegression_CV', scores_cv print 'finished with the FCA(完成流通股本预测)' y = merge(sal_pred, FCA_pred) print '行业衡量标准:', y print 'ok'
print '******************************************' print name print '******************************************' if name=='Boston' or name=='Diabetes': # Regression problem rfr = RandomForestRegressor(**params) rfr.fit(X, y) print 'Score RandomForestRegressor = %s' % (rfr.score(X, y)) scores_rfr = cross_val_score(rfr, X, y ,cv=5) print 'Cross Val Score RandomForestRegressor = %s' % (np.mean(scores_rfr)) br = BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=max_depth), n_estimators=n_estimators) br.fit(X, y) print 'Score BaggingRegressor = %s' % (br.score(X, y)) scores_br = cross_val_score(br, X, y, cv=5) print 'Cross Val Scores of BR = %s' %(np.mean(scores_br)) if name=='Iris' or name=='Digits': # Classificaiton problem rfc = RandomForestClassifier(**params) rfc.fit(X, y) print 'Score RandomForestClassifier = %s' % (rfc.score(X, y)) scores_rfc = cross_val_score(rfc, X, y ,cv=5) print 'Corss Val Scores of RandomForestClassifier = %s' %(np.mean(scores_rfc)) bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=max_depth), n_estimators=n_estimators) bc.fit(X, y) print 'Score BaggingClassifier == %s' % (bc.score(X, y)) scores_bc = cross_val_score(bc, X, y, cv=5)
# In[197]: from sklearn.ensemble import BaggingRegressor bgcl = BaggingRegressor(n_estimators=10,random_state=1) bgcl = bgcl.fit(X_train, y_train) # In[198]: y_predict = bgcl.predict(X_test) acc_BG_train=bgcl.score(X_train , y_train) print("Bagging - Train Accuracy:",acc_BG_train) acc_BG = bgcl.score(X_test , y_test) print("Bagging - Test Accuracy:",acc_BG) results = cross_val_score(bgcl, X, y, cv=kfold, scoring='r2') print(results) kf_res_mean=results.mean()*100.0 kf_res_std=results.std()*100.0 # In[200]: #Store the accuracy results for each model in a dataframe for final comparison tempResultsDf = pd.DataFrame({'Model':['Bagging'],'Training_Score': acc_BG_train,
def bagging(df1, features, pred_var, df2): lr = BaggingRegressor() lr.fit(df1[features], df1[pred_var]) print 'BaggingClassifier Score: ', lr.score(df2[features], df2[pred_var])
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn import preprocessing data = pd.read_csv("clean_data.csv", sep=",", index_col=None, prefix=None, skip_blank_lines=True, header=0) X = data.loc[:, [ "Quartier", "Commune", "Etage", "Superficie", "Piece", "Electricite", "Gaz", "Eau", "Acte notarie", "Jardin", "Livret foncier", "Meuble", "Garage" ]].values Y = data.loc[:, "Prix"].values X = pd.DataFrame(X) le = preprocessing.LabelEncoder() X = X.apply(le.fit_transform) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) regressor = BaggingRegressor(DecisionTreeRegressor(max_depth=4)) regressor.fit(X_train, Y_train) score = regressor.score(X_test, Y_test) print score
from sklearn import preprocessing,cross_validation from sklearn.ensemble import BaggingRegressor import pickle temp=0 # ID,ATM Name,Transaction Date,No Of Withdrawals,No Of CUB Card Withdrawals,No Of Other Card Withdrawals,Total amount Withdrawn, # Amount withdrawn CUB Card,Amount withdrawn Other Card,averageWithdrawals,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday, # WorkingDay,H,N,C,M,NH,HWH,HHW,WWH,WHH,HWW,WWW,WHW,HHH,Rounded Amount Withdrawn,class,AvgAmountPerWithdrawal df=pd.read_csv('ClassificationData.csv') df=df[df['ATM Name']=='Big Street ATM'] df.drop(['ID','ATM Name','Transaction Date','No Of Withdrawals','No Of CUB Card Withdrawals','No Of Other Card Withdrawals', 'class','Amount withdrawn CUB Card','Amount withdrawn Other Card','Rounded Amount Withdrawn'],1,inplace=True) X=np.array(df.drop('Total amount Withdrawn',1)) X=preprocessing.scale(X) y=np.array(df['Total amount Withdrawn']) X_train,X_test,y_train,y_test=cross_validation.train_test_split(X,y,test_size=0.2) clf=BaggingRegressor(n_estimators=200) clf.fit(X_train,y_train) accuracy=clf.score(X_test,y_test) print('Accuracy: ',accuracy) # with open('gradientBoosting.pickle','wb') as f: # pickle.dump(clf,f) # pickle_in=open('gradientBoosting.pickle','rb') # clf=pickle.load(pickle_in) # temp+=accuracy # print("Average Accuracy is: ",temp)
tmpSCR = adaBoostC.score(testX, yTest) else: adaBoostR.fit(trainX, yTrain) tmpSCR = adaBoostR.score(testX, yTest) scores['adaBoost'][label].append(tmpSCR) tTOT = time.time() - t0 times['adaBoost'][label].append(tTOT) t0 = time.time() print("start bagging withOUT out-of-bag") if cnt < 2: bagCoobN.fit(trainX, yTrain) tmpSCR = bagCoobN.score(testX, yTest) else: bagRoobN.fit(trainX, yTrain) tmpSCR = bagRoobN.score(testX, yTest) scores['bagging (NO out of bag)'][label].append(tmpSCR) tTOT = time.time() - t0 times['bagging (NO out of bag)'][label].append(tTOT) t0 = time.time() print("start bagging WITH out-of-bag") if cnt < 2: bagCoobY.fit(trainX, yTrain) tmpSCR = bagCoobY.score(testX, yTest) else: bagRoobY.fit(trainX, yTrain) tmpSCR = bagRoobY.score(testX, yTest) scores['bagging (YES out of bag)'][label].append(tmpSCR) tTOT = time.time() - t0 times['bagging (YES out of bag)'][label].append(tTOT)
reg8.fit(X_train, y_train) reg1.fit(X_train, y_train) reg2.fit(X_train, y_train) reg3.fit(X_train, y_train) ereg.fit(X_train, y_train) reg4.fit(X_train, y_train) reg5.fit(X_train, y_train) reg6.fit(X_train, y_train) # reg7.fit(X_train, y_train) print("GradientBoostingRegressor:", reg1.score(X_test, y_test)) print("RandomForestRegressor:", reg2.score(X_test, y_test)) print("LinearRegression:", reg3.score(X_test, y_test)) print("VotingRegressor:", ereg.score(X_test, y_test)) print("AdaBoostRegressor:", reg4.score(X_test, y_test)) print("BaggingRegressor:", reg5.score(X_test, y_test)) print("ExtraTreesRegressor:", reg6.score(X_test, y_test)) # print("StackingRegressor:", reg7.score(X_test, y_test)) print("XGBRegressor:", reg8.score(X_test, y_test)) XGBpredictions = reg8.predict(X_test) MAE = mean_absolute_error(y_test, XGBpredictions) print('XGBoost validation MAE = ', MAE) xx = [] # try: # file = open('regression.csv', 'w', newline='') # file_w = csv.writer(file) # except Exception: # print('regression.csv open faild') # exit() # names = ['test', 'prediction']
def timeseries(company_name): df = pd.read_csv('data_files/WIKI-'+company_name+'.csv') print(len(df)) df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace = True) df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Volume', 'Adj. Close']] df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df_timeseries_open = df[df.columns[0]] df_timeseries_high = df[df.columns[1]] df_timeseries_low= df[df.columns[2]] df_timeseries_vol = df[df.columns[3]] df_timeseries_close = df[df.columns[4]] df_timeseries_HL_PCT = df[df.columns[5]] df_timeseries_PCT_change = df[df.columns[6]] x1,train_size = timer(df_timeseries_open) print("done 1 ",train_size," ",len(x1)) x2,train_size = timer(df_timeseries_high) print("done 2 ",train_size) x3,train_size = timer(df_timeseries_low) print("done 3 ",train_size) x4,train_size = timer(df_timeseries_vol) print("done 4 ",train_size) x6,train_size = timer(df_timeseries_HL_PCT) print("done 6 ",train_size) x7,train_size = timer(df_timeseries_PCT_change) print("done 7 ",train_size) """ np.savetxt('open.txt', x1, fmt='%d') np.savetxt('high.txt', x2, fmt='%d') np.savetxt('low.txt', x3, fmt='%d') np.savetxt('volume.txt', x4, fmt='%d') np.savetxt('Close.txt', x5, fmt='%d') np.savetxt('HL_PCT.txt', x6, fmt='%d') np.savetxt('PCT_change.txt', x7, fmt='%d') """ x1 = np.loadtxt('open.txt', dtype=int) x2 = np.loadtxt('high.txt', dtype=int) x3 = np.loadtxt('low.txt', dtype=int) x4 = np.loadtxt('volume.txt', dtype=int) x6 = np.loadtxt('HL_PCT.txt', dtype=int) x7 = np.loadtxt('PCT_change.txt', dtype=int) dfform = {'Adj. Open': df['Adj. Open'], 'Adj. High':df['Adj. High'], 'Adj. Low': df['Adj. Low'], 'Adj. Volume': df['Adj. Volume'], 'Adj. Close':df['Adj. Close'], 'HL_PCT':df['HL_PCT'], 'PCT_change':df['PCT_change'] } df1 = pd.DataFrame(dfform) data = {'Adj. Open': x1, 'Adj. High': x2, 'Adj. Low': x3, 'Adj. Volume': x4, 'HL_PCT': x6, 'PCT_change': x7 } df_test = pd.DataFrame(data) y = np.array(df1['Adj. Close']) y_train = y[0:train_size] y_test = y[train_size:] x_test = np.array(df_test) del df1['Adj. Close'] x_train = np.array(df1[0:train_size]) from sklearn.linear_model import Ridge clf1=Ridge(alpha=1.0) print("fitting ridge") clf1.fit(x_train, y_train) predicted=clf1.predict(x_test) print("score") confidence1 = clf1.score(x_test, y_test) print("Ridge : %.3f%%" % (confidence1*100.0)) print(" E N D") clf = LinearRegression() print("fitting LR") clf.fit(x_train, y_train) print("score") confidence2 = clf.score(x_test, y_test) print("LinearRegressor : %.3f%%" % (confidence2*100.0)) print(" E N D") from sklearn.ensemble import BaggingRegressor clfy=BaggingRegressor(base_estimator=None,n_estimators=10) print("fitting bagging") clfy.fit(x_train, y_train) predictedy=clfy.predict(x_test) print("score") confidencey = clfy.score(x_test, y_test) print("BAGGING : %.3f%%" % (confidencey*100.0)) from sklearn.ensemble import GradientBoostingRegressor clfz=GradientBoostingRegressor() print("GradientBoostingRegressor") clfz.fit(x_train, y_train) predictedz=clfz.predict(x_test) print("score") confidencez = clfz.score(x_test, y_test) print("BOOSTING : %.3f%%" % (confidencez*100.0)) import matplotlib.pyplot as plt plt.plot(predictedz,label='predicted') plt.plot(y_test,label='Actual') plt.legend() plt.xlabel('Time') plt.ylabel('Price') plt.savefig('fea/'+str(company_name)+'.png',dpi=200,bbox_inches='tight')
from sklearn.model_selection import cross_val_score clf = RandomForestRegressor() scores = cross_val_score(clf, X_test, y_test, cv=5) scores.mean() #mse in $ mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor
# Appendix - Datasets description section at the end of this MOOC. # ``` # %% [markdown] # Create a `BaggingRegressor` and provide a `DecisionTreeRegressor` # to its parameter `base_estimator`. Train the regressor and evaluate its # statistical performance on the testing set. # %% from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor tree = DecisionTreeRegressor() bagging = BaggingRegressor(base_estimator=tree, n_jobs=-1) bagging.fit(data_train, target_train) test_score = bagging.score(data_test, target_test) print(f"Basic R2 score of the bagging regressor:\n" f"{test_score:.2f}") # %% [markdown] # Now, create a `RandomizedSearchCV` instance using the previous model and # tune the important parameters of the bagging regressor. Find the best # parameters and check if you are able to find a set of parameters that # improve the default regressor. # ```{tip} # You can list the bagging regressor's parameters using the `get_params` # method. # ``` # %% for param in bagging.get_params().keys():
def impute_missing_values(df, var_deviation_tolerance=0.97, actual_or_gaussian_residuals='actual', col_floor_ceiling_dict=None, scores=False): '''Impute missing values while minimizing distortion of variable distribution by creating a bagged model using other variables and adding residuals to output values Parameters: df: dataframe with missing values var_deviation_tolerance: target percent deviation from original variable distributions actual_or_guassian_residuals: apply residuals to model outputs from actual distribution or from a gaussian distribution based on residuals' means and variances col_floor_ceiling_dict: a dictionary with the variable name and a tuple of the min and max for variables with a finite range. Use float(inf) or float(-inf) for variables that are limited in only one direction scores: return accuracy score of models per variable Returns: df: df with imputed values problems: columns that failed to impute column_scores: accuracy scores of imputation model on non-missing values ''' df = df.copy() columns = df.columns type_dict = df.dtypes.to_dict() missing_columns = list( df.isna().sum()[df.isna().sum() > 0].sort_values().index) have_columns = [i for i in columns if i not in missing_columns] column_scores = {} problems = [] for col in tqdm.tqdm(missing_columns): try: percent_missing = df[col].isna().sum() / df.shape[0] m = math.ceil(percent_missing / ((1 / .97) - 1)) other_columns = [i for i in columns if i != col] na_index = df[df[col].isna() == 1].index have_index = [i for i in df.index if i not in na_index] na_have_cols = set(df.loc[na_index, other_columns].dropna(axis=1).columns) have_have_cols = set(df.loc[have_index, other_columns].dropna(axis=1).columns) both_cols = na_have_cols.intersection(have_have_cols) int_df = pd.get_dummies(df.loc[:, both_cols], drop_first=True) X_have = int_df.loc[have_index, :] y_have = df[col][have_index] X_na = int_df.loc[na_index, :] if type_dict[col] == 'object': le = LabelEncoder() y_have = le.fit_transform(y_have) df[col][have_index] = y_have rf = RandomForestClassifier() bagc = BaggingClassifier(base_estimator=rf, n_estimators=m) bagc.fit(X_have, y_have) column_scores[col] = bagc.score(X_have, y_have) resid_preds = bagc.predict(X_have) residuals = y_have - resid_preds preds = bagc.predict(X_na) else: bagr = BaggingRegressor(n_estimators=m) bagr.fit(X_have, y_have) column_scores[col] = bagr.score(X_have, y_have) resid_preds = bagr.predict(X_have) residuals = y_have - resid_preds preds = bagr.predict(X_na) if actual_or_gaussian_residuals == 'actual': rand_resids = np.random.choice(residuals, len(X_na), replace=True) else: rand_resids = np.random.normal(residuals.mean(), residuals.std(), len(X_na)) preds = preds + rand_resids if type_dict[col] == 'object': preds = preds.round() if col_floor_ceiling_dict != None: if col in col_floor_ceiling_dict.keys(): preds = np.clip(preds, col_floor_ceiling_dict[col][0], col_floor_ceiling_dict[col][1]) df[col][na_index] = preds have_columns.append(col) except: problems.append(col) if scores == False: return df, problems else: return df, problems, column_scores
# Bagging Methods ########################################################### from sklearn.ensemble import BaggingRegressor from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(n_neighbors=5) # usual knn knn.fit(xtrain,ytrain) print(knn.score(xtrain,ytrain),knn.score(xtest,ytest)) # full bagging n_estimators : 모델의 갯수, max_samples : resample의 비율, 0.5로 하면 반만 중복허용 # max_features : feature를 뽑을 때 중복허용 비율 bootstrap : true(중복허용) bootstrap_features : true(중복허용) bf = BaggingRegressor(knn,n_estimators=100,max_samples=1.0,max_features=1.0,random_state=0) bf.fit(xtrain,ytrain) print(bf.score(xtrain,ytrain),bf.score(xtest,ytest)) # bagging with subsampling and feature randomization bf = BaggingRegressor(knn,n_estimators=500,max_samples=0.5,max_features=0.5) bf.fit(xtrain,ytrain) print(bf.score(xtrain,ytrain),bf.score(xtest,ytest)) # effect of estimators np.random.seed(0) n_list = [1,5,10,20,30,50,100,200,500,1000] s = np.zeros((len(n_list),2)) for i in range(len(n_list)): bf = BaggingRegressor(knn,n_estimators=n_list[i],max_samples=0.5,max_features=0.5) bf.fit(xtrain,ytrain) s[i,0] = bf.score(xtrain,ytrain) s[i,1] = bf.score(xtest,ytest)