def ada(): print("----------------------Ada----------------------------") t = DecisionTreeRegressor(max_depth=7, criterion='mse') ada = AdaBoostRegressor(base_estimator=t, n_estimators=150, random_state=seed) ada.fit(Xtrain, Ytrain) valiada = cross_val_score(ada, Xvalid, Yvalid, cv=kf, verbose=1, n_jobs=-1) test_score = ada.score(Xvalid, Yvalid) Y = ada.predict(Xvalid) print(test_score) print("MSE:", mean_squared_error(Yvalid, Y, squared=False))
def test_boston(): # Check consistency on dataset boston house prices. reg = AdaBoostRegressor(random_state=0) reg.fit(boston.data, boston.target) score = reg.score(boston.data, boston.target) assert score > 0.85 # Check we used multiple estimators assert len(reg.estimators_) > 1 # Check for distinct random states (see issue #7408) assert_equal(len(set(est.random_state for est in reg.estimators_)), len(reg.estimators_))
def test_diabetes(loss): # Check consistency on dataset diabetes. reg = AdaBoostRegressor(loss=loss, random_state=0) reg.fit(diabetes.data, diabetes.target) score = reg.score(diabetes.data, diabetes.target) assert score > 0.6 # Check we used multiple estimators assert len(reg.estimators_) > 1 # Check for distinct random states (see issue #7408) assert (len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_))
class _AdaBoostRegressorImpl: def __init__( self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, loss="linear", random_state=None, ): if base_estimator is None: estimator_impl = None else: estimator_impl = _FitSpecProxy(base_estimator) self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "learning_rate": learning_rate, "loss": loss, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = _FitSpecProxy( feature_transformer >> self._hyperparams["base_estimator"]) self._wrapped_model = SKLModel(**self._hyperparams) if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def DecisionTreeAdaBoost(X_train, y_train, X_test, y_test): # Create Decision Tree Regressor object tree_1 = DecisionTreeRegressor() tree_2 = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=200, learning_rate=.1) # Train the model using the training sets tree_1.fit(X_train, y_train) tree_2.fit(X_train, y_train) # Score the decision tree model tree_1.score(X_test, y_test) # Score the boosted decision tree model boosted_tree_score = tree_2.score(X_test, y_test) boosted_tree_score # Make predictions using the testing set tree_1_pred = tree_1.predict(X_test) tree_2_pred = tree_2.predict(X_test) # The mean squared error tree2RMSE = sqrt(mean_squared_error(y_test, tree_2_pred)) print("Root mean squared error: %.2f" % tree2RMSE) # The absolute squared error print("Mean absolute error: %.2f" % mean_absolute_error(y_test, tree_2_pred)) # Explained variance score: 1 is perfect prediction print('R-squared decision tree: %.2f' % r2_score(y_test, tree_2_pred)) features = X.columns importances = tree_2.feature_importances_ indices = np.argsort(importances) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), features[indices]) plt.xlabel('Relative Importance') plt.show() plt.scatter(y_test, tree_1_pred) plt.xlabel('Measured') plt.ylabel('Predicted') plt.title('Decision Tree Predicted vs Actual') plt.show() chart_regression(tree_1_pred, y_test, 'Decision tree') plt.scatter(y_test, tree_2_pred) plt.xlabel('Measured') plt.ylabel('Predicted') plt.title('Boosted Decision Tree Predicted vs Actual') plt.show() chart_regression(tree_2_pred, y_test, 'Adaboost + DT') return boosted_tree_score, tree2RMSE
def ApplyAdaBoostRegressor(self, train, test, cross_validation, full_train, config): ABR = AdaBoostRegressor(loss=config['loss'], n_estimators=config['n_estimators']) target_train = train[['Hazard']] cross_validation_test = cross_validation[['Hazard']] prepared_train = train[train.columns.difference(['Id', 'Hazard'])] print "prepared_train meta" print "shape", prepared_train.shape print prepared_train.head(3) ABR.fit(prepared_train, target_train) dt = ABR.predict(test[test.columns.difference(['Id'])]) print "prediction score on cross validation" print ABR.score( cross_validation[cross_validation.columns.difference( ['Id', 'Hazard'])], cross_validation_test) dt_cv = ABR.predict( cross_validation[cross_validation.columns.difference( ['Id', 'Hazard'])]) test['Hazard'] = self.clipForecastValue(dt) cross_validation['predicted_Hazard'] = self.clipForecastValue(dt_cv) names = prepared_train.columns.values print "sorted feature importance" print sorted(zip(map(lambda x: round(x, 4), ABR.feature_importances_), names), reverse=True) #computing the Gini score print "the Gini score" print self.gini_normalized( np.ravel(cross_validation[['Hazard']]), np.ravel(cross_validation[['predicted_Hazard']])) return test, cross_validation
def adaboost(): # train = genfromtxt(open('./data/PCA_train_scored.csv', 'r'), delimiter=',', dtype='f8')[1:] # house_prices = genfromtxt(open('./data/train_scored_y.csv', 'r'), delimiter=',', dtype='f8')[1:] # test_data = genfromtxt(open('./data/PCA_test_scored.csv', 'r'), delimiter=',', dtype='f8')[1:] train = genfromtxt(open( './data/feature_engineering_test/filtered_train_new_scored.csv', 'r'), delimiter=',', dtype='f8')[1:] house_prices = genfromtxt(open('./data/train_scored_y.csv', 'r'), delimiter=',', dtype='f8')[1:] test = genfromtxt(open( './data/feature_engineering_test/filtered_test_new_scored.csv', 'r'), delimiter=',', dtype='f8')[1:] # train_data = genfromtxt(open('./data/feature_engineering_test/PCA_train_new_scored.csv', 'r'), delimiter=',', dtype='f8')[1:1320,1:] # house_prices_data = genfromtxt(open('./data/train_scored_y.csv', 'r'), delimiter=',', dtype='f8')[1:] # test_data = genfromtxt(open('./data/feature_engineering_test/PCA_test_new_scored.csv', 'r'), delimiter=',', dtype='f8')[1:,1:] totalCols = 100 train_data = train[:1320, 1:] house_prices_data = house_prices[:1320] validation_data = train[1320:, 1:] house_prices_validation = house_prices[1320:] test_data = test[0:, 1:] # Fit regression model regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=12), n_estimators=500, loss='square', learning_rate=1) regr_2.fit(train_data, house_prices_data) #Predict validation y_validation = regr_2.predict(validation_data) mse = mean_squared_error(house_prices_validation, y_validation) print("AdaBoost MSE: %.4f" % mse) print("AdaBoost Variance: %.4f" % regr_2.score(validation_data, y_validation)) # Predict y_2 = regr_2.predict(test_data) writeOutput(y_2, 'AdaBoost')
def adaboost_regression(): ''' runs an adaboost regression over the data set runs one with just the cleaned data we are already certain that cleaned data significantly outperforms raw data so we will not waste anymore time training models with raw data warning: this takes a REALLY long time to run, would not reccomend running this, especially because the results are not amazing ''' # adaboost parameters kFold = 5 param_grid = { 'loss': np.array(['linear', 'square', 'exponential']), 'learning_rate': np.arange(1, 101, 5) / 100, 'n_estimators': np.arange(40, 400, 20) } adaboost_grid = GridSearchCV(AdaBoostRegressor(), param_grid, cv=kFold) # test using the cleaned data x_np, y_np, df = load_data() y_np_c, x_np_c, df_c = clean_data(df) x_train, x_test, y_train, y_test = split_data(y_np_c, x_np_c) adaboost_grid.fit(x_train, y_train) best_learn = adaboost_grid.best_params_['learning_rate'] best_loss = adaboost_grid.best_params_['loss'] best_n = adaboost_grid.best_params_['n_estimators'] print("Best learning rate: %f" % best_learn) print("Best loss function: %s" % best_loss) print("Best n estimators: %f" % best_n) # train a model using these best parameters adaboost_model = AdaBoostRegressor(n_estimators=best_n, learning_rate=best_learn, loss=best_loss) adaboost_model.fit(x_train, y_train) y_predict = adaboost_model.predict(x_test) mse = mean_squared_error(y_predict, y_test) r2 = adaboost_model.score(x_test, y_test) print( "Performance of adaboost regression with removed day labels and normalized" ) print("Mean Squared Error: %f" % mse) print("RMSE: %f" % (mse**0.5)) print("R^2: %f" % r2)
def adaboost_regression(): ''' runs an adaboost regression over the data set warning: this will probably take a long time to run ''' print("Currently running AdaBoost Regression") train_x, train_y, _ = load_data() # convert to np array train_x = train_x.values train_y = train_y.values # convert the y values to log train_y = log_transform(train_y, "forward") # split the data x_train, x_test, y_train, y_test = split_data(train_x, train_y) # adaboost parameters kFold = 5 param_grid = { 'loss': np.array(['linear', 'square', 'exponential']), 'learning_rate': np.arange(1, 101, 5) / 100, 'n_estimators': np.arange(40, 400, 20) } adaboost_grid = GridSearchCV(AdaBoostRegressor(), param_grid, cv=kFold) # test using the training data adaboost_grid.fit(x_train, y_train) best_learn = adaboost_grid.best_params_['learning_rate'] best_loss = adaboost_grid.best_params_['loss'] best_n = adaboost_grid.best_params_['n_estimators'] print("Best learning rate: %f" % best_learn) print("Best loss function: %s" % best_loss) print("Best n estimators: %f" % best_n) # train a model using these best parameters adaboost_model = AdaBoostRegressor(n_estimators=best_n, learning_rate=best_learn, loss=best_loss) adaboost_model.fit(x_train, y_train) y_predict = adaboost_model.predict(x_test) mse = mean_squared_error(y_predict, y_test) r2 = adaboost_model.score(x_test, y_test) print("Performance of adaboost regression") print("Mean Squared Error: %f" % mse) print("RMSE: %f" % (mse**0.5)) print("R^2: %f" % r2)
class _AdaBoostRegressorImpl: def __init__( self, base_estimator=None, n_estimators=50, learning_rate=1.0, loss="linear", random_state=None, ): estimator_impl = base_estimator if isinstance(estimator_impl, lale.operators.Operator): if isinstance(estimator_impl, lale.operators.IndividualOp): estimator_impl = estimator_impl._impl_instance() wrapped_model = getattr(estimator_impl, "_wrapped_model", None) if wrapped_model is not None: estimator_impl = wrapped_model else: raise ValueError( "If base_estimator is a Lale operator, it needs to be an individual operator. " ) self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "learning_rate": learning_rate, "loss": loss, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def generate_model(X_train, X_test, y_train, y_test): model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=100), n_estimators=200, learning_rate=0.01 ) ''' model=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') model = ExtraTreesRegressor( n_estimators=200 ) ''' model.fit(X_train,y_train) print("model score ", model.score(X_test, y_test)) return model
def run_tree_regressor(): from sklearn.tree import DecisionTreeRegressor from sklearn.cross_validation import cross_val_score from sklearn.cross_validation import train_test_split import numpy as np from sklearn.ensemble import AdaBoostRegressor print "running me" X = np.genfromtxt("/home/john/Downloads/kaggle.X1.train.txt",delimiter=",") # load the text file Y = np.genfromtxt("/home/john/Downloads/kaggle.Y.train.txt",delimiter=",") x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2) rng = np.random.RandomState(1) depth = 35 # current lowest for estimators in [130,235,300,345,450]: treeAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=depth),n_estimators=estimators, random_state=rng) treeAdaBoost.fit(x_train, y_train) print "adabost estimators @ " + str(estimators) + ":", treeAdaBoost.score(x_test, y_test)
def Adaboost(Xtrain, Ytrain, Xtest, Ytest): """ Apply the adaboost algorithm """ from sklearn.ensemble import AdaBoostRegressor print('\nAdaboost:') clf = AdaBoostRegressor(n_estimators=1000).fit(Xtrain, Ytrain) print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain))) #find the training error prediction = clf.predict(Xtrain) Etrain = error(prediction, Ytrain) print('Training error: {0}'.format(Etrain)) #find the test error prediction = clf.predict(Xtest) Etrain = error(prediction, Ytest) print('Test error: {0}'.format(Etrain))
def boosting(X, y, k_cv): kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0) regr = AdaBoostRegressor(base_estimator=SVR(C=40, gamma=0.01), random_state=319, n_estimators=40, learning_rate=0.01, loss="square") vaild_split = kfold.split(y) for i in range(k_cv): split_index = vaild_split.__next__() test_index = split_index[1] y_test = y[test_index] trainval_index = split_index[0] X_trainval = X[trainval_index, :] X_test = X[test_index, :] y_trainval = y[trainval_index] regr.fit(X_trainval, y_trainval) print((regr.score(X_trainval, y_trainval))**0.5) test_pre = regr.predict(X_test) print("accuracy: ", (r_2(y_test, test_pre))**0.5)
def makeAdaDefaultBaseEstimatorPrediction(n_est): global y_t_pred, result print "Prediction and #estimators = %s" % (n_est) prefix = "%s_AdaBoost_n_est%s_DefaultDecisionTree" % (name, n_est) model = AdaBoostRegressor(n_estimators=n_est) x1 = x[:, :] # use all data x_t1 = x_t[:, :] # use all data y_t_pred = model.fit(x1, y).predict(x_t1) r = model.score(x1, y) print("score r = %s" % r) print "Estimator weights: %s..." % model.estimator_weights_ bla1 = (sorted(enumerate(model.estimator_weights_), key=lambda x: -abs(x[1]))[:5]) print "Abs-Val largest est-weights: %s..." % bla1 plt.clf() plt.plot(model.estimator_weights_, "ro") plt.title("Most relevant coef:%s" % (bla1)) plt.savefig(prefix + "_est_weights.png") plt.show() return prefix, model
def makeAdaLassoPrediction(al, n_est): global y_t_pred, result, alpha alpha = al print "Prediction with alpha = %s and #estimators = %s" % (alpha, n_est) prefix = "%s_AdaBoost_Lasso_alpha%s" % (name, alpha) model = AdaBoostRegressor(Lasso(alpha=alpha), n_estimators=n_est) x1 = x[:, :] # use all data x_t1 = x_t[:, :] # use all data y_t_pred = model.fit(x1, y).predict(x_t1) r = model.score(x1, y) print("score r = %s" % r) print "Estimator weights: %s..." % model.estimator_weights_ bla1 = (sorted(enumerate(model.estimator_weights_), key=lambda x: -abs(x[1]))[:5]) print "Abs-Val largest est-weights: %s..." % bla1 plt.clf() plt.plot(model.estimator_weights_, "ro") plt.title("Most relevant coef:%s" % (bla1)) plt.savefig(prefix + "_est_weights.png") plt.show() return prefix, model
class _AdaBoostRegressorImpl: def __init__( self, base_estimator=None, n_estimators=50, learning_rate=1.0, loss="linear", random_state=None, ): if isinstance(base_estimator, lale.operators.Operator): if isinstance(base_estimator, lale.operators.IndividualOp): base_estimator = base_estimator._impl_instance() wrapped_model = getattr(base_estimator, "_wrapped_model", None) if wrapped_model is not None: base_estimator = wrapped_model else: raise ValueError( "If base_estimator is a Lale operator, it needs to be an individual operator. " ) self._hyperparams = { "base_estimator": base_estimator, "n_estimators": n_estimators, "learning_rate": learning_rate, "loss": loss, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
class ABRegressor(): def __init__(self, dataset): self.dataset = dataset self.adaboost = AdaBoostRegressor( **DEFAULTS[dataset]['ab']['defaults']) print(""" ************************* Ada Boost Regressor ************************ """) def train_and_predict(self, X, y, X_test): ''' fit training dataset and predict values for test dataset ''' self.adaboost.fit(X, y) self.adaboost.predict(X_test) def score(self, X, X_test, y, y_test): ''' Returns the score of Ada Boost by fitting training data ''' self.train_and_predict(X, y, X_test) return self.adaboost.score(X_test, y_test) def create_new_instance(self, values): return AdaBoostRegressor(**{**values}) def param_grid(self, is_random=False): ''' dictionary of hyper-parameters to get good values for each one of them ''' # random search only accepts a dict for params whereas gridsearch can take either a dic or list of dict return DEFAULTS[self.dataset]['ab']['param_grid'] def get_sklearn_model_class(self): return self.adaboost def __str__(self): return "AdaBoostRegressor"
def run_tree_models(x,y): ''' Get an overview of performances of different tree models. Tree models: Decision tree, AdaBoost, Bagged tree INPUT: Dataframe with features (X) and target variable dataframe (y) OUTPUT: Scores of each tree model ''' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) dt = DecisionTreeRegressor() dt.fit(X_train, y_train) print('Decision Tree Score: ' + str(dt.score(X_test, y_test))) ada = AdaBoostRegressor(LinearRegression()) ada.fit(X_train, y_train) print('AdaBoost Regressor Score: ' + str(ada.score(X_test, y_test))) # Train and Score Bagged Tree Regressor (ensemble learner) bagged_tree = BaggingRegressor(DecisionTreeRegressor()) bagged_tree.fit(X_train, y_train) print('Bagged Tree Score: ' + str(bagged_tree.score(X_test, y_test)))
def test_adaboostregressor_sample_weight(): # check that giving weight will have an influence on the error computed # for a weak learner rng = np.random.RandomState(42) X = np.linspace(0, 100, num=1000) y = (.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001) X = X.reshape(-1, 1) # add an arbitrary outlier X[-1] *= 10 y[-1] = 10000 # random_state=0 ensure that the underlying boostrap will use the outlier regr_no_outlier = AdaBoostRegressor(base_estimator=LinearRegression(), n_estimators=1, random_state=0) regr_with_weight = clone(regr_no_outlier) regr_with_outlier = clone(regr_no_outlier) # fit 3 models: # - a model containing the outlier # - a model without the outlier # - a model containing the outlier but with a null sample-weight regr_with_outlier.fit(X, y) regr_no_outlier.fit(X[:-1], y[:-1]) sample_weight = np.ones_like(y) sample_weight[-1] = 0 regr_with_weight.fit(X, y, sample_weight=sample_weight) score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1]) score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1]) score_with_weight = regr_with_weight.score(X[:-1], y[:-1]) assert score_with_outlier < score_no_outlier assert score_with_outlier < score_with_weight assert score_no_outlier == pytest.approx(score_with_weight)
from sklearn.tree import DecisionTreeRegressor ##ARBRES DE DECISION regressor = DecisionTreeRegressor(max_leaf_nodes=9072) regressor.fit(X_train, Y_train) #%% from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor ##BOOSTING regBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=300), loss='square') regBoost.fit(X_train, Y_train) Y_test = regBoost.predict(X_test) regBoost.score(X_train,Y_train) regBoost.score(X_test,Y_test) Y_test = Y_test.astype(int) #%% regr_3 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=200), n_estimators=300) regr_3.fit(X_train,Y_train) y_3 = regr_3.predict(X_test) #%% Y_test = regressor.predict(X_test)
print('Training Score : ',reg_ridge.score(X_train, y_train)) print('Testing Score : ',reg_ridge.score(X_test, y_test)) print('Mean Square Error:',mean_squared_error(y_test, y_pred_ridge)) print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_ridge)) print('Root Mean Square Error:',mean_squared_error(y_test, y_pred_ridge)**0.5) r2_ridge=r2_score(y_test, y_pred_ridge) r2_ridge=1-(((1-r2_ridge)*(n-1))/(n-p-1)) print('R2 adjusted:',r2_ridge) r2_scores.append(r2_ridge) #AdaBoost Regression reg_ada = AdaBoostRegressor(n_estimators=1000) reg_ada.fit(X_train , y_train) y_pred_ada = reg_ada.predict(X_test) print('5.AdaBoost Regression') print('Training Score : ',reg_ada.score(X_train, y_train)) print('Testing Score : ',reg_ada.score(X_test, y_test)) print('Mean Square Error:',mean_squared_error(y_test, y_pred_ada)) print('Mean Absolute Error:',mean_absolute_error(y_test, y_pred_ada)) print('Root Mean Square Error:',mean_squared_error(y_test, y_pred_ada)**0.5) r2_ada=r2_score(y_test, y_pred_ada) r2_ada=1-(((1-r2_ada)*(n-1))/(n-p-1)) print('R2 adjusted:',r2_ada) r2_scores.append(r2_ada) #Gradient Boost Regression reg_gradient = GradientBoostingRegressor(n_estimators=500, learning_rate=0.1,max_depth=1, random_state=0, loss='ls',verbose = 1) reg_gradient.fit(X_train , y_train) y_pred_gradient = reg_gradient.predict(X_test) print('6.Gradient Boosting Regression') print('Training Score : ',reg_gradient.score(X_train, y_train))
def test_boston(): # Check consistency on dataset boston house prices. clf = AdaBoostRegressor(random_state=0) clf.fit(boston.data, boston.target) score = clf.score(boston.data, boston.target) assert score > 0.85
tmpSCR = randForrC.score(testX, yTest) else: randForrR.fit(trainX, yTrain) tmpSCR = randForrR.score(testX, yTest) scores['rand Forest'][label].append(tmpSCR) tTOT = time.time() - t0 times['rand Forest'][label].append(tTOT) print("start adaBoost") t0 = time.time() if cnt < 2: adaBoostC.fit(trainX, yTrain) tmpSCR = adaBoostC.score(testX, yTest) else: adaBoostR.fit(trainX, yTrain) tmpSCR = adaBoostR.score(testX, yTest) scores['adaBoost'][label].append(tmpSCR) tTOT = time.time() - t0 times['adaBoost'][label].append(tTOT) t0 = time.time() print("start bagging withOUT out-of-bag") if cnt < 2: bagCoobN.fit(trainX, yTrain) tmpSCR = bagCoobN.score(testX, yTest) else: bagRoobN.fit(trainX, yTrain) tmpSCR = bagRoobN.score(testX, yTest) scores['bagging (NO out of bag)'][label].append(tmpSCR) tTOT = time.time() - t0 times['bagging (NO out of bag)'][label].append(tTOT)
# In[74]: from sklearn.ensemble import RandomForestRegressor # In[75]: rf = RandomForestRegressor(n_estimators=200, random_state=45) rf.fit(train_x, train_y) # In[76]: pred = rf.predict(test_x) pred # In[77]: from sklearn.ensemble import AdaBoostRegressor model = AdaBoostRegressor() model.fit(train_x, train_y) print(model.score(train_x, train_y)) abpred = model.predict(test_x) print(abpred) model.score(test_x, test_y) # In[78]: from sklearn.externals import joblib joblib.dump(abpred, 'abpredsave.obj') # In[ ]:
def test_boston(): """Check consistency on dataset boston house prices.""" clf = AdaBoostRegressor() clf.fit(boston.data, boston.target) score = clf.score(boston.data, boston.target) assert score > 0.85
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=8, min_samples_split=2, random_state=rnd), n_estimators=n, learning_rate=0.1, random_state=rnd, loss='exponential') # r = DecisionTreeRegressor(random_state=rnd).get_params('random_state') print 'n feat: ', regr.n_features_ raw_input('ENTER') regr.fit(training_data, training_labels) # Predict y = regr.predict(training_data) z = regr.predict(testing_data) sy = regr.score(training_data, training_labels) sz = regr.score(testing_data, testing_labels) training_scores[i] = sy testing_scores[i] = sz # print ' ' # print 'Training scores --> n_est = %0.2f , ts = %0.2f : ' % (n, t), sy # print 'Testing scores --> n_est = %0.2f , ts = %0.2f: '% (n, t), sz # print ' ' scores_z = cross_val_score(regr, testing_data, testing_labels) scores_y = cross_val_score(regr, training_data, training_labels) # print ' accuracy training: %0.2f (+/- %0.2f) ' % (scores_y.mean(), scores_y.std() *2) # print ' accuracy testing: %0.2f (+/- %0.2f) ' % (scores_z.mean(), scores_z.std() *2)
test_size=0.1, random_state=42) X_train # In[ ]: # In[ ]: # In[ ]: # In[40]: from sklearn.ensemble import AdaBoostRegressor Ada_reg = AdaBoostRegressor(random_state=42) Ada_reg.fit(X_train, y_train) Ada_reg.score(X_train, y_train) # In[ ]: # In[41]: Ada_reg.get_params # In[42]: param_grid_xg = [ { 'n_estimators': [45, 50, 55], 'learning_rate': [0.75, 1, 1.25] }, ]
param_grid=parameters_ada, cv=5, scoring='neg_mean_squared_error') clf2.fit(X_train, y_train) print clf2.best_params_ regr2 = AdaBoostRegressor(regr1, n_estimators=100, loss='exponential', learning_rate=0.7) regr2.fit(X_train, y_train) print regr2.score(X_test, y_test) """ Ridge model """ ridge_regr = Ridge() parameters_ridge = { 'alpha': [0.25, 0.5, 0.75, 1, 1.25, 2], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag'] } ridge_regr = GridSearchCV(ridge_regr, param_grid=parameters_ridge, cv=5, scoring='neg_mean_squared_error') ridge_regr.fit(X_train, y_train) ridge_regr.best_params_
train_Salary, train_Salary_y = preprocess(all_cat, all_num, all_y) # AdaBoostRegressor ridge = Ridge(15) trainX_Sal, testX_Sal, trainy_Sal, testy_Sal = train_test_split( train_Salary, train_Salary_y, test_size=0.1, random_state=1) clf_Ada = AdaBoostRegressor(n_estimators=10, base_estimator=ridge) clf_Ada.fit(trainX_Sal, trainy_Sal) # scores = clf_Ada.score(testXF,testyF) y_Sal = clf_Ada.predict(testX_Sal) # print y_Sal sal_pred = scaling(y_Sal) print sal_pred scores_Sal_C = cross_val_score(clf_Ada, train_Salary, train_Salary_y) scores_Sal_CV = np.mean(scores_Sal_C) scores_Sal = clf_Ada.score(testX_Sal, testy_Sal) print 'AdaBoostRegression:', scores_Sal print 'AdaBoostRegression_cv:', scores_Sal_CV print 'finished with the mean-Salary(平均工资预测ok)' #预测股票数据流通股本 dataXFCA = pd.read_csv('/Users/huanghuaixian/desktop/final.csv', encoding="GBK") data_cat_df = dataXFCA[[ 'area', 'province', 'city', 'year', 'month', 'day', 'industry' ]].astype(str) y_data = dataXFCA['fcA'] data_num_df = dataXFCA[['gcA']] train, y_data = preprocess(data_cat_df, data_num_df, y_data) trainXF, testXF, trainyF, testyF = train_test_split(train, y_data,
def test_boston(): """Check consistency on dataset boston house prices.""" clf = AdaBoostRegressor(random_state=0) clf.fit(boston.data, boston.target) score = clf.score(boston.data, boston.target) assert score > 0.85
trainlabel = pd.read_csv("Produce_Data/University_data_cluster.csv") #use different regression methods est = AdaBoostRegressor(DecisionTreeRegressor()) col_predic = ["UniversityNo","Topic","Year","Lowest","Last_Ranking","Average_Ranking"] # parameter use for label training df2 = df.merge(trainlabel[col_predic],on=["UniversityNo","Topic","Year"]) df2 = df2[~np.isnan(df2.Lowest)] df2 = df2[~(df2["Average_Ranking"] == 0)] X = df2[["UniversityNo","Year","Topic","Lowest","Ranking_Scores","Last_Ranking","Average_Ranking"]] #The train parameters label y = df2.Label X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) est.fit(X_train,y_train) print "----------------" print "Prediction score: " + str(round(est.score(X_test,y_test)*1000)/10) + "%" print "----------------" df2.New_Label = est.predict(X) #obtain the prediction label for future ranking prediction #sum up the enroll student number equal to smaller than the label, the predicting ranking numbers are mainly base on this parameter for y in range(2011,2016): #no 2010 because no average ranking in it for t in range(2): s = 0 while sum(df2.Label >= s): df2.loc[(df2.Year == y) & (df2.Topic == t) & (df2.Label >= s),'Plan_Number_Total'] += sum(df2.loc[(df2.Year == y) & (df2.Topic == t) & (np.round(df2.Label) ==s),'Plan_Number']) s += 1 dfsave = df2 dfsave.to_csv("Produce_Data/University.csv")
# Random Forest Regression import numpy as np from sklearn import datasets from sklearn.ensemble import AdaBoostRegressor # load the diabetes datasets dataset = datasets.load_diabetes() # fit an AdaBoost model to the data model = AdaBoostRegressor() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model mse = np.mean((predicted-expected)**2) print(mse) print(model.score(dataset.data, dataset.target))
for i in index: if scores2[i]<0.88: list_index2.append(list_index[i]) n=len(list_index2) b_train=True b_test=True for i in range(n): b_train=b_train&(X_train[:,index_adt]==np.unique(X_train[:,index_adt])[list_index2[i]]) b_test=b_test&(X_test[:,index_adt]==np.unique(X_test[:,index_adt])[list_index2[i]]) reg2=AdaBoostRegressor(RandomForestRegressor()) reg2.fit(X_train[b_train],y_train[b_train]) reg2.score(X_test[b_test],y_test[b_test]) Qt[b_train]=Qt[b_train]-reg2.predict(X_train[b_train]) Q[b_test]=Q[b_test]-reg2.predict(X_test[b_test]) Q2[b_test]=reg2.predict(X_test[b_test]) for i in range(n): s="Pred/fit_adt_"+str(list_index2[i]+1)+".pickle" fid = open(s, 'wb') pickle.dump(reg2,fid) fid.close() r=AdaBoostRegressor(RandomForestRegressor()) r.fit(X_train,Qt) r.score(X_test,Q)
# Fit regression model regr_dt = DecisionTreeRegressor(criterion='mse', max_depth=4) regr_abdt = AdaBoostRegressor( DecisionTreeRegressor(max_depth=4), n_estimators=300, ) regr_dt.fit(X_train_std, y_train) regr_abdt.fit(X_train_std, y_train) Rsquare_dt = regr_dt.score(X_train_std, y_train) mse_dt = mean_squared_error(y_test.values, y_test_predict_dt) y_train_predict_dt = regr_dt.predict(X_train_std) Rsquare_abdt = regr_abdt.score(X_train_std, y_train) mse_abdt = mean_squared_error(y_test.values, y_test_predict_abdt) y_train_predict_abdt = regr_abdt.predict(X_train_std) y_test_predict_dt = regr_dt.predict(X_test_std) y_test_predict_abdt = regr_abdt.predict(X_test_std) # In[ ]: # boosting建模 # In[43]: regr_gb = GradientBoostingRegressor(n_estimators=100, max_depth=1, loss='ls') regr_gb.fit(X_train_std, y_train)
if __name__ == '__main__': np.set_printoptions(edgeitems=5) # Read dataset data = np.genfromtxt("shuffled.csv", delimiter=',', skip_header=1, usecols=range(1, 385)) reference = np.genfromtxt("shuffled.csv", delimiter=',', skip_header=1, usecols=(385)) testData = np.genfromtxt("test.csv", delimiter=',', skip_header=1, usecols=range(1, 385)) validationData = np.genfromtxt("train.csv", delimiter=',', skip_header=1, usecols=range(1, 385), max_rows=5000) validationReference = np.genfromtxt("train.csv", delimiter=',', skip_header=1, usecols=(385), max_rows=5000) numberOfTrainingData = data.shape[0] numberOfFeatures = data.shape[1] numberOfTestData = testData.shape[0] numberOfVldtData = validationData.shape[0] # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html bdt = AdaBoostRegressor(base_estimator=ExtraTreeRegressor(), n_estimators=1000) #bdt = RandomForestRegressor(n_estimators=50) #bdt = GradientBoostingRegressor() bdt.fit(data, reference) print("FINISH FITTING") predict = bdt.predict(testData).reshape(numberOfTestData, 1) score = bdt.score(validationData, validationReference) print(score) with open('adaboostResult.csv', 'w') as file: file.write("id,reference\n") for i in range(0, numberOfTestData): file.write("%d,%f\n" %(i, predict[i]))
def adaboost_regressor(train_data, train_label, test_data, test_label, parameters): min_error = 10000000000 error = [] learn_rate = [1e-2, 1e-1, 1, 10, 100, 500, 1000] n_est = [20, 40, 60, 80, 100] comb = list(itertools.product(learn_rate, n_est)) # print comb fin_learn = 0 fin_est = 0 for i in range(0, len(comb)): regr = AdaBoostRegressor(n_estimators=comb[i][1], learning_rate=comb[i][0], random_state=random_state) regr.fit(train_data, train_label) predict = regr.predict(test_data) predict = map(lambda x: [x], predict) mse = MSE(np.array(predict), test_label) error.append(mse) # print mse[0] if (mse[0] < min_error): min_error = mse[0] # print comb[i] fin_learn = comb[i][0] fin_est = comb[i][1] else: continue plt.figure(figsize=(10, 12)) plt.title('MSE vs (learning rate, n_estimate)') plt.plot(range(len(comb)), error) plt.xticks(np.arange(len(comb)), comb, rotation=90) plt.xlabel('(learning rate, n_estimate)') plt.ylabel('MSE') directory = './adaboost/' if not os.path.exists(directory): os.makedirs(directory) plt.savefig(directory + 'MSE' + parameters + '.png') plt.close() regr = AdaBoostRegressor(n_estimators=80, learning_rate=1, random_state=random_state) regr.fit(train_data, train_label) score = regr.score(test_data, test_label) predict = regr.predict(test_data) predict = map(lambda x: [x], predict) predict = np.array(predict) mse = MSE(np.array(predict), test_label) print 'MSE ' + parameters + ' ' + str(mse[0]) df = pd.Series(predict.flatten(), index=test_label.index) price = train_label.append(test_label) plt.title('AdaBoost on ' + parameters) plt.plot(price[1000:-1], label='actual price') plt.plot(df, label='predicted price') plt.legend(loc='lower right') plt.xlabel('Dates') plt.ylabel('Price') # plt.show() directory = './adaboost/' if not os.path.exists(directory): os.makedirs(directory) plt.savefig(directory + parameters + '.png') plt.close() return score
#trn2=train from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(trn2[feature_cols], trn2['Hazard'], random_state=1) #fit the model and predict # model = AdaBoostRegressor(base_estimator=RandomForestRegressor()) model = AdaBoostRegressor() model.fit(X_train,y_train) y_pred =model.predict(X_test) coef = giniscore.Gini(y_pred,y_test) print 'Gini coefficient is ', coef model.score(X_train,y_train) # score with 100 rows RF estimator is .92 # gini with default columns, default estimator is 0.12 # gini with 1000 rows all columns, default estimater is 0.188 # gini with 10000 rows all columns, default estimater is 0.1802 # gini with all rows all columns, default estimater is 0.12759 # gini with 100 rows RF esimator is .098 # gini with 1000 rows RF estimator is .0876 # ugh, using LassoCV 32 the score is only .19 to 21 so it must want all the columns # benchmark is .20 , Kaggle public LB says .263387 # < 14 97.5% benchmark is .172 # < 10 90.5% benchmark is .1472
mse = mean_absolute_error(y_test, y_pred) print("The mean absolute error is:$", mse) #chceking r^2 from sklearn.metrics import r2_score print("r_Score:", r2_score(y_test, y_pred)) bg = BaggingRegressor(RandomForestRegressor(), n_estimators=10) bg.fit(X_train, y_train) bg.score(X_train, y_train) bg.score(X_test, y_test) #Adaboosting regr = AdaBoostRegressor() regr.fit(X_train, y_train) regr.score(X_test, y_test) #Decision from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, y_train) dt.score(X_test, y_test) #gradientBoost from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb.fit(X_train, y_train) gb.score(X_train, y_train) gb.score(X_test, y_test) #KNN
print "train score: ", dtr.score(data_0am_train_xx,data_0am_train_yy) print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train print "test error: ", np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test rng = np.random.RandomState(1) abr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=5), n_estimators=300, random_state=rng) abr.fit(data_0am_train_xx,data_0am_train_yy) data_0am_train_predy = abr.predict(data_0am_train_xx) abr_train_predy = abr.predict(data_0am_train_xx) data_0am_test_predy = abr.predict(data_0am_test_x) abr_test_predy = abr.predict(data_0am_test_x) print "ABR report" print "train score: ", abr.score(data_0am_train_xx,data_0am_train_yy) print "train error: " , np.sqrt(np.mean((data_0am_train_predy-data_0am_train_yy)**2))/nom_train print "test error: ", np.sqrt(np.mean((data_0am_test_predy-data_0am_test_y)**2))/nom_test # print lasso_train_predy.shape combine_train_predy = np.concatenate(( np.atleast_2d(linear_train_predy), np.atleast_2d(lasso_train_predy), np.atleast_2d(DTR_train_predy), np.atleast_2d(svr_train_predy), np.atleast_2d(abr_train_predy)),axis=0) # print combine_train_predy.shape combine_train_predy= np.mean(combine_train_predy,axis=0) # print combine_train_predy.shape