class Regressor(BaseEstimator): def __init__(self): self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=100, max_depth=40, max_features=25), n_estimators=100) #self.clf_Boost = GradientBoostingRegressor( n_estimators = 500 , max_features = 20 ) #self.clf_Regression = LinearRegression() def fit(self, X, y): self.clf.fit(X,y) def predict(self, X): return self.clf.predict(X)
def train_learning_model_decision_tree_ada_boost(df): #code taken from sklearn X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) tree_regressor = DecisionTreeRegressor(max_depth = 6) ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1) tree_regressor.fit(X_train, y_train) ada_regressor.fit(X_train, y_train) y_pred_tree = tree_regressor.predict(X_test) y_pred_ada = ada_regressor.predict(X_test) mse_tree = mean_squared_error(y_test, y_pred_tree) mse_ada = mean_squared_error(y_test, y_pred_ada) mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train)) mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train)) print ("MSE tree: %.4f " %mse_tree) print ("MSE ada: %.4f " %mse_ada) print ("MSE tree train: %.4f " %mse_tree_train) print ("MSE ada train: %.4f " %mse_ada_train)
def main(): ab = AdaBoostRegressor(base_estimator=None, n_estimators=50, learning_rate=1.0, loss='exponential', random_state=None) ab.fit(X_train, y_train) #Evaluation in train set #Evaluation in train set pred_proba_train = ab.predict(X_train) mse_train = mean_squared_error(y_train, pred_proba_train) rmse_train = np.sqrt(mse_train) logloss_train = log_loss(y_train, pred_proba_train) #Evaluation in validation set pred_proba_val = ab.predict(X_val) mse_val = mean_squared_error(y_val, pred_proba_val) rmse_val = np.sqrt(mse_val) logloss_val = log_loss(y_val, pred_proba_val) rmse_train rmse_val logloss_train logloss_val
def fit(self, start_date, end_date): for ticker in self.tickers: self.stocks[ticker] = Stock(ticker) params_ada = [{ 'n_estimators': [25, 50, 100], 'learning_rate': [0.01, 0.1, 1, 10], 'loss': ['linear', 'square', 'exponential'] }] params = ParameterGrid(params_ada) # Find the split for training and CV mid_date = train_test_split(start_date, end_date) for ticker, stock in self.stocks.items(): X_train, y_train = stock.get_data(start_date, mid_date, fit=True) # X_train = self.pca.fit_transform(X_train.values) X_train = X_train.values # pdb.set_trace() X_cv, y_cv = stock.get_data(mid_date, end_date) # X_cv = self.pca.transform(X_cv.values) X_cv = X_cv.values lowest_mse = np.inf for i, param in enumerate(params): ada = AdaBoostRegressor(**param) ada.fit(X_train, y_train.values) mse = mean_squared_error( y_cv, ada.predict(X_cv)) if mse <= lowest_mse: self.models[ticker] = ada return self
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return: the predicted values,learning curve, validation curve """ ada = AdaBoostRegressor(n_estimators=5) if get_model: print "Fitting Ada..." ada.fit(train_x, np.log(train_y+1)) ada_pred = np.exp(ada.predict(pred_x))-1 Votes = ada_pred[:,np.newaxis] Id = np.array(review_id)[:,np.newaxis] # create submission csv for Kaggle submission_ada= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_ada.csv", submission_ada,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') # plot validation and learning curves if l_curve: print "Working on Learning Curves" plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y+1.0)) if v_curve: print "Working on Validation Curves" plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y+1.0), param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
def round2(X_df, featurelist): # Set parameters model = AdaBoostRegressor() y_df = X_df['target'] n = len(y_df) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X_df.iloc[train_idx, :], X_df.iloc[test_idx, :] # y_train, y_test = y_df[train_idx], y_df[test_idx] X_train, X_test = applyFeatures(X_train, X_test, featurelist) Xtrain_array, ytrain_array, Xtest_array, ytest_array = dfToArray(X_train, X_test) model.fit(Xtrain_array, ytrain_array) prediction = model.predict(Xtest_array) rmse = np.sqrt(mean_squared_error(ytest_array, prediction)) scores.append(rmse) print rmse print "Finish fold" return scores
def Round2(X, y): # Set parameters min_score = {} for loss in ['linear', 'square', 'exponential']: model = AdaBoostRegressor(loss=loss) n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) # score = model.score(X_test, y_test) scores.append(rmse) if len(min_score) == 0: min_score['loss'] = loss min_score['scores'] = scores else: if np.mean(scores) < np.mean(min_score['scores']): min_score['loss'] = loss min_score['scores'] = scores print "Loss:", loss print scores print np.mean(scores) return min_score
def predict(tour_data): vec = DictVectorizer() tour_data = get_tour_data() transformed = vec.fit_transform(tour_data).toarray() categories = vec.get_feature_names() y = transformed[:,[categories.index('rating')]] X = transformed[:,np.arange(transformed.shape[1])!=categories.index('rating')] reg_tree = DecisionTreeRegressor() addboost_tree = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) red_tree.fit(X,y) addboost_tree(X,y) # Predict y_1 = red_tree.predict(X) y_2 = addboost_tree.predict(X) return prediction
def predict_volatility_1year_ahead(rows, day, num_days): """ SUMMARY: Predict volatility 1 year into the future ALGORITHM: a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day` b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19) INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0) """ '''enforce that `day` is in the required range''' assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days assert day >= 0 '''Compile features for fitting''' feature_sets = [] value_sets = []; for ii in range(day+num_days+252, len(rows) - num_days): features = [] for jj in range(num_days): day_index = ii + jj features += [ float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9]), float(rows[day_index][10]), float(rows[day_index][11]), float(rows[day_index][12]), float(rows[day_index][13]), ] #print("issue here: " + str(rows[day_index][0])) feature_sets += [features] value_sets += [float(rows[ii-252][9])] '''Create Regressor and fit''' num_features = 16 rng = np.random.RandomState(1) regr = AdaBoostRegressor(CustomClassifier(), n_estimators=3, random_state=rng) regr.fit(feature_sets, value_sets) '''Get prediction features''' ii = day features = [] for jj in range( num_days ): day_index = ii + jj features += [ float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9]), float(rows[day_index][10]), float(rows[day_index][11]), float(rows[day_index][12]), float(rows[day_index][13]), ] return float(regr.predict([features]))
class Regressor(BaseEstimator): def __init__(self): self.clf = AdaBoostRegressor(RandomForestRegressor(n_estimators=500, max_depth=78, max_features=10), n_estimators=40) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
class Regressor(BaseEstimator): def __init__(self): cl = RandomForestRegressor(n_estimators=10, max_depth=10, max_features=10) self.clf = AdaBoostRegressor(base_estimator = cl, n_estimators=100) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X) #RandomForestClassifier
def AdaBoost(xTrain, yTrain, xTest, yTest, treeNum): rms = dict() for trees in treeNum: ab = AdaBoostRegressor(n_estimators = trees) ab.fit(xTrain, yTrain) yPred = ab.predict(xTest) rms[trees] = sqrt(mean_squared_error(yTest, yPred)) (bestRegressor, rmse) = sorted(rms.iteritems(), key = operator.itemgetter(1))[0] return bestRegressor, rmse
def performAdaBoostReg(train, test, features, output): """ Ada Boost Regression """ clf = AdaBoostRegressor() clf.fit(train[features], train[output]) Predicted = clf.predict(test[features]) plt.plot(test[output]) plt.plot(Predicted, color='red') plt.show() return mean_squared_error(test[output],Predicted), r2_score(test[output], Predicted)
def do_adaboost(filename): df, Y = create_merged_dataset(filename) # Ideas: # Create a feature for accelerations e deacceleration. # Leave default base regressor for AdaBoost(decision tree). Extra trees were tried with catastrophic results. #ada = AdaBoostRegressor(n_estimators=350, learning_rate=0.05) ada = AdaBoostRegressor(n_estimators=500, learning_rate=1) #X = df.drop(['driver', 'trip', 'prob_points', 'prob_speed', 'prob_distance', 'prob_acceleration'], 1) X = df.drop(['driver', 'trip'], 1) ada.fit(X, Y) probs = ada.predict(X[:200]) return pd.DataFrame({'driver': df['driver'][:200], 'trip': df['trip'][:200], 'probs': probs})
def test_sparse_regression(): """Check regression with sparse input.""" class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVR, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression(n_samples=100, n_features=50, n_targets=1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostRegressor( base_estimator=CustomSVR(probability=True), random_state=1 ).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = dense_results = AdaBoostRegressor( base_estimator=CustomSVR(probability=True), random_state=1 ).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) sparse_type = type(X_train_sparse) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def predict_volatility_1year_ahead(rows, day): """ SUMMARY: Predict volatility 1 year into the future ALGORITHM: a) The predictor will train on all data up to exactly 1 year (252 trading days) before `day` b) The newest 10 days up to and including `day` will be used as the feature vector for the prediction i.e. if day = 0, then the feature vector for prediction will consist of days (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) if day = 10, then the feature vector for predictor input will be days (10, 11, 12, 13, 14, 15, 16, 17, 19) INPUT: minimum of (1 year + 10 days) of data before `day` (newest data is day=0) """ #num_days = 10 num_days = 10 # enforce that `day` is in the required range assert len(rows) >= 252+num_days + day, 'You need to have AT LEAST 252+%d rows AFTER the day index. See predict_volatility_1year_ahead() for details.' % num_days assert day >= 0 # compile features (X) and values (Y) feature_sets = [] value_sets = []; value_sets_index = [] for ii in range(day+252, len(rows) - num_days): features = [] for jj in range(num_days): day_index = ii + jj features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])] feature_sets += [features] value_sets += [float(rows[ii-252][9])] value_sets_index.append([ii-252]) # fit #regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000) # they call lambda alpha rng = np.random.RandomState(1) regr = AdaBoostRegressor(CustomClassifier(), n_estimators=4, random_state=rng) #regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=2, random_state=rng) #regr = DecisionTreeRegressor(max_depth=4) regr.fit(feature_sets, value_sets) #print "Adaboost weights:", regr.estimator_weights_ ii = day features = [] for jj in range( num_days ): day_index = ii + jj +252 features += [float(rows[day_index][1]), float(rows[day_index][2]), float(rows[day_index][3]), float(rows[day_index][5]), float(rows[day_index][7]), float(rows[day_index][8]), float(rows[day_index][9])] return float(regr.predict([features]))
def ada_boost(df, significant_cols, target, cat_cols, num_cols): ss = StandardScaler() ohe = OneHotEncoder(drop='first', sparse=False) X = df[significant_cols] y = df[target] base = DecisionTreeRegressor(max_depth=3, random_state=0) estimator = AdaBoostRegressor(base_estimator=base, random_state=0) params = { 'n_estimators': np.arange(5, int(X.shape[0] * 0.1)), 'learning_rate': np.arange(0.1, 1.1, 0.1), 'loss': ['linear', 'square', 'exponential'], } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) X_train_cat = ohe.fit_transform(X_train[cat_cols]) X_train_num = ss.fit_transform(X_train[num_cols]) X_test_cat = ohe.transform(X_test[cat_cols]) X_test_num = ss.transform(X_test[num_cols]) train_data = np.c_[X_train_cat, X_train_num] test_data = np.c_[X_test_cat, X_test_num] gs = GridSearchCV(estimator, params, scoring='r2', cv=3) gs.fit(train_data, y_train) estimator = gs.best_estimator_ r2_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='r2', cv=3, n_jobs=-1) rmse_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1) params = estimator.get_params() r2 = np.mean(r2_cv_scores) rmse = np.abs(np.mean(rmse_cv_scores)) r2_variance = np.var(r2_cv_scores, ddof=1) rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1)) estimator.fit(train_data, y_train) y_pred = estimator.predict(test_data) r2_validation = r2_score(y_test, y_pred) rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred)) return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
def train_predict_loan11_2_20(trainData, predictData, maxdepth=50, goaladdress=None): trainData = pd.read_csv(trainData) X_feature = np.array(trainData.iloc[:, 2:]) X_uidsum = np.array(trainData.iloc[:, 0:2]) # X_train, X_test, y_train, y_test = train_test_split(X_feature,X_uidsum, test_size=0.3) X_train = X_feature y_train = X_uidsum[:, 1] rng = np.random.RandomState(1) regAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=maxdepth), n_estimators=1000, random_state=rng) regRandomForest = RandomForestRegressor(max_depth=maxdepth, n_estimators=1000, random_state=rng) regXGB = xgb.XGBRegressor(max_depth=maxdepth, n_estimators=1000, random_state=1) regBagg = BaggingRegressor(DecisionTreeRegressor(max_depth=maxdepth), n_estimators=1000, random_state=rng) regAdaBoost.fit(X_train, y_train) regRandomForest.fit(X_train, y_train) regXGB.fit(X_train, y_train) regBagg.fit(X_train, y_train) predictData = pd.read_csv(predictData) X_test_feature = np.array(predictData.iloc[:, 1:]) X_test_uidsum = np.array(predictData.iloc[:, 0:1]) y1 = regAdaBoost.predict(X_test_feature) y2 = regRandomForest.predict(X_test_feature) y3 = regXGB.predict(X_test_feature) y4 = regBagg.predict(X_test_feature) pd1 = pd.DataFrame(X_test_uidsum, columns=["uid"]) pd2 = pd.DataFrame(y1, columns=["AdaBoost_pre"]) pd3 = pd.DataFrame(y2, columns=["RandomFrost_pre"]) pd4 = pd.DataFrame(y3, columns=["XGB"]) pd5 = pd.DataFrame(y4, columns=["Bagg"]) pdd = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1) # pdd.to_csv(goaladdress, index=False) return pdd
def test_sparse_regression(): """Check regression with sparse input.""" class CustomSVR(SVR): """SVR variant that records the nature of the training set.""" def fit(self, X, y, sample_weight=None): """Modification on fit caries data type for later verification.""" super(CustomSVR, self).fit(X, y, sample_weight=sample_weight) self.data_type_ = type(X) return self X, y = datasets.make_regression(n_samples=15, n_features=50, n_targets=1, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for sparse_format in [ csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix ]: X_train_sparse = sparse_format(X_train) X_test_sparse = sparse_format(X_test) # Trained on sparse format sparse_classifier = AdaBoostRegressor( base_estimator=CustomSVR(probability=True), random_state=1).fit(X_train_sparse, y_train) # Trained on dense format dense_classifier = dense_results = AdaBoostRegressor( base_estimator=CustomSVR(probability=True), random_state=1).fit(X_train, y_train) # predict sparse_results = sparse_classifier.predict(X_test_sparse) dense_results = dense_classifier.predict(X_test) assert_array_equal(sparse_results, dense_results) # staged_predict sparse_results = sparse_classifier.staged_predict(X_test_sparse) dense_results = dense_classifier.staged_predict(X_test) for sprase_res, dense_res in zip(sparse_results, dense_results): assert_array_equal(sprase_res, dense_res) types = [i.data_type_ for i in sparse_classifier.estimators_] assert all([(t == csc_matrix or t == csr_matrix) for t in types])
def AdaBoost(self, args): ## Adaptive Boosting logger.info("Running Adaptive Boosting ... ") # Initialilze the ababoost regressor if args.predictor.lower() == 'classifier': from sklearn.tree import DecisionTreeClassifier as dtree from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor elif args.predictor.lower() == 'regressor': from sklearn.tree import DecisionTreeRegressor as dtree from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor dtree_model = dtree(max_depth= 3) if args.predictor.lower() == 'classifier': ada_model = AdaBoostRegressor(base_estimator = dtree_model, n_estimators=20000, loss = 'linear', random_state= 23 ) elif args.predictor.lower() == 'regressor': ada_model = AdaBoostRegressor(base_estimator = dtree_model, n_estimators=20000, loss = 'exponential', random_state= 23, learning_rate = 0.1 ) # Fit ada to the training set ada_model.fit(self.X_train, self.y_train) # Get the predicted values self.y_pred = ada_model.predict(self.X_data) ## The inverse logit transform, \mathrm{invlogit}(x) = \frac{1}{1 + \exp(-x)}, is given in R by: plogis(x) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = ada_model return self
def test_onnxt_iris_adaboost_regressor_dt(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, __ = train_test_split(X, y, random_state=11) y_train = y_train.astype(numpy.float32) clr = AdaBoostRegressor( base_estimator=DecisionTreeRegressor(max_depth=3), n_estimators=3) clr.fit(X_train, y_train) X_test = X_test.astype(numpy.float32) X_test = numpy.vstack([X_test[:3], X_test[-3:]]) res0 = clr.predict(X_test).astype(numpy.float32) model_def = to_onnx(clr, X_train.astype(numpy.float32)) oinf = OnnxInference(model_def, runtime='python') res1 = oinf.run({'X': X_test}) self.assertEqualArray(res0, res1['variable'].ravel())
def main(): # 使用带AdaBoost算法的决策树回归器 # 调用房屋价格数据接口 每个数据点含有13个输入参数 housing_data = datasets.load_boston() # .data 获取输入参数, .target获取对应价格 x, y = shuffle(housing_data.data, housing_data.target, random_state=7) # 分成训练集和测试集 num_training = int(0.8 * len(x)) x_train, y_train = x[:num_training], y[:num_training] x_test, y_test = x[num_training:], y[num_training:] # 拟合一个决策回归模型 dt_regressor = DecisionTreeRegressor(max_depth=4) dt_regressor.fit(x_train, y_train) # 用带AdaBoost算法的决策回归模型拟合 ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7) ab_regressor.fit(x_train, y_train) # 评价决策树回归器的效果 y_pred_dt = dt_regressor.predict(x_test) mse = mean_squared_error(y_test, y_pred_dt) evs = explained_variance_score(y_test, y_pred_dt) print("\n### Decision Tree performance ####") print("Mean squared error =", round(mse, 2)) print("Explained variance score = ", round(evs, 2)) # AdaBoost算法改善后的效果 y_pred_ab = ab_regressor.predict(x_test) mse = mean_squared_error(y_test, y_pred_ab) evs = explained_variance_score(y_test, y_pred_ab) print("\n### Decision Tree performance ####") print("Mean squared error =", round(mse, 2)) print("Explained variance score = ", round(evs, 2)) # 画出特征的相对重要性 #plt.figure(figsize=(10, 8), dpi=100) # 指定尺寸和分辨率 plot_feature_importances(dt_regressor.feature_importances_, 'Decision Tree regressor', housing_data.feature_names, 1) plot_feature_importances(ab_regressor.feature_importances_, 'AdaBoost regressor', housing_data.feature_names, 2)
def ada_boost_regressor(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return: the predicted values,learning curve, validation curve """ ada = AdaBoostRegressor(n_estimators=5) if get_model: print "Fitting Ada..." ada.fit(train_x, np.log(train_y + 1)) ada_pred = np.exp(ada.predict(pred_x)) - 1 Votes = ada_pred[:, np.newaxis] Id = np.array(review_id)[:, np.newaxis] # create submission csv for Kaggle submission_ada = np.concatenate((Id, Votes), axis=1) np.savetxt("submission_ada.csv", submission_ada, header="Id,Votes", delimiter=',', fmt="%s, %0.2f", comments='') # plot validation and learning curves if l_curve: print "Working on Learning Curves" plot_learning_curve(AdaBoostRegressor(), "Learning curve: Adaboost", train_x, np.log(train_y + 1.0)) if v_curve: print "Working on Validation Curves" plot_validation_curve(AdaBoostRegressor(), "Validation Curve: Adaboost", train_x, np.log(train_y + 1.0), param_name="n_estimators", param_range=[2, 5, 10, 15, 20, 25, 30])
def Bayesian(dataTrain, dataTest, TestCol, outputpath): modelName = 'RandomForestRegress' s = datetime.datetime.now() X_train = dataTrain.drop('Rain', axis=1) y_train = dataTrain['Rain'] # .loc[dataTrain['Rain'] >= 0, 'Rain'] X_test = dataTest.drop('Rain', axis=1) y_test = dataTest['Rain'] # .loc[dataTest['Rain'] >= 0, 'Rain'] X_train_Nodate = X_train[TestCol] X_test_Nodeate = X_test[TestCol] model = AdaBoostRegressor(linear_model.BayesianRidge(), n_estimators=300) model.fit(X_train_Nodate, y_train) y_predict = model.predict(X_test_Nodeate) print("Baysian performance:") MAE = round(sm.mean_absolute_error(y_test, y_predict), 2) MSE = round(sm.mean_squared_error(y_test, y_predict), 2) MedienAE = round(sm.median_absolute_error(y_test, y_predict), 2) R2score = round(sm.r2_score(y_test, y_predict), 2) print("Mean absolute error =", MAE) print("Mean squared error =", MSE) print("Median absolute error =", MedienAE) print("Explained variance score =", round(sm.explained_variance_score(y_test, y_predict), 2)) print("R2 score =", R2score) y_pred = numpy.round(y_predict, 1) X_test.loc[:, 'Predict_Rain'] = pandas.Series(y_pred, index=X_test.index) outFile = pandas.DataFrame(X_test[['Date', 'Lat', 'Long', 'Predict_Rain']]) # print(outFile) # print(dataTrain.loc[dataTrain['Rain'] > 0, 'Rain']) outFile.to_csv(outputpath + 'Result_predict_{0}.csv'.format(modelName), index=False) text = open(outputpath + 'Result_{0}.txt'.format(modelName), mode='a') text.write("Mean absolute error ={0}\n".format(MAE)) text.write("Mean squared error ={0}\n".format(MSE)) text.write("Median absolute error ={0}\n".format(MedienAE)) text.write("R2 score ={0}\n".format(R2score)) e = datetime.datetime.now() text.write("Total Time:{0}".format(e - s)) text.close()
def test_loansum_predict_0_1(trainpath ,trainData,goalpath ,goalfile): print(str(trainData)) trainData = pd.read_csv(trainpath+trainData) X_feature = np.array(trainData.iloc[:, 0:4]) X_goal = np.array(trainData.iloc[:, 4:5]) x_train,x_test,y_train,y_test=train_test_split(X_feature,X_goal,train_size=0.7) y_train=y_train.reshape(y_train.shape[0],) maxdepth =500 assert y_train.shape[0]==x_train.shape[0] ," 怎么搞的" rng = np. random.RandomState(1) regAdaBoost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=maxdepth), n_estimators=1000, random_state=rng) regRandomForest = RandomForestRegressor(max_depth=maxdepth, n_estimators=1000, random_state=rng) regGrad= GradientBoostingRegressor(loss="ls",n_estimators=1000, max_depth=maxdepth ) regAdaBoost.fit(x_train[:,1:4], y_train) regRandomForest.fit(x_train[:,1:4], y_train) regGrad.fit(x_train[:,1:4], y_train) y1 = regAdaBoost.predict(x_test[:,1:4]) y2 = regRandomForest.predict(x_test[:,1:4]) y3=regGrad.predict(x_test[:,1:4]) y123=[y1,y2,y3] doc=["Ada","Ran","Grad"] for y,d in zip(y123,doc): ysum = 0 for i in range(x_test.shape[0]): ysum=ysum+np.power(y[i]-y_test[i,0],2) print("{0}'s RMSE :{1}".format(d,math.sqrt(ysum/x_test.shape[0]))) meansum = 0 for i in range(x_test.shape[0]): meansum = meansum + np.power((x_test[i, 1] + x_test[i, 2] + x_test[i, 3]) / 3 - y_test[i, 0], 2) print("meansum's RMSE:{0}".format(math.sqrt(meansum/x_test.shape[0]))) print() pduid=pd.DataFrame(x_test[:,0],columns=["uid"]) pd1=pd.DataFrame(y_test, columns=["goal"]) pd2=pd.DataFrame(y1, columns=["AdaBoost_pre"]) pd3=pd.DataFrame(y2, columns=["RandomFrost_pre"]) pd4=pd.DataFrame(y3, columns=["Grad"]) pdd = pd.concat([pduid,pd1, pd2, pd3,pd4], axis=1) pdd.to_csv(goalpath+goalfile, index=False)
def AdaBoostRegr(train,test, labels, ground_truth): params = {'n_estimators':[50,100], 'learning_rate':[0.01,0.05,0.1,0.3,1], 'loss':['linear','square','exponential']} #USE CV to find opt parameters #adaBoostRS = RandomizedSearchCV(AdaBoostRegressor(DecisionTreeRegressor()), param_distributions=params, #cv = 10, n_iter=50, n_jobs=-1) adaBoostRS = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=50) adaBoostRS = adaBoostRS.fit(train, labels) print(adaBoostRS.feature_importances_) print(labels) predictions = adaBoostRS.predict(test) score = RegPrediction(ground_truth, predictions) return score
def recspre(exstr, predata, datadict, zhe, count=100): tree, te = exstr.split('-') model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=int(te)), n_estimators=int(tree), learning_rate=0.8) model.fit(datadict[zhe]['train'][:, :-1], datadict[zhe]['train'][:, -1]) # 预测 yucede = model.predict(predata[:, :-1]) # 为了便于展示,选100条数据进行展示 zongleng = np.arange(len(yucede)) randomnum = np.random.choice(zongleng, count, replace=False) yucede_se = list(np.array(yucede)[randomnum]) yuce_re = list(np.array(predata[:, -1])[randomnum]) # 对比 plt.figure(figsize=(17, 9)) plt.subplot(2, 1, 1) plt.plot(list(range(len(yucede_se))), yucede_se, 'r--', label='预测', lw=2) plt.scatter(list(range(len(yuce_re))), yuce_re, c='b', marker='.', label='真实', lw=2) plt.xlim(-1, count + 1) plt.legend() plt.title('预测和真实值对比[最大树数%d]' % int(tree)) plt.subplot(2, 1, 2) plt.plot(list(range(len(yucede_se))), np.array(yuce_re) - np.array(yucede_se), 'k--', marker='s', label='真实-预测', lw=2) plt.legend() plt.title('预测和真实值相对误差') plt.savefig(r'C:\Users\GWT9\Desktop\duibi.jpg') return '预测真实对比完毕'
def round1(X, y): # Set parameters model = AdaBoostRegressor() n = len(y) # Perform 5-fold cross validation scores = [] kf = KFold(n, n_folds=5, shuffle=True) # Calculate mean absolute deviation for train/test for each fold for train_idx, test_idx in kf: X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) prediction = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, prediction)) scores.append(rmse) return scores
def svm_smooth(data, residual_imf, period): train_data = [] lable = [] for i in range(period,len(residual_imf)-20): tmp = data[i-period:i+1] train_data.append(tmp) lable.append(residual_imf[i]) rng = np.random.RandomState(1) clf = AdaBoostRegressor(svm.SVR(),n_estimators=1, random_state=rng) clf.fit(train_data, lable) smooth_data = [] for i in range(len(data)): if i<=period: smooth_data.append(data[i]) else: smooth_data.append(clf.predict([data[i-period:i+1]])[0]) return smooth_data
def train_model(training, testing, window=5, n=5): X_train, y_train = prepare_data(training) X_test, y_test = prepare_data(testing) rf = RandomForestRegressor() rf.fit(X_train, y_train) predrf = rf.predict(X_test) print "mse for random forest regressor: ", mean_squared_error(predrf, y_test) gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.025) gb.fit(X_train, y_train) predgb = gb.predict(X_test) print "mse for gradient boosting regressor: ", mean_squared_error(predgb, y_test) ## plot feature importance using GBR results fx_imp = pd.Series(gb.feature_importances_, index=['bb', 'momentum', 'sma', 'volatility']) fx_imp /= fx_imp.max() # normalize fx_imp.sort() ax = fx_imp.plot(kind='barh') fig = ax.get_figure() fig.savefig("output/feature_importance.png") adb = AdaBoostRegressor(DecisionTreeRegressor()) adb.fit(X_train, y_train) predadb = adb.predict(X_test) print "mse for adaboosting decision tree regressor: ", mean_squared_error(predadb, y_test) scale = StandardScaler() scale.fit(X_train) X_trainscale = scale.transform(X_train) X_testscale = scale.transform(X_test) knn = BaggingRegressor(KNeighborsRegressor(n_neighbors=10), max_samples=0.5, max_features=0.5) knn.fit(X_trainscale, y_train) predknn = knn.predict(X_testscale) print "mse for bagging knn regressor: ", mean_squared_error(predknn, y_test) pred_test = 0.1*predrf+0.2*predgb+0.1*predadb+0.6*predknn print "mse for ensemble all the regressors: ", mean_squared_error(pred_test, y_test) result = testing.copy() result.ix[5:-5, 'trend'] = pred_test result.ix[10:, 'pred'] = pred_test * result.ix[5:-5, 'IBM'].values result.ix[:-5, 'pred_date'] = result.index[5:] return result
def boosting(X, y, k_cv): kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0) regr = AdaBoostRegressor(base_estimator=SVR(C=40, gamma=0.01), random_state=319, n_estimators=40, learning_rate=0.01, loss="square") vaild_split = kfold.split(y) for i in range(k_cv): split_index = vaild_split.__next__() test_index = split_index[1] y_test = y[test_index] trainval_index = split_index[0] X_trainval = X[trainval_index, :] X_test = X[test_index, :] y_trainval = y[trainval_index] regr.fit(X_trainval, y_trainval) print((regr.score(X_trainval, y_trainval))**0.5) test_pre = regr.predict(X_test) print("accuracy: ", (r_2(y_test, test_pre))**0.5)
class gaussProcess_classifier(Classifier): def __init__(self, ticker, inputSize=5, binary=True, risk=0.5, numTrainDays=300, adaboost=False): self.type = 'Gaussian Process' self.ticker = ticker self.days = inputSize self.inputSize = inputSize self.binary = binary self.risk_thresh = risk self.adaboost = adaboost self.numTrainDays = numTrainDays if binary: self.clf = GaussianProcessClassifier() else: self.clf = GaussianProcessRegressor() if adaboost: self.clf = AdaBoostRegressor(base_estimator=self.clf, n_estimators=100) def trainClf(self, endDay=date.today(), numTrainDays=100): X, Y = self.processData(endDay, self.numTrainDays) self.fit(X, Y) def predict(self, inputArray): inputArray = np.array(inputArray) inputArray.reshape([1, -1]) if self.binary: pred = self.clf.predict_proba(inputArray) pred = (np.array(pred)[:, 1] > self.risk_thresh) * 1 else: pred = self.clf.predict(inputArray) return pred def fit(self, X, Y): self.clf.fit(X, Y)
def adaBoost(X_train, X_test, y_train, y_test, Xscaler, yscaler): # Fit regression model rng = np.random.RandomState(1) regr_1 = DecisionTreeRegressor(max_depth=40) regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=40), n_estimators=3000, random_state=rng) regr_1.fit(X_train, y_train) regr_2.fit(X_train, y_train) # Predict y_1 = regr_1.predict(X_test) y_2 = regr_2.predict(X_test) y_1 = yscaler.inverse_transform(np.array(y_1).reshape(-1, 1)) y_2 = yscaler.inverse_transform(np.array(y_2).reshape(-1, 1)) return y_2
def predict_solution(X_train, y_train, X_test): """ This method uses prepared data and AdaBoost including Decision tree to generate predictions from the given test set. """ # Normalize data scaler = MinMaxScaler() scaler.fit(X_train.values) X_train = scaler.transform(X_train.values) X_test = scaler.transform(X_test.values) # Build and fit chosen model model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor( max_depth=30, criterion='friedman_mse'), n_estimators=50, learning_rate=0.5, loss='square') model.fit(X_train, y_train) return model.predict(X_test)
class k_meansCluster(Classifier): def __init__(self,ticker,inputSize=5, binary=True, adaboost=False): self.type = 'K-Means' self.ticker=ticker self.days=inputSize self.inputSize = inputSize self.binary=binary self.adaboost = adaboost self.clf = KMeans(n_clusters=2, random_state=0) if self.adaboost: self.clf=AdaBoostRegressor(base_estimator=self.clf, n_estimators=100) def predict(self, inputArray): inputArray = np.array(inputArray) inputArray.reshape([1,-1]) pred = self.clf.predict(inputArray) return pred def fit(self, X, Y): self.clf.fit(X,Y)
def test_my_select_feature_train21_23(): filename = ("train_data_loan11_21.csv", "train_data_loan11_22.csv", "train_data_loan11_23.csv") maxfeaturenum = 20 # trainData = np.zeros((1, maxfeaturenum + 2)) print(trainData.shape) # print(trainData.shape) path = "./temporaryData/" filename21 = ("train_data_loan11_21.csv", ) for fn in filename: Datarray, Datauid = feature_Select_del_similar(path, filename=fn) trainDatatem = np.array(np.concatenate([Datauid, Datarray], axis=1)) trainData = np.concatenate([trainData, trainDatatem]) trainData = np.delete(trainData, 0, 0) print(trainData.shape) X_train, X_test, y_train, y_test = train_test_split(trainData[:, 2:], trainData[:, 0:2], test_size=0.3) print(X_train.shape) print(y_test.shape) print(y_train[:, 1]) maxdepth = 20 rng = np.random.RandomState(1) reg2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=maxdepth), n_estimators=300, random_state=rng) reg2.fit(X_train, y_train[:, 1]) y2 = reg2.predict(X_test) columns = [] for num1 in range(maxfeaturenum): columns.append(str("loan_amount%d" % num1)) pd1 = pd.DataFrame(y_test, columns=["uid", "goal"]) pd2 = pd.DataFrame(y2, columns=["pre"]) pdd = pd.concat([pd1, pd2], axis=1) pdd.to_csv(path + "train_data_loan11_21_fs.csv", index=False)
def xgb_train(x_train, x_label, x_test): model = 'xgb' #model = 'adaboost' #if model.count('xgb') >0: params = {} params["objective"] = "reg:linear" params["eta"] = 0.005 # [0,1] params["min_child_weight"] = 6 params["subsample"] = 0.7 params["colsample_bytree"] = 0.7 params["scale_pos_weight"] = 1.0 params["silent"] = 1 params["max_depth"] = 9 if config.nthread > 1: params["nthread"] = 1 num_rounds = 10000 xgtrain = xgb.DMatrix(x_train, label=x_label) xgval = xgb.DMatrix(x_test) #train using early stopping and predict watchlist = [(xgtrain, "train")] #model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120, feval=gini_metric) model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120) pred1 = model.predict( xgval ) #clf = RandomForestRegressor() #clf = LogisticRegression() #clf = GradientBoostingRegressor() clf = AdaBoostRegressor( ExtraTreesRegressor(max_depth=9), n_estimators=200 ) clf.fit(x_train, x_label) pred2 = clf.predict(x_test) #pred = pred1 * pred2 / (pred1 + pred2) #pred = 0.7 * (pred1**0.01) + 0.3 * (pred2**0.01) #pred = (pred1.argsort() + pred2.argsort()) / 2 pred = 0.6 * pred1 + 0.4 * pred2 return pred
def train_and_predict_adab_stacked_gbr (train, labels, test, feature_names = None) : " Attmept with SVR ... " print ("Training ADABoost with GBR as base model") t0 = time.clock() if (gridSearch) : params_dict = {'adab__learning_rate' : [0.1, 0.3]} #model = GridSearchCV(regr, params_dict, n_jobs = 3, cv = kfObject, verbose = 10, scoring = 'mean_squared_error') else : base = GradientBoostingRegressor(random_state = randomState, learning_rate = 0.1, n_estimators = 1500, max_depth = 6, subsample = 0.95, max_features = 1, verbose = 10) model = AdaBoostRegressor(random_state = randomState, base_estimator = base, n_estimators = 3, learning_rate = 0.005) model.fit(train, labels) print ("Model fit completed in %.3f sec " %(time.clock() - t0)) if (gridSearch) : print ("Best estimator: ", model.best_estimator_) print ("Best MSLE scores: %.4f" %(model.best_score_)) print ("Best RMSLE score: %.4f" %(math.sqrt(-model.best_score_))) else : float_formatter = lambda x: "%.4f" %(x) print ("Feature importances: ", sorted(zip([float_formatter(x) for x in model.feature_importances_], feature_names), reverse=True)) return model.predict(test)
def test_regression_toy(): # Check classification on a toy dataset. clf = AdaBoostRegressor(random_state=0) clf.fit(X, y_regr) assert_array_equal(clf.predict(T), y_t_regr)
X, y = shuffle(boston.data, boston.target) offset = int(0.7*len(X)) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # We will vary the number of base learners from 2 to 300 max_learners = arange(2, 300) train_err = zeros(len(max_learners)) test_err = zeros(len(max_learners)) for i, l in enumerate(max_learners): # Set up a Adaboost Regression Learner with l base learners regressor = AdaBoostRegressor(n_estimators=l) # Fit the learner to the training data regressor.fit(X_train, y_train) # Find the MSE on the training set train_err[i] = mean_squared_error(y_train, regressor.predict(X_train)) # Find the MSE on the testing set test_err[i] = mean_squared_error(y_test, regressor.predict(X_test)) # Plot training and test error as a function of the number of base learners pl.figure() pl.title('Boosting: Performance vs Number of Learners') pl.plot(max_learners, test_err, lw=2, label = 'test error') pl.plot(max_learners, train_err, lw=2, label = 'training error') pl.legend() pl.xlabel('Number of Learners') pl.ylabel('RMS Error') pl.show()
def test_regression_toy(): """Check classification on a toy dataset.""" clf = AdaBoostRegressor() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result)
month=i%12 Train_X.append([month,i//12,1 if month==0 else 0, 1 if month==1 else 0,1 if month==2 else 0, 1 if month==3 else 0,1 if month==4 else 0, 1 if month==5 else 0,1 if month==6 else 0, 1 if month==7 else 0,1 if month==8 else 0, 1 if month==9 else 0,1 if month==10 else 0, 1 if month==11 else 0,1 if month==12 else 0]) Test_X=Train_X[-12:] Train_X=Train_X[:-12] clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 3), n_estimators = 37, learning_rate = 2).fit(Train_X,Train_Y) Test_Y=clf.predict(Test_X)-90000 if local: filename = "SampleOutput.txt" f = open(filename) dtot=0.0 k=0 for x in Test_Y: actual=int(f.readline()) d=(abs(actual-int(x))/float(actual))/12 print int(x),actual,d dtot+=d print 2.5*max(40-(dtot*100),0) else: for x in Test_Y:
#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Sat Mar 25 17:19:04 2017 @author: carrey """ import numpy as np from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor import matplotlib.pyplot as plt from feature_format import feature_format x_train, x_test, y_train, y_test = feature_format(task = 2) rng = np.random.RandomState(1) regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 21), n_estimators=11, random_state = rng) regr.fit(x_train, y_train) y_pred = regr.predict(x_test) mape = np.mean(np.abs((y_pred - y_test)/y_test)) print x_test print mape print y_pred,y_test
rng = np.random.RandomState(1) X = np.linspace(0, 6, 100)[:, np.newaxis] y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0]) # Fit regression model from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor clf_1 = DecisionTreeRegressor(max_depth=4) clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) clf_1.fit(X, y) clf_2.fit(X, y) # Predict y_1 = clf_1.predict(X) y_2 = clf_2.predict(X) # Plot the results plt.figure() plt.scatter(X, y, c="k", label="training samples") plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2) plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Boosted Decision Tree Regression") plt.legend() plt.show()
def decision_tree(X, y1, y2, y3): n, _ = X.shape nTrain = int(0.5*n) #training on 50% of the data Xtrain = X[:nTrain,:] ytrain = y1[:nTrain] ytrain_registered = y2[:nTrain] ytest_registered = y2[nTrain:] ytrain_casual = y3[:nTrain] ytest_casual = y3[nTrain:] Xtest = X[nTrain:,:] ytest = y1[nTrain:] #regular clf_1 = DecisionTreeRegressor(max_depth=None) clf_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=None), n_estimators=500) clf_4 = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_5 = ExtraTreesRegressor(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_3 = GradientBoostingRegressor(n_estimators=500, max_depth=None, random_state=0) print "finished generating tree" clf_1.fit(Xtrain, ytrain_registered) clf_2.fit(Xtrain, ytrain_registered) clf_3.fit(Xtrain, ytrain_registered) clf_4.fit(Xtrain, ytrain_registered) clf_5.fit(Xtrain, ytrain_registered) print 'Finished fitting' dt_regular = clf_1.predict(Xtest) ada_regular = clf_2.predict(Xtest) grad_regular = clf_3.predict(Xtest) rf_regular = clf_4.predict(Xtest) et_regular = clf_5.predict(Xtest) #casual print "finished generating tree" clf_1.fit(Xtrain, ytrain_casual) clf_2.fit(Xtrain, ytrain_casual) clf_3.fit(Xtrain, ytrain_casual) clf_4.fit(Xtrain, ytrain_casual) clf_5.fit(Xtrain, ytrain_casual) print 'Finished fitting' dt_casual = clf_1.predict(Xtest) ada_casual = clf_2.predict(Xtest) grad_casual = clf_3.predict(Xtest) rf_casual = clf_4.predict(Xtest) et_casual = clf_5.predict(Xtest) feature_imps = clf_4.feature_importances_ print "regular decision tree" print rmsle(ytest, dt_regular + dt_casual) print "boosted decision tree" print rmsle(ytest, ada_regular + ada_casual) print "gradient tree boosting" print rmsle(ytest, grad_regular + grad_casual) print "random forest classifier" print rmsle(ytest, rf_regular + rf_casual) print "extra trees classifier" print rmsle(ytest, et_casual + et_regular) print "feature importances" print feature_imps
if s == current: # test X_predict.append(v_features[id]) Y_predict.append(v_map[id]) else: # train X_train.append(v_features[id]) Y_train.append(v_map[id]) assert len(X_train) == len(Y_train) assert len(X_predict) == len(Y_predict) X_train = np.array(X_train) Y_train = np.array(Y_train) X_predict = np.array(X_predict) Y_predict = np.array(Y_predict) # X_train = scale(X_train, axis=0) # X_predict = scale(X_predict, axis=0) regr = AdaBoostRegressor(n_estimators=150, learning_rate=0.1) # regr = SVR(C=0.02, epsilon=0.5) regr.fit(X_train, Y_train) y = regr.predict(X_predict) scores_mse[current] = mean_squared_error(Y_predict, y) scores_r[current] = pearsonr(Y_predict, y)[0] # print(sum(scores) / 5) print(scores_mse, sum(scores_mse) / 5) print(scores_r, sum(scores_r) / 5)
### visualization code (prettyPicture) to show you the decision boundary from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostRegressor from sklearn.metrics import accuracy_score from sklearn.metrics import r2_score # abc = AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, # algorithm='SAMME.R', random_state=None) # # abc.fit(features_train, labels_train) # predicted = abc.predict(features_test) # accuracy = accuracy_score(labels_test, predicted) # print accuracy abr = AdaBoostRegressor(base_estimator=None, n_estimators=500, learning_rate=1.0, loss='linear', random_state=None) abr.fit(features_train, labels_train) predicted_test = abr.predict(features_test) test_score = r2_score(labels_test, predicted_test) print test_score try: prettyPicture(abr, features_test, labels_test) except NameError: pass