def fun_bag_fs(x, *args): X, y, flag, n_splits, random_seed = args n_samples, n_var = X.shape _estimator = [None, None] base_estimator = _estimator[int(round(x[0]))] n_estimators = int(round(x[1])) clf = BaggingRegressor(random_state=random_seed, ) p = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, } clf.set_params(**p) if len(x) <= 2: ft = np.array([1 for i in range(n_var)]) ft = np.where(ft > 0.5) else: ft = np.array([1 if k > 0.5 else 0 for k in x[2::]]) ft = np.where(ft > 0.5) #x[4::] = [1 if k>0.5 else 0 for k in x[4::]] #ft = np.array([1 if k>0.5 else 0 for k in x[4::]]) #ft = np.where(ft>0.5) try: #cv=KFold(n_splits=n_splits, shuffle=True, random_state=random_seed) #cv=KFold(n=n_samples, n_folds=5, shuffle=True, random_state=int(random_seed)) cv = KFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) y_p = cross_val_predict(clf, X[:, ft].squeeze(), y, cv=cv, n_jobs=1) #r = r2_score(y_p,y) #r = mean_squared_error(y,y_p)**0.5 #r = -accuracy_score(y,y_p) #r = -f1_score(y,y_p,average='weighted') r = RMSE(y_p, y) except: y_p = [None] r = 1e12 #print(r,'\t',p) if flag == 'eval': return r else: clf.fit(X[:, ft].squeeze(), y) return { 'Y_TRUE': y, 'Y_PRED': y_p, 'EST_PARAMS': p, 'PARAMS': x, 'EST_NAME': 'BAG', 'ESTIMATOR': clf, 'ACTIVE_VAR': ft, 'DATA': X, 'SEED': random_seed }
def run(): print "Decision Tree Regression started..." #Preparing Training data dir_path = "" train_file_path = dir_path + "train.csv" train_file = read_csv(train_file_path,skiprows=1,header=None) train_file = train_file.drop(train_file.columns[0],axis=1) train_file = train_file.values train_X_temp = train_file[5:50000,:-1] train_Y = train_file[6:50001,-1] #Combining previous 5 time step data into one row train_X = np.zeros((train_X_temp.shape[0],8*5)) for i in range(train_X_temp.shape[0]): for j in range(5): for k in range(8): train_X[i][j*8+k] = train_X_temp[i-j][k] #Preparing testing data test_file_name = dir_path + "test2.csv" test_file = read_csv(test_file_name,skiprows=1,header=None) test_file = test_file.values test_X = np.array(test_file[:,:-1]) test_y = test_file[:,-1] #Model training and prediction for different no of trees estimators = np.arange(10, 100, 10) print "\nBagged Decision Tree:" bag_reg = BaggingRegressor(DecisionTreeRegressor(),n_jobs=2,random_state=0).fit(train_X, train_Y) scores = [] prediction = [] for n in estimators: bag_reg.set_params(n_estimators=n) bag_reg.fit(train_X, train_Y) score = bag_reg.score(test_X, test_y) print score scores.append(score) #prediction.append(bag_reg.predict(test_X)) #plotting the effect of increasing no of trees on accuracy score plt.title("Effect of n_estimators") plt.xlabel("n_estimator") plt.ylabel("score") plt.plot(estimators, scores) plt.show()
def test_parallel_regression(): # Check parallel regression. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
def test_parallel_regression(): # Check parallel regression. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, random_state=rng) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)