def test_gradient_boosting_estimator_with_smooth_quantile_loss(): np.random.seed(0) m = 15000 n = 10 p = .8 X = np.random.normal(size=(m,n)) beta = np.random.normal(size=n) mu = np.dot(X, beta) y = np.random.lognormal(mu) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33333333333333) loss_function = SmoothQuantileLossFunction(1, p, .0001) q_loss = QuantileLossFunction(1, p) model = Booster(BaggingRegressor(Earth(max_degree=2, verbose=False, use_fast=True, max_terms=10)), loss_function, n_estimators=150, stopper=stop_after_n_iterations_without_percent_improvement_over_threshold(3, .01), verbose=True) assert_raises(NotFittedError, lambda : model.predict(X_train)) model.fit(X_train, y_train) prediction = model.predict(X_test) model2 = GradientBoostingRegressor(loss='quantile', alpha=p) model2.fit(X_train, y_train) prediction2 = model2.predict(X_test) assert_less(q_loss(y_test, prediction), q_loss(y_test, prediction2)) assert_greater(r2_score(y_test,prediction), r2_score(y_test,prediction2)) q = np.mean(y_test <= prediction) assert_less(np.abs(q-p), .05) assert_greater(model.score_, 0.) assert_approx_equal(model.score(X_train, y_train), model.score_)
def get_hypotheses(self, candidate_data, seed_data=None): X_cand, X_seed, y_seed = self.update_data(candidate_data, seed_data) steps = [('scaler', StandardScaler()), ('GP', self.GP)] pipeline = Pipeline(steps) bag_reg = BaggingRegressor(base_estimator=pipeline, n_estimators=self.n_estimators, max_samples=self.max_samples, bootstrap=self.bootstrap, verbose=True, n_jobs=self.n_jobs) self.cv_score = np.mean( -1.0 * cross_val_score(pipeline, X_seed, y_seed, cv=KFold(3, shuffle=True), scoring='neg_mean_absolute_error')) bag_reg.fit(X_seed, y_seed) # TODO: make this a static method def _get_unc(bagging_regressor, X_test): stds = [] pres = [] for est in bagging_regressor.estimators_: _p, _s = est.predict(X_test, return_std=True) stds.append(_s) pres.append(_p) return np.mean(np.array(pres), axis=0), np.min(np.array(stds), axis=0) # GP makes predictions for Hf and uncertainty*alpha on candidate data preds, stds = _get_unc(bag_reg, X_cand) expected = preds - stds * self.alpha # Update candidate data dataframe with predictions self.update_candidate_stabilities(expected, sort=True, floor=-6.0) # Find the most stable ones up to n_query within hull_distance stability_filter = self.candidate_data[ 'pred_stability'] < self.hull_distance within_hull = self.candidate_data[stability_filter] self.indices_to_compute = within_hull.head(self.n_query).index.tolist() return self.indices_to_compute
def evalOne(parameters): all_obs = [] all_pred = [] for location in locations: trainX, testX, trainY, testY = splitDataForXValidation(location, "location", data, all_features, "target") normalizer_X = StandardScaler() trainX = normalizer_X.fit_transform(trainX) testX = normalizer_X.transform(testX) normalizer_Y = StandardScaler() trainY = normalizer_Y.fit_transform(trainY) testY = normalizer_Y.transform(testY) model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) prediction = normalizer_Y.inverse_transform(prediction) testY = normalizer_Y.inverse_transform(testY) all_obs.extend(testY) all_pred.extend(prediction) return rmseEval(all_obs, all_pred)[1]
def trainSVM(data, columns, targetColumn, parameters): modelColumns = [] for column in columns: if column != targetColumn: modelColumns.append(column) modelData = [] for i in range(0, len(data[targetColumn])): record = [] for column in modelColumns: record.append(data[column][i]) modelData.append(record) #model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=1e4,cache_size=5000), max_samples=4000,n_estimators=10, verbose=0, n_jobs=-1) model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=parameters["C"], cache_size=5000), max_samples=parameters["max_samples"],n_estimators=parameters["n_estimators"], verbose=0, n_jobs=-1) model.fit (modelData, data[targetColumn]) return SVMModel(model, modelColumns)
output.write("location,observation,prediction\n") for location in locations: print(str(location)) trainX, testX, trainY, testY = splitDataForXValidation( location, "location", data, all_features, "target") normalizer_X = StandardScaler() trainX = normalizer_X.fit_transform(trainX) testX = normalizer_X.transform(testX) normalizer_Y = StandardScaler() trainY = normalizer_Y.fit_transform(trainY) testY = normalizer_Y.transform(testY) model = BaggingRegressor(base_estimator=SVR(kernel='rbf', C=40, cache_size=5000), max_samples=4200, n_estimators=10, verbose=0, n_jobs=-1) model.fit(trainX, trainY) prediction = model.predict(testX) prediction = normalizer_Y.inverse_transform(prediction) testY = normalizer_Y.inverse_transform(testY) for i in range(0, len(testY)): output.write(str(location)) output.write(",") output.write(str(testY[i])) output.write(",") output.write(str(prediction[i])) output.write("\n")
print(auto_X.dtype, auto_y.dtype) def build_auto(regressor, name): regressor = regressor.fit(auto_X, auto_y) store_pkl(regressor, name + ".pkl") mpg = DataFrame(regressor.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), "DecisionTreeAuto") build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto( BaggingRegressor(LinearRegression(), random_state=13, max_features=0.5), "LinearRegressionEnsembleAuto") build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=5), "RandomForestAuto") build_auto(RidgeCV(), "RidgeAuto")
(RidgeCV(), ['predict'], create_regression_problem_1()), (SGDRegressor(), ['predict'], create_regression_problem_1()), (Lasso(), ['predict'], create_regression_problem_1()), (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]), ['predict', 'predict_proba'], create_weird_classification_problem_1()), (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))], transformer_weights={ 'earth': 1, 'earth2': 2 }), ['transform'], create_weird_classification_problem_1()), (RandomForestRegressor(), ['predict'], create_regression_problem_1()), (CalibratedClassifierCV(LogisticRegression(), 'isotonic'), ['predict_proba'], create_weird_classification_problem_1()), (AdaBoostRegressor(), ['predict'], create_regression_problem_1()), (BaggingRegressor(), ['predict'], create_regression_problem_1()), (BaggingClassifier(), ['predict_proba'], create_weird_classification_problem_1()), (GradientBoostingRegressor(verbose=True), ['predict'], create_regression_problem_1(m=100000, n=200)), (XGBRegressor(), ['predict'], create_regression_problem_for_xgb_1()) ] # Create tests for numpy_flat language def create_case_numpy_flat(estimator, methods, fit_data, predict_data, export_predict_data): def test_case(self): model = clone(estimator) model.fit(**fit_data) for method in methods:
from sklearn.linear_model.theil_sen import TheilSenRegressor from sklearn.mixture.dpgmm import VBGMM from sklearn.feature_selection.variance_threshold import VarianceThreshold import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) clf_dict = {'ARDRegression':ARDRegression(), 'AdaBoostClassifier':AdaBoostClassifier(), 'AdaBoostRegressor':AdaBoostRegressor(), 'AdditiveChi2Sampler':AdditiveChi2Sampler(), 'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(), 'ElasticNetCV':ElasticNetCV(),
model_catb = CatBoostRegressor(random_state=2020, loss_function='MAPE', task_type='GPU') # Bagging base_estimator = [model_lgbm, model_catb, model_rf] bagging_params = { 'base_estimator': base_estimator, 'n_estimators': n_estimators, 'max_samples': max_samples, 'max_features': max_features } model_bagging = BaggingRegressor() def random_search(model, params, X_train, y_train, X_val, y_val, i, name=''): print('-' * 100) start_time = datetime.datetime.now() print('Start Time : {}'.format(start_time)) rnd_search = RandomizedSearchCV(model, param_distributions=params, n_iter=100, cv=2, scoring='neg_mean_absolute_error', verbose=2, n_jobs=2, random_state=2020)
def validate(params): transf_type = params['transf_type'] if transf_type == 'drop': transf = FunctionTransformer(drop_transform, validate=False) elif transf_type == 'dr+inp+sc+pca': transf = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=params['n_pca_components']), ) elif transf_type == 'dr+inp': transf = make_pipeline( drop_transform, SimpleImputer(), ) elif transf_type == 'dr+inp+sc': transf = make_pipeline(drop_transform, SimpleImputer(), StandardScaler()) elif transf_type == 'union': transf = create_union_transf(params) elif transf_type == 'poly_kbest': transf = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PolynomialFeatures(degree=2, interaction_only=True), SelectKBest(f_regression, params['best_features']), ) else: raise AttributeError(f'unknown transformer type: {transf_type}') est_type = params['est_type'] if est_type == 'xgboost': est = create_xgb_est(params) elif est_type == 'gblinear': est = create_gblinear_est(params) elif est_type == 'exttree': est = ExtraTreesRegressor(n_estimators=params['n_estimators'], n_jobs=-1) elif est_type == 'gp': est = GaussianProcessRegressor() elif est_type == 'ridge': est = Ridge(alpha=params['alpha']) else: raise AttributeError(f'unknown estimator type: {est_type}') if params['bagging']: BaggingRegressor(est, n_estimators=params['n_bag_estimators'], max_features=1., max_samples=1.) pl = make_pipeline(transf, est) if params['per_group_regr']: pl = PerGroupRegressor(estimator=pl, split_condition=['os', 'cpuFreq', 'memSize_MB'], n_jobs=1, verbose=1) return cv_test(pl, n_folds=params['n_folds'])
def moudle_select(X, test_A, y, moudelselect, threshold=False, Rate=False): ''' Function :model X : train data test_A : predict data y : result label predict_A : predict data moudelselect : waht' model do you select? threshold:False Rate:False modelselect : 1,XGBRegressor 2,ensemble.RandomForestRegressor 3,linear_model.Lasso 4,LinearRegression 5,linear_model.BayesianRidge 6,DecisionTreeRegressor 7,ensemble.RandomForestRegressor 8,ensemble.GradientBoostingRegressor 9,ensemble.AdaBoostRegressor 10,BaggingRegressor 11,ExtraTreeRegressor 12,SVR 13,MLPRegressor other:MLPRegressor ''' mse = [] sum_mse = 0.0 predict_A = pd.DataFrame(np.zeros((100, 10))) for index in range(5): X_train, X_test, y_train, y_test = train_test_split(X, y) if (moudelselect == 1): model = xgb.XGBRegressor( model=xgb.XGBRegressor(max_depth=17, min_child_weigh=5, eta=0.025, gamma=0.06, subsample=1, learning_rate=0.1, n_estimators=100, silent=0, n_jobs=-1, objective='reg:linear')) elif (moudelselect == 2): model = ensemble.RandomForestRegressor( n_estimators=25, criterion='mse', max_depth=14, min_samples_split=0.1, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features=0.95, max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False) elif (moudelselect == 3): model = linear_model.Lasso(alpha=0.1, max_iter=1000, normalize=False) elif (moudelselect == 4): model = LinearRegression(fit_intercept=False, n_jobs=1, normalize=False) elif (moudelselect == 5): model = linear_model.BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=500, normalize=False, tol=10, verbose=False) elif (moudelselect == 6): model = DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=3, min_samples_split=0.1, min_samples_leaf=0.1, min_weight_fraction_leaf=0.1, max_features=None, random_state=None, max_leaf_nodes=None, presort=False) elif (moudelselect == 7): model = ensemble.RandomForestRegressor( n_estimators=1000, criterion='mse', max_depth=14, min_samples_split=0.1, min_samples_leaf=2, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0, warm_start=False) elif (moudelselect == 8): model = ensemble.GradientBoostingRegressor(n_estimators=800, learning_rate=0.1, max_depth=4, random_state=0, loss='ls') elif (moudelselect == 9): model = ensemble.AdaBoostRegressor(base_estimator=None, n_estimators=120, learning_rate=1, loss='linear', random_state=None) elif (moudelselect == 10): model = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True) elif (moudelselect == 11): model = ExtraTreeRegressor(criterion='mse', splitter='random', max_depth=3, min_samples_split=0.1, min_samples_leaf=1, min_weight_fraction_leaf=0.01, max_features='auto', random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07) elif (moudelselect == 12): model = SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.1, tol=0.001, C=1, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1) elif (moudelselect == 13): model = MLPRegressor(hidden_layer_sizes=(100, ), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08) else: model = MLPRegressor(activation='relu', alpha=0.001, solver='lbfgs', max_iter=90, hidden_layer_sizes=(11, 11, 11), random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_test) print("index: ", index, mean_squared_error(y_test, y_pred)) sum_mse += mean_squared_error(y_test, y_pred) # # if (threshold == False): y_predict = model.predict(test_A) predict_A.ix[:, index] = y_predict mse.append(mean_squared_error(y_test, y_pred)) else: if (mean_squared_error(y_test, y_pred) <= 0.03000): y_predict = model.predict(test_A) predict_A.ix[:, index] = y_predict mse.append(mean_squared_error(y_test, y_pred)) # if(Rate==False): # mse_rate = mse / np.sum(mse) # #predict_A = predict_A.ix[:,~(data==0).all()] # for index in range(len(mse_rate)): # y+=predict_A.ix[:,index]*mse_rate[index] # y = 0.0 mse = mse / np.sum(mse) mse = pd.Series(mse) mse_rate_asc = mse.sort_values(ascending=False) mse_rate_asc = mse_rate_asc.reset_index(drop=True) mse_rate_desc = mse.sort_values(ascending=True) indexs = list(mse_rate_desc.index) for index in range(len(mse)): y += mse_rate_asc.ix[index] * predict_A.ix[:, indexs[index]] print("y_predict_mean: ", y.mean()) print("y_predict_var: ", y.var()) y = pd.DataFrame(y) y.to_excel("H:/java/python/src/machinelearning/test/predict.xlsx", index=False) predict_A.to_excel( "H:/java/python/src/machinelearning/test/predict_testA.xlsx", index=False) print("Averge mse:", sum_mse / len(mse))