def get_regressor(training_set): """ Estimation of the value function using a regression algorithm. Training set contains tuples of (state, score) V: S -> R """ clf = ExtraTreeRegressor() clf.fit(*zip(*training_set)) return clf
def init_ML_models(): models = dict() models['lr'] = LinearRegression() models['lasso'] = Lasso() models['ridge'] = Ridge() models['en'] = ElasticNet() models['huber'] = HuberRegressor() models['llars'] = LassoLars() models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3) # models['knn'] = KNeighborsRegressor(n_neighbors=5) models['cart'] = DecisionTreeRegressor() models['extra'] = ExtraTreeRegressor() models['svmr'] = SVR() n_trees = 100 models['ada'] = AdaBoostRegressor(n_estimators=n_trees) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['rf'] = RandomForestRegressor(n_estimators=n_trees) models['et'] = ExtraTreesRegressor(n_estimators=n_trees) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) return models
def get_models(models=dict()): # linear models models['lr'] = LinearRegression() alpha = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for a in alpha: models['lasso-' + str(a)] = Lasso(alpha=a) for a in alpha: models['ridge-' + str(a)] = Ridge(alpha=a) for a1 in alpha: for a2 in alpha: name = 'en-' + str(a1) + '-' + str(a2) models[name] = ElasticNet(a1, a2) models['huber'] = HuberRegressor() models['lars'] = Lars() models['llars'] = LassoLars() models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3) models['ranscac'] = RANSACRegressor() models['sgd'] = SGDRegressor(max_iter=1000, tol=1e-3) models['theil'] = TheilSenRegressor() # non-linear models n_neighbors = range(1, 21) for k in n_neighbors: models['knn-' + str(k)] = KNeighborsRegressor(n_neighbors=k) models['cart'] = DecisionTreeRegressor() models['extra'] = ExtraTreeRegressor() models['svml'] = SVR(kernel='linear') models['svmp'] = SVR(kernel='poly') c_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] for c in c_values: models['svmr' + str(c)] = SVR(C=c) # ensemble models n_trees = 100 models['ada'] = AdaBoostRegressor(n_estimators=n_trees) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['rf'] = RandomForestRegressor(n_estimators=n_trees) models['et'] = ExtraTreesRegressor(n_estimators=n_trees) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) print('Defined %d models' % len(models)) return models
def __init__(self, n_estimators=100, type_weight='nearest_neighbors', n_neighbors='auto', max_knn_sample="auto", max_samples="auto", contamination="auto", max_features=1., bootstrap=False, n_jobs=None, behaviour='deprecated', random_state=None, verbose=0, warm_start=False, pos_label=None): super().__init__( base_estimator=ExtraTreeRegressor(max_features=1, splitter='random', random_state=random_state), # here above max_features has no links with self.max_features bootstrap=bootstrap, bootstrap_features=False, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) self.behaviour = behaviour self.contamination = contamination self.n_neighbors = n_neighbors self.max_knn_sample = max_knn_sample self.pos_label = pos_label self.type_weight = type_weight self.nn_ = None self.depths_ = None self.nn_weight_ = None
def ExtraTree(X_train, X_test, y_train, y_test): reg = ExtraTreeRegressor() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) printMetrics(y_true=y_test, y_pred=y_pred) val_metrics = getMetrics(y_true=y_test, y_pred=y_pred) y_pred = reg.predict(X=X_train) metrics = getMetrics(y_true=y_train, y_pred=y_pred) printMetrics(y_true=y_train, y_pred=y_pred) logSave(nameOfModel="ExtraTree", reg=reg, metrics=metrics, val_metrics=val_metrics)
def fit_data(X, Y, type): global rnd error_list = [] if type == "LinearRegression": data_fit = LinearRegression() elif type == "ExtraTreeRegressor": data_fit = ExtraTreeRegressor() elif type == "DecisionTreeRegressor": data_fit = DecisionTreeRegressor() elif type == "RandomForestRegressor": data_fit = RandomForestRegressor() elif type == "GradientBoostingRegressor": data_fit = GradientBoostingRegressor() elif type == "XGBRegressor": data_fit = XGBRegressor() for i in range(100, 1000, 1): rnd.append(i) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, random_state=i) data_fit.fit(x_train, y_train) reesult = data_fit.predict(x_test) error_list.append(np.sqrt(mean_squared_error(y_test, reesult))) print("using " + type + " error is :") print(min(error_list)) print("using randonstate = ", rnd[error_list.index(min(error_list))])
def model_train(x, y, delay, params_dict, version_): if params_dict['model_name'] == 'LinearRegression': model = LinearRegression(fit_intercept=True, normalize=False, n_jobs=-1) # 1. 线性回归 elif params_dict['model_name'] == 'RandomForestRegressor': model = RandomForestRegressor( n_estimators=params_dict['n_estimators'], n_jobs=7, random_state=2017, max_depth=params_dict['max_depth'], oob_score=True, min_samples_split=params_dict['min_samples_split'], min_samples_leaf=params_dict['min_samples_leaf']) elif params_dict['model_name'] == 'ExtraTreeRegressor': model = ExtraTreeRegressor( max_depth=params_dict['max_depth'], random_state=2017, min_samples_split=params_dict['min_samples_split'], min_samples_leaf=params_dict['min_samples_leaf'] ) # 9. ExtraTree极端随机树回归 else: model = RandomForestRegressor() model_n = 'Model_' + str(delay + 1) + '_' + params_dict['model_name'] + '_V' + str( version_) + '.model' model_dir = fgl.MODEL_DIR + '/' + model_n print('Start Training Model ' + str(delay + 1) + '...', 'Time:', datetime.datetime.now()) print(list(x.columns)) best_model = model.fit(x, y) # print(best_model.feature_importances_) print('Model ' + str(delay + 1) + 'is ok!', 'Time:', datetime.datetime.now()) joblib.dump(best_model, model_dir) print('Model ' + str(delay + 1) + 'saved!', 'Time:', datetime.datetime.now()) return model_n, best_model, '-'.join(list(x.columns.values))
def get_models_(models=dict()): # non-linear models models['knn'] = KNeighborsRegressor(n_neighbors=8) models['cart'] = DecisionTreeRegressor() models['extra'] = ExtraTreeRegressor() # # ensemble models n_trees = 100 #500 #models['ada'] = AdaBoostRegressor(n_estimators=n_trees) models['bag'] = BaggingRegressor(n_estimators=n_trees) models['rf'] = RandomForestRegressor(n_estimators=n_trees) #models['et'] = ExtraTreesRegressor(n_estimators=n_trees) models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees) models['xgb'] = XGBRegressor(max_depth=8, n_estimators=n_trees, min_child_weight=300, colsample_bytree=0.8, subsample=0.8, eta=0.3, seed=42, silent=True) models['lgbm'] = LGBMRegressor(n_jobs=-1, random_state=0, n_estimators=n_trees, learning_rate=0.001, num_leaves=2**6, subsample=0.9, subsample_freq=1, colsample_bytree=1.)
def run(self, configuration: Configuration) -> None: inputs = torch.from_numpy(np.load(configuration.inputs)) missing_mask = torch.from_numpy(np.load(configuration.missing_mask)) assert inputs.shape == missing_mask.shape # the model need np.nan in the missing values to work inputs = compose_with_mask(missing_mask, where_one=torch.empty_like(inputs).fill_(np.nan), where_zero=inputs, differentiable=False) # cannot be differentiable with nans! # create the model model = IterativeImputer(random_state=configuration.get("seed", 0), estimator=ExtraTreeRegressor(), missing_values=np.nan) # go back to torch (annoying) model.fit(inputs.numpy()) # save the model with open(create_parent_directories_if_needed(configuration.outputs), "wb") as model_file: pickle.dump(model, model_file)
def __init__(self, n_estimators=100, max_samples=256, max_features=1., bootstrap=True, n_jobs=1, random_state=None, verbose=0): super(IsolationForest, self).__init__( base_estimator=ExtraTreeRegressor(max_depth=int( np.ceil(np.log2(max(max_samples, 2)))), max_features=1, splitter='random', random_state=random_state), # here above max_features has no links with self.max_features bootstrap=bootstrap, bootstrap_features=False, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, n_jobs=n_jobs, random_state=random_state, verbose=verbose)
def run(type): if type == 'decision_tree': from sklearn import tree model = tree.DecisionTreeRegressor() elif type == 'linear': from sklearn import linear_model model = linear_model.LinearRegression() elif type == 'svm': from sklearn import svm model = svm.SVR() elif type == 'KNN': from sklearn import neighbors model = neighbors.KNeighborsRegressor() elif type == 'random_forest': from sklearn import ensemble model = ensemble.RandomForestRegressor(n_estimators=20) elif type == 'adaboost': from sklearn import ensemble model = ensemble.AdaBoostRegressor(n_estimators=50) elif type == 'extra_tree': from sklearn.tree import ExtraTreeRegressor model = ExtraTreeRegressor() method(model, model)
def fit(self, trend, seasonality): self.fit = None self.trend = trend self.seasonality = seasonality self.lags = [1, seasonality] # try: xtree = ExtraTreeRegressor() self.feature_selection(model=xtree) param_grid = { 'max_depth': [2, 4, 6, 8] } model = GridSearchCV(estimator=xtree, param_grid=param_grid) self.fit = model.fit(self.selected_X, self.y.to_numpy()) # except: # print('ERROR: Could not run Support Vector Regressor! Please check your input data. \n') return self.fit
def get_model(model_name): model_dict = { # 回归 "model_DecisionTreeRegressor": tree.DecisionTreeRegressor(), # 决策树 "model_LinearRegression": linear_model.LinearRegression(), # 线性回归 "model_SVR": svm.SVR(), # SVM "model_KNeighborsRegressor": neighbors.KNeighborsRegressor(), # KNN "model_RandomForestRegressor": ensemble.RandomForestRegressor(n_estimators=20), # 随机森林,这里使用20个决策树 "model_AdaBoostRegressor": ensemble.AdaBoostRegressor(n_estimators=50), # Adaboost,这里使用50个决策树 "model_GradientBoostingRegressor": ensemble.GradientBoostingRegressor( n_estimators=100), # GBRT,这里使用100个决策树 "model_BaggingRegressor": BaggingRegressor(), # Bagging回归 "model_ExtraTreeRegressor": ExtraTreeRegressor(), # ExtraTree极端随机树回归 # 分类 "model_LogisticRegression_weight": LogisticRegression(C=1000, class_weight={ 0: 0.8, 1: 0.2 }), # 逻辑回归 "model_LogisticRegression": LogisticRegression(C=1000), # 逻辑回归(无权重) "model_SVC": svm.SVC(class_weight="balanced"), # 向量机 "model_RandomForestClassifier": RandomForestClassifier(n_estimators=7, class_weight="balanced") # 随机森林 } return model_dict[model_name]
def GET_ORDINARY_MODELS(prediction_type=None): if prediction_type == "C": return { "LR": LogisticRegression(), "LDA": LinearDiscriminantAnalysis(), "GNB": GaussianNB(), "KNC": KNeighborsClassifier(), "SVC": SVC(), "ETC": ExtraTreeClassifier(), "DTC": DecisionTreeClassifier() } elif prediction_type == "R": return { "LR": LinearRegression(), "RIDGE": Ridge(), "LASSO": Lasso(), "EN": ElasticNet(), "KNR": KNeighborsRegressor(), "SVR": SVR(), "ETR": ExtraTreeRegressor(), "DTR": DecisionTreeRegressor() } else: raise Exception()
def __init__(self, n_estimators=10, max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_leaf_nodes=None, sparse_output=True, n_jobs=1, random_state=None, verbose=0, warm_start=False, use_one_hot=True): super(RandomTreesEmbeddingUnsupervised, self).__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, estimator_params=("criterion", "max_depth", "min_samples_split", "min_samples_leaf", "min_weight_fraction_leaf", "max_features", "max_leaf_nodes", "random_state"), bootstrap=False, oob_score=False, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start) self.criterion = 'mse' self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = 1 self.max_leaf_nodes = max_leaf_nodes self.sparse_output = sparse_output self.use_one_hot = use_one_hot
def get_linear_model(model_name): if model_name == "LinearRegression": reg = LinearRegression() elif model_name == "RandomForestRegressor": reg = RandomForestRegressor() elif model_name == "DecisionTreeRegressor": reg = DecisionTreeRegressor() elif model_name == "GaussianProcessRegressor": reg = GaussianProcessRegressor() elif model_name == "ExtraTreeRegressor": reg = ExtraTreeRegressor() elif model_name == "LGBMRegressor": reg = lgbm.sklearn.LGBMRegressor() elif model_name == "BaggingRegressor": reg = BaggingRegressor() elif model_name == "KNeighborsRegressor": reg = KNeighborsRegressor() elif model_name == "Lars": reg = Lars() elif model_name == "SVR": reg = SVR() elif model_name == "NuSVR": reg = NuSVR() return reg
def _GET_ORDINARY_MODEL_DICT(_Type = "classification"): # Fine! if _Type == "classification": return { "LR" : LogisticRegression(), "LDA" : LinearDiscriminantAnalysis(), "GNB" : GaussianNB(), "KNC" : KNeighborsClassifier(), "DTC" : DecisionTreeClassifier(), "ETC" : ExtraTreeClassifier(), "SVC" : SVC() } elif _Type == "regression": return { "LR" : LinearRegression(), "RIDGE" : Ridge(), "LASSO" : Lasso(), "EN" : ElasticNet(), "KNR" : KNeighborsRegressor(), "DTR" : DecisionTreeRegressor(), "ETR" : ExtraTreeRegressor(), "SVR" : SVR() } else: raise Exception("")
else: pipeline.verify(auto_X.sample(n = 3, random_state = 13)) pipeline.configure(**pmml_options) store_pmml(pipeline, name) if isinstance(regressor, IsolationForest): decision_function = DataFrame(pipeline.decision_function(auto_X), columns = ["decisionFunction"]) outlier = DataFrame(pipeline.predict(auto_X), columns = ["outlier"]) outlier['outlier'] = outlier['outlier'].apply(lambda x: str(bool(x == -1)).lower()) store_csv(pandas.concat((decision_function, outlier), axis = 1), name) else: mpg = DataFrame(pipeline.predict(auto_X), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(n_estimators = 31, random_state = 13), "AdaBoostAuto") build_auto(DecisionTreeRegressor(random_state = 13), "DecisionTreeAuto", compact = False, flat = True) build_auto(GradientBoostingRegressor(n_estimators = 31, random_state = 13), "GradientBoostingAuto") build_auto(IsolationForest(n_estimators = 31, random_state = 13), "IsolationForestAuto") build_auto(LGBMRegressor(objective = "regression", n_estimators = 31, random_state = 13), "LightGBMAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(RandomForestRegressor(n_estimators = 17, random_state = 13), "RandomForestAuto", compact = False, flat = False) build_auto(VotingRegressor(estimators = [("major", DecisionTreeRegressor(max_depth = 8, random_state = 13)), ("minor", ExtraTreeRegressor(max_depth = 5, random_state = 13))], weights = [0.7, 0.3]), "VotingEnsembleAuto") build_auto(XGBRegressor(objective = "reg:squarederror", n_estimators = 31, random_state = 13), "XGBoostAuto") sparsify("Auto") auto_X, auto_y = load_auto("AutoNA") if ("Auto" in datasets) or ("AutoNA" in datasets): build_auto(LGBMRegressor(objective = "regression", n_estimators = 31, random_state = 13), "LightGBMAutoNA") build_auto(XGBRegressor(objective = "reg:squarederror", n_estimators = 31, random_state = 13), "XGBoostAutoNA")
#print cols[11] #printcols[0] X1 = cols[0:11] #X1 = preprocessing.normalize(X1) X = list(zip(*X1)) Y = cols[11] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state=rn) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) y_test = np.array(y_test) #print(y_test) lin_reg_mod = ExtraTreeRegressor() lin_reg_mod.fit(X_train, y_train) pred = lin_reg_mod.predict(X_test) #print(pred) #print(y_test) test_set_r2 = r2_score(y_test, pred) #print(test_set_r2) tr2+=test_set_r2 #abs_er = mean_absolute_error(y_test, pred) #tabse+=abs_er temp = [] for (i,j) in zip(y_test, pred): t = (abs(i-j))/float(i)
from math import * import pandas as pd import numpy as np from sklearn.tree import ExtraTreeRegressor import matplotlib.pyplot as plt import re,os data=pd.read_csv('ice.csv') x=data[['temp','street']] y=data['ice'] clf=ExtraTreeRegressor() clf.fit(x,y) p=clf.predict(x) print clf.score(x,y) t=np.arange(0.0,31.0) plt.plot(t,data['ice'],'--',t,p,'-') plt.show()
nca_val = NeighborhoodComponentsAnalysis(random_state=0) nca_ar.fit(X_train, ar_train) nca_val.fit(X_train, val_train) X_train_ar = nca_ar.transform(X_train) X_train_val = nca_val.transform(X_train) X_test_ar = nca_ar.transform(X_test) X_test_val = nca_val.transform(X_test) # X_train_ar = X_train # X_train_val = X_train # X_test_ar = X_test # X_test_val = X_test parameters = {"n_estimators": [50, 75, 100], "learning_rate": [0.1, 0.5, 1.]} reg_ar = AdaBoostRegressor(ExtraTreeRegressor(max_depth=5, random_state=0), random_state=0) reg_val = AdaBoostRegressor(ExtraTreeRegressor(max_depth=5, random_state=0), random_state=0) clf_ar = GridSearchCV(reg_ar, parameters) clf_val = GridSearchCV(reg_val, parameters) clf_ar.fit(X_train_ar, ar_train) clf_val.fit(X_train_val, val_train) print(clf_ar.score(X_train_ar, ar_train)) print(clf_val.score(X_train_val, val_train)) print("--------------Test-------------------") print(clf_ar.score(X_test_ar, ar_test)) print(clf_val.score(X_test_val, val_test))
def Build_MapMean_Model(self): MapMean = ExtraTreeRegressor() MapMean.fit(self.MapFeature_list,(self.MapMean_list)) self.Dump_Model('Model/MapMean.model',MapMean) print MapMean.feature_importances_
model_RandomForestRegressor = ensemble.RandomForestRegressor( n_estimators=20) #这里使用20个决策树 ####3.6Adaboost回归#### from sklearn import ensemble model_AdaBoostRegressor = ensemble.AdaBoostRegressor( n_estimators=50) #这里使用50个决策树 ####3.7GBRT回归#### from sklearn import ensemble model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor( n_estimators=100) #这里使用100个决策树 ####3.8Bagging回归#### from sklearn.ensemble import BaggingRegressor model_BaggingRegressor = BaggingRegressor() ####3.9ExtraTree极端随机树回归#### from sklearn.tree import ExtraTreeRegressor model_ExtraTreeRegressor = ExtraTreeRegressor() # In[48]: def getHeadFromFile(): #paths=path[0:path.index(".")] #names=paths.split("-") #names=['date', 'sales_num'] return names def getDataFromFile(jsonstring): #df = pd.read_csv(path, sep='\t', low_memory=False) #new_df = df.replace('?', np.nan) #datas = new_df.dropna(how = 'any')
def extra_tree_regressor(self): x_train, x_test, y_train, y_test = self.preprocessing() model = ExtraTreeRegressor() y_pred = model.fit(x_train, y_train).predict(x_test) self.printing(y_test, y_pred, 'Extra Tree')
ri_MakingLT_labels_train, test_size=0.25, random_state=42) ###**Adaboost(ExtraDecisionTree)**### # **Adaboost(ExtraDecisionTree)** 모델 훈련 시킴 from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import ExtraTreeRegressor ada_et_tree_reg = AdaBoostRegressor(base_estimator=ExtraTreeRegressor( ccp_alpha=0.0, criterion='mse', max_depth=11, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, random_state=42, splitter='random'), learning_rate=0.1, loss='exponential', n_estimators=150, random_state=42) ada_et_tree_reg.fit(ri_MakingLT_prepared_train, ri_MakingLT_labels_train) ri_MakingLT_predicted = ada_et_tree_reg.predict(ri_MakingLT_prepared_test) from sklearn.metrics import mean_squared_error ada_et_tree_reg_mse = mean_squared_error(ri_MakingLT_labels_test,
from sklearn.ensemble import RandomForestRegressor rf_model=RandomForestRegressor(n_estimators=700,random_state=42) rf_model.fit(x_train,y_train) y_predict=rf_model.predict(x_test) r2_score(y_test,y_predict.ravel()) # ### ExtraTreeRegressor # In[85]: from sklearn.tree import ExtraTreeRegressor extratree_model=ExtraTreeRegressor(random_state=42) extratree_model.fit(x_train,y_train) y_predict=extratree_model.predict(x_test) r2_score(y_test,y_predict.ravel()) # ### Result # # So from here we can conclude that out of multiple models RandomForestRegressor model is working well with 90.66% accuracy. which is a very good accuracy. # In[86]: # Using pickle we will save our model so that we can use it further import pickle pickle.dump(extratree_model,open('model.pkl','wb'))
def fit(self, training_queries, training_query_weights=None, validation_queries=None, validation_query_weights=None): ''' Train a LambdaRandomForest model on given training queries. Optionally, use validation queries for finding an optimal number of trees using early stopping. Parameters ---------- training_queries : Queries instance The set of queries from which the model will be trained. training_query_weights : array of floats, shape = [n_queries], or None The weight given to each training query, which is used to measure its importance. Queries with 0.0 weight will never be used in training. validation_queries : Queries instance or None The set of queries used for early stopping. validation_query_weights : array of floats, shape = [n_queries] or None The weight given to each validation query, which is used to measure its importance. Queries with 0.0 weight will never be used in validation. Returns ------- self : object Returns self. ''' metric = MetricFactory(self.metric, queries=aslist(training_queries, validation_queries), random_state=self.random_state) # If the metric used for training is normalized, it is advantageous # to precompute the scaling factor for each query in advance. training_query_scales = metric.compute_scaling( training_queries, query_weights=training_query_weights) if validation_queries is None: validation_queries = training_queries validation_query_scales = training_query_scales.copy() else: validation_query_scales = metric.compute_scaling( validation_queries, query_weights=validation_query_weights) # The first row is reserved for the ranking scores computed # in the previous fold of regression trees, see below, how # the tree ensemble is evaluated, that is why +1. validation_ranking_scores = np.zeros( (self.n_jobs + 1, validation_queries.document_count()), dtype=np.float64) logger.info('Training of LambdaRandomForest model has started.') estimators = [] if self.random_thresholds: for k in range(self.n_estimators): estimators.append( ExtraTreeRegressor( max_depth=self.max_depth, max_leaf_nodes=self.max_leaf_nodes, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state)) else: for k in range(self.n_estimators): estimators.append( DecisionTreeRegressor( max_depth=self.max_depth, max_leaf_nodes=self.max_leaf_nodes, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, max_features=self.max_features, random_state=self.random_state)) # Best performance and index of the last tree. best_performance = -np.inf best_performance_k = -1 # Counts how many trees have been trained since the last # improvement on the validation set. performance_not_improved = 0 # Partition the training into a proper number of folds # to benefit from the parallelization at best. if self.n_estimators > self.n_jobs: estimator_indices = np.array_split( np.arange(self.n_estimators, dtype=np.intc), (self.n_estimators + self.n_jobs - 1) / self.n_jobs) else: estimator_indices = [np.arange(self.n_estimators)] for fold_indices in estimator_indices: # Train all trees in the current fold... fold_estimators = \ Parallel(n_jobs=self.n_jobs, backend='threading')( delayed(parallel_build_trees, check_pickle=False)( i, estimators[i], self.n_estimators, metric.copy(), training_queries, training_query_scales, training_query_weights, self.use_newton_method, self.bootstrap, self.subsample_queries, self.subsample_documents, self.random_state.randint(1, np.iinfo('i').max), self.sigma, validation_queries, validation_ranking_scores[i - fold_indices[0] + 1]) for i in fold_indices) self.estimators.extend(fold_estimators) # Compute the ranking score of validation queries for every # new tree that has been just trained. np.cumsum(validation_ranking_scores[:(len(fold_indices) + 1)], out=validation_ranking_scores[:(len(fold_indices) + 1)], axis=0) for i, ranking_scores in enumerate( validation_ranking_scores[1:, :]): # Get the performance of the current model consisting # of `fold_indices[0] + i + 1` number of trees. validation_performance = metric.evaluate_queries( validation_queries, ranking_scores, scales=validation_query_scales) logger.info( '#%08d: %s (%s): %11.8f' % (fold_indices[i], 'training' if validation_queries is training_queries else 'validation', metric, validation_performance)) if validation_performance > best_performance: best_performance = validation_performance best_performance_k = fold_indices[i] performance_not_improved = 0 else: performance_not_improved += 1 if (performance_not_improved >= self.estopping and self.min_n_estimators <= fold_indices[i] + 1): break if (performance_not_improved >= self.estopping and self.min_n_estimators <= fold_indices[i] + 1): logger.info( 'Stopping early since no improvement on %s ' 'queries has been observed for %d iterations ' '(since iteration %d)' % ('training' if validation_queries is training_queries else 'validation', self.estopping, best_performance_k + 1)) break # Copy the last ranking scores for the next validation "fold". validation_ranking_scores[0, :] = validation_ranking_scores[ len(fold_indices), :] if validation_queries is not training_queries: logger.info('Final model performance (%s) on validation queries: ' '%11.8f' % (metric, best_performance)) else: logger.info('Final model performance (%s) on training queries: ' '%11.8f' % (metric, best_performance)) # Make sure the model has the wanted size. best_performance_k = max(best_performance_k, self.min_n_estimators - 1) # Leave the estimators that led to the best performance, # either on training or validation set. del self.estimators[best_performance_k + 1:] # Correct the number of trees. self.n_estimators = len(self.estimators) self.best_performance = best_performance logger.info('Training of LambdaRandomForest model has finished.') return self
def predict_extra_tree(train_X, train_Y, test, param=30): clf = ExtraTreeRegressor(min_samples_leaf=param, min_samples_split=1, criterion='mse') clf.fit(train_X, train_Y) preds = clf.predict(test) return preds
def build_lonely_tree_regressor(X, y, max_features, max_depth, min_samples_split): clf = ExtraTreeRegressor(max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split) clf = clf.fit(X, y) return clf
def doregress(X_train, y_train, n_train, X_test, y_test, n_test, band, fnames): lin = LinearRegression() lin.fit(X_train, y_train) lres = lin.predict(X_test) - y_test zl, ml, sl, fl = summstats(z_test, lres, n_test) #gbr1 = GradientBoostingRegressor(loss="ls") #gbr2 = GradientBoostingRegressor(loss="lad") #gbr1.fit(X_train, y_train) #gbr2.fit(X_train, y_train) #g1res = gbr1.predict(X_test) - y_test #g2res = gbr2.predict(X_test) - y_test #g1z, g1med, g1std = summstats(z_test, g1res) #g2z, g2med, g2std = summstats(z_test, g2res) #ada = AdaBoostRegressor() #ada.fit(X_train, y_train) #ares = ada.predict(X_test) - y_test #az, amed, astd = summstats(z_test, ares) # Some of these appear to be unstable # I.e. feature importance changes #for extension in ("A", "B", "C", "D", "E"): for extension in ("A",): print "# Regressing", extension xtr = ExtraTreeRegressor() xtr.fit(X_train, y_train) zx, mx, sx, fx = doplot(xtr, X_test, y_test, z_test, n_test, fnames, "%s-band ExtraTreeRegressor"%(band), "R_%s_%s_ext.png"%(band, extension)) xtrw = ExtraTreeRegressor() xtrw.fit(X_train, y_train, sample_weight=np.log10(n_train)) zxw, mxw, sxw, fxw = doplot(xtrw, X_test, y_test, z_test, n_test, fnames, "%s-band weighted ExtraTreeRegressor"%(band), "R_%s_%s_ext_weight.png"%(band, extension)) #### tree = DecisionTreeRegressor() tree.fit(X_train, y_train) zt, mt, st, ft = doplot(tree, X_test, y_test, z_test, n_test, fnames, "%s-band DecisionTreeRegressor"%(band), "R_%s_%s_tree.png"%(band, extension)) treew = DecisionTreeRegressor() treew.fit(X_train, y_train, sample_weight=np.log10(n_train)) ztw, mtw, stw, ftw = doplot(treew, X_test, y_test, z_test, n_test, fnames, "%s-band weighted DecisionTreeRegressor"%(band), "R_%s_%s_tree_weight.png"%(band, extension)) #### weights = n_train nt = 50 rfr = RandomForestRegressor(n_estimators=nt) rfr.fit(X_train, y_train) zr, mr, sr, fr = doplot(rfr, X_test, y_test, z_test, n_test, fnames, "%s-band RandomForestRegressor"%(band), "R_%s_%s_%d_rfr.png"%(band, extension, nt)) rfrw = RandomForestRegressor(n_estimators=nt) rfrw.fit(X_train, y_train, sample_weight=weights) zrw, mrw, srw, frw = doplot(rfrw, X_test, y_test, z_test, n_test, fnames, "%s-band weighted RandomForestRegressor"%(band), "R_%s_%s_%d_rfr_weight.png"%(band, extension, nt)) print "RF %d : %.5e +/- %.5e vs weighted %.5e +/- %.5e" % (nt, np.median(fr), 0.741 * (np.percentile(fr, 75) - np.percentile(fr, 25)), np.median(frw), 0.741 * (np.percentile(frw, 75) - np.percentile(frw, 25))) #### # Compare all models fig, (sp1, sp2, sp3) = plt.subplots(3, 1, sharex=True, figsize=(16,12)) sp1.plot(zl, ml, "r-", label="LinearRegression") sp1.plot(zt, mt, "b-", label="DecisionTreeRegressor") sp1.plot(zr, mr, "g-", label="RandomForestRegressor") sp1.plot(zx, mx, "m-", label="ExtraTreeRegressor") sp2.plot(zl[np.where(sl>0.)], sl[np.where(sl>0.)], "r-") sp2.plot(zt[np.where(st>0.)], st[np.where(st>0.)], "b-") sp2.plot(zr[np.where(sr>0.)], sr[np.where(sr>0.)], "g-") sp2.plot(zx[np.where(sx>0.)], sx[np.where(sx>0.)], "m-") ymin, ymax = sp2.get_ylim() sp2.set_ylim(max(1e-7,ymin), 1e-1) sp3.plot(zl[np.where(fl>0.)], fl[np.where(fl>0.)], "r-") sp3.plot(zt[np.where(ft>0.)], ft[np.where(ft>0.)], "b-") sp3.plot(zr[np.where(fr>0.)], fr[np.where(fr>0.)], "g-") sp3.plot(zx[np.where(fx>0.)], fx[np.where(fx>0.)], "m-") ymin, ymax = sp3.get_ylim() sp3.set_ylim(max(1e-7,ymin), 1.1) sp1.legend(loc=2, fancybox=True) sp1.set_title("Mean refraction residual (arcsec)", weight="bold") sp2.set_ylabel("RMS residual (arcsec)", weight="bold") sp3.set_ylabel("f_tot with dR>%.3f"%(dcrLevel), weight="bold") sp3.set_xlabel("Zenith distance (deg)", weight="bold") sp1.axhline(y=0, c='k', linestyle='--', alpha=0.5) sp2.axhline(y=dcrLevel, c='k', linestyle='--', alpha=0.5) sp3.axhline(y=0.01, c='k', linestyle='--', alpha=0.5) sp2.semilogy() sp3.semilogy() plt.savefig("R_%s_%s.png" % (band, extension)) ### fig, (sp1, sp2, sp3) = plt.subplots(3, 1, sharex=True, figsize=(16,12)) sp1.plot(zl, ml, "r-", label="LinearRegression") sp1.plot(ztw, mtw, "b-", label="DecisionTreeRegressor weighted") sp1.plot(zrw, mrw, "g-", label="RandomForestRegressor weighted") sp1.plot(zxw, mxw, "m-", label="ExtraTreeRegressor weighted") sp2.plot(zl[np.where(sl>0.)], sl[np.where(sl>0.)], "r-") sp2.plot(ztw[np.where(stw>0.)], stw[np.where(stw>0.)], "b-") sp2.plot(zrw[np.where(srw>0.)], srw[np.where(srw>0.)], "g-") sp2.plot(zxw[np.where(sxw>0.)], sxw[np.where(sxw>0.)], "m-") ymin, ymax = sp2.get_ylim() sp2.set_ylim(max(1e-7,ymin), 1e-1) sp3.plot(zl[np.where(fl>0.)], fl[np.where(fl>0.)], "r-") sp3.plot(ztw[np.where(ftw>0.)], ftw[np.where(ftw>0.)], "b-") sp3.plot(zrw[np.where(frw>0.)], frw[np.where(frw>0.)], "g-") sp3.plot(zxw[np.where(fxw>0.)], fxw[np.where(fxw>0.)], "m-") ymin, ymax = sp3.get_ylim() sp3.set_ylim(max(1e-7,ymin), 1.1) sp1.legend(loc=2, fancybox=True) sp1.set_title("Mean refraction residual (arcsec)", weight="bold") sp2.set_ylabel("RMS residual (arcsec)", weight="bold") sp3.set_ylabel("f_tot with dR>%.3f"%(dcrLevel), weight="bold") sp3.set_xlabel("Zenith distance (deg)", weight="bold") sp1.axhline(y=0, c='k', linestyle='--', alpha=0.5) sp2.axhline(y=dcrLevel, c='k', linestyle='--', alpha=0.5) sp3.axhline(y=0.01, c='k', linestyle='--', alpha=0.5) sp2.semilogy() sp3.semilogy() plt.savefig("R_%s_%s_weight.png" % (band, extension))
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaled_X = scaler.fit_transform(X) new_X = pd.DataFrame(scaled_X, columns=X.columns) new_X.head from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.33, random_state=42) #check r2 score accuracy for Train data from sklearn.tree import ExtraTreeRegressor model = ExtraTreeRegressor() model.fit(X_train, y_train) print(model.score(X_train, y_train)) #check r2 score accuracy for Test data from sklearn.tree import ExtraTreeRegressor model = ExtraTreeRegressor() model.fit(X_test, y_test) print(model.score(X_test, y_test)) print(model.feature_importances_) imp_feat = pd.Series(model.feature_importances_, index=X.columns) imp_feat.nlargest(5).plot(kind='barh') plt.show() from sklearn.linear_model import LinearRegression
'mse_train', 'mse_test', 'mae_train', 'mae_test', 'mdae_train', 'mdae_test' ] reg = [ linear_model.LinearRegression(), linear_model.Ridge(max_iter=800), linear_model.RidgeCV(), linear_model.Lasso(max_iter=800), linear_model.LassoLarsCV(max_iter=800), linear_model.RANSACRegressor(), linear_model.BayesianRidge(), linear_model.ARDRegression(), linear_model.HuberRegressor(max_iter=800), linear_model.TheilSenRegressor(max_iter=800), PLSRegression(), DecisionTreeRegressor(), ExtraTreeRegressor(), BaggingRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), RandomForestRegressor(), linear_model.PassiveAggressiveRegressor(max_iter=800, tol=.001), linear_model.ElasticNet(max_iter=800), linear_model.SGDRegressor(max_iter=800, tol=.001), svm.SVR(), KNeighborsRegressor(), RadiusNeighborsRegressor(radius=1.5), GaussianProcessRegressor() ] listreg = [ 'LinearRegression', 'Ridge', 'RidgeCV', 'Lasso', 'LassoLarsCV',
from sklearn import ensemble model_adaboost_regressor = ensemble.AdaBoostRegressor( n_estimators=50) # 这里使用50个决策树 # 7.GBRT回归 from sklearn import ensemble model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor( n_estimators=100) # 这里使用100个决策树 # 8.Bagging回归 from sklearn import ensemble model_bagging_regressor = ensemble.BaggingRegressor() # 9.ExtraTree极端随机数回归 from sklearn.tree import ExtraTreeRegressor model_extra_tree_regressor = ExtraTreeRegressor() # 10.多项式回归 model_Polynomial = make_pipeline(PolynomialFeatures(3), Ridge()) # 11.高斯过程(Gaussian Processes) from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel kernel = DotProduct() + WhiteKernel() model_GaussianProcessRegressor = GaussianProcessRegressor(kernel=kernel, random_state=0) # 回归部分 def try_different_method(model): model.fit(x_train, y_train)
def Build_MapMean_Model(self): knn_MapMean = ExtraTreeRegressor() knn_MapMean.fit(self.MapFeature_list,self.MapMean_list) print knn_MapMean.feature_importances_ self.Dump_Model('Model/MapMean.model',knn_MapMean)
def __init__(self, **kwargs): super().__init__(ExtraTreeRegressor(**kwargs))
pass def fselection_bfs(): pass def fselection_add_del(): pass if __name__ == '__main__': X, y = load_boston(return_X_y=True) X = X[:20, :] y = y[:20] alg = ExtraTreeRegressor() cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42) n = X.shape[1] int_scores = {} ext_scores = {} for i in range(1, n + 1): int_score_tmp1 = inf ext_score_tmp1 = inf for features in combinations(range(n), i): X_cuted = X[:, features] int_score_tmp2 = inf ext_score_tmp2 = inf for train_index, test_index in cv.split(X_cuted): X_train, X_test = X_cuted[train_index], X_cuted[test_index]
model_maps['random_forest'] = ensemble.RandomForestRegressor( n_estimators=10) #这里使用20个决策树 ####3.6Adaboost回归#### from sklearn import ensemble model_maps['adaboost'] = ensemble.AdaBoostRegressor( n_estimators=50) #这里使用50个决策树 ####3.7GBRT回归#### from sklearn import ensemble model_maps['gradient_boosting'] = ensemble.GradientBoostingRegressor( n_estimators=100) #这里使用100个决策树 ####3.8Bagging回归#### from sklearn.ensemble import BaggingRegressor model_maps['bagging'] = BaggingRegressor() ####3.9ExtraTree极端随机树回归#### from sklearn.tree import ExtraTreeRegressor model_maps['extra_tree'] = ExtraTreeRegressor() def get_score(data_path, model_name): data = np.memmap(data_path, dtype='float64', mode='r') data = np.array(data.reshape((int(len(data) / 46), 46))) bad_row = [] for i in range(data.shape[0]): if data[i, 0] == 0: bad_row.append(i) data = np.delete(data, bad_row, axis=0) X = data[:, 2:-1] Y = data[:, -1].reshape(-1, 1) split_at = int(X.shape[0] * 0.8) X_tr = X[:split_at, :] X_te = X[split_at:, :]
def Build_MapMean_Model(self): MapMean_Model = ExtraTreeRegressor() MapMean_Model.fit(self.MapFeature_list,self.MapMean_list) self.Dump_Model('Model/MapMean.model',MapMean_Model)