class ExtraTreesRegressorImpl(): def __init__(self, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start} self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
class ExtremelyRandomizeTreeEstimator(Estimator): def __init__(self): self.estimator = ExtraTreesRegressor(n_estimators=30) self.initialized = False def __call__(self, state, action): if self.initialized: x = np.array(state + [action[0], action[1]]).reshape(1, -1) return self.estimator.predict(x)[0] else: return 0 def train(self, train_in, train_out): self.initialized = True train_in_formatted = np.array(train_in) self.estimator.fit(train_in_formatted, train_out)
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def __init__( self, sc=None, partitions="auto", n_estimators=100, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ): ExtraTreesRegressor.__init__( self, n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, ) self.sc = sc self.partitions = partitions
def create_model(min_split=186, njobs=1, verbose=False): regressor_params = { 'n_estimators': 50, 'criterion': 'mse', 'min_samples_split': min_split, 'min_samples_leaf': 1, 'n_jobs': njobs, 'verbose': verbose } model = ExtraTreesRegressor(**regressor_params) return model
def __init__(self, n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False): self._hyperparams = { 'n_estimators': n_estimators, 'criterion': criterion, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'min_weight_fraction_leaf': min_weight_fraction_leaf, 'max_features': max_features, 'max_leaf_nodes': max_leaf_nodes, 'min_impurity_decrease': min_impurity_decrease, 'min_impurity_split': min_impurity_split, 'bootstrap': bootstrap, 'oob_score': oob_score, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose, 'warm_start': warm_start} self._wrapped_model = Op(**self._hyperparams)
"Passive Aggressive", "SGD", "Theil-Sen", "RANSAC", "K-Neighbors", "Radius Neighbors", "MLP", "Decision Tree", "Extra Tree", "SVR" ] classifiers = [ RandomForestRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate), # GradientBoostingRegressor(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] # HistGradientBoostingClassifier(random_state=randomstate), # learning_rate is a hyper-parameter in the range (0.0, 1.0] AdaBoostRegressor(n_estimators=200, random_state=randomstate), GaussianProcessRegressor(normalize_y=True), ARDRegression(), # HuberRegressor(), # epsilon: greater than 1.0, default 1.35 LinearRegression(n_jobs=5), PassiveAggressiveRegressor( random_state=randomstate), # C: 0.25, 0.5, 1, 5, 10 SGDRegressor(random_state=randomstate), TheilSenRegressor(n_jobs=5, random_state=randomstate), RANSACRegressor(random_state=randomstate), KNeighborsRegressor( weights='distance'), # n_neighbors: 3, 6, 9, 12, 15, 20 RadiusNeighborsRegressor(weights='distance'), # radius: 1, 2, 5, 10, 15
max_depth=7, n_estimators=200, min_child_weight=10, subsample=0.7, colsample_bytree=0.7, reg_alpha=0, reg_lambda=0.5) reg.fit(X_train, y_train) end = time.time() y_pred_lgb = reg.predict(X_test) print(metrics.mean_squared_error(y_test, y_pred_lgb)) print(end - start) start = time.time() reg = ExtraTreesRegressor(n_estimators=100, max_depth=7, min_samples_leaf=10, n_jobs=8) reg.fit(X_train, y_train) end = time.time() y_pred = reg.predict(X_test) print(metrics.mean_squared_error(y_test, y_pred)) print(end - start) start = time.time() reg = KNeighborsRegressor(n_neighbors=4, algorithm='kd_tree') reg.fit(X_train, y_train) end = time.time() y_pred = reg.predict(X_test) print(metrics.mean_squared_error(y_test, y_pred)) print(end - start)
numfeat = 10 qtfm = PowerTransformer(method='yeo-johnson') y_train = np.squeeze(qtfm.fit_transform(y_train_tmp.values.reshape(-1, 1))) selidx, selscore, _ = sel(X_train.values, y_train, n_selected_features=numfeat) selscoredf = pd.DataFrame(data=np.transpose( np.vstack((X_train.columns[selidx].values, selscore))), columns=['Feature', 'Score']) X_train_selected = X_train.iloc[:, selidx[0:numfeat]] print(X_train_selected.columns.values) print("Train classifier...") clf = ExtraTreesRegressor(n_estimators=200, n_jobs=5, random_state=randomstate) clf.fit(X_train, y_train) # save classifier for further use dump(clf, clfpath) print("Training complete...") # clf = load(clfpath) # VALIDATION SET # load validation data validationfeatures = pd.read_csv( "/media/yannick/c4a7e8d3-9ac5-463f-b6e6-92e216ae6ac0/BRATS/BraTS2020/validationfeat_normalized.csv", index_col="ID") y_pred_validation = clf.predict(validationfeatures) pred_validation_df = pd.DataFrame(data=zip(validationfeatures.index.values, y_pred_validation),
regressor = regressor.fit(auto_X, auto_y) store_pkl(regressor, name + ".pkl") mpg = DataFrame(regressor.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), "DecisionTreeAuto") build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto") build_auto(LassoCV(random_state=13), "LassoAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto( BaggingRegressor(LinearRegression(), random_state=13, max_features=0.5), "LinearRegressionEnsembleAuto") build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=5), "RandomForestAuto") build_auto(RidgeCV(), "RidgeAuto") build_auto(XGBRegressor(objective="reg:linear"), "XGBAuto") housing_df = load_csv("Housing.csv")
'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(), 'ElasticNetCV':ElasticNetCV(), 'EmpiricalCovariance':EmpiricalCovariance(), 'ExtraTreeClassifier':ExtraTreeClassifier(), 'ExtraTreeRegressor':ExtraTreeRegressor(), 'ExtraTreesClassifier':ExtraTreesClassifier(), 'ExtraTreesRegressor':ExtraTreesRegressor(), 'FactorAnalysis':FactorAnalysis(), 'FastICA':FastICA(), 'FeatureAgglomeration':FeatureAgglomeration(), 'FunctionTransformer':FunctionTransformer(), 'GMM':GMM(), 'GaussianMixture':GaussianMixture(), 'GaussianNB':GaussianNB(), 'GaussianProcess':GaussianProcess(), 'GaussianProcessClassifier':GaussianProcessClassifier(), 'GaussianProcessRegressor':GaussianProcessRegressor(), 'GaussianRandomProjection':GaussianRandomProjection(), 'GenericUnivariateSelect':GenericUnivariateSelect(), 'GradientBoostingClassifier':GradientBoostingClassifier(), 'GradientBoostingRegressor':GradientBoostingRegressor(), 'GraphLasso':GraphLasso(),
def __init__(self): self.estimator = ExtraTreesRegressor(n_estimators=30) self.initialized = False
def run_tuning(dataset, nmin, half, n_jobs=1, output_path='', output_name='', track_file_name='', rt_file_name='', data_path=''): if len(dataset) == 0: # Create dataset dataset, _ = prepare_dataset(os.path.join(data_path, track_file_name + '.csv'), os.path.join(data_path, rt_file_name + '.csv'), reward_function='progress', knn_actions=True) X = dataset[state_cols + action_cols].values t = dataset['r'].values n_samples = len(t) ids = list(range(n_samples)) if half: np.random.shuffle(ids) ids_A = ids[:math.floor(n_samples / 2)] ids_B = ids[math.floor(n_samples / 2):] else: ids_A = ids mdl = ExtraTreesRegressor(n_estimators=100, criterion='mse', n_jobs=n_jobs) gcv = GridSearchCV(mdl, {'min_samples_leaf': nmin}, cv=10, scoring='neg_mean_squared_error') # Fit the models gcv.fit(X[ids_A, :], t[ids_A]) if half: gcv_list = [] gcv_list.append(gcv) gcv = GridSearchCV(mdl, {'min_samples_leaf': nmin}, cv=10, scoring='neg_mean_squared_error') # Fit the models gcv.fit(X[ids_B], t[ids_B]) gcv_list.append(gcv) to_save = gcv_list else: to_save = gcv if output_path != '': # Save the results with open(os.path.join(output_path, output_name + '.pkl'), 'wb') as out: pickle.dump(to_save, out, pickle.HIGHEST_PROTOCOL) print('Saved cross val results as {}'.format(output_name)) if half: return gcv_list else: return gcv
def test_run(fn, features, type): """ load dataset, build feature set, and do learning Parameters ---------- fn: file name of dataset features: a list of list, each of which is a feature list for different models type: str for indicating feature set Returns ------- predictions and feature-engineered dataset are saved to files """ np.set_printoptions(precision=4) print('test_run ' + type) df = load_data(fn) check_df(df) df = feature_engineering(df) print(df.columns) # print(df.head()) # print(df.groupby(['peak_hr'])['cnt'].agg(sum)) y_pred_list = [] for i, est in enumerate( (DecisionTreeRegressor(min_samples_split=20), ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=1234), RandomForestRegressor(n_estimators=1000, max_depth=15, random_state=1234, min_samples_split=3, n_jobs=-1), GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=0, min_samples_leaf=20, learning_rate=0.1, subsample=0.7, loss='ls'), svm.SVR(C=30))): # print(features[i]) df, X_train, X_test, y_train, y_test, y_train_cas, y_test_cas, y_train_reg, y_test_reg, time_test = split_data( df, features=features[i]) y_pred, mse = predict_evaluate(est, X_train, y_train, X_test, y_test) est_name = str(est).split('(')[0] print(type, est_name, np.round(mse, 4)) """ feature importance if est_name != 'SVR': # print out feature importance sfi = sorted([(x[0], float('%.4f'%x[1])) for x in zip(features[i], est.feature_importances_)], key=lambda x: x[1], reverse=True) print(sfi) print([x[0] for x in sfi]) """ y_pred_list.append([est_name, mse, y_pred]) # blending models y_pred_blend = np.log1p(.2 * (np.exp(y_pred_list[2][2]) - 1) + .8 * (np.exp(y_pred_list[3][2]) - 1)) print( type + ' blending: 0.2*' + y_pred_list[2][0] + ' + 0.8*' + y_pred_list[3][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) y_pred_blend = np.log1p(.3 * (np.exp(y_pred_list[1][2]) - 1) + .7 * (np.exp(y_pred_list[3][2]) - 1)) print( type + ' blending: 0.3*' + y_pred_list[1][0] + ' + 0.7*' + y_pred_list[3][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) y_pred_blend = np.log1p(.3 * (np.exp(y_pred_list[3][2]) - 1) + .7 * (np.exp(y_pred_list[4][2]) - 1)) print( type + ' blending: 0.2*' + y_pred_list[3][0] + ' + 0.8*' + y_pred_list[4][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) y_pred_blend = np.log1p(.6 * (np.exp(y_pred_list[3][2]) - 1) + .4 * (np.exp(y_pred_list[4][2]) - 1)) print( type + ' blending: 0.6*' + y_pred_list[3][0] + ' + 0.4*' + y_pred_list[4][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4)) dff = pd.DataFrame({ 'datetime': time_test[:, 0], 'mnth': time_test[:, 1], 'hr': time_test[:, 2], 'cnt': np.expm1(y_test), 'prediction': y_pred_blend }) dff.to_csv('../output/prediction_blended.csv', index=False, columns=['datetime', 'mnth', 'hr', 'cnt', 'prediction']) print('blended predictions saved in ../output/prediction_blended.csv') df.to_csv('../data/hour_ext.csv') print('extended dataset saved in ../data/hour_ext.csv')