def applyTPOT(X_train, y_train, X_test, y_test, SavePath, popSize=20, number_Generations=5, kFolders=0, TPOTSingleMinutes=1, TPOTFullMinutes = 10, useSavedModels = True): if not useSavedModels or not os.path.isfile(SavePath): pipeline_optimizer = tpot.TPOTRegressor(generations=number_Generations, #number of iterations to run the training population_size=popSize, #number of individuals to train cv=kFolders, #number of folds in StratifiedKFold max_eval_time_mins=TPOTSingleMinutes, #time in minutes for each trial max_time_mins=TPOTFullMinutes, #time in minutes for whole optimization scoring="neg_mean_absolute_error") pipeline_optimizer.fit(X_train, y_train) #fit the pipeline optimizer - can take a long time pipeline_optimizer.export(SavePath) else: print("######### PLACE THE EXPORTED PIPELINE CODE HERE ########") # from sklearn.ensemble import ExtraTreesRegressor # from sklearn.pipeline import make_pipeline # from sklearn.preprocessing import PolynomialFeatures # # Average CV score on the training set was: -0.04784394817861738 # pipeline_optimizer = make_pipeline( # PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), # ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=12, min_samples_split=13, n_estimators=100) # ) # pipeline_optimizer.fit(X_train, y_train) print("TPOT - Score: {0}".format(-pipeline_optimizer.score(X_test, y_test))) y_hat = pipeline_optimizer.predict(X_test) print("MAE: %.4f" % mean_absolute_error(y_test, y_hat)) return y_hat
def train( self, train_file: Union[str, Path], validation_file: Optional[Union[str, Path]] = None, workdir: Optional[Union[str, Path]] = None, ) -> Dict[str, float]: X_train, y_train = self.load_data(train_file) assert y_train is not None with tempfile.TemporaryDirectory() as tempdir: workdir = Path(workdir or tempdir) log_file_name = workdir / "tpot.log" pipeline_file_name = workdir / "fitted_pipeline.pkl" pipeline_code_file_name = workdir / "pipeline.py" with open(log_file_name, "w") as log_file: teeing_log_file = TeeingIO(log_file, sys.stdout) if self._task == "classification": model = tpot.TPOTClassifier( log_file=teeing_log_file, **self._kwargs, ) else: model = tpot.TPOTRegressor(log_file=teeing_log_file, **self._kwargs) model.fit(X_train, y_train) with open(log_file_name) as log_file: tpot_log = log_file.read() model.export(str(pipeline_code_file_name)) self._estimator = model.fitted_pipeline_ with open(pipeline_file_name, "wb") as pipeline_file: pickle.dump(self._estimator, pipeline_file) metrics = self._get_metrics_from_log(tpot_log) if validation_file is not None: X_val, y_val = self.load_data(validation_file) assert y_val is not None metrics["validation_score"] = model.score(X_val, y_val) return metrics
'xgboost.XGBRegressor': { 'n_estimators': [100], 'max_depth': range(1, 11), 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], 'subsample': np.arange(0.05, 1.01, 0.05), 'min_child_weight': range(1, 21), 'nthread': [1], 'objective': ['reg:squarederror'] } } tpot_estimator = tpot.TPOTRegressor(generations=5, population_size=100, offspring_size=250, verbosity=2, early_stop=3, config_dict=regressor_config_dict, cv = 5, scoring = scoring) tpot_estimator.fit(X_train, y_train) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.fitted_pipeline_) print(tpot_estimator._optimized_pipeline) print(tpot_estimator.evaluated_individuals_) X_test = house3[house_train.shape[0]:] X_test1 = utils.select_features(lasso_selector, X_test) pca_test_data = lpca.transform(X_test1) pca_test_data.shape house_test['SalePrice'] = np.expm1(final_rf_model.predict(pca_test_data)) house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
day_of_week = datetime.datetime(year, int(month), int(day)).weekday() # day_of_week = 0 if day_of_week < 5 else 1 row_offset = (year - start_year) * 12 * 7 * 24 row = row_offset + int(month) + int(day_of_week) point = Point(crime['properties']['X'], crime['properties']['Y']) point = transform(project, point) latitude = float(truncate(float(point.x), 2)) longitude = float(truncate(float(point.y), 2)) if (min_lat <= latitude <= max_lat) and (min_long <= longitude <= max_long): try: lat_long_dict[latitude, longitude, year, int(month), day_of_week, hour] += 1 except KeyError: pass # print('starting training...') data = np.array(list(lat_long_dict.keys())) target = np.array(list(lat_long_dict.values())) # X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, target) # regressor = KNeighborsRegressor(n_jobs=-1) # regressor.fit(X_train, y_train) # expected = y_test # predicted = regressor.predict(X_test) # joblib.dump(regressor, 'k_neighbors.pkl') # print(classification_report(expected, predicted)) print('Training') crime_classifier = tpot.TPOTRegressor(n_jobs=-1, verbosity=3, generations=50, population_size=50) crime_classifier.fit(data, target) crime_classifier.export('crime_predictor_classifier.py')
# 'sklearn.preprocessing.StandardScaler': { # }, # 'sklearn.feature_selection.VarianceThreshold': { # 'threshold': np.arange(0.05, 1.01, 0.05) # }, # # 'sklearn.feature_selection.SelectFromModel': { # 'estimator': { # 'sklearn.linear_model.Lasso': { # 'alpha': np.linspace(0.001, 1.0, 100), # 'normalize': [True, False] # } # } # }, } #y=np.log1p(y) model = tpot.TPOTRegressor(generations=50, population_size=50, verbosity=3, config_dict=config_dict) model.fit(X, y) model.export('diabetes-tpot-result.py') pipe = model._toolbox.compile(expr=model._optimized_pipeline) cv_pred = cross_val_predict(pipe, X, y, cv=5) print("R2 score: %.4f" % r2_score(y, cv_pred))
y_trans = np.log1p(y_train) sns.distplot(y_trans) params = { 'n_estimators': [100], 'max_depth': range(1, 11), 'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.], 'subsample': np.arange(0.05, 1.01, 0.05), 'min_child_weight': range(1, 21), 'nthread': [1], 'objective': ['reg:squarederror'] } tpot_estimator = tpot.TPOTRegressor( generations=5, population_size=100, offspring_size=250, verbosity=2, early_stop=3, config_dict={'xgboost.XGBRegressor': params}, cv=5) tpot_estimator.fit(pca_data, y_trans) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.fitted_pipeline_) print(tpot_estimator._optimized_pipeline) print(tpot_estimator.evaluated_individuals_) X_test = house3[house_train.shape[0]:] X_test1 = utils.select_features(lasso_selector, X_test) pca_test_data = lpca.transform(X_test1) pca_test_data.shape house_test['SalePrice'] = np.expm1(final_rf_model.predict(pca_test_data)) house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv",
('pca', decomposition.PCA(n_components=0.95)), ('tsne', manifold.TSNE(2))]) tsne_data = viz_pipeline.fit_transform(house_train1) rutils.plot_data_3d_regression(tsne_data, house_train['SalePrice']) X_train = preprocess_pipeline.fit_transform(house_train1) y_train = house_train['SalePrice'] scoring = metrics.make_scorer(log_rmse, greater_is_better=False) tpot_estimator = tpot.TPOTRegressor(generations=10, population_size=40, verbosity=2, early_stop=2, random_state=100, cv=5, scoring=scoring, config_dict=None, warm_start=True, periodic_checkpoint_folder='I:/checkpoint') tpot_estimator.fit(X_train, y_train) print(tpot_estimator.score(X_train, y_train)) print(tpot_estimator.evaluated_individuals_) print(tpot_estimator.fitted_pipeline_) #read test data house_test = pd.read_csv(os.path.join(path, "test.csv")) house_test.shape house_test.info() house_test['SalePrice'] = None