def applyTPOT(X_train, y_train, X_test, y_test, SavePath, popSize=20,
              number_Generations=5, kFolders=0, TPOTSingleMinutes=1,
              TPOTFullMinutes = 10, useSavedModels = True):
    if not useSavedModels or not os.path.isfile(SavePath):
        pipeline_optimizer = tpot.TPOTRegressor(generations=number_Generations, #number of iterations to run the training
                                                population_size=popSize, #number of individuals to train
                                                cv=kFolders, #number of folds in StratifiedKFold
                                                max_eval_time_mins=TPOTSingleMinutes, #time in minutes for each trial
                                                max_time_mins=TPOTFullMinutes, #time in minutes for whole optimization
                                                scoring="neg_mean_absolute_error") 
        
        pipeline_optimizer.fit(X_train, y_train) #fit the pipeline optimizer - can take a long time
        pipeline_optimizer.export(SavePath)
        
    else:
        print("######### PLACE THE EXPORTED PIPELINE CODE HERE ########")
        # from sklearn.ensemble import ExtraTreesRegressor
        # from sklearn.pipeline import make_pipeline
        # from sklearn.preprocessing import PolynomialFeatures

        # # Average CV score on the training set was: -0.04784394817861738
        # pipeline_optimizer = make_pipeline(
        #     PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
        #     ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=12, min_samples_split=13, n_estimators=100)
        # )
        # pipeline_optimizer.fit(X_train, y_train)
        
    print("TPOT - Score: {0}".format(-pipeline_optimizer.score(X_test, y_test)))
    y_hat = pipeline_optimizer.predict(X_test)
    print("MAE: %.4f" % mean_absolute_error(y_test, y_hat))
        
    return y_hat
Ejemplo n.º 2
0
    def train(
        self,
        train_file: Union[str, Path],
        validation_file: Optional[Union[str, Path]] = None,
        workdir: Optional[Union[str, Path]] = None,
    ) -> Dict[str, float]:
        X_train, y_train = self.load_data(train_file)
        assert y_train is not None

        with tempfile.TemporaryDirectory() as tempdir:
            workdir = Path(workdir or tempdir)
            log_file_name = workdir / "tpot.log"
            pipeline_file_name = workdir / "fitted_pipeline.pkl"
            pipeline_code_file_name = workdir / "pipeline.py"

            with open(log_file_name, "w") as log_file:
                teeing_log_file = TeeingIO(log_file, sys.stdout)
                if self._task == "classification":
                    model = tpot.TPOTClassifier(
                        log_file=teeing_log_file,
                        **self._kwargs,
                    )
                else:
                    model = tpot.TPOTRegressor(log_file=teeing_log_file,
                                               **self._kwargs)

                model.fit(X_train, y_train)

            with open(log_file_name) as log_file:
                tpot_log = log_file.read()

            model.export(str(pipeline_code_file_name))
            self._estimator = model.fitted_pipeline_
            with open(pipeline_file_name, "wb") as pipeline_file:
                pickle.dump(self._estimator, pipeline_file)

        metrics = self._get_metrics_from_log(tpot_log)

        if validation_file is not None:
            X_val, y_val = self.load_data(validation_file)
            assert y_val is not None
            metrics["validation_score"] = model.score(X_val, y_val)

        return metrics
    'xgboost.XGBRegressor': {
        'n_estimators': [100],
        'max_depth': range(1, 11),
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'subsample': np.arange(0.05, 1.01, 0.05),
        'min_child_weight': range(1, 21),
        'nthread': [1],
        'objective': ['reg:squarederror']
    }
    
}

tpot_estimator = tpot.TPOTRegressor(generations=5, population_size=100,
                                offspring_size=250,
                                verbosity=2, early_stop=3, 
                                config_dict=regressor_config_dict,
                                cv = 5, scoring = scoring)
tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.score(X_train, y_train))
print(tpot_estimator.fitted_pipeline_)
print(tpot_estimator._optimized_pipeline)
print(tpot_estimator.evaluated_individuals_)
X_test = house3[house_train.shape[0]:]
X_test1 = utils.select_features(lasso_selector, X_test)
pca_test_data = lpca.transform(X_test1)
pca_test_data.shape

house_test['SalePrice'] = np.expm1(final_rf_model.predict(pca_test_data))
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv", columns=["Id", "SalePrice"], index=False)
Ejemplo n.º 4
0
        day_of_week = datetime.datetime(year, int(month), int(day)).weekday()
        # day_of_week = 0 if day_of_week < 5 else 1
        row_offset = (year - start_year) * 12 * 7 * 24
        row = row_offset + int(month) + int(day_of_week)
        point = Point(crime['properties']['X'], crime['properties']['Y'])
        point = transform(project, point)
        latitude = float(truncate(float(point.x), 2))
        longitude = float(truncate(float(point.y), 2))
        if (min_lat <= latitude <= max_lat) and (min_long <= longitude <= max_long):
            try:
                lat_long_dict[latitude, longitude, year, int(month), day_of_week, hour] += 1
            except KeyError:
                pass
#
print('starting training...')
data = np.array(list(lat_long_dict.keys()))
target = np.array(list(lat_long_dict.values()))
# X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data, target)
# regressor = KNeighborsRegressor(n_jobs=-1)
# regressor.fit(X_train, y_train)
# expected = y_test
# predicted = regressor.predict(X_test)
# joblib.dump(regressor, 'k_neighbors.pkl')
# print(classification_report(expected, predicted))

print('Training')
crime_classifier = tpot.TPOTRegressor(n_jobs=-1, verbosity=3, generations=50, population_size=50)
crime_classifier.fit(data, target)
crime_classifier.export('crime_predictor_classifier.py')

    # 'sklearn.preprocessing.StandardScaler': {
    # },

    # 'sklearn.feature_selection.VarianceThreshold': {
    #     'threshold': np.arange(0.05, 1.01, 0.05)
    # },
    #
    # 'sklearn.feature_selection.SelectFromModel': {
    #     'estimator': {
    #         'sklearn.linear_model.Lasso': {
    #             'alpha': np.linspace(0.001, 1.0, 100),
    #             'normalize': [True, False]
    #         }
    #     }
    # },
}

#y=np.log1p(y)

model = tpot.TPOTRegressor(generations=50,
                           population_size=50,
                           verbosity=3,
                           config_dict=config_dict)
model.fit(X, y)

model.export('diabetes-tpot-result.py')

pipe = model._toolbox.compile(expr=model._optimized_pipeline)
cv_pred = cross_val_predict(pipe, X, y, cv=5)
print("R2 score: %.4f" % r2_score(y, cv_pred))
Ejemplo n.º 6
0
y_trans = np.log1p(y_train)
sns.distplot(y_trans)

params = {
    'n_estimators': [100],
    'max_depth': range(1, 11),
    'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
    'subsample': np.arange(0.05, 1.01, 0.05),
    'min_child_weight': range(1, 21),
    'nthread': [1],
    'objective': ['reg:squarederror']
}
tpot_estimator = tpot.TPOTRegressor(
    generations=5,
    population_size=100,
    offspring_size=250,
    verbosity=2,
    early_stop=3,
    config_dict={'xgboost.XGBRegressor': params},
    cv=5)
tpot_estimator.fit(pca_data, y_trans)
print(tpot_estimator.score(X_train, y_train))
print(tpot_estimator.fitted_pipeline_)
print(tpot_estimator._optimized_pipeline)
print(tpot_estimator.evaluated_individuals_)
X_test = house3[house_train.shape[0]:]
X_test1 = utils.select_features(lasso_selector, X_test)
pca_test_data = lpca.transform(X_test1)
pca_test_data.shape

house_test['SalePrice'] = np.expm1(final_rf_model.predict(pca_test_data))
house_test.to_csv("C:\\Users\\Algorithmica\\Downloads\\submission.csv",
Ejemplo n.º 7
0
                                  ('pca',
                                   decomposition.PCA(n_components=0.95)),
                                  ('tsne', manifold.TSNE(2))])

tsne_data = viz_pipeline.fit_transform(house_train1)
rutils.plot_data_3d_regression(tsne_data, house_train['SalePrice'])

X_train = preprocess_pipeline.fit_transform(house_train1)
y_train = house_train['SalePrice']

scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
tpot_estimator = tpot.TPOTRegressor(generations=10,
                                    population_size=40,
                                    verbosity=2,
                                    early_stop=2,
                                    random_state=100,
                                    cv=5,
                                    scoring=scoring,
                                    config_dict=None,
                                    warm_start=True,
                                    periodic_checkpoint_folder='I:/checkpoint')
tpot_estimator.fit(X_train, y_train)
print(tpot_estimator.score(X_train, y_train))
print(tpot_estimator.evaluated_individuals_)
print(tpot_estimator.fitted_pipeline_)

#read test data
house_test = pd.read_csv(os.path.join(path, "test.csv"))
house_test.shape
house_test.info()
house_test['SalePrice'] = None