Beispiel #1
0
 def test_sparse_matrix_lr(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 3,
         "metric": "f1",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
         "estimator_list": ["lrl1", "lrl2"],
         "log_type": "all",
         "n_jobs": 1,
     }
     X_train = scipy.sparse.random(3000, 3000, density=0.1)
     y_train = np.random.randint(2, size=3000)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           train_time_limit=1,
                           **automl_settings)
     automl_settings["time_budget"] = 5
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("lrl2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Beispiel #2
0
 def test_sparse_matrix_regression(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": "mae",
         "task": "regression",
         "log_file_name": "test/sparse_regression.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "verbose": 0,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("rf"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
Beispiel #3
0
 def test_regression_xgboost(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_experiment.add_learner(learner_name="my_xgb1",
                                   learner_class=MyXGB1)
     automl_experiment.add_learner(learner_name="my_xgb2",
                                   learner_class=MyXGB2)
     automl_settings = {
         "time_budget": 2,
         "estimator_list": ["my_xgb1", "my_xgb2"],
         "task": "regression",
         "log_file_name": "test/regression_xgboost.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("my_xgb2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
Beispiel #4
0
    def test_random_skip_oom(self):
        automl_experiment = AutoML()
        automl_experiment.add_learner(learner_name="large_lgbm",
                                      learner_class=MyLargeLGBM)
        automl_settings = {
            "time_budget": 2,
            "task": "classification",
            "log_file_name": "test/sparse_classification_oom.log",
            "estimator_list": ["large_lgbm"],
            "log_type": "all",
            "n_jobs": 1,
            "hpo_method": "random",
            "n_concurrent_trials": 2,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)

        try:
            automl_experiment.fit(X_train=X_train,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("large_lgbm"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            print("skipping concurrency test as ray is not installed")
            return
Beispiel #5
0
    def test_parallel_xgboost(self, hpo_method=None):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            "metric": "ap",
            "task": "classification",
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
            "n_jobs": 1,
            "n_concurrent_trials": 2,
            "hpo_method": hpo_method,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        try:
            import ray

            X_train_ref = ray.put(X_train)
            automl_experiment.fit(X_train=X_train_ref,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("xgboost"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            return
Beispiel #6
0
    def test_custom_metric(self):
        df, y = load_iris(return_X_y=True, as_frame=True)
        df["label"] = y
        automl_experiment = AutoML()
        automl_settings = {
            "dataframe": df,
            "label": "label",
            "time_budget": 5,
            "eval_method": "cv",
            "metric": custom_metric,
            "task": "classification",
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            "log_type": "all",
            "n_jobs": 1,
            "model_history": True,
            "sample_weight": np.ones(len(y)),
            "pred_time_limit": 1e-5,
            "ensemble": True,
        }
        automl_experiment.fit(**automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.best_model_for_estimator("rf"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0, task="multi"
        )
        print(estimator)
        (
            time_history,
            best_valid_loss_history,
            valid_loss_history,
            config_history,
            metric_history,
        ) = get_output_from_log(
            filename=automl_settings["log_file_name"], time_budget=6
        )
        print(metric_history)
        try:
            import ray

            df = ray.put(df)
            automl_settings["dataframe"] = df
            automl_settings["use_ray"] = True
            automl_experiment.fit(**automl_settings)
        except ImportError:
            pass
Beispiel #7
0
 def test_custom_learner(self):
     automl = AutoML()
     automl.add_learner(learner_name='RGF',
                        learner_class=MyRegularizedGreedyForest)
     X_train, y_train = load_wine(return_X_y=True)
     settings = {
         "time_budget": 10,  # total running time in seconds
         "estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
         "task": 'classification',  # task type
         "sample": True,  # whether to subsample training data
         "log_file_name": "test/wine.log",
         "log_training_metric": True,  # whether to log training metric
         "n_jobs": 1,
     }
     '''The main flaml automl API'''
     automl.fit(X_train=X_train, y_train=y_train, **settings)
     # print the best model found for RGF
     print(automl.best_model_for_estimator("RGF"))
Beispiel #8
0
 def test_regression(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     n = int(len(y_train) * 9 // 10)
     automl_experiment.fit(X_train=X_train[:n],
                           y_train=y_train[:n],
                           X_val=X_train[n:],
                           y_val=y_train[n:],
                           **automl_settings)
     assert automl_experiment._state.eval_method == "holdout"
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("xgboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(get_output_from_log(automl_settings["log_file_name"], 1))
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=1,
     )
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=0,
     )
Beispiel #9
0
 def test_sparse_matrix_classification(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": "auto",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
         "split_type": "uniform",
         "n_jobs": 1,
         "model_history": True,
     }
     X_train = scipy.sparse.random(1554, 21, dtype=int)
     y_train = np.random.randint(3, size=1554)
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
     print(automl_experiment.classes_)
     print(automl_experiment.predict_proba(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("extra_tree"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Beispiel #10
0
    def test_custom_learner(self):
        automl = AutoML()
        automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
        X_train, y_train = load_wine(return_X_y=True)
        settings = {
            "time_budget": 8,  # total running time in seconds
            "estimator_list": ["RGF", "lgbm", "rf", "xgboost"],
            "task": "classification",  # task type
            "sample": True,  # whether to subsample training data
            "log_file_name": "test/wine.log",
            "log_training_metric": True,  # whether to log training metric
            "n_jobs": 1,
        }

        """The main flaml automl API"""
        automl.fit(X_train=X_train, y_train=y_train, **settings)
        # print the best model found for RGF
        print(automl.best_model_for_estimator("RGF"))

        MyRegularizedGreedyForest.search_space = lambda data_size, task: {}
        automl.fit(X_train=X_train, y_train=y_train, **settings)
Beispiel #11
0
 def test_sparse_matrix_xgboost(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 3,
         "metric": "ap",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
         "estimator_list": ["xgboost"],
         "log_type": "all",
         "n_jobs": 1,
     }
     X_train = scipy.sparse.eye(900000)
     y_train = np.random.randint(2, size=900000)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("xgboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Beispiel #12
0
 def test_classification(self, as_frame=False):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 4,
         "metric": "accuracy",
         "task": "classification",
         "log_file_name": "test/iris.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
     if as_frame:
         # test drop column
         X_train.columns = range(X_train.shape[1])
         X_train[X_train.shape[1]] = np.zeros(len(y_train))
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
     print(automl_experiment.classes_)
     print(automl_experiment.predict(X_train)[:5])
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("catboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     del automl_settings["metric"]
     del automl_settings["model_history"]
     del automl_settings["log_training_metric"]
     automl_experiment = AutoML(task="classification")
     duration = automl_experiment.retrain_from_log(
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         record_id=0,
     )
     print(duration)
     print(automl_experiment.model)
     print(automl_experiment.predict_proba(X_train)[:5])
Beispiel #13
0
 def test_sparse_matrix_regression_holdout(self):
     X_train = scipy.sparse.random(8, 100)
     y_train = np.random.uniform(size=8)
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 1,
         "eval_method": "holdout",
         "task": "regression",
         "log_file_name": "test/sparse_regression.log",
         "n_jobs": 1,
         "model_history": True,
         "metric": "mse",
         "sample_weight": np.ones(len(y_train)),
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("rf"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Beispiel #14
0
 def test_parallel(self, hpo_method=None):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 10,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_type": "all",
         "n_jobs": 1,
         "n_concurrent_trials": 10,
         "hpo_method": hpo_method,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     try:
         automl_experiment.fit(X_train=X_train,
                               y_train=y_train,
                               **automl_settings)
         print(automl_experiment.predict(X_train))
         print(automl_experiment.model)
         print(automl_experiment.config_history)
         print(automl_experiment.best_model_for_estimator("xgboost"))
         print(automl_experiment.best_iteration)
         print(automl_experiment.best_estimator)
     except ImportError:
         return
Beispiel #15
0
    def test_training_log(self,
                          path="test_training_log.log",
                          estimator_list="auto",
                          use_ray=False):

        with TemporaryDirectory() as d:
            filename = os.path.join(d, path)

            # Run a simple job.
            automl = AutoML()
            automl_settings = {
                "time_budget": 1,
                "metric": "mse",
                "task": "regression",
                "log_file_name": filename,
                "log_training_metric": True,
                "mem_thres": 1024 * 1024,
                "n_jobs": 1,
                "model_history": True,
                "train_time_limit": 0.1,
                "verbose": 3,
                # "ensemble": True,
                "keep_search_state": True,
                "estimator_list": estimator_list,
            }
            X_train, y_train = fetch_california_housing(return_X_y=True)
            automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
            # Check if the training log file is populated.
            self.assertTrue(os.path.exists(filename))
            if automl.best_estimator:
                estimator, config = automl.best_estimator, automl.best_config
                model0 = automl.best_model_for_estimator(estimator)
                print(model0.params["n_estimators"], config)

                # train on full data with no time limit
                automl._state.time_budget = None
                model, _ = automl._state._train_with_config(estimator, config)

                # assuming estimator & config are saved and loaded as follows
                automl = AutoML()
                automl.fit(
                    X_train=X_train,
                    y_train=y_train,
                    max_iter=1,
                    task="regression",
                    estimator_list=[estimator],
                    n_jobs=1,
                    starting_points={estimator: config},
                    use_ray=use_ray,
                )
                print(automl.best_config)
                # then the fitted model should be equivalent to model
                assert (str(model.estimator) == str(automl.model.estimator)
                        or estimator == "xgboost"
                        and str(model.estimator.get_dump()) == str(
                            automl.model.estimator.get_dump())
                        or estimator == "catboost"
                        and str(model.estimator.get_all_params()) == str(
                            automl.model.estimator.get_all_params()))
                automl.fit(
                    X_train=X_train,
                    y_train=y_train,
                    max_iter=1,
                    task="regression",
                    estimator_list=[estimator],
                    n_jobs=1,
                    starting_points={estimator: {}},
                )
                print(automl.best_config)

                with training_log_reader(filename) as reader:
                    count = 0
                    for record in reader.records():
                        print(record)
                        count += 1
                    self.assertGreater(count, 0)

            automl_settings["log_file_name"] = ""
            automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
            automl._selected.update(None, 0)
            automl = AutoML()
            automl.fit(X_train=X_train,
                       y_train=y_train,
                       max_iter=0,
                       task="regression")