Esempio n. 1
0
    def test_random_skip_oom(self):
        automl_experiment = AutoML()
        automl_experiment.add_learner(learner_name="large_lgbm",
                                      learner_class=MyLargeLGBM)
        automl_settings = {
            "time_budget": 2,
            "task": "classification",
            "log_file_name": "test/sparse_classification_oom.log",
            "estimator_list": ["large_lgbm"],
            "log_type": "all",
            "n_jobs": 1,
            "hpo_method": "random",
            "n_concurrent_trials": 2,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)

        try:
            automl_experiment.fit(X_train=X_train,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("large_lgbm"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            print("skipping concurrency test as ray is not installed")
            return
Esempio n. 2
0
 def test_sparse_matrix_lr(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 3,
         "metric": "f1",
         "task": "classification",
         "log_file_name": "test/sparse_classification.log",
         "estimator_list": ["lrl1", "lrl2"],
         "log_type": "all",
         "n_jobs": 1,
     }
     X_train = scipy.sparse.random(3000, 3000, density=0.1)
     y_train = np.random.randint(2, size=3000)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           train_time_limit=1,
                           **automl_settings)
     automl_settings["time_budget"] = 5
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("lrl2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
Esempio n. 3
0
    def test_parallel_xgboost(self, hpo_method=None):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            "metric": "ap",
            "task": "classification",
            "log_file_name": "test/sparse_classification.log",
            "estimator_list": ["xgboost"],
            "log_type": "all",
            "n_jobs": 1,
            "n_concurrent_trials": 2,
            "hpo_method": hpo_method,
        }
        X_train = scipy.sparse.eye(900000)
        y_train = np.random.randint(2, size=900000)
        try:
            import ray

            X_train_ref = ray.put(X_train)
            automl_experiment.fit(X_train=X_train_ref,
                                  y_train=y_train,
                                  **automl_settings)
            print(automl_experiment.predict(X_train))
            print(automl_experiment.model)
            print(automl_experiment.config_history)
            print(automl_experiment.best_model_for_estimator("xgboost"))
            print(automl_experiment.best_iteration)
            print(automl_experiment.best_estimator)
        except ImportError:
            return
Esempio n. 4
0
    def test_classification(self, as_frame=False):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 4,
            "metric": 'accuracy',
            "task": 'classification',
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train)[:5])
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        del automl_settings["metric"]
        del automl_settings["model_history"]
        del automl_settings["log_training_metric"]
        automl_experiment = AutoML()
        duration = automl_experiment.retrain_from_log(
            log_file_name=automl_settings["log_file_name"],
            X_train=X_train,
            y_train=y_train,
            train_full=True,
            record_id=0)
        print(duration)
        print(automl_experiment.model)
        print(automl_experiment.predict_proba(X_train)[:5])
Esempio n. 5
0
    def test_training_log(self):

        with TemporaryDirectory() as d:
            filename = os.path.join(d, 'test_training_log.log')

            # Run a simple job.
            automl_experiment = AutoML()
            automl_settings = {
                "time_budget": 2,
                "metric": 'mse',
                "task": 'regression',
                "log_file_name": filename,
                "log_training_metric": True,
                "mem_thres": 1024*1024,
                "n_jobs": 1,
                "model_history": True
            }
            X_train, y_train = load_boston(return_X_y=True)            
            automl_experiment.fit(X_train=X_train, y_train=y_train,
                                  **automl_settings)

            # Check if the training log file is populated.
            self.assertTrue(os.path.exists(filename))
            with training_log_reader(filename) as reader:
                count = 0
                for record in reader.records():
                    print(record)
                    count += 1
                self.assertGreater(count, 0)
Esempio n. 6
0
    def test_micro_macro_f1(self):
        automl_experiment_micro = AutoML()
        automl_experiment_macro = AutoML()
        automl_settings = {
            "time_budget": 2,
            "task": "classification",
            "log_file_name": "test/micro_macro_f1.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
        }
        X_train, y_train = load_iris(return_X_y=True)
        automl_experiment_micro.fit(
            X_train=X_train, y_train=y_train, metric="micro_f1", **automl_settings
        )
        automl_experiment_macro.fit(
            X_train=X_train, y_train=y_train, metric="macro_f1", **automl_settings
        )
        estimator = automl_experiment_macro.model
        y_pred = estimator.predict(X_train)
        y_pred_proba = estimator.predict_proba(X_train)
        from flaml.ml import norm_confusion_matrix, multi_class_curves

        print(norm_confusion_matrix(y_train, y_pred))
        from sklearn.metrics import roc_curve, precision_recall_curve

        print(multi_class_curves(y_train, y_pred_proba, roc_curve))
        print(multi_class_curves(y_train, y_pred_proba, precision_recall_curve))
Esempio n. 7
0
    def test_ray_classification(self):
        X, y = load_breast_cancer(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25)

        automl = AutoML()
        try:
            automl.fit(
                X_train,
                y_train,
                X_val=X_test,
                y_val=y_test,
                time_budget=10,
                task="classification",
                use_ray=True,
            )
            automl.fit(
                X_train,
                y_train,
                X_val=X_test,
                y_val=y_test,
                time_budget=10,
                task="classification",
                n_concurrent_trials=2,
            )
        except ImportError:
            return
Esempio n. 8
0
    def test_sparse_matrix_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mae',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(300, 900, density=0.0001)
        y_train = np.random.uniform(size=300)
        X_val = scipy.sparse.random(100, 900, density=0.0001)
        y_val = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              X_val=X_val,
                              y_val=y_val,
                              **automl_settings)
        assert automl_experiment.X_val.shape == X_val.shape
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(automl_experiment.best_config)
        print(automl_experiment.best_loss)
        print(automl_experiment.best_config_train_time)
Esempio n. 9
0
    def test_custom_metric(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 10,
            'eval_method': 'holdout',
            "metric": custom_metric,
            "task": 'classification',
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            'log_type': 'all',
            "model_history": True
        }
        X_train, y_train = load_iris(return_X_y=True)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.predict_proba(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0, objective='multi')
        print(estimator)
        time_history, best_valid_loss_history, valid_loss_history, \
            config_history, train_loss_history = get_output_from_log(
                filename=automl_settings['log_file_name'], time_budget=6)
        print(train_loss_history)
Esempio n. 10
0
 def test_regression_xgboost(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_experiment.add_learner(learner_name="my_xgb1",
                                   learner_class=MyXGB1)
     automl_experiment.add_learner(learner_name="my_xgb2",
                                   learner_class=MyXGB2)
     automl_settings = {
         "time_budget": 2,
         "estimator_list": ["my_xgb1", "my_xgb2"],
         "task": "regression",
         "log_file_name": "test/regression_xgboost.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("my_xgb2"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
Esempio n. 11
0
def test_numpy():
    X_train = np.arange("2014-01", "2021-01", dtype="datetime64[M]")
    y_train = np.random.random(size=len(X_train))
    automl = AutoML()
    automl.fit(
        X_train=X_train[:72],  # a single column of timestamp
        y_train=y_train[:72],  # value for each timestamp
        period=12,  # time horizon to forecast, e.g., 12 months
        task="ts_forecast",
        time_budget=3,  # time budget in seconds
        log_file_name="test/ts_forecast.log",
        n_splits=3,  # number of splits
    )
    print(automl.predict(X_train[72:]))

    automl = AutoML()
    automl.fit(
        X_train=X_train[:72],  # a single column of timestamp
        y_train=y_train[:72],  # value for each timestamp
        period=12,  # time horizon to forecast, e.g., 12 months
        task="ts_forecast",
        time_budget=1,  # time budget in seconds
        estimator_list=["arima", "sarimax"],
        log_file_name="test/ts_forecast.log",
    )
    print(automl.predict(X_train[72:]))
    # an alternative way to specify predict steps for arima/sarimax
    print(automl.predict(12))
Esempio n. 12
0
 def test_sparse_matrix_regression(self):
     X_train = scipy.sparse.random(300, 900, density=0.0001)
     y_train = np.random.uniform(size=300)
     X_val = scipy.sparse.random(100, 900, density=0.0001)
     y_val = np.random.uniform(size=100)
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "metric": "mae",
         "task": "regression",
         "log_file_name": "test/sparse_regression.log",
         "n_jobs": 1,
         "model_history": True,
         "keep_search_state": True,
         "verbose": 0,
         "early_stop": True,
     }
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           X_val=X_val,
                           y_val=y_val,
                           **automl_settings)
     assert automl_experiment._state.X_val.shape == X_val.shape
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("rf"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(automl_experiment.best_config)
     print(automl_experiment.best_loss)
     print(automl_experiment.best_config_train_time)
Esempio n. 13
0
    def test_regression(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            "metric": 'mse',
            "task": 'regression',
            "log_file_name": "test/boston.log",
            "log_training_metric": True,
            "model_history": True
        }
        X_train, y_train = load_boston(return_X_y=True)
        n = len(y_train)
        automl_experiment.fit(X_train=X_train[:n >> 1],
                              y_train=y_train[:n >> 1],
                              X_val=X_train[n >> 1:],
                              y_val=y_train[n >> 1:],
                              **automl_settings)
        assert automl_experiment.y_val.shape[0] == n - (n >> 1)
        assert automl_experiment.eval_method == 'holdout'
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        print(get_output_from_log(automl_settings["log_file_name"], 1))
Esempio n. 14
0
def run(dataset, config):
    log.info(f"\n**** FLAML [v{__version__}] ****\n")

    X_train, y_train = dataset.train.X, dataset.train.y.squeeze()
    X_test, y_test = dataset.test.X, dataset.test.y.squeeze()

    is_classification = config.type == 'classification'
    time_budget = config.max_runtime_seconds
    n_jobs = config.framework_params.get('_n_jobs', config.cores)
    log.info("Running FLAML with {} number of cores".format(config.cores))
    aml = AutoML()

    # Mapping of benchmark metrics to flaml metrics
    metrics_mapping = dict(
        acc='accuracy',
        auc='roc_auc',
        f1='f1',
        logloss='log_loss',
        mae='mae',
        mse='mse',
        rmse='rmse',
        r2='r2',
    )
    perf_metric = metrics_mapping[
        config.metric] if config.metric in metrics_mapping else 'auto'
    if perf_metric is None:
        log.warning("Performance metric %s not supported.", config.metric)

    training_params = {
        k: v
        for k, v in config.framework_params.items() if not k.startswith('_')
    }

    log_dir = output_subdir("logs", config)
    flaml_log_file_name = os.path.join(log_dir, "flaml.log")
    with Timer() as training:
        aml.fit(X_train,
                y_train,
                metric=perf_metric,
                task=config.type,
                n_jobs=n_jobs,
                log_file_name=flaml_log_file_name,
                time_budget=time_budget,
                **training_params)

    with Timer() as predict:
        predictions = aml.predict(X_test)
    probabilities = aml.predict_proba(X_test) if is_classification else None
    labels = aml.classes_ if is_classification else None
    return result(
        output_file=config.output_predictions_file,
        probabilities=probabilities,
        predictions=predictions,
        truth=y_test,
        models_count=len(aml.config_history),
        training_duration=training.duration,
        predict_duration=predict.duration,
        probabilities_labels=labels,
    )
Esempio n. 15
0
    def test_fit_w_starting_point(self, as_frame=True):
        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 3,
            "metric": "accuracy",
            "task": "classification",
            "log_file_name": "test/iris.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=as_frame)
        if as_frame:
            # test drop column
            X_train.columns = range(X_train.shape[1])
            X_train[X_train.shape[1]] = np.zeros(len(y_train))
        automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
        automl_val_accuracy = 1.0 - automl_experiment.best_loss
        print("Best ML leaner:", automl_experiment.best_estimator)
        print("Best hyperparmeter config:", automl_experiment.best_config)
        print("Best accuracy on validation data: {0:.4g}".format(automl_val_accuracy))
        print(
            "Training duration of best run: {0:.4g} s".format(
                automl_experiment.best_config_train_time
            )
        )

        starting_points = automl_experiment.best_config_per_estimator
        print("starting_points", starting_points)
        print("loss of the starting_points", automl_experiment.best_loss_per_estimator)
        automl_settings_resume = {
            "time_budget": 2,
            "metric": "accuracy",
            "task": "classification",
            "log_file_name": "test/iris_resume.log",
            "log_training_metric": True,
            "n_jobs": 1,
            "model_history": True,
            "log_type": "all",
            "starting_points": starting_points,
        }
        new_automl_experiment = AutoML()
        new_automl_experiment.fit(
            X_train=X_train, y_train=y_train, **automl_settings_resume
        )

        new_automl_val_accuracy = 1.0 - new_automl_experiment.best_loss
        print("Best ML leaner:", new_automl_experiment.best_estimator)
        print("Best hyperparmeter config:", new_automl_experiment.best_config)
        print(
            "Best accuracy on validation data: {0:.4g}".format(new_automl_val_accuracy)
        )
        print(
            "Training duration of best run: {0:.4g} s".format(
                new_automl_experiment.best_config_train_time
            )
        )
Esempio n. 16
0
def _test_custom_data():
    from flaml import AutoML
    import requests
    import pandas as pd

    try:
        train_dataset = pd.read_csv("data/input/train.tsv",
                                    delimiter="\t",
                                    quoting=3)
        dev_dataset = pd.read_csv("data/input/dev.tsv",
                                  delimiter="\t",
                                  quoting=3)
        test_dataset = pd.read_csv("data/input/test.tsv",
                                   delimiter="\t",
                                   quoting=3)
    except requests.exceptions.HTTPError:
        return

    custom_sent_keys = ["#1 String", "#2 String"]
    label_key = "Quality"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    X_val = dev_dataset[custom_sent_keys]
    y_val = dev_dataset[label_key]

    X_test = test_dataset[custom_sent_keys]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "data/output/",
        "ckpt_per_epoch": 1,
    }

    automl.fit(X_train=X_train,
               y_train=y_train,
               X_val=X_val,
               y_val=y_val,
               **automl_settings)
    automl.predict(X_test)
    automl.predict(["test test"])
    automl.predict([
        ["test test", "test test"],
        ["test test", "test test"],
        ["test test", "test test"],
    ])
Esempio n. 17
0
def _test_ray_classification():
    from sklearn.datasets import make_classification

    X, y = make_classification(1000, 10)
    automl = AutoML()
    automl.fit(X,
               y,
               time_budget=10,
               task="classification",
               n_concurrent_trials=2)
Esempio n. 18
0
 def test_roc_auc_ovo(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 1,
         "metric": "roc_auc_ovo",
         "task": "classification",
         "log_file_name": "test/roc_auc_ovo.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_iris(return_X_y=True)
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
Esempio n. 19
0
    def test_custom_metric(self):
        df, y = load_iris(return_X_y=True, as_frame=True)
        df["label"] = y
        automl_experiment = AutoML()
        automl_settings = {
            "dataframe": df,
            "label": "label",
            "time_budget": 5,
            "eval_method": "cv",
            "metric": custom_metric,
            "task": "classification",
            "log_file_name": "test/iris_custom.log",
            "log_training_metric": True,
            "log_type": "all",
            "n_jobs": 1,
            "model_history": True,
            "sample_weight": np.ones(len(y)),
            "pred_time_limit": 1e-5,
            "ensemble": True,
        }
        automl_experiment.fit(**automl_settings)
        print(automl_experiment.classes_)
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.best_model_for_estimator("rf"))
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)
        automl_experiment = AutoML()
        estimator = automl_experiment.get_estimator_from_log(
            automl_settings["log_file_name"], record_id=0, task="multi"
        )
        print(estimator)
        (
            time_history,
            best_valid_loss_history,
            valid_loss_history,
            config_history,
            metric_history,
        ) = get_output_from_log(
            filename=automl_settings["log_file_name"], time_budget=6
        )
        print(metric_history)
        try:
            import ray

            df = ray.put(df)
            automl_settings["dataframe"] = df
            automl_settings["use_ray"] = True
            automl_experiment.fit(**automl_settings)
        except ImportError:
            pass
Esempio n. 20
0
def test_cv():
    from flaml import AutoML
    import pandas as pd
    import requests

    train_data = {
        "sentence1": [
            'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
            "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
            "They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .",
            "Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .",
        ],
        "sentence2": [
            'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
            "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .",
            "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .",
            "Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .",
        ],
        "label": [1, 0, 1, 0],
        "idx": [0, 1, 2, 3],
    }
    train_dataset = pd.DataFrame(train_data)

    custom_sent_keys = ["sentence1", "sentence2"]
    label_key = "label"

    X_train = train_dataset[custom_sent_keys]
    y_train = train_dataset[label_key]

    automl = AutoML()

    automl_settings = {
        "gpu_per_trial": 0,
        "max_iter": 3,
        "time_budget": 5,
        "task": "seq-classification",
        "metric": "accuracy",
        "n_splits": 3,
    }

    automl_settings["custom_hpo_args"] = {
        "model_path": "google/electra-small-discriminator",
        "output_dir": "test/data/output/",
        "ckpt_per_epoch": 1,
        "fp16": False,
    }

    try:
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
    except requests.exceptions.HTTPError:
        return
Esempio n. 21
0
def test_numpy_large():
    import numpy as np
    import pandas as pd
    from flaml import AutoML

    X_train = pd.date_range("2017-01-01", periods=70000, freq="T")
    y_train = pd.DataFrame(np.random.randint(6500, 7500, 70000))
    automl = AutoML()
    automl.fit(
        X_train=X_train[:-10].values,  # a single column of timestamp
        y_train=y_train[:-10].values,  # value for each timestamp
        period=10,  # time horizon to forecast, e.g., 12 months
        task="ts_forecast",
        time_budget=10,  # time budget in seconds
    )
Esempio n. 22
0
 def test_roc_auc_ovr(self):
     automl_experiment = AutoML()
     X_train, y_train = load_iris(return_X_y=True)
     automl_settings = {
         "time_budget": 1,
         "metric": "roc_auc_ovr",
         "task": "classification",
         "log_file_name": "test/roc_auc_ovr.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "sample_weight": np.ones(len(y_train)),
         "eval_method": "holdout",
         "model_history": True,
     }
     automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
Esempio n. 23
0
 def test_binary(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 1,
         "task": "binary",
         "log_file_name": "test/breast_cancer.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = load_breast_cancer(return_X_y=True)
     automl_experiment.fit(X_train=X_train,
                           y_train=y_train,
                           **automl_settings)
     _ = automl_experiment.predict(X_train)
Esempio n. 24
0
 def test_custom_learner(self):
     automl = AutoML()
     automl.add_learner(learner_name='RGF',
                        learner_class=MyRegularizedGreedyForest)
     X_train, y_train = load_wine(return_X_y=True)
     settings = {
         "time_budget": 10,  # total running time in seconds
         "estimator_list": ['RGF', 'lgbm', 'rf', 'xgboost'],
         "task": 'classification',  # task type    
         "sample": True,  # whether to subsample training data
         "log_file_name": "test/wine.log",
         "log_training_metric": True,  # whether to log training metric
         "n_jobs": 1,
     }
     '''The main flaml automl API'''
     automl.fit(X_train=X_train, y_train=y_train, **settings)
Esempio n. 25
0
def test_mlflow():
    import subprocess
    import sys

    subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"])
    import mlflow
    from flaml.data import load_openml_task

    try:
        X_train, X_test, y_train, y_test = load_openml_task(
            task_id=7592, data_dir="test/"
        )
    except (OpenMLServerException, ChunkedEncodingError) as e:
        print(e)
        return
    """ import AutoML class from flaml package """
    from flaml import AutoML

    automl = AutoML()
    settings = {
        "time_budget": 5,  # total running time in seconds
        "metric": "accuracy",  # primary metrics can be chosen from: ['accuracy','roc_auc','roc_auc_ovr','roc_auc_ovo','f1','log_loss','mae','mse','r2']
        "estimator_list": ["lgbm", "rf", "xgboost"],  # list of ML learners
        "task": "classification",  # task type
        "sample": False,  # whether to subsample training data
        "log_file_name": "adult.log",  # flaml log file
    }
    mlflow.set_experiment("flaml")
    with mlflow.start_run() as run:
        automl.fit(X_train=X_train, y_train=y_train, **settings)
        mlflow.sklearn.log_model(automl, "automl")
    loaded_model = mlflow.pyfunc.load_model(f"{run.info.artifact_uri}/automl")
    print(loaded_model.predict(X_test))
    automl._mem_thres = 0
    print(automl.trainable(automl.points_to_evaluate[0]))

    settings["use_ray"] = True
    try:
        with mlflow.start_run() as run:
            automl.fit(X_train=X_train, y_train=y_train, **settings)
            mlflow.sklearn.log_model(automl, "automl")
        automl = mlflow.sklearn.load_model(f"{run.info.artifact_uri}/automl")
        print(automl.predict_proba(X_test))
    except ImportError:
        pass
Esempio n. 26
0
    def _test_memory_limit(self):
        automl_experiment = AutoML()
        automl_experiment.add_learner(
            learner_name="large_lgbm", learner_class=MyLargeLGBM
        )
        automl_settings = {
            "time_budget": -1,
            "task": "classification",
            "log_file_name": "test/classification_oom.log",
            "estimator_list": ["large_lgbm"],
            "log_type": "all",
            "hpo_method": "random",
        }
        X_train, y_train = load_iris(return_X_y=True, as_frame=True)

        automl_experiment.fit(
            X_train=X_train, y_train=y_train, max_iter=1, **automl_settings
        )
        print(automl_experiment.model)
Esempio n. 27
0
    def test_logging_level(self):

        from flaml import logger, logger_formatter

        with tempfile.TemporaryDirectory() as d:

            training_log = os.path.join(d, "training.log")

            # Configure logging for the FLAML logger
            # and add a handler that outputs to a buffer.
            logger.setLevel(logging.INFO)
            buf = io.StringIO()
            ch = logging.StreamHandler(buf)
            ch.setFormatter(logger_formatter)
            logger.addHandler(ch)

            # Run a simple job.
            automl = AutoML()
            automl_settings = {
                "time_budget": 1,
                "metric": 'mse',
                "task": 'regression',
                "log_file_name": training_log,
                "log_training_metric": True,
                "n_jobs": 1,
                "model_history": True,
            }
            X_train, y_train = load_boston(return_X_y=True)
            n = len(y_train) >> 1
            automl.fit(X_train=X_train[:n],
                       y_train=y_train[:n],
                       X_val=X_train[n:],
                       y_val=y_train[n:],
                       **automl_settings)

            # Check if the log buffer is populated.
            self.assertTrue(len(buf.getvalue()) > 0)

        import pickle
        with open('automl.pkl', 'wb') as f:
            pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
        print(automl.__version__)
Esempio n. 28
0
    def test_ensemble(self):
        automl = AutoML()
        automl.add_learner(learner_name="RGF", learner_class=MyRegularizedGreedyForest)
        X_train, y_train = load_wine(return_X_y=True)
        settings = {
            "time_budget": 5,  # total running time in seconds
            "estimator_list": ["rf", "xgboost", "catboost"],
            "task": "classification",  # task type
            "sample": True,  # whether to subsample training data
            "log_file_name": "test/wine.log",
            "log_training_metric": True,  # whether to log training metric
            "ensemble": {
                "final_estimator": MyRegularizedGreedyForest(),
                "passthrough": False,
            },
            "n_jobs": 1,
        }

        """The main flaml automl API"""
        automl.fit(X_train=X_train, y_train=y_train, **settings)
Esempio n. 29
0
 def test_regression(self):
     automl_experiment = AutoML()
     automl_settings = {
         "time_budget": 2,
         "task": "regression",
         "log_file_name": "test/california.log",
         "log_training_metric": True,
         "n_jobs": 1,
         "model_history": True,
     }
     X_train, y_train = fetch_california_housing(return_X_y=True)
     n = int(len(y_train) * 9 // 10)
     automl_experiment.fit(X_train=X_train[:n],
                           y_train=y_train[:n],
                           X_val=X_train[n:],
                           y_val=y_train[n:],
                           **automl_settings)
     assert automl_experiment._state.eval_method == "holdout"
     print(automl_experiment.predict(X_train))
     print(automl_experiment.model)
     print(automl_experiment.config_history)
     print(automl_experiment.best_model_for_estimator("xgboost"))
     print(automl_experiment.best_iteration)
     print(automl_experiment.best_estimator)
     print(get_output_from_log(automl_settings["log_file_name"], 1))
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=1,
     )
     automl_experiment.retrain_from_log(
         task="regression",
         log_file_name=automl_settings["log_file_name"],
         X_train=X_train,
         y_train=y_train,
         train_full=True,
         time_budget=0,
     )
Esempio n. 30
0
    def test_sparse_matrix_regression_cv(self):

        automl_experiment = AutoML()
        automl_settings = {
            "time_budget": 2,
            'eval_method': 'cv',
            "task": 'regression',
            "log_file_name": "test/sparse_regression.log",
            "model_history": True
        }
        X_train = scipy.sparse.random(100, 100)
        y_train = np.random.uniform(size=100)
        automl_experiment.fit(X_train=X_train,
                              y_train=y_train,
                              **automl_settings)
        print(automl_experiment.predict(X_train))
        print(automl_experiment.model)
        print(automl_experiment.config_history)
        print(automl_experiment.model_history)
        print(automl_experiment.best_iteration)
        print(automl_experiment.best_estimator)