def test_fit_third_party_feature(self):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        tsdata_train = get_tsdataset().gen_dt_feature().scale(scaler, fit=True)
        tsdata_valid = get_tsdataset().gen_dt_feature().scale(scaler,
                                                              fit=False)

        search_space = {
            'hidden_dim': hp.grid_search([32, 64]),
            'dropout': hp.uniform(0.1, 0.2)
        }

        auto_estimator = AutoTSEstimator(model=model_creator,
                                         search_space=search_space,
                                         past_seq_len=hp.randint(4, 6),
                                         future_seq_len=1,
                                         selected_features="auto",
                                         metric="mse",
                                         loss=torch.nn.MSELoss(),
                                         cpus_per_trial=2)

        ts_pipeline = auto_estimator.fit(data=tsdata_train,
                                         epochs=1,
                                         batch_size=hp.choice([32, 64]),
                                         validation_data=tsdata_valid,
                                         n_sampling=1)
        best_config = auto_estimator.get_best_config()
        best_model = auto_estimator._get_best_automl_model()
        assert 4 <= best_config["past_seq_len"] <= 6

        assert isinstance(ts_pipeline, TSPipeline)

        # use raw base model to predic and evaluate
        tsdata_valid.roll(lookback=best_config["past_seq_len"],
                          horizon=0,
                          feature_col=best_config["selected_features"])
        x_valid, y_valid = tsdata_valid.to_numpy()
        y_pred_raw = best_model.predict(x_valid)
        y_pred_raw = tsdata_valid.unscale_numpy(y_pred_raw)

        # use tspipeline to predic and evaluate
        eval_result = ts_pipeline.evaluate(tsdata_valid)
        y_pred = ts_pipeline.predict(tsdata_valid)

        # check if they are the same
        np.testing.assert_almost_equal(y_pred, y_pred_raw)

        # save and load
        ts_pipeline.save("/tmp/auto_trainer/autots_tmp_model_3rdparty")
        new_ts_pipeline = TSPipeline.load(
            "/tmp/auto_trainer/autots_tmp_model_3rdparty")

        # check if load ppl is the same as previous
        eval_result_new = new_ts_pipeline.evaluate(tsdata_valid)
        y_pred_new = new_ts_pipeline.predict(tsdata_valid)
        np.testing.assert_almost_equal(eval_result[0], eval_result_new[0])
        np.testing.assert_almost_equal(y_pred, y_pred_new)

        # use tspipeline to incrementally train
        new_ts_pipeline.fit(tsdata_valid)
Exemple #2
0
    def test_fit_lstm_data_creator(self):
        input_feature_dim = 4
        output_feature_dim = 2  # 2 targets are generated in get_tsdataset

        search_space = {
            'hidden_dim': hp.grid_search([32, 64]),
            'layer_num': hp.randint(1, 3),
            'lr': hp.choice([0.001, 0.003, 0.01]),
            'dropout': hp.uniform(0.1, 0.2)
        }
        auto_estimator = AutoTSEstimator(model='lstm',
                                         search_space=search_space,
                                         past_seq_len=7,
                                         future_seq_len=1,
                                         input_feature_num=input_feature_dim,
                                         output_target_num=output_feature_dim,
                                         selected_features="auto",
                                         metric="mse",
                                         loss=torch.nn.MSELoss(),
                                         logs_dir="/tmp/auto_trainer",
                                         cpus_per_trial=2,
                                         name="auto_trainer")
        auto_estimator.fit(data=get_data_creator(),
                           epochs=1,
                           batch_size=hp.choice([32, 64]),
                           validation_data=get_data_creator(),
                           n_sampling=1)
        config = auto_estimator.get_best_config()
        assert config["past_seq_len"] == 7
Exemple #3
0
    def test_fit_data_creator(self):
        auto_lstm = AutoLSTM(input_feature_num=input_feature_dim,
                             output_target_num=output_feature_dim,
                             past_seq_len=5,
                             optimizer='Adam',
                             loss=torch.nn.MSELoss(),
                             metric="mse",
                             hidden_dim=hp.grid_search([32, 64]),
                             layer_num=hp.randint(1, 3),
                             lr=hp.choice([0.001, 0.003, 0.01]),
                             dropout=hp.uniform(0.1, 0.2),
                             logs_dir="/tmp/auto_lstm",
                             cpus_per_trial=2,
                             name="auto_lstm")

        auto_lstm.fit(
            data=train_dataloader_creator,
            epochs=1,
            batch_size=hp.choice([32, 64]),
            validation_data=valid_dataloader_creator,
            n_sampling=1,
        )
        best_model = auto_lstm.get_best_model()
        assert 0.1 <= best_model.config['dropout'] <= 0.2
        assert best_model.config['batch_size'] in (32, 64)
        assert 1 <= best_model.config['layer_num'] < 3
Exemple #4
0
 def get_past_seq_config(look_back):
     """
     generate pass sequence config based on look_back
     :param look_back: look_back configuration
     :return: search configuration for past sequence
     """
     if isinstance(look_back, tuple) and len(look_back) == 2 and isinstance(
             look_back[0], int) and isinstance(look_back[1], int):
         if look_back[1] < 2:
             raise ValueError(
                 "The max look back value should be at least 2")
         if look_back[0] < 2:
             print("The input min look back value is smaller than 2. "
                   "We sample from range (2, {}) instead.".format(
                       look_back[1]))
         past_seq_config = hp.randint(look_back[0], look_back[1] + 1)
     elif isinstance(look_back, int):
         if look_back < 2:
             raise ValueError(
                 "look back value should not be smaller than 2. "
                 "Current value is ", look_back)
         past_seq_config = look_back
     else:
         raise ValueError(
             "look back is {}.\n "
             "look_back should be either a tuple with 2 int values:"
             " (min_len, max_len) or a single int".format(look_back))
     return past_seq_config
 def test_fit(self):
     data, validation_data = get_data()
     auto_arima = AutoARIMA(metric="mse",
                            p=hp.randint(0, 4),
                            q=hp.randint(0, 4),
                            seasonality_mode=hp.choice([True, False]),
                            P=hp.randint(5, 12),
                            Q=hp.randint(5, 12),
                            m=hp.choice([4, 7]))
     auto_arima.fit(
         data=data,
         validation_data=validation_data,
         epochs=1,
         n_sampling=1,
     )
     best_model = auto_arima.get_best_model()
 def test_num_channels(self):
     auto_tcn = AutoTCN(input_feature_num=input_feature_dim,
                        output_target_num=output_feature_dim,
                        past_seq_len=past_seq_len,
                        future_seq_len=future_seq_len,
                        optimizer='Adam',
                        loss=torch.nn.MSELoss(),
                        metric="mse",
                        hidden_units=4,
                        levels=hp.randint(1, 3),
                        num_channels=[8] * 2,
                        kernel_size=hp.choice([2, 3]),
                        lr=hp.choice([0.001, 0.003, 0.01]),
                        dropout=hp.uniform(0.1, 0.2),
                        logs_dir="/tmp/auto_tcn",
                        cpus_per_trial=2,
                        name="auto_tcn")
     auto_tcn.fit(data=train_dataloader_creator,
                  epochs=1,
                  batch_size=hp.choice([32, 64]),
                  validation_data=valid_dataloader_creator,
                  n_sampling=1,
                  )
     assert auto_tcn.get_best_model()
     best_config = auto_tcn.get_best_config()
     assert best_config['num_channels'] == [8]*2
def get_auto_estimator():
    auto_lstm = AutoLSTM(input_feature_num=input_feature_dim,
                         output_target_num=output_feature_dim,
                         past_seq_len=5,
                         optimizer='Adam',
                         loss=torch.nn.MSELoss(),
                         metric="mse",
                         hidden_dim=hp.grid_search([32, 64]),
                         layer_num=hp.randint(1, 3),
                         lr=hp.choice([0.001, 0.003, 0.01]),
                         dropout=hp.uniform(0.1, 0.2),
                         logs_dir="/tmp/auto_lstm",
                         cpus_per_trial=2,
                         name="auto_lstm")
    return auto_lstm
Exemple #8
0
 def _gen_sample_func(self, ranges, param_name):
     if isinstance(ranges, tuple):
         assert len(ranges) == 2, \
             f"length of tuple {param_name} should be 2 while get {len(ranges)} instead."
         assert param_name != "teacher_forcing", \
             f"type of {param_name} can only be a list while get a tuple"
         if param_name in ["lr"]:
             return hp.loguniform(lower=ranges[0], upper=ranges[1])
         if param_name in [
                 "lstm_hidden_dim", "lstm_layer_num", "batch_size"
         ]:
             return hp.randint(lower=ranges[0], upper=ranges[1])
         if param_name in ["dropout"]:
             return hp.uniform(lower=ranges[0], upper=ranges[1])
     if isinstance(ranges, list):
         return hp.grid_search(ranges)
     raise RuntimeError(f"{param_name} should be either a list or a tuple.")
Exemple #9
0
def get_auto_estimator():
    auto_tcn = AutoTCN(input_feature_num=input_feature_dim,
                       output_target_num=output_feature_dim,
                       past_seq_len=past_seq_len,
                       future_seq_len=future_seq_len,
                       optimizer='Adam',
                       loss=torch.nn.MSELoss(),
                       metric="mse",
                       hidden_units=8,
                       levels=hp.randint(1, 3),
                       kernel_size=hp.choice([2, 3]),
                       lr=hp.choice([0.001, 0.003, 0.01]),
                       dropout=hp.uniform(0.1, 0.2),
                       logs_dir="/tmp/auto_tcn",
                       cpus_per_trial=2,
                       name="auto_tcn")
    return auto_tcn
Exemple #10
0
def get_auto_estimator():
    auto_seq2seq = AutoSeq2Seq(input_feature_num=input_feature_dim,
                               output_target_num=output_feature_dim,
                               past_seq_len=past_seq_len,
                               future_seq_len=future_seq_len,
                               optimizer='Adam',
                               loss=torch.nn.MSELoss(),
                               metric="mse",
                               lr=hp.choice([0.001, 0.003, 0.01]),
                               lstm_hidden_dim=hp.grid_search([32, 64, 128]),
                               lstm_layer_num=hp.randint(1, 4),
                               dropout=hp.uniform(0.1, 0.3),
                               teacher_forcing=False,
                               logs_dir="/tmp/auto_seq2seq",
                               cpus_per_trial=2,
                               name="auto_seq2seq")
    return auto_seq2seq
    def test_select_feature(self):
        sample_num = np.random.randint(100, 200)
        df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num),
            "id":
            np.array(['00'] * sample_num)
        })
        train_ts, val_ts, _ = TSDataset.from_pandas(df,
                                                    target_col=['value'],
                                                    dt_col='datetime',
                                                    id_col='id',
                                                    with_split=True,
                                                    val_ratio=0.1)

        search_space = {
            'hidden_dim': hp.grid_search([32, 64]),
            'layer_num': hp.randint(1, 3),
            'lr': hp.choice([0.001, 0.003, 0.01]),
            'dropout': hp.uniform(0.1, 0.2)
        }

        input_feature_dim, output_feature_dim = 1, 1
        auto_estimator = AutoTSEstimator(model='lstm',
                                         search_space=search_space,
                                         past_seq_len=6,
                                         future_seq_len=1,
                                         input_feature_num=input_feature_dim,
                                         output_target_num=output_feature_dim,
                                         selected_features="auto",
                                         metric="mse",
                                         loss=torch.nn.MSELoss(),
                                         cpus_per_trial=2,
                                         name="auto_trainer")

        auto_estimator.fit(data=train_ts,
                           epochs=1,
                           batch_size=hp.choice([32, 64]),
                           validation_data=val_ts,
                           n_sampling=1)
        config = auto_estimator.get_best_config()
        assert config['past_seq_len'] == 6
                      cores=args.cores,
                      memory=args.memory,
                      num_nodes=num_nodes,
                      init_ray_on_spark=True)

    tsdata_train, tsdata_valid, tsdata_test = get_nyc_taxi_tsdataset(
        args.datadir)

    auto_lstm = AutoLSTM(input_feature_num=1,
                         output_target_num=1,
                         past_seq_len=14,
                         optimizer='Adam',
                         loss=torch.nn.MSELoss(),
                         metric="mse",
                         hidden_dim=hp.grid_search([32, 64]),
                         layer_num=hp.randint(1, 3),
                         lr=hp.choice([0.001, 0.003, 0.01]),
                         dropout=hp.uniform(0.1, 0.2),
                         logs_dir="/tmp/auto_lstm",
                         cpus_per_trial=args.cpus_per_trial,
                         name="auto_lstm")
    auto_lstm.fit(
        data=get_data_creator(tsdata_train),
        epochs=args.epoch,
        batch_size=hp.choice([32, 64]),
        validation_data=get_data_creator(tsdata_valid),
        n_sampling=args.n_sampling,
    )
    best_model = auto_lstm.get_best_model()
    best_config = auto_lstm.get_best_config()
def get_xgb_search_space():
    return {
        "n_estimators": hp.randint(5, 10),
        "max_depth": hp.randint(2, 5),
        "lr": hp.loguniform(1e-4, 1e-1),
    }
    def test_fit_tcn_feature(self):
        input_feature_dim = 11  # This param will not be used
        output_feature_dim = 2  # 2 targets are generated in get_tsdataset

        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        tsdata_train = get_tsdataset().gen_dt_feature().scale(scaler, fit=True)
        tsdata_valid = get_tsdataset().gen_dt_feature().scale(scaler,
                                                              fit=False)

        search_space = {
            'hidden_units': hp.grid_search([32, 64]),
            'levels': hp.randint(4, 6),
            'kernel_size': hp.randint(3, 5),
            'dropout': hp.uniform(0.1, 0.2),
            'lr': hp.loguniform(0.001, 0.01)
        }
        auto_trainer = AutoTSTrainer(model='tcn',
                                     search_space=search_space,
                                     past_seq_len=hp.randint(4, 6),
                                     future_seq_len=1,
                                     input_feature_num=input_feature_dim,
                                     output_target_num=output_feature_dim,
                                     selected_features="auto",
                                     metric="mse",
                                     optimizer="Adam",
                                     loss=torch.nn.MSELoss(),
                                     logs_dir="/tmp/auto_trainer",
                                     cpus_per_trial=2,
                                     name="auto_trainer")
        ts_pipeline = auto_trainer.fit(data=tsdata_train,
                                       epochs=1,
                                       batch_size=hp.choice([32, 64]),
                                       validation_data=tsdata_valid,
                                       n_sampling=1)
        best_config = auto_trainer.get_best_config()
        best_model = auto_trainer.get_best_model()
        assert 4 <= best_config["past_seq_len"] <= 6

        assert isinstance(ts_pipeline, TSPipeline)

        # use raw base model to predic and evaluate
        tsdata_valid.roll(lookback=best_config["past_seq_len"],
                          horizon=0,
                          feature_col=best_config["selected_features"])
        x_valid, y_valid = tsdata_valid.to_numpy()
        y_pred_raw = best_model.predict(x_valid)
        y_pred_raw = tsdata_valid.unscale_numpy(y_pred_raw)

        # use tspipeline to predic and evaluate
        eval_result = ts_pipeline.evaluate(tsdata_valid)
        y_pred = ts_pipeline.predict(tsdata_valid)

        # check if they are the same
        np.testing.assert_almost_equal(y_pred, y_pred_raw)

        # save and load
        ts_pipeline.save("/tmp/auto_trainer/autots_tmp_model_tcn")
        new_ts_pipeline = TSPipeline.load(
            "/tmp/auto_trainer/autots_tmp_model_tcn")

        # check if load ppl is the same as previous
        eval_result_new = new_ts_pipeline.evaluate(tsdata_valid)
        y_pred_new = new_ts_pipeline.predict(tsdata_valid)
        np.testing.assert_almost_equal(eval_result[0], eval_result_new[0])
        np.testing.assert_almost_equal(y_pred, y_pred_new)

        # use tspipeline to incrementally train
        new_ts_pipeline.fit(tsdata_valid)
    def test_fit_seq2seq_feature(self):
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        tsdata_train = get_tsdataset().gen_dt_feature().scale(scaler, fit=True)
        tsdata_valid = get_tsdataset().gen_dt_feature().scale(scaler,
                                                              fit=False)

        auto_estimator = AutoTSEstimator(model='seq2seq',
                                         search_space="minimal",
                                         past_seq_len=hp.randint(4, 6),
                                         future_seq_len=1,
                                         selected_features="auto",
                                         metric="mse",
                                         optimizer="Adam",
                                         loss=torch.nn.MSELoss(),
                                         logs_dir="/tmp/auto_trainer",
                                         cpus_per_trial=2,
                                         name="auto_trainer")
        ts_pipeline = auto_estimator.fit(data=tsdata_train,
                                         epochs=1,
                                         batch_size=hp.choice([32, 64]),
                                         validation_data=tsdata_valid,
                                         n_sampling=1)
        best_config = auto_estimator.get_best_config()
        best_model = auto_estimator._get_best_automl_model()
        assert 4 <= best_config["past_seq_len"] <= 6

        assert isinstance(ts_pipeline, TSPipeline)

        # use raw base model to predic and evaluate
        tsdata_valid.roll(lookback=best_config["past_seq_len"],
                          horizon=0,
                          feature_col=best_config["selected_features"])
        x_valid, y_valid = tsdata_valid.to_numpy()
        y_pred_raw = best_model.predict(x_valid)
        y_pred_raw = tsdata_valid.unscale_numpy(y_pred_raw)

        # use tspipeline to predic and evaluate
        eval_result = ts_pipeline.evaluate(tsdata_valid)
        y_pred = ts_pipeline.predict(tsdata_valid)

        # check if they are the same
        np.testing.assert_almost_equal(y_pred, y_pred_raw)

        # save and load
        ts_pipeline.save("/tmp/auto_trainer/autots_tmp_model_seq2seq")
        new_ts_pipeline = TSPipeline.load(
            "/tmp/auto_trainer/autots_tmp_model_seq2seq")

        # check if load ppl is the same as previous
        eval_result_new = new_ts_pipeline.evaluate(tsdata_valid)
        y_pred_new = new_ts_pipeline.predict(tsdata_valid)
        np.testing.assert_almost_equal(eval_result[0], eval_result_new[0])
        np.testing.assert_almost_equal(y_pred, y_pred_new)

        # check if load ppl is the same as previous with onnx
        try:
            import onnx
            import onnxruntime
            eval_result_new_onnx = new_ts_pipeline.evaluate_with_onnx(
                tsdata_valid)
            y_pred_new_onnx = new_ts_pipeline.predict_with_onnx(tsdata_valid)
            np.testing.assert_almost_equal(eval_result[0],
                                           eval_result_new_onnx[0],
                                           decimal=5)
            np.testing.assert_almost_equal(y_pred, y_pred_new_onnx, decimal=5)
        except ImportError:
            pass

        # use tspipeline to incrementally train
        new_ts_pipeline.fit(tsdata_valid)
Exemple #16
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import zoo.orca.automl.hp as hp


AUTO_MODEL_SUPPORT_LIST = ["lstm", "tcn", "seq2seq"]

AUTO_MODEL_DEFAULT_SEARCH_SPACE = {
    "lstm": {"minimal": {"hidden_dim": hp.grid_search([16, 32]),
                         "layer_num": hp.randint(1, 2),
                         "lr": hp.loguniform(0.001, 0.005),
                         "dropout": hp.uniform(0.1, 0.2)},
             "normal": {"hidden_dim": hp.grid_search([16, 32, 64]),
                        "layer_num": hp.grid_search([1, 2]),
                        "lr": hp.loguniform(0.0005, 0.01),
                        "dropout": hp.uniform(0, 0.2)},
             "large": {"hidden_dim": hp.grid_search([16, 32, 64, 128]),
                       "layer_num": hp.grid_search([1, 2, 3, 4]),
                       "lr": hp.loguniform(0.0005, 0.01),
                       "dropout": hp.uniform(0, 0.3)}},

    "tcn": {"minimal": {"hidden_units": hp.grid_search([16, 32]),
                        "levels": hp.randint(4, 6),
                        "kernel_size": 3,
                        "lr": hp.loguniform(0.001, 0.005),