def _hp_search(self, input_df, validation_df, metric, recipe, mc, resources_per_trial, remote_dir): ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) if isinstance(input_df, list): feature_list = ft.get_feature_list(input_df[0]) else: feature_list = ft.get_feature_list(input_df) # model = VanillaLSTM(check_optional_config=False) model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len) # prepare parameters for search engine search_space = recipe.search_space(feature_list) runtime_params = recipe.runtime_params() num_samples = runtime_params['num_samples'] stop = dict(runtime_params) search_algorithm_params = recipe.search_algorithm_params() search_algorithm = recipe.search_algorithm() fixed_params = recipe.fixed_params() del stop['num_samples'] searcher = RayTuneSearchEngine(logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile(input_df, search_space=search_space, stop=stop, search_algorithm_params=search_algorithm_params, search_algorithm=search_algorithm, fixed_params=fixed_params, # feature_transformers=TimeSequenceFeatures, feature_transformers=ft, # model=model, future_seq_len=self.future_seq_len, validation_df=validation_df, metric=metric, mc=mc, num_samples=num_samples) # searcher.test_run() searcher.run() best = searcher.get_best_trials(k=1)[0] # get the best one trial, later could be n pipeline = self._make_pipeline(best, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def setup_method(self, method): # super().setup_method(method) train_data = pd.DataFrame(data=np.random.randn(64, 4)) val_data = pd.DataFrame(data=np.random.randn(16, 4)) test_data = pd.DataFrame(data=np.random.randn(16, 4)) future_seq_len = 1 past_seq_len = 6 # use roll method in time_sequence tsft = TimeSequenceFeatureTransformer() self.x_train, self.y_train = tsft._roll_train( train_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) self.x_val, self.y_val = tsft._roll_train( val_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) self.x_test = tsft._roll_test(test_data, past_seq_len=past_seq_len) self.config = { 'epochs': 1, "lr": 0.001, "lstm_1_units": 16, "dropout_1": 0.2, "lstm_2_units": 10, "dropout_2": 0.2, "batch_size": 32, } self.model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len)
def test_predict(self): train_data = pd.DataFrame(data=np.random.randn(64, 4)) test_data = pd.DataFrame(data=np.random.randn(16, 4)) future_seq_len = 1 past_seq_len = 6 # use roll method in time_sequence tsft = TimeSequenceFeatureTransformer() x_train, y_train = tsft._roll_train(train_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) x_test = tsft._roll_test(test_data, past_seq_len=past_seq_len) config = { 'epochs': 2, "lr": 0.001, "lstm_1_units": 16, "dropout_1": 0.2, "lstm_2_units": 10, "dropout_2": 0.2, "batch_size": 32, } model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len) model.fit_eval(x_train, y_train, **config) y_pred = model.predict(x_test) assert y_pred.shape == (x_test.shape[0], 1)
def test_evaluate(self): train_data = pd.DataFrame(data=np.random.randn(64, 4)) val_data = pd.DataFrame(data=np.random.randn(16, 4)) future_seq_len = 1 past_seq_len = 6 # use roll method in time_sequence tsft = TimeSequenceFeatureTransformer() x_train, y_train = tsft._roll_train(train_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) x_val, y_val = tsft._roll_train(val_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) config = { 'epochs': 1, "lr": 0.001, "lstm_1_units": 16, "dropout_1": 0.2, "lstm_2_units": 10, "dropout_2": 0.2, "batch_size": 32, } model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len) model.fit_eval(x_train, y_train, **config) print("evaluate:", model.evaluate(x_val, y_val))
def setup_method(self, method): tf.keras.backend.clear_session() self.ft = TimeSequenceFeatureTransformer() self.create_data() self.model = MTNetKeras() self.config = {"long_num": self.long_num, "time_step": self.time_step, "ar_window": np.random.randint(1, 3), "cnn_height": np.random.randint(1, 3), "epochs": 1}
class TestZouwuModelMTNetForecaster(TestCase): def setUp(self): tf.keras.backend.clear_session() self.ft = TimeSequenceFeatureTransformer() self.create_data() def tearDown(self): pass def create_data(self): def gen_train_sample(data, past_seq_len, future_seq_len): data = pd.DataFrame(data) x, y = self.ft._roll_train(data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) return x, y def gen_test_sample(data, past_seq_len): test_data = pd.DataFrame(data) x = self.ft._roll_test(test_data, past_seq_len=past_seq_len) return x self.long_num = 6 self.time_step = 2 look_back = (self.long_num + 1) * self.time_step look_forward = 1 self.x_train, self.y_train = gen_train_sample( data=np.random.randn(64, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_test = gen_test_sample(data=np.random.randn(16, 4), past_seq_len=look_back) def test_forecast_mtnet(self): # TODO hacking to fix a bug target_dim = 1 model = MTNetForecaster(target_dim=target_dim, feature_dim=self.x_train.shape[-1], long_series_num=self.long_num, series_length=self.time_step) x_train_long, x_train_short = model.preprocess_input(self.x_train) x_val_long, x_val_short = model.preprocess_input(self.x_val) x_test_long, x_test_short = model.preprocess_input(self.x_test) model.fit([x_train_long, x_train_short], self.y_train, validation_data=([x_val_long, x_val_short], self.y_val), batch_size=32, distributed=False) assert model.evaluate([x_val_long, x_val_short], self.y_val) predict_result = model.predict([x_test_long, x_test_short]) assert predict_result.shape == (self.x_test.shape[0], target_dim)
def _hp_search(self, input_df, validation_df, metric, recipe, mc, resources_per_trial, remote_dir): ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) if isinstance(input_df, list): feature_list = ft.get_feature_list(input_df[0]) else: feature_list = ft.get_feature_list(input_df) def model_create_func(): # model = VanillaLSTM(check_optional_config=False) _model = TimeSequenceModel(check_optional_config=False, future_seq_len=self.future_seq_len) return _model model = model_create_func() # prepare parameters for search engine search_space = recipe.search_space(feature_list) metric_mode = TimeSequencePredictor._get_metric_mode(metric) searcher = RayTuneSearchEngine( logs_dir=self.logs_dir, resources_per_trial=resources_per_trial, name=self.name, remote_dir=remote_dir, ) searcher.compile( input_df, model_create_func=model_create_func(), search_space=search_space, recipe=recipe, feature_transformers=ft, future_seq_len=self.future_seq_len, validation_df=validation_df, metric=metric, metric_mode=metric_mode, mc=mc, ) # searcher.test_run() analysis = searcher.run() pipeline = self._make_pipeline(analysis, metric_mode, feature_transformers=ft, model=model, remote_dir=remote_dir) return pipeline
def fit_with_fixed_configs(self, input_df, validation_df=None, mc=False, **user_configs): """ Fit pipeline with fixed configs. The model will be trained from initialization with the hyper-parameter specified in configs. The configs contain both identity configs (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs (Eg. "past_seq_len", "batch_size"). We recommend calling get_default_configs to see the name and default values of configs you you can specify. :param input_df: one data frame or a list of data frames :param validation_df: one data frame or a list of data frames :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs" :return: """ # self._check_configs() if self.config is None: self.config = self.get_default_configs() if user_configs is not None: self.config.update(user_configs) ft_id_config_set = { 'future_seq_len', 'dt_col', 'target_col', 'extra_features_col', 'drop_missing' } ft_id_configs = {a: self.config[a] for a in ft_id_config_set} self.feature_transformers = TimeSequenceFeatureTransformer( **ft_id_configs) model_id_config_set = {'future_seq_len'} ft_id_configs = {a: self.config[a] for a in model_id_config_set} self.model = TimeSequenceModel(check_optional_config=False, **ft_id_configs) all_available_features = self.feature_transformers.get_feature_list( input_df) self.config.update({"selected_features": all_available_features}) (x_train, y_train) = self.feature_transformers.fit_transform( input_df, **self.config) if self._is_val_df_valid(validation_df): validation_data = self.feature_transformers.transform( validation_df) else: validation_data = None self.model.fit_eval(x_train, y_train, validation_data=validation_data, mc=mc, verbose=1, **self.config)
def __init__(self, feature_transformers=None, model=None, config=None): """ initialize a pipeline :param model: the internal model :param feature_transformers: the feature transformers """ if feature_transformers is None: assert model is None and config is None self.feature_transformers = TimeSequenceFeatureTransformer() self.model = VanillaLSTM(check_optional_config=False) print("Initialize new time sequence pipeline.") else: self.feature_transformers = feature_transformers self.model = model self.config = config
def test_evaluate_predict_future_more_1(self): target_col = "values" metrics = ["mse", "r2"] future_seq_len = np.random.randint(2, 6) train_df, test_df, tsp, test_sample_num = self.get_input_tsp( future_seq_len, target_col) pipeline = tsp.fit(train_df, test_df) mse, rs = pipeline.evaluate(test_df, metrics=metrics) assert len(mse) == future_seq_len assert len(rs) == future_seq_len y_pred = pipeline.predict(test_df) assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, future_seq_len + 1) y_pred_df = pipeline.predict(test_df[:-future_seq_len]) columns = [ "{}_{}".format(target_col, i) for i in range(future_seq_len) ] y_pred_value = y_pred_df[columns].values y_df = test_df[default_past_seq_len:] y_value = TimeSequenceFeatureTransformer()._roll_test( y_df[target_col], future_seq_len) mse_pred_eval, rs_pred_eval = [ Evaluator.evaluate(m, y_value, y_pred_value) for m in metrics ] mse_eval, rs_eval = pipeline.evaluate(test_df, metrics) assert_array_almost_equal(mse_pred_eval, mse_eval, decimal=2) assert_array_almost_equal(rs_pred_eval, rs_eval, decimal=2)
def create_feature_transformer(self): ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) return ft
class TestZouwuModelLSTMForecaster(TestCase): def setUp(self): tf.keras.backend.clear_session() self.ft = TimeSequenceFeatureTransformer() self.create_data() def tearDown(self): pass def create_data(self): def gen_train_sample(data, past_seq_len, future_seq_len): data = pd.DataFrame(data) x, y = self.ft._roll_train(data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) return x, y def gen_test_sample(data, past_seq_len): test_data = pd.DataFrame(data) x = self.ft._roll_test(test_data, past_seq_len=past_seq_len) return x self.long_num = 6 self.time_step = 2 look_back = (self.long_num + 1) * self.time_step look_forward = 1 self.x_train, self.y_train = gen_train_sample( data=np.random.randn(64, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_test = gen_test_sample(data=np.random.randn(16, 4), past_seq_len=look_back) def test_forecast_lstm(self): # TODO hacking to fix a bug model = LSTMForecaster(target_dim=1, feature_dim=self.x_train.shape[-1]) model.fit(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), batch_size=8, distributed=False) model.evaluate(self.x_val, self.y_val) model.predict(self.x_test)
def load_ts_pipeline(file): feature_transformers = TimeSequenceFeatureTransformer() model = TimeSequenceModel(check_optional_config=False) all_config = restore_zip(file, feature_transformers, model) ts_pipeline = TimeSequencePipeline(feature_transformers=feature_transformers, model=model, config=all_config) print("Restore pipeline from", file) return ts_pipeline
def test_save_restore(self): train_data = pd.DataFrame(data=np.random.randn(64, 4)) test_data = pd.DataFrame(data=np.random.randn(16, 4)) future_seq_len = 1 past_seq_len = 6 # use roll method in time_sequence tsft = TimeSequenceFeatureTransformer() x_train, y_train = tsft._roll_train(train_data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) x_test = tsft._roll_test(test_data, past_seq_len=past_seq_len) config = { 'epochs': 2, "lr": 0.001, "lstm_1_units": 16, "dropout_1": 0.2, "lstm_2_units": 10, "dropout_2": 0.2, "batch_size": 32, } dirname = tempfile.mkdtemp(prefix="automl_test_vanilla") try: model = VanillaLSTM(check_optional_config=False, future_seq_len=future_seq_len) model.fit_eval(x_train, y_train, **config) predict_before = model.predict(x_test) model_path = os.path.join(dirname, "testmodel.h5") config_path = os.path.join(dirname, "local_config.json") model.save(model_path=model_path, config_path=config_path) local_config = load_config(config_path) config.update(local_config) model.restore(model_path=model_path, **config) predict_after = model.predict(x_test) assert np.allclose(predict_before, predict_after) finally: shutil.rmtree(dirname)
def setup_method(self, method): # super().setup_method(method) self.train_data = pd.DataFrame(data=np.random.randn(64, 4)) self.val_data = pd.DataFrame(data=np.random.randn(16, 4)) self.test_data = pd.DataFrame(data=np.random.randn(16, 4)) self.past_seq_len = 6 self.future_seq_len_1 = 1 self.future_seq_len_2 = 2 # use roll method in time_sequence self.feat = TimeSequenceFeatureTransformer() self.config = {'batch_size': 32, 'epochs': 1} self.model_1 = LSTMSeq2Seq(check_optional_config=False, future_seq_len=self.future_seq_len_1) self.model_2 = LSTMSeq2Seq(check_optional_config=False, future_seq_len=self.future_seq_len_2) self.fitted = False self.predict_1 = None self.predict_2 = None
def test_dataframe_input_with_datetime(self): train_df, validation_df, future_seq_len = get_ts_input() dataframe_with_datetime = {'df': train_df, 'val_df': validation_df} ft = TimeSequenceFeatureTransformer(future_seq_len=future_seq_len, dt_col="datetime", target_col="value") searcher = prepare_searcher( data=dataframe_with_datetime, model_creator=LSTM_model_creator, name='test_ray_dateframe_with_datetime_with_val', feature_transformer=ft) searcher.run() best_trials = searcher.get_best_trials(k=1) assert best_trials is not None
class TestSeq2Seq(ZooTestCase): def setup_method(self, method): # super().setup_method(method) self.train_data = pd.DataFrame(data=np.random.randn(64, 4)) self.val_data = pd.DataFrame(data=np.random.randn(16, 4)) self.test_data = pd.DataFrame(data=np.random.randn(16, 4)) self.past_seq_len = 6 self.future_seq_len_1 = 1 self.future_seq_len_2 = 2 # use roll method in time_sequence self.feat = TimeSequenceFeatureTransformer() self.config = {'batch_size': 32, 'epochs': 1} self.model_1 = LSTMSeq2Seq(check_optional_config=False, future_seq_len=self.future_seq_len_1) self.model_2 = LSTMSeq2Seq(check_optional_config=False, future_seq_len=self.future_seq_len_2) self.fitted = False self.predict_1 = None self.predict_2 = None def teardown_method(self, method): pass def test_fit_eval_1(self): x_train_1, y_train_1 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_1) print("fit_eval_future_seq_len_1:", self.model_1.fit_eval(x_train_1, y_train_1, **self.config)) assert self.model_1.past_seq_len == 6 assert self.model_1.feature_num == 4 assert self.model_1.future_seq_len == 1 assert self.model_1.target_col_num == 1 def test_fit_eval_2(self): x_train_2, y_train_2 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_2) print("fit_eval_future_seq_len_2:", self.model_2.fit_eval(x_train_2, y_train_2, **self.config)) assert self.model_2.future_seq_len == 2 self.fitted = True def test_evaluate_1(self): x_train_1, y_train_1 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_1) x_val_1, y_val_1 = self.feat._roll_train( self.val_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_1) self.model_1.fit_eval(x_train_1, y_train_1, **self.config) print("evaluate_future_seq_len_1:", self.model_1.evaluate(x_val_1, y_val_1, metric=['mse', 'r2'])) def test_evaluate_2(self): x_train_2, y_train_2 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_2) x_val_2, y_val_2 = self.feat._roll_train( self.val_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_2) self.model_2.fit_eval(x_train_2, y_train_2, **self.config) print("evaluate_future_seq_len_2:", self.model_2.evaluate(x_val_2, y_val_2, metric=['mse', 'r2'])) def test_predict_1(self): x_train_1, y_train_1 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_1) x_test_1 = self.feat._roll_test(self.test_data, past_seq_len=self.past_seq_len) self.model_1.fit_eval(x_train_1, y_train_1, **self.config) predict_1 = self.model_1.predict(x_test_1) assert predict_1.shape == (x_test_1.shape[0], self.future_seq_len_1) def test_predict_2(self): x_train_2, y_train_2 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_2) x_test_2 = self.feat._roll_test(self.test_data, past_seq_len=self.past_seq_len) self.model_2.fit_eval(x_train_2, y_train_2, **self.config) predict_2 = self.model_2.predict(x_test_2) assert predict_2.shape == (x_test_2.shape[0], self.future_seq_len_2) def test_save_restore_1(self): x_train_1, y_train_1 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_1) x_test_1 = self.feat._roll_test(self.test_data, past_seq_len=self.past_seq_len) self.model_1.fit_eval(x_train_1, y_train_1, **self.config) predict_1_before = self.model_1.predict(x_test_1) new_model_1 = LSTMSeq2Seq(check_optional_config=False) dirname = tempfile.mkdtemp(prefix="automl_test_feature") try: save(dirname, model=self.model_1) restore(dirname, model=new_model_1, config=self.config) predict_1_after = new_model_1.predict(x_test_1) assert_array_almost_equal(predict_1_before, predict_1_after, decimal=2), \ "Prediction values are not the same after restore: " \ "predict before is {}, and predict after is {}".format(predict_1_before, predict_1_after) new_config = {'epochs': 1} new_model_1.fit_eval(x_train_1, y_train_1, **new_config) finally: shutil.rmtree(dirname) def test_save_restore_2(self): x_train_2, y_train_2 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_2) x_test_2 = self.feat._roll_test(self.test_data, past_seq_len=self.past_seq_len) self.model_2.fit_eval(x_train_2, y_train_2, **self.config) predict_2_before = self.model_2.predict(x_test_2) new_model_2 = LSTMSeq2Seq(check_optional_config=False) dirname = tempfile.mkdtemp(prefix="automl_test_feature") try: save(dirname, model=self.model_2) restore(dirname, model=new_model_2, config=self.config) predict_2_after = new_model_2.predict(x_test_2) assert_array_almost_equal(predict_2_before, predict_2_after, decimal=2), \ "Prediction values are not the same after restore: " \ "predict before is {}, and predict after is {}".format(predict_2_before, predict_2_after) new_config = {'epochs': 2} new_model_2.fit_eval(x_train_2, y_train_2, **new_config) finally: shutil.rmtree(dirname) def test_predict_with_uncertainty(self, ): x_train_2, y_train_2 = self.feat._roll_train( self.train_data, past_seq_len=self.past_seq_len, future_seq_len=self.future_seq_len_2) x_test_2 = self.feat._roll_test(self.test_data, past_seq_len=self.past_seq_len) self.model_2.fit_eval(x_train_2, y_train_2, mc=True, **self.config) prediction, uncertainty = self.model_2.predict_with_uncertainty( x_test_2, n_iter=2) assert prediction.shape == (x_test_2.shape[0], self.future_seq_len_2) assert uncertainty.shape == (x_test_2.shape[0], self.future_seq_len_2) assert np.any(uncertainty) new_model_2 = LSTMSeq2Seq(check_optional_config=False) dirname = tempfile.mkdtemp(prefix="automl_test_feature") try: save(dirname, model=self.model_2) restore(dirname, model=new_model_2, config=self.config) prediction, uncertainty = new_model_2.predict_with_uncertainty( x_test_2, n_iter=2) assert prediction.shape == (x_test_2.shape[0], self.future_seq_len_2) assert uncertainty.shape == (x_test_2.shape[0], self.future_seq_len_2) assert np.any(uncertainty) finally: shutil.rmtree(dirname)
class TestMTNetKeras(ZooTestCase): def setup_method(self, method): tf.keras.backend.clear_session() self.ft = TimeSequenceFeatureTransformer() self.create_data() self.model = MTNetKeras() self.config = {"long_num": self.long_num, "time_step": self.time_step, "ar_window": np.random.randint(1, 3), "cnn_height": np.random.randint(1, 3), "epochs": 1} def teardown_method(self, method): pass def create_data(self): def gen_train_sample(data, past_seq_len, future_seq_len): data = pd.DataFrame(data) x, y = self.ft._roll_train(data, past_seq_len=past_seq_len, future_seq_len=future_seq_len ) return x, y def gen_test_sample(data, past_seq_len): test_data = pd.DataFrame(data) x = self.ft._roll_test(test_data, past_seq_len=past_seq_len) return x self.long_num = 6 self.time_step = 2 look_back = (self.long_num + 1) * self.time_step look_forward = 1 self.x_train, self.y_train = gen_train_sample(data=np.random.randn( 64, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_test = gen_test_sample(data=np.random.randn(16, 4), past_seq_len=look_back) def test_fit_evaluate(self): self.model.fit_eval(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), **self.config) self.model.evaluate(self.x_val, self.y_val) def test_save_restore(self): self.model.fit_eval(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), **self.config) y_pred = self.model.predict(self.x_test) assert y_pred.shape == (self.x_test.shape[0], self.y_train.shape[1]) dirname = "tmp" restored_model = MTNetKeras() try: save(dirname, model=self.model) restore(dirname, model=restored_model, config=self.config) predict_after = restored_model.predict(self.x_test) assert_array_almost_equal(y_pred, predict_after, decimal=2), \ "Prediction values are not the same after restore: " \ "predict before is {}, and predict after is {}".format(y_pred, predict_after) restored_model.fit_eval(self.x_train, self.y_train, epochs=1) restored_model.evaluate(self.x_val, self.y_val) finally: shutil.rmtree("tmp") def test_predict_with_uncertainty(self): self.model.fit_eval(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), mc=True, **self.config) pred, uncertainty = self.model.predict_with_uncertainty(self.x_test, n_iter=2) assert pred.shape == (self.x_test.shape[0], self.y_train.shape[1]) assert uncertainty.shape == pred.shape assert np.any(uncertainty)
def setUp(self): tf.keras.backend.clear_session() self.ft = TimeSequenceFeatureTransformer() self.create_data()
class TestZouwuModelForecast(ZooTestCase): def setup_method(self, method): tf.keras.backend.clear_session() # super(TestZouwuModelForecast, self).setup_method(method) self.ft = TimeSequenceFeatureTransformer() self.create_data() def teardown_method(self, method): pass def create_data(self): def gen_train_sample(data, past_seq_len, future_seq_len): data = pd.DataFrame(data) x, y = self.ft._roll_train(data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) return x, y def gen_test_sample(data, past_seq_len): test_data = pd.DataFrame(data) x = self.ft._roll_test(test_data, past_seq_len=past_seq_len) return x self.long_num = 6 self.time_step = 2 look_back = (self.long_num + 1) * self.time_step look_forward = 1 self.x_train, self.y_train = gen_train_sample( data=np.random.randn(64, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_test = gen_test_sample(data=np.random.randn(16, 4), past_seq_len=look_back) def test_forecast_lstm(self): # TODO hacking to fix a bug model = LSTMForecaster(target_dim=1, feature_dim=self.x_train.shape[-1]) model.fit(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), batch_size=8, distributed=False) model.evaluate(self.x_val, self.y_val) model.predict(self.x_test) def test_forecast_mtnet(self): # TODO hacking to fix a bug model = MTNetForecaster(target_dim=1, feature_dim=self.x_train.shape[-1], lb_long_steps=self.long_num, lb_long_stepsize=self.time_step) x_train_long, x_train_short = model.preprocess_input(self.x_train) x_val_long, x_val_short = model.preprocess_input(self.x_val) x_test_long, x_test_short = model.preprocess_input(self.x_test) model.fit([x_train_long, x_train_short], self.y_train, validation_data=([x_val_long, x_val_short], self.y_val), batch_size=32, distributed=False) model.evaluate([x_val_long, x_val_short], self.y_val) model.predict([x_test_long, x_test_short]) def test_forecast_tcmf(self): from zoo.zouwu.model.forecast import TCMFForecaster model = TCMFForecaster(max_y_iterations=1, init_XF_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) x = np.random.rand(300, 480) model.fit(x) model.predict(x=None, horizon=24) target_value = np.random.rand(300, 24) model.evaluate(x=None, target_value=target_value, metric=['mse'])
class TimeSequencePipeline(Pipeline): def __init__(self, feature_transformers=None, model=None, config=None, name=None): """ initialize a pipeline :param model: the internal model :param feature_transformers: the feature transformers """ self.feature_transformers = feature_transformers self.model = model self.config = config self.name = name self.time = time.strftime("%Y%m%d-%H%M%S") def describe(self): init_info = [ 'future_seq_len', 'dt_col', 'target_col', 'extra_features_col', 'drop_missing' ] print("**** Initialization info ****") for info in init_info: print(info + ":", self.config[info]) print("") def fit(self, input_df, validation_df=None, mc=False, epoch_num=20): x, y = self.feature_transformers.transform(input_df, is_train=True) if validation_df is not None and not validation_df.empty: validation_data = self.feature_transformers.transform( validation_df) else: validation_data = None new_config = {'epochs': epoch_num} self.model.fit_eval(x, y, validation_data, mc=mc, verbose=1, **new_config) print('Fit done!') def _is_val_df_valid(self, validation_df): df_not_empty = isinstance(validation_df, pd.DataFrame) and not validation_df.empty df_list_not_empty = isinstance(validation_df, list) \ and validation_df and not all([d.empty for d in validation_df]) if validation_df is not None and (df_not_empty or df_list_not_empty): return True else: return False def _check_configs(self): required_configs = {'future_seq_len'} if not self.config.keys() & required_configs: raise ValueError("Missing required parameters in configuration. " + "Required parameters are: " + str(required_configs)) default_config = { 'dt_col': 'datetime', 'target_col': 'value', 'extra_features_col': None, 'drop_missing': True, 'past_seq_len': 2, 'batch_size': 64, 'lr': 0.001, 'dropout': 0.2, 'epochs': 10, 'metric': 'mse' } for config, value in default_config.items(): if config not in self.config: print('Config: \'{}\' is not specified. ' 'A default value of {} will be used.'.format( config, value)) def get_default_configs(self): default_configs = { 'dt_col': 'datetime', 'target_col': 'value', 'extra_features_col': None, 'drop_missing': True, 'future_seq_len': 1, 'past_seq_len': 2, 'batch_size': 64, 'lr': 0.001, 'dropout': 0.2, 'epochs': 10, 'metric': 'mean_squared_error' } print("**** default config: ****") for config in default_configs: print(config + ":", default_configs[config]) print( "You can change any fields in the default configs by passing into " "fit_with_fixed_configs(). Otherwise, the default values will be used." ) return default_configs def fit_with_fixed_configs(self, input_df, validation_df=None, mc=False, **user_configs): """ Fit pipeline with fixed configs. The model will be trained from initialization with the hyper-parameter specified in configs. The configs contain both identity configs (Eg. "future_seq_len", "dt_col", "target_col", "metric") and automl tunable configs (Eg. "past_seq_len", "batch_size"). We recommend calling get_default_configs to see the name and default values of configs you you can specify. :param input_df: one data frame or a list of data frames :param validation_df: one data frame or a list of data frames :param user_configs: you can overwrite or add more configs with user_configs. Eg. "epochs" :return: """ # self._check_configs() if self.config is None: self.config = self.get_default_configs() if user_configs is not None: self.config.update(user_configs) ft_id_config_set = { 'future_seq_len', 'dt_col', 'target_col', 'extra_features_col', 'drop_missing' } ft_id_configs = {a: self.config[a] for a in ft_id_config_set} self.feature_transformers = TimeSequenceFeatureTransformer( **ft_id_configs) model_id_config_set = {'future_seq_len'} ft_id_configs = {a: self.config[a] for a in model_id_config_set} self.model = TimeSequenceModel(check_optional_config=False, **ft_id_configs) all_available_features = self.feature_transformers.get_feature_list( input_df) self.config.update({"selected_features": all_available_features}) (x_train, y_train) = self.feature_transformers.fit_transform( input_df, **self.config) if self._is_val_df_valid(validation_df): validation_data = self.feature_transformers.transform( validation_df) else: validation_data = None self.model.fit_eval(x_train, y_train, validation_data=validation_data, mc=mc, verbose=1, **self.config) def evaluate(self, input_df, metrics=["mse"], multioutput='raw_values'): """ evaluate the pipeline :param input_df: :param metrics: subset of ['mean_squared_error', 'r_square', 'sMAPE'] :param multioutput: string in ['raw_values', 'uniform_average'] 'raw_values' : Returns a full set of errors in case of multioutput input. 'uniform_average' : Errors of all outputs are averaged with uniform weight. :return: """ if isinstance(metrics, str): metrics = [metrics] # if not isinstance(metrics, list): # raise ValueError("Expected metrics to be a list!") x, y = self.feature_transformers.transform(input_df, is_train=True) y_pred = self.model.predict(x) if y_pred.shape[1] == 1: multioutput = 'uniform_average' y_unscale, y_pred_unscale = self.feature_transformers.post_processing( input_df, y_pred, is_train=True) return [ Evaluator.evaluate(m, y_unscale, y_pred_unscale, multioutput=multioutput) for m in metrics ] def predict(self, input_df): """ predict test data with the pipeline fitted :param input_df: :return: """ x, _ = self.feature_transformers.transform(input_df, is_train=False) y_pred = self.model.predict(x) y_output = self.feature_transformers.post_processing(input_df, y_pred, is_train=False) return y_output def predict_with_uncertainty(self, input_df, n_iter=100): x, _ = self.feature_transformers.transform(input_df, is_train=False) y_pred, y_pred_uncertainty = self.model.predict_with_uncertainty( x=x, n_iter=n_iter) y_output = self.feature_transformers.post_processing(input_df, y_pred, is_train=False) y_uncertainty = self.feature_transformers.unscale_uncertainty( y_pred_uncertainty) return y_output, y_uncertainty def save(self, ppl_file=None): """ save pipeline to file, contains feature transformer, model, trial config. :param ppl_file: :return: """ ppl_file = ppl_file or os.path.join( DEFAULT_PPL_DIR, "{}_{}.ppl".format(self.name, self.time)) save_zip(ppl_file, self.feature_transformers, self.model, self.config) print("Pipeline is saved in", ppl_file) return ppl_file def config_save(self, config_file=None): """ save all configs to file. :param config_file: :return: """ config_file = config_file or os.path.join( DEFAULT_CONFIG_DIR, "{}_{}.json".format(self.name, self.time)) save_config(config_file, self.config, replace=True) return config_file
def _hp_search(self, input_df, validation_df, metric): # features # feature_list = ["WEEKDAY(datetime)", "HOUR(datetime)", # "PERCENTILE(value)", "IS_WEEKEND(datetime)", # "IS_AWAKE(datetime)", "IS_BUSY_HOURS(datetime)" # # "DAY(datetime)","MONTH(datetime)", #probabaly not useful # ] # target_list = ["value"] # ft = TimeSequenceFeatures(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col) # ft = DummyTimeSequenceFeatures(file_path='../../../../data/nyc_taxi_rolled_split.npz') ft = TimeSequenceFeatureTransformer(self.future_seq_len, self.dt_col, self.target_col, self.extra_features_col, self.drop_missing) feature_list = ft.get_feature_list(input_df) # model model = VanillaLSTM(check_optional_config=False, future_seq_len=self.future_seq_len) search_space = { # -------- feature related parameters "selected_features": RandomSample(lambda spec: np.random.choice( feature_list, size=np.random.randint(low=3, high=len(feature_list), size=1), replace=False)), # --------- model related parameters # 'input_shape_x': x_train.shape[1], # 'input_shape_y': x_train.shape[-1], 'out_units': self.future_seq_len, "lr": 0.001, "lstm_1_units": GridSearch([16, 32]), "dropout_1": 0.2, "lstm_2_units": 10, "dropout_2": RandomSample(lambda spec: np.random.uniform(0.2, 0.5)), "batch_size": 1024, } stop = {"reward_metric": -0.05, "training_iteration": 10} searcher = RayTuneSearchEngine(logs_dir=self.logs_dir, ray_num_cpus=6, resources_per_trial={"cpu": 2}) searcher.compile( input_df, search_space=search_space, stop=stop, # feature_transformers=TimeSequenceFeatures, feature_transformers=ft, # use dummy features for testing the rest model=model, validation_df=validation_df, metric=metric) # searcher.test_run() trials = searcher.run() best = searcher.get_best_trials( k=1)[0] # get the best one trial, later could be n pipeline = self._make_pipeline( best, feature_transformers=ft, # feature_transformers=TimeSequenceFeatures( # file_path='../../../../data/nyc_taxi_rolled_split.npz'), model=VanillaLSTM(check_optional_config=False)) return pipeline
class TestZouwuModelForecast(ZooTestCase): def setup_method(self, method): tf.keras.backend.clear_session() # super(TestZouwuModelForecast, self).setup_method(method) self.ft = TimeSequenceFeatureTransformer() self.create_data() def teardown_method(self, method): pass def create_data(self): def gen_train_sample(data, past_seq_len, future_seq_len): data = pd.DataFrame(data) x, y = self.ft._roll_train(data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) return x, y def gen_test_sample(data, past_seq_len): test_data = pd.DataFrame(data) x = self.ft._roll_test(test_data, past_seq_len=past_seq_len) return x self.long_num = 6 self.time_step = 2 look_back = (self.long_num + 1) * self.time_step look_forward = 1 self.x_train, self.y_train = gen_train_sample( data=np.random.randn(64, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_test = gen_test_sample(data=np.random.randn(16, 4), past_seq_len=look_back) def test_forecast_lstm(self): # TODO hacking to fix a bug model = LSTMForecaster(horizon=1, feature_dim=self.x_train.shape[-1]) model.fit(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), batch_size=8, distributed=False) model.evaluate(self.x_val, self.y_val) model.predict(self.x_test) def test_forecast_mtnet(self): # TODO hacking to fix a bug model = MTNetForecaster(horizon=1, feature_dim=self.x_train.shape[-1], lb_long_steps=self.long_num, lb_long_stepsize=self.time_step) x_train_long, x_train_short = model.preprocess_input(self.x_train) x_val_long, x_val_short = model.preprocess_input(self.x_val) x_test_long, x_test_short = model.preprocess_input(self.x_test) model.fit([x_train_long, x_train_short], self.y_train, validation_data=([x_val_long, x_val_short], self.y_val), batch_size=32, distributed=False) model.evaluate([x_val_long, x_val_short], self.y_val) model.predict([x_test_long, x_test_short])
def setup_method(self, method): tf.keras.backend.clear_session() # super(TestZouwuModelForecast, self).setup_method(method) self.ft = TimeSequenceFeatureTransformer() self.create_data()
class TestZouwuModelForecast(ZooTestCase): def setup_method(self, method): tf.keras.backend.clear_session() # super(TestZouwuModelForecast, self).setup_method(method) self.ft = TimeSequenceFeatureTransformer() self.create_data() def teardown_method(self, method): pass def create_data(self): def gen_train_sample(data, past_seq_len, future_seq_len): data = pd.DataFrame(data) x, y = self.ft._roll_train(data, past_seq_len=past_seq_len, future_seq_len=future_seq_len) return x, y def gen_test_sample(data, past_seq_len): test_data = pd.DataFrame(data) x = self.ft._roll_test(test_data, past_seq_len=past_seq_len) return x self.long_num = 6 self.time_step = 2 look_back = (self.long_num + 1) * self.time_step look_forward = 1 self.x_train, self.y_train = gen_train_sample( data=np.random.randn(64, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_val, self.y_val = gen_train_sample(data=np.random.randn(16, 4), past_seq_len=look_back, future_seq_len=look_forward) self.x_test = gen_test_sample(data=np.random.randn(16, 4), past_seq_len=look_back) def test_forecast_lstm(self): # TODO hacking to fix a bug model = LSTMForecaster(target_dim=1, feature_dim=self.x_train.shape[-1]) model.fit(self.x_train, self.y_train, validation_data=(self.x_val, self.y_val), batch_size=8, distributed=False) model.evaluate(self.x_val, self.y_val) model.predict(self.x_test) def test_forecast_mtnet(self): # TODO hacking to fix a bug model = MTNetForecaster(target_dim=1, feature_dim=self.x_train.shape[-1], long_series_num=self.long_num, series_length=self.time_step) x_train_long, x_train_short = model.preprocess_input(self.x_train) x_val_long, x_val_short = model.preprocess_input(self.x_val) x_test_long, x_test_short = model.preprocess_input(self.x_test) model.fit([x_train_long, x_train_short], self.y_train, validation_data=([x_val_long, x_val_short], self.y_val), batch_size=32, distributed=False) model.evaluate([x_val_long, x_val_short], self.y_val) model.predict([x_test_long, x_test_short]) def test_forecast_tcmf(self): from zoo.zouwu.model.forecast import TCMFForecaster import tempfile model = TCMFForecaster(max_y_iterations=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) horizon = np.random.randint(1, 50) # construct data id = np.arange(300) data = np.random.rand(300, 480) input = dict({'data': data}) with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) input = dict({'id': id, 'y': data}) with self.assertRaises(Exception) as context: model.is_distributed() self.assertTrue('You should run fit before calling is_distributed()' in str(context.exception)) model.fit(input) assert not model.is_distributed() with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue('This model has already been fully trained' in str( context.exception)) with self.assertRaises(Exception) as context: model.fit(input, incremental=True) self.assertTrue( 'NotImplementedError' in context.exception.__class__.__name__) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) yhat = model.predict(x=None, horizon=horizon) yhat_loaded = loaded_model.predict(x=None, horizon=horizon) yhat_id = yhat_loaded["id"] assert (yhat_id == id).all() yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (300, horizon) assert (yhat == yhat_loaded).all() target_value = np.random.rand(300, horizon) target_value = dict({"y": target_value}) model.evaluate(x=None, target_value=target_value, metric=['mse']) def test_forecast_tcmf_without_id(self): from zoo.zouwu.model.forecast import TCMFForecaster import tempfile model = TCMFForecaster(max_y_iterations=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) horizon = np.random.randint(1, 50) # construct data id = np.arange(200) data = np.random.rand(300, 480) input = dict({'y': "abc"}) with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue( "the value of y should be an ndarray" in str(context.exception)) input = dict({'id': id, 'y': data}) with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue( "the length of the id array should be equal to the number of" in str(context.exception)) input = dict({'y': data}) model.fit(input) assert not model.is_distributed() with self.assertRaises(Exception) as context: model.fit(input) self.assertTrue('This model has already been fully trained' in str( context.exception)) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname) loaded_model = TCMFForecaster.load(tempdirname, distributed=False) yhat = model.predict(x=None, horizon=horizon) yhat_loaded = loaded_model.predict(x=None, horizon=horizon) assert "id" not in yhat_loaded yhat = yhat["prediction"] yhat_loaded = yhat_loaded["prediction"] assert yhat.shape == (300, horizon) assert (yhat == yhat_loaded).all() target_value = np.random.rand(300, horizon) target_value_fake = dict({"data": target_value}) with self.assertRaises(Exception) as context: model.evaluate(x=None, target_value=target_value_fake, metric=['mse']) self.assertTrue("key y doesn't exist in y" in str(context.exception)) target_value = dict({"y": target_value}) model.evaluate(x=None, target_value=target_value, metric=['mse']) def test_forecast_tcmf_xshards(self): from zoo.zouwu.model.forecast import TCMFForecaster from zoo.orca import OrcaContext import zoo.orca.data.pandas import tempfile OrcaContext.pandas_read_backend = "pandas" def preprocessing(df, id_name, y_name): id = df.index data = df.to_numpy() result = dict({id_name: id, y_name: data}) return result def postprocessing(pred_results, output_dt_col_name): id_arr = pred_results["id"] pred_results = pred_results["prediction"] pred_results = np.concatenate( (np.expand_dims(id_arr, axis=1), pred_results), axis=1) final_df = pd.DataFrame(pred_results, columns=["id"] + output_dt_col_name) final_df.id = final_df.id.astype("int") final_df = final_df.set_index("id") final_df.columns.name = "datetime" final_df = final_df.unstack().reset_index().rename( {0: "prediction"}, axis=1) return final_df def get_pred(d): return d["prediction"] model = TCMFForecaster(max_y_iterations=1, init_FX_epoch=1, max_FX_epoch=1, max_TCN_epoch=1, alt_iters=2) with tempfile.NamedTemporaryFile() as temp: data = np.random.rand(300, 480) df = pd.DataFrame(data) df.to_csv(temp.name) shard = zoo.orca.data.pandas.read_csv(temp.name) shard.cache() shard_train = shard.transform_shard(preprocessing, 'id', 'data') with self.assertRaises(Exception) as context: model.fit(shard_train) self.assertTrue("key `y` doesn't exist in x" in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'cid', 'y') with self.assertRaises(Exception) as context: model.fit(shard_train) self.assertTrue( "key `id` doesn't exist in x" in str(context.exception)) with self.assertRaises(Exception) as context: model.is_distributed() self.assertTrue('You should run fit before calling is_distributed()' in str(context.exception)) shard_train = shard.transform_shard(preprocessing, 'id', 'y') model.fit(shard_train) assert model.is_distributed() with self.assertRaises(Exception) as context: model.fit(shard_train) self.assertTrue('This model has already been fully trained' in str( context.exception)) with self.assertRaises(Exception) as context: model.fit(shard_train, incremental=True) self.assertTrue( 'NotImplementedError' in context.exception.__class__.__name__) with tempfile.TemporaryDirectory() as tempdirname: model.save(tempdirname + "/model") loaded_model = TCMFForecaster.load(tempdirname + "/model", distributed=True) horizon = np.random.randint(1, 50) yhat_shard_origin = model.predict(x=None, horizon=horizon) yhat_list_origin = yhat_shard_origin.collect() yhat_list_origin = list(map(get_pred, yhat_list_origin)) yhat_shard = loaded_model.predict(x=None, horizon=horizon) yhat_list = yhat_shard.collect() yhat_list = list(map(get_pred, yhat_list)) yhat_origin = np.concatenate(yhat_list_origin) yhat = np.concatenate(yhat_list) assert yhat.shape == (300, horizon) assert (yhat == yhat_origin).all() output_dt_col_name = pd.date_range(start='2020-05-01', periods=horizon, freq='H').to_list() yhat_df_shards = yhat_shard.transform_shard(postprocessing, output_dt_col_name) final_df_list = yhat_df_shards.collect() final_df = pd.concat(final_df_list) final_df.sort_values("datetime", inplace=True) assert final_df.shape == (300 * horizon, 3) OrcaContext.pandas_read_backend = "spark"
def setup_method(self, method): self.ft = TimeSequenceFeatureTransformer() self.create_data()
class TimeSequencePipeline(Pipeline): def __init__(self, feature_transformers=None, model=None, config=None): """ initialize a pipeline :param model: the internal model :param feature_transformers: the feature transformers """ if feature_transformers is None: assert model is None and config is None self.feature_transformers = TimeSequenceFeatureTransformer() self.model = VanillaLSTM(check_optional_config=False) print("Initialize new time sequence pipeline.") else: self.feature_transformers = feature_transformers self.model = model self.config = config def evaluate(self, input_df, metric=["mean_squared_error"]): """ evaluate the pipeline :param input_df: :param metric: :return: """ x, y = self.feature_transformers.transform(input_df, is_train=True) return self.model.evaluate(x, y, metric) def predict(self, input_df): # there might be no y in the data, TODO needs to fix in TimeSquenceFeatures x = self.feature_transformers.transform(input_df, is_train=False) y_pred = self.model.predict(x) y_output = self.feature_transformers.post_processing(y_pred) return y_output def save(self, file): """ save pipeline to file, contains feature transformer, model, trial config. :param file: :return: """ if not os.path.isdir(file): os.mkdir(file) model_path = os.path.join(file, "weights_tune.h5") config_path = os.path.join(file, "all_config.json") self.feature_transformers.save(config_path, replace=True) self.model.save(model_path, config_path) # check if ** is needed save_config(config_path, self.config) def restore(self, file): """ restore pipeline from file :param file: :param config: :return: """ model_path = os.path.join(file, "weights_tune.h5") config_path = os.path.join(file, "all_config.json") all_config = load_config(config_path) self.model.restore(model_path, **all_config) self.feature_transformers.restore(**all_config)