def test_valuate(self): # sample_num should > past_seq_len, the default value of which is 50 sample_num = 100 train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num) }) sample_num = 64 test_df = pd.DataFrame({ "datetime": pd.date_range('1/10/2019', periods=sample_num), "value": np.random.randn(sample_num) }) tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", extra_features_col=None, ) pipeline = tsp.fit(train_df) print( "evaluate:", pipeline.evaluate(test_df, metric=["mean_squared_error", "r_square"]))
def test_predict(self): # sample_num should > past_seq_len, the default value of which is 50 sample_num = 100 train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num) }) test_sample_num = 64 test_df = pd.DataFrame({ "datetime": pd.date_range('1/10/2019', periods=test_sample_num), "value": np.random.randn(test_sample_num) }) tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", extra_features_col=None, ) pipeline = tsp.fit(train_df) y_pred = pipeline.predict(test_df) default_past_seq_len = 50 assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, 2)
def test_fit_BayesRecipe(self): from zoo.automl.config.recipe import BayesRecipe train_df, _, future_seq_len = self.create_dataset() tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", future_seq_len=future_seq_len, extra_features_col=None, ) pipeline = tsp.fit(train_df, recipe=BayesRecipe(num_samples=1, training_iteration=2, epochs=1, look_back=(3, 5))) assert isinstance(pipeline, TimeSequencePipeline) assert isinstance(pipeline.feature_transformers, TimeSequenceFeatureTransformer) assert isinstance(pipeline.model, BaseModel) assert pipeline.config is not None assert "epochs" in pipeline.config assert [ config_name for config_name in pipeline.config if config_name.startswith('bayes_feature') ] == [] assert [ config_name for config_name in pipeline.config if config_name.endswith('float') ] == [] assert 'past_seq_len' in pipeline.config assert 3 <= pipeline.config["past_seq_len"] <= 5
def main(train_path, pred_path, n_pred, dt, target, time_limit_min): os.environ["TRIALRUNNER_WALLTIME_LIMIT"] = str(time_limit_min * 60) df_train = pd.read_csv(train_path) df_train[dt] = pd.to_datetime(df_train[dt]) sc = init_spark_on_local(cores=mp.cpu_count(), spark_log_level="ERROR") ray_ctx = RayContext(sc=sc) ray_ctx.init() extra_features_col = list(set(df_train.columns) - set([dt, target])) if not extra_features_col: extra_features_col = None tsp = TimeSequencePredictor(dt_col=dt, target_col=target, extra_features_col=extra_features_col, future_seq_len=n_pred) pipeline = tsp.fit(df_train, resources_per_trial={"cpu": 4}, recipe=BayesRecipe(num_samples=100000)) df_pred = pipeline.predict(df_train[-2:]) x_pred = pd.date_range(df_pred.iloc[0][dt], periods=n_pred, freq=pd.infer_freq(df_train[dt])) y_pred = df_pred.iloc[0][1:] df_pred = pd.DataFrame({dt: x_pred, target: y_pred}) df_pred.to_csv(pred_path, index=False)
class AutoTSTrainer: """ The Automated Time Series Forecast Trainer """ def __init__(self, horizon=1, dt_col="datetime", target_col="value", extra_features_col=None): """ Initialize the AutoTS Trainer. :param horizon: steps to look forward :param dt_col: the datetime column :param target_col: the target column to forecast :param extra_features_col: extra feature columns """ target_col_list = target_col if isinstance(target_col, str): target_col_list = [target_col] self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col_list, future_seq_len=horizon, extra_features_col=extra_features_col, ) def fit(self, train_df, validation_df=None, metric="mse", recipe: Recipe = SmokeRecipe(), uncertainty: bool = False, distributed: bool = False, hdfs_url=None): """ Fit a time series forecasting pipeline w/ automl :param train_df: the input dataframe (as pandas.dataframe) :param validation_df: the validation dataframe (as pandas.dataframe) :param recipe: the configuration of searching :param metric: the evaluation metric to optimize :param uncertainty: whether to enable uncertainty calculation (will output an uncertainty sigma) :param hdfs_url: the hdfs_url to use for storing trail and intermediate results :param distributed: whether to enable distributed training :return a TSPipeline """ zoo_pipeline = self.internal.fit(train_df, validation_df, metric, recipe, mc=uncertainty, distributed=distributed, hdfs_url=hdfs_url) ppl = TSPipeline() ppl.internal = zoo_pipeline return ppl
def test_fit_SmokeRecipe(self): train_df, validation_df, future_seq_len = self.create_dataset() tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", future_seq_len=future_seq_len, extra_features_col=None, ) pipeline = tsp.fit(train_df, validation_df) assert isinstance(pipeline, TimeSequencePipeline) assert isinstance(pipeline.feature_transformers, TimeSequenceFeatureTransformer) assert isinstance(pipeline.model, BaseModel) assert pipeline.config is not None
def test_fit_LSTMGridRandomRecipe(self): from zoo.automl.config.recipe import LSTMGridRandomRecipe train_df, _, future_seq_len = self.create_dataset() tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", future_seq_len=future_seq_len, extra_features_col=None, ) pipeline = tsp.fit(train_df, recipe=LSTMGridRandomRecipe(lstm_2_units=[4], batch_size=[1024], num_rand_samples=5, look_back=2, training_iteration=1, epochs=1)) assert isinstance(pipeline, TimeSequencePipeline) assert isinstance(pipeline.feature_transformers, TimeSequenceFeatureTransformer) assert isinstance(pipeline.model, BaseModel) assert pipeline.config is not None assert 'past_seq_len' in pipeline.config assert pipeline.config["past_seq_len"] == 2
def test_save_restore(self): sample_num = 100 train_df = pd.DataFrame({ "datetime": pd.date_range('1/1/2019', periods=sample_num), "value": np.random.randn(sample_num) }) sample_num = 64 test_df = pd.DataFrame({ "datetime": pd.date_range('1/10/2019', periods=sample_num), "value": np.random.randn(sample_num) }) tsp = TimeSequencePredictor( dt_col="datetime", target_col="value", extra_features_col=None, ) pipeline = tsp.fit(train_df) pred = pipeline.predict(test_df) dirname = tempfile.mkdtemp(prefix="saved_pipeline") try: save_pipeline_file = dirname pipeline.save(save_pipeline_file) new_pipeline = TimeSequencePipeline() new_pipeline.restore(save_pipeline_file) new_pred = new_pipeline.predict(test_df) np.testing.assert_allclose(pred["value"].values, new_pred["value"].values) finally: shutil.rmtree(dirname)
class AutoTSTrainer: """ The Automated Time Series Forecast Trainer """ def __init__(self, horizon=1, dt_col="datetime", target_col="value", logs_dir="~/zoo_automl_logs", extra_features_col=None, search_alg=None, search_alg_params=None, scheduler=None, scheduler_params=None, name="automl"): """ Initialize the AutoTS Trainer. :param horizon: steps to look forward :param dt_col: the datetime column :param target_col: the target column to forecast :param extra_features_col: extra feature columns """ target_col_list = target_col if isinstance(target_col, str): target_col_list = [target_col] self.internal = TimeSequencePredictor( dt_col=dt_col, target_col=target_col_list, logs_dir=logs_dir, future_seq_len=horizon, extra_features_col=extra_features_col, search_alg=search_alg, search_alg_params=search_alg_params, scheduler=scheduler, scheduler_params=scheduler_params, name=name) def fit( self, train_df, validation_df=None, metric="mse", recipe: Recipe = SmokeRecipe(), uncertainty: bool = False, upload_dir=None, ): """ Fit a time series forecasting pipeline w/ automl :param train_df: the input dataframe (as pandas.dataframe) :param validation_df: the validation dataframe (as pandas.dataframe) :param recipe: the configuration of searching :param metric: the evaluation metric to optimize :param uncertainty: whether to enable uncertainty calculation (will output an uncertainty sigma) :param upload_dir: Optional URI to sync training results and checkpoints. We only support hdfs URI for now. :return a TSPipeline """ zoo_pipeline = self.internal.fit(train_df, validation_df, metric, recipe, mc=uncertainty, upload_dir=upload_dir) ppl = TSPipeline() ppl.internal = zoo_pipeline return ppl