def test_valuate(self):
        # sample_num should > past_seq_len, the default value of which is 50
        sample_num = 100
        train_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })
        sample_num = 64
        test_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/10/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })

        tsp = TimeSequencePredictor(
            dt_col="datetime",
            target_col="value",
            extra_features_col=None,
        )
        pipeline = tsp.fit(train_df)
        print(
            "evaluate:",
            pipeline.evaluate(test_df,
                              metric=["mean_squared_error", "r_square"]))
    def test_predict(self):
        # sample_num should > past_seq_len, the default value of which is 50
        sample_num = 100
        train_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })
        test_sample_num = 64
        test_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/10/2019', periods=test_sample_num),
            "value":
            np.random.randn(test_sample_num)
        })

        tsp = TimeSequencePredictor(
            dt_col="datetime",
            target_col="value",
            extra_features_col=None,
        )
        pipeline = tsp.fit(train_df)
        y_pred = pipeline.predict(test_df)

        default_past_seq_len = 50
        assert y_pred.shape == (test_sample_num - default_past_seq_len + 1, 2)
 def test_fit_BayesRecipe(self):
     from zoo.automl.config.recipe import BayesRecipe
     train_df, _, future_seq_len = self.create_dataset()
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col="value",
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     pipeline = tsp.fit(train_df,
                        recipe=BayesRecipe(num_samples=1,
                                           training_iteration=2,
                                           epochs=1,
                                           look_back=(3, 5)))
     assert isinstance(pipeline, TimeSequencePipeline)
     assert isinstance(pipeline.feature_transformers,
                       TimeSequenceFeatureTransformer)
     assert isinstance(pipeline.model, BaseModel)
     assert pipeline.config is not None
     assert "epochs" in pipeline.config
     assert [
         config_name for config_name in pipeline.config
         if config_name.startswith('bayes_feature')
     ] == []
     assert [
         config_name for config_name in pipeline.config
         if config_name.endswith('float')
     ] == []
     assert 'past_seq_len' in pipeline.config
     assert 3 <= pipeline.config["past_seq_len"] <= 5
Beispiel #4
0
def main(train_path, pred_path, n_pred, dt, target, time_limit_min):
    os.environ["TRIALRUNNER_WALLTIME_LIMIT"] = str(time_limit_min * 60)

    df_train = pd.read_csv(train_path)
    df_train[dt] = pd.to_datetime(df_train[dt])

    sc = init_spark_on_local(cores=mp.cpu_count(), spark_log_level="ERROR")
    ray_ctx = RayContext(sc=sc)
    ray_ctx.init()

    extra_features_col = list(set(df_train.columns) - set([dt, target]))
    if not extra_features_col:
        extra_features_col = None
    tsp = TimeSequencePredictor(dt_col=dt,
                                target_col=target,
                                extra_features_col=extra_features_col,
                                future_seq_len=n_pred)
    pipeline = tsp.fit(df_train,
                       resources_per_trial={"cpu": 4},
                       recipe=BayesRecipe(num_samples=100000))

    df_pred = pipeline.predict(df_train[-2:])
    x_pred = pd.date_range(df_pred.iloc[0][dt],
                           periods=n_pred,
                           freq=pd.infer_freq(df_train[dt]))
    y_pred = df_pred.iloc[0][1:]
    df_pred = pd.DataFrame({dt: x_pred, target: y_pred})
    df_pred.to_csv(pred_path, index=False)
Beispiel #5
0
class AutoTSTrainer:
    """
    The Automated Time Series Forecast Trainer
    """
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 extra_features_col=None):
        """
        Initialize the AutoTS Trainer.

        :param horizon: steps to look forward
        :param dt_col: the datetime column
        :param target_col: the target column to forecast
        :param extra_features_col: extra feature columns
        """
        target_col_list = target_col
        if isinstance(target_col, str):
            target_col_list = [target_col]
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col_list,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
        )

    def fit(self,
            train_df,
            validation_df=None,
            metric="mse",
            recipe: Recipe = SmokeRecipe(),
            uncertainty: bool = False,
            distributed: bool = False,
            hdfs_url=None):
        """
        Fit a time series forecasting pipeline w/ automl
        :param train_df: the input dataframe (as pandas.dataframe)
        :param validation_df: the validation dataframe (as pandas.dataframe)
        :param recipe: the configuration of searching
        :param metric: the evaluation metric to optimize
        :param uncertainty: whether to enable uncertainty calculation
                            (will output an uncertainty sigma)
        :param hdfs_url: the hdfs_url to use for storing trail and intermediate results
        :param distributed: whether to enable distributed training
        :return a TSPipeline
        """
        zoo_pipeline = self.internal.fit(train_df,
                                         validation_df,
                                         metric,
                                         recipe,
                                         mc=uncertainty,
                                         distributed=distributed,
                                         hdfs_url=hdfs_url)
        ppl = TSPipeline()
        ppl.internal = zoo_pipeline
        return ppl
 def test_fit_SmokeRecipe(self):
     train_df, validation_df, future_seq_len = self.create_dataset()
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col="value",
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     pipeline = tsp.fit(train_df, validation_df)
     assert isinstance(pipeline, TimeSequencePipeline)
     assert isinstance(pipeline.feature_transformers,
                       TimeSequenceFeatureTransformer)
     assert isinstance(pipeline.model, BaseModel)
     assert pipeline.config is not None
 def test_fit_LSTMGridRandomRecipe(self):
     from zoo.automl.config.recipe import LSTMGridRandomRecipe
     train_df, _, future_seq_len = self.create_dataset()
     tsp = TimeSequencePredictor(
         dt_col="datetime",
         target_col="value",
         future_seq_len=future_seq_len,
         extra_features_col=None,
     )
     pipeline = tsp.fit(train_df,
                        recipe=LSTMGridRandomRecipe(lstm_2_units=[4],
                                                    batch_size=[1024],
                                                    num_rand_samples=5,
                                                    look_back=2,
                                                    training_iteration=1,
                                                    epochs=1))
     assert isinstance(pipeline, TimeSequencePipeline)
     assert isinstance(pipeline.feature_transformers,
                       TimeSequenceFeatureTransformer)
     assert isinstance(pipeline.model, BaseModel)
     assert pipeline.config is not None
     assert 'past_seq_len' in pipeline.config
     assert pipeline.config["past_seq_len"] == 2
    def test_save_restore(self):
        sample_num = 100
        train_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/1/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })
        sample_num = 64
        test_df = pd.DataFrame({
            "datetime":
            pd.date_range('1/10/2019', periods=sample_num),
            "value":
            np.random.randn(sample_num)
        })

        tsp = TimeSequencePredictor(
            dt_col="datetime",
            target_col="value",
            extra_features_col=None,
        )
        pipeline = tsp.fit(train_df)
        pred = pipeline.predict(test_df)

        dirname = tempfile.mkdtemp(prefix="saved_pipeline")
        try:
            save_pipeline_file = dirname
            pipeline.save(save_pipeline_file)

            new_pipeline = TimeSequencePipeline()
            new_pipeline.restore(save_pipeline_file)

            new_pred = new_pipeline.predict(test_df)
            np.testing.assert_allclose(pred["value"].values,
                                       new_pred["value"].values)
        finally:
            shutil.rmtree(dirname)
Beispiel #9
0
class AutoTSTrainer:
    """
    The Automated Time Series Forecast Trainer
    """
    def __init__(self,
                 horizon=1,
                 dt_col="datetime",
                 target_col="value",
                 logs_dir="~/zoo_automl_logs",
                 extra_features_col=None,
                 search_alg=None,
                 search_alg_params=None,
                 scheduler=None,
                 scheduler_params=None,
                 name="automl"):
        """
        Initialize the AutoTS Trainer.

        :param horizon: steps to look forward
        :param dt_col: the datetime column
        :param target_col: the target column to forecast
        :param extra_features_col: extra feature columns
        """
        target_col_list = target_col
        if isinstance(target_col, str):
            target_col_list = [target_col]
        self.internal = TimeSequencePredictor(
            dt_col=dt_col,
            target_col=target_col_list,
            logs_dir=logs_dir,
            future_seq_len=horizon,
            extra_features_col=extra_features_col,
            search_alg=search_alg,
            search_alg_params=search_alg_params,
            scheduler=scheduler,
            scheduler_params=scheduler_params,
            name=name)

    def fit(
        self,
        train_df,
        validation_df=None,
        metric="mse",
        recipe: Recipe = SmokeRecipe(),
        uncertainty: bool = False,
        upload_dir=None,
    ):
        """
        Fit a time series forecasting pipeline w/ automl
        :param train_df: the input dataframe (as pandas.dataframe)
        :param validation_df: the validation dataframe (as pandas.dataframe)
        :param recipe: the configuration of searching
        :param metric: the evaluation metric to optimize
        :param uncertainty: whether to enable uncertainty calculation
                            (will output an uncertainty sigma)
        :param upload_dir: Optional URI to sync training results and checkpoints. We only support
            hdfs URI for now.
        :return a TSPipeline
        """
        zoo_pipeline = self.internal.fit(train_df,
                                         validation_df,
                                         metric,
                                         recipe,
                                         mc=uncertainty,
                                         upload_dir=upload_dir)
        ppl = TSPipeline()
        ppl.internal = zoo_pipeline
        return ppl