def test_chain(): chain = transform.Chain(trans=[ transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=10, ), transform.AddAgeFeature( target_field=FieldName.TARGET, output_field="age", pred_length=10, log_scale=True, ), transform.AddObservedValuesIndicator(target_field=FieldName.TARGET, output_field="observed_values"), ]) assert equals(chain, clone(chain)) assert not equals(chain, clone(chain, {"trans": []})) another_chain = transform.Chain(trans=[ transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=10, ), transform.AddAgeFeature( target_field=FieldName.TARGET, output_field="age", pred_length=10, log_scale=False, ), transform.AddObservedValuesIndicator(target_field=FieldName.TARGET, output_field="observed_values"), ]) assert not equals(chain, another_chain)
def create_transformation(self) -> transform.Transformation: return transform.Chain( trans=[ transform.AsNumpyArray( field=FieldName.TARGET, expected_ndim=1 ), transform.AddTimeFeatures( start_field=transform.FieldName.START, target_field=transform.FieldName.TARGET, output_field=transform.FieldName.FEAT_TIME, time_features=time_features_from_frequency_str(self.freq), pred_length=self.prediction_length, ), transform.VstackFeatures( output_field=FieldName.FEAT_DYNAMIC_REAL, input_fields=[FieldName.FEAT_TIME], ), transform.SetFieldIfNotPresent( field=FieldName.FEAT_STATIC_CAT, value=[0.0] ), transform.AsNumpyArray( field=FieldName.FEAT_STATIC_CAT, expected_ndim=1 ), transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=ExpectedNumInstanceSampler(num_instances=1), past_length=self.context_length, future_length=self.prediction_length, time_series_fields=[FieldName.FEAT_DYNAMIC_REAL], ), ] )
def test_ExpectedNumInstanceSampler(): N = 6 train_length = 2 pred_length = 1 ds = make_dataset(N, train_length) t = transform.Chain(trans=[ transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=transform.ExpectedNumInstanceSampler( num_instances=4, min_future=pred_length), past_length=train_length, future_length=pred_length, ) ]) assert_serializable(t) scale_hist = ScaleHistogram() repetition = 2 for i in range(repetition): for data in t(iter(ds), is_train=True): target_values = data["past_target"] # for simplicity, discard values that are zeros to avoid confusion with padding target_values = target_values[target_values > 0] scale_hist.add(target_values) expected_values = {i: 2**i * repetition for i in range(1, N)} assert expected_values == scale_hist.bin_counts
def create_transformation(self) -> transform.Transformation: return transform.Chain( trans=[ transform.AsNumpyArray( field=FieldName.TARGET, expected_ndim=1 ), transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field=FieldName.FEAT_TIME, time_features=time_features_from_frequency_str(self.freq), pred_length=self.prediction_length, ), transform.VstackFeatures( output_field=FieldName.FEAT_DYNAMIC_REAL, input_fields=[FieldName.FEAT_TIME], ), transform.SetFieldIfNotPresent( field=FieldName.FEAT_STATIC_CAT, value=[0.0] ), transform.AsNumpyArray( field=FieldName.FEAT_STATIC_CAT, expected_ndim=1 ), ] )
def test_Transformation(): train_length = 100 ds = gluonts.dataset.common.ListDataset( [{"start": "2012-01-01", "target": [0.2] * train_length}], freq="1D" ) pred_length = 10 t = transform.Chain( trans=[ transform.AddTimeFeatures( start_field=transform.FieldName.START, target_field=transform.FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=pred_length, ), transform.AddAgeFeature( target_field=transform.FieldName.TARGET, output_field="age", pred_length=pred_length, log_scale=True, ), transform.AddObservedValuesIndicator( target_field=transform.FieldName.TARGET, output_field="observed_values", ), transform.VstackFeatures( output_field="dynamic_feat", input_fields=["age", "time_feat"], drop_inputs=True, ), transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.ExpectedNumInstanceSampler( num_instances=4 ), past_length=train_length, future_length=pred_length, time_series_fields=["dynamic_feat", "observed_values"], ), ] ) assert_serializable(t) for u in t(iter(ds), is_train=True): print(u)
def test_forking_sequence_with_features(is_train) -> None: def make_dataset(N, train_length): # generates 2 ** N - 1 timeseries with constant increasing values n = 2 ** N - 1 targets = np.arange(n * train_length).reshape((n, train_length)) return ListDataset( [ {"start": "2012-01-01", "target": targets[i, :]} for i in range(n) ], freq="D", ) ds = make_dataset(1, 20) trans = transform.Chain( trans=[ transform.AddAgeFeature( target_field=FieldName.TARGET, output_field=FieldName.FEAT_AGE, pred_length=10, ), transform.AddTimeFeatures( start_field=FieldName.START, target_field=FieldName.TARGET, output_field=FieldName.FEAT_TIME, time_features=time_features_from_frequency_str("D"), pred_length=10, ), ForkingSequenceSplitter( train_sampler=TSplitSampler(), enc_len=5, dec_len=3, encoder_series_fields=[ FieldName.FEAT_AGE, FieldName.FEAT_TIME, ], decoder_series_fields=[FieldName.FEAT_TIME], ), ] ) out = trans(iter(ds), is_train=is_train) transformed_data = next(iter(out)) assert transformed_data["past_target"].shape == (5, 1) assert transformed_data["past_feat_dynamic_age"].shape == (5, 1) assert transformed_data["past_time_feat"].shape == (5, 3) assert transformed_data["future_time_feat"].shape == (5, 3, 3) if is_train: assert transformed_data["future_target"].shape == (5, 3)
def test_forking_sequence_splitter() -> None: len_ts = 20 ds = make_dataset(1, len_ts) enc_len = 5 dec_len = 3 trans = transform.Chain( [ transform.AddAgeFeature( target_field=FieldName.TARGET, output_field="age", pred_length=dec_len, ), ForkingSequenceSplitter( train_sampler=TSplitSampler(), enc_len=enc_len, dec_len=dec_len, encoder_series_fields=["age"], ), ] ) out = trans(ds, is_train=True) transformed_data = next(iter(out)) future_target = np.array( [ [13.0, 14.0, 15.0], [14.0, 15.0, 16.0], [15.0, 16.0, 17.0], [16.0, 17.0, 18.0], [17.0, 18.0, 19.0], ] ) assert ( np.linalg.norm(future_target - transformed_data["future_target"]) < 1e-5 ), "the forking sequence target should be computed correctly." age = np.log10(2.0 + np.arange(len_ts)) assert ( np.linalg.norm( age[-(enc_len + dec_len) : -dec_len] - transformed_data["past_age"].flatten() ) < 1e-5 ), "the forking sequence past feature should be computed correctly."
def test_BucketInstanceSampler(): N = 6 train_length = 2 pred_length = 1 ds = make_dataset(N, train_length) dataset_stats = calculate_dataset_statistics(ds) t = transform.Chain( trans=[ transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.BucketInstanceSampler( dataset_stats.scale_histogram ), past_length=train_length, future_length=pred_length, pick_incomplete=True, ) ] ) assert_serializable(t) scale_hist = ScaleHistogram() repetition = 200 for i in range(repetition): for data in t(iter(ds), is_train=True): target_values = data["past_target"] # for simplicity, discard values that are zeros to avoid confusion with padding target_values = target_values[target_values > 0] scale_hist.add(target_values) expected_values = {i: repetition for i in range(1, N)} found_values = scale_hist.bin_counts for i in range(1, N): assert abs( expected_values[i] - found_values[i] < expected_values[i] * 0.3 )
def test_target_dim_indicator(): target = np.array([0, 2, 3, 10]).tolist() multi_dim_target = np.array([target, target, target, target]) dataset = gluonts.dataset.common.ListDataset( data_iter=[{ "start": "2012-01-01", "target": multi_dim_target }], freq="1D", one_dim_target=False, ) t = transform.Chain(trans=[ transform.TargetDimIndicator(target_field=FieldName.TARGET, field_name="target_dimensions") ]) for data_entry in t(dataset, is_train=True): assert (data_entry["target_dimensions"] == np.array([0, 1, 2, 3])).all()
def test_multi_dim_transformation(is_train): train_length = 10 first_dim = np.arange(1, 11, 1).tolist() first_dim[-1] = "NaN" second_dim = np.arange(11, 21, 1).tolist() second_dim[0] = "NaN" ds = gluonts.dataset.common.ListDataset( data_iter=[{"start": "2012-01-01", "target": [first_dim, second_dim]}], freq="1D", one_dim_target=False, ) pred_length = 2 # Looks weird - but this is necessary to assert the nan entries correctly. first_dim[-1] = np.nan second_dim[0] = np.nan t = transform.Chain( trans=[ transform.AddTimeFeatures( start_field=transform.FieldName.START, target_field=transform.FieldName.TARGET, output_field="time_feat", time_features=[ time_feature.DayOfWeek(), time_feature.DayOfMonth(), time_feature.MonthOfYear(), ], pred_length=pred_length, ), transform.AddAgeFeature( target_field=transform.FieldName.TARGET, output_field="age", pred_length=pred_length, log_scale=True, ), transform.AddObservedValuesIndicator( target_field=transform.FieldName.TARGET, output_field="observed_values", convert_nans=False, ), transform.VstackFeatures( output_field="dynamic_feat", input_fields=["age", "time_feat"], drop_inputs=True, ), transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.ExpectedNumInstanceSampler( num_instances=4 ), past_length=train_length, future_length=pred_length, time_series_fields=["dynamic_feat", "observed_values"], output_NTC=False, ), ] ) assert_serializable(t) if is_train: for u in t(iter(ds), is_train=True): assert_shape(u["past_target"], (2, 10)) assert_shape(u["past_dynamic_feat"], (4, 10)) assert_shape(u["past_observed_values"], (2, 10)) assert_shape(u["future_target"], (2, 2)) assert_padded_array( u["past_observed_values"], np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]), u["past_is_pad"], ) assert_padded_array( u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"], ) else: for u in t(iter(ds), is_train=False): assert_shape(u["past_target"], (2, 10)) assert_shape(u["past_dynamic_feat"], (4, 10)) assert_shape(u["past_observed_values"], (2, 10)) assert_shape(u["future_target"], (2, 0)) assert_padded_array( u["past_observed_values"], np.array([[1.0] * 9 + [0.0], [0.0] + [1.0] * 9]), u["past_is_pad"], ) assert_padded_array( u["past_target"], np.array([first_dim, second_dim]), u["past_is_pad"], )
def test_cdf_to_gaussian_transformation(): def make_test_data(): target = np.array( [ 0, 0, 0, 0, 10, 10, 20, 20, 30, 30, 40, 50, 59, 60, 60, 70, 80, 90, 100, ] ).tolist() np.random.shuffle(target) multi_dim_target = np.array([target, target]).transpose() past_is_pad = np.array([[0] * len(target)]).transpose() past_observed_target = np.array( [[1] * len(target), [1] * len(target)] ).transpose() ds = gluonts.dataset.common.ListDataset( # Mimic output from InstanceSplitter data_iter=[ { "start": "2012-01-01", "target": multi_dim_target, "past_target": multi_dim_target, "future_target": multi_dim_target, "past_is_pad": past_is_pad, f"past_{FieldName.OBSERVED_VALUES}": past_observed_target, } ], freq="1D", one_dim_target=False, ) return ds def make_fake_output(u: DataEntry): fake_output = np.expand_dims( np.expand_dims(u["past_target_cdf"], axis=0), axis=0 ) return fake_output ds = make_test_data() t = transform.Chain( trans=[ transform.CDFtoGaussianTransform( target_field=FieldName.TARGET, observed_values_field=FieldName.OBSERVED_VALUES, max_context_length=20, target_dim=2, ) ] ) for u in t(iter(ds), is_train=False): fake_output = make_fake_output(u) # Fake transformation chain output u["past_target_sorted"] = mx.nd.array( np.expand_dims(u["past_target_sorted"], axis=0) ) u["slopes"] = mx.nd.array(np.expand_dims(u["slopes"], axis=0)) u["intercepts"] = mx.nd.array(np.expand_dims(u["intercepts"], axis=0)) back_transformed = transform.cdf_to_gaussian_forward_transform( u, fake_output ) # Get any sample/batch (slopes[i][:, d]they are all the same) back_transformed = back_transformed[0][0] original_target = u["target"] # Original target and back-transformed target should be the same assert np.allclose(original_target, back_transformed)
def test_forking_sequence_splitter() -> None: def make_dataset(N, train_length): # generates 2 ** N - 1 timeseries with constant increasing values n = 2 ** N - 1 targets = np.arange(n * train_length).reshape((n, train_length)) return ListDataset( [ {"start": "2012-01-01", "target": targets[i, :]} for i in range(n) ], freq="D", ) ds = make_dataset(1, 20) trans = transform.Chain( trans=[ transform.AddAgeFeature( target_field=transform.FieldName.TARGET, output_field="age", pred_length=10, ), ForkingSequenceSplitter( train_sampler=TestSplitSampler(), time_series_fields=["age"], enc_len=5, dec_len=3, ), ] ) out = trans(iter(ds), is_train=True) transformed_data = next(iter(out)) future_target = np.array( [ [13.0, 14.0, 15.0], [14.0, 15.0, 16.0], [15.0, 16.0, 17.0], [16.0, 17.0, 18.0], [17.0, 18.0, 19.0], ] ) assert ( np.linalg.norm(future_target - transformed_data["future_target"]) < 1e-5 ), "the forking sequence target should be computed correctly." trans_oob = transform.Chain( trans=[ transform.AddAgeFeature( target_field=transform.FieldName.TARGET, output_field="age", pred_length=10, ), ForkingSequenceSplitter( train_sampler=TestSplitSampler(), time_series_fields=["age"], enc_len=20, dec_len=20, ), ] ) transformed_data_oob = next(iter(trans_oob(iter(ds), is_train=True))) assert ( np.sum(transformed_data_oob["future_target"]) - np.sum(np.arange(20)) < 1e-5 ), "the forking sequence target should be computed correctly."