def test_ExpectedNumInstanceSampler(): N = 6 train_length = 2 pred_length = 1 ds = make_dataset(N, train_length) t = transform.Chain(trans=[ transform.InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, instance_sampler=transform.ExpectedNumInstanceSampler( num_instances=4, min_future=pred_length), past_length=train_length, future_length=pred_length, ) ]) assert_serializable(t) scale_hist = ScaleHistogram() repetition = 2 for i in range(repetition): for data in t(iter(ds), is_train=True): target_values = data["past_target"] # for simplicity, discard values that are zeros to avoid confusion with padding target_values = target_values[target_values > 0] scale_hist.add(target_values) expected_values = {i: 2**i * repetition for i in range(1, N)} assert expected_values == scale_hist.bin_counts
def test_dataset_statistics(self) -> None: n = 2 T = 10 # use integers to avoid float conversion that can fail comparison np.random.seed(0) targets = np.random.randint(0, 10, (n, T)) scale_histogram = ScaleHistogram() for i in range(n): scale_histogram.add(targets[i, :]) scale_histogram.add([]) expected = DatasetStatistics( integer_dataset=True, num_time_series=n + 1, num_time_observations=targets.size, mean_target_length=T * 2 / 3, min_target=targets.min(), mean_target=targets.mean(), mean_abs_target=targets.mean(), max_target=targets.max(), cats=[{0}, {1, 2}], num_dynamic_feat=2, num_missing_values=0, scale_histogram=scale_histogram, ) # FIXME: the cast below is a hack to make mypy happy timeseries = cast( Dataset, [ make_time_series( target=targets[0, :], cat=[0, 1], num_dynamic_feat=2 ), make_time_series( target=targets[1, :], cat=[0, 2], num_dynamic_feat=2 ), make_time_series( target=np.array([]), cat=[0, 2], num_dynamic_feat=2 ), ], ) found = calculate_dataset_statistics(timeseries) assert expected == found
def test_BucketInstanceSampler(): N = 6 train_length = 2 pred_length = 1 ds = make_dataset(N, train_length) dataset_stats = calculate_dataset_statistics(ds) t = transform.Chain( trans=[ transform.InstanceSplitter( target_field=transform.FieldName.TARGET, is_pad_field=transform.FieldName.IS_PAD, start_field=transform.FieldName.START, forecast_start_field=transform.FieldName.FORECAST_START, train_sampler=transform.BucketInstanceSampler( dataset_stats.scale_histogram ), past_length=train_length, future_length=pred_length, pick_incomplete=True, ) ] ) assert_serializable(t) scale_hist = ScaleHistogram() repetition = 200 for i in range(repetition): for data in t(iter(ds), is_train=True): target_values = data["past_target"] # for simplicity, discard values that are zeros to avoid confusion with padding target_values = target_values[target_values > 0] scale_hist.add(target_values) expected_values = {i: repetition for i in range(1, N)} found_values = scale_hist.bin_counts for i in range(1, N): assert abs( expected_values[i] - found_values[i] < expected_values[i] * 0.3 )
def test_dataset_statistics(self) -> None: num_time_series = 3 num_time_observations = 10 num_feat_dynamic_real = 2 num_past_feat_dynamic_real = 3 num_feat_dynamic_cat = 2 num_missing_values = 0 # use integers to avoid float conversion that can fail comparison np.random.seed(0) targets = np.random.randint( 0, 10, (num_time_series - 1, num_time_observations) ) scale_histogram = ScaleHistogram() for i in range(num_time_series - 1): scale_histogram.add(targets[i, :]) scale_histogram.add([]) expected = DatasetStatistics( integer_dataset=True, num_time_series=num_time_series, # includes empty array num_time_observations=targets.size, mean_target_length=num_time_observations * (num_time_series - 1) / num_time_series, max_target_length=num_time_observations, min_target=targets.min(), mean_target=targets.mean(), mean_abs_target=targets.mean(), max_target=targets.max(), feat_static_real=[{0.1}, {0.2, 0.3}], feat_static_cat=[{1}, {2, 3}], num_feat_dynamic_real=num_feat_dynamic_real, num_past_feat_dynamic_real=num_past_feat_dynamic_real, num_feat_dynamic_cat=num_feat_dynamic_cat, num_missing_values=num_missing_values, scale_histogram=scale_histogram, ) # FIXME: the cast below is a hack to make mypy happy timeseries = cast( Dataset, [ make_time_series( target=targets[0, :], feat_static_cat=[1, 2], feat_static_real=[0.1, 0.2], num_feat_dynamic_cat=num_feat_dynamic_cat, num_feat_dynamic_real=num_feat_dynamic_real, num_past_feat_dynamic_real=num_past_feat_dynamic_real, ), make_time_series( target=targets[1, :], feat_static_cat=[1, 3], feat_static_real=[0.1, 0.3], num_feat_dynamic_cat=num_feat_dynamic_cat, num_feat_dynamic_real=num_feat_dynamic_real, num_past_feat_dynamic_real=num_past_feat_dynamic_real, ), make_time_series( target=np.array([]), feat_static_cat=[1, 3], feat_static_real=[0.1, 0.3], num_feat_dynamic_cat=num_feat_dynamic_cat, num_feat_dynamic_real=num_feat_dynamic_real, num_past_feat_dynamic_real=num_past_feat_dynamic_real, ), ], ) found = calculate_dataset_statistics(timeseries) assert expected == found