def test_training_loader_batch_size_hard_constraint() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    train_dataset_loader_1 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    train_dataset_loader_2 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        shuffle_buffer_length=3 * BATCH_SIZE,
    )

    batches_1 = list(islice(train_dataset_loader_1, 30))
    batches_2 = list(islice(train_dataset_loader_2, 30))

    assert all([len(batch["item_id"]) == BATCH_SIZE for batch in batches_1
                ]), "Not every batch from training loader is right size."

    assert all(
        [len(batch["item_id"]) == BATCH_SIZE for batch in batches_2]
    ), "Not every batch from training loader is right size, with shuffling on."
Example #2
0
    def train_model(self, training_data: Dataset) -> TrainOutput:
        transformation = self.create_transformation()

        transformation.estimate(iter(training_data))

        training_data_loader = TrainDataLoader(
            dataset=training_data,
            transform=transformation,
            batch_size=self.trainer.batch_size,
            num_batches_per_epoch=self.trainer.num_batches_per_epoch,
            ctx=self.trainer.ctx,
            dtype=self.dtype,
        )

        # ensure that the training network is created within the same MXNet
        # context as the one that will be used during training
        with self.trainer.ctx:
            trained_net = self.create_training_network()

        self.trainer(
            net=trained_net,
            input_names=get_hybrid_forward_input_names(trained_net),
            train_iter=training_data_loader,
        )

        with self.trainer.ctx:
            # ensure that the prediction network is created within the same MXNet
            # context as the one that was used during training
            return TrainOutput(
                transformation=transformation,
                trained_net=trained_net,
                predictor=self.create_predictor(transformation, trained_net),
            )
Example #3
0
    def train_model(
            self,
            training_data: Dataset) -> Tuple[Transformation, HybridBlock]:
        transformation = self.create_transformation()

        transformation.estimate(iter(training_data))

        training_data_loader = TrainDataLoader(
            dataset=training_data,
            transform=transformation,
            batch_size=self.trainer.batch_size,
            num_batches_per_epoch=self.trainer.num_batches_per_epoch,
            ctx=self.trainer.ctx,
            float_type=self.float_type,
        )

        # ensure that the training network is created within the same MXNet
        # context as the one that will be used during training
        with self.trainer.ctx:
            trained_net = self.create_training_network()

        self.trainer(
            net=trained_net,
            input_names=get_hybrid_forward_input_names(trained_net),
            train_iter=training_data_loader,
        )

        return transformation, trained_net
Example #4
0
    def train_model(
        self,
        training_data: Dataset,
        validation_data: Optional[Dataset] = None,
        num_workers: Optional[int] = None,
        num_prefetch: Optional[int] = None,
        shuffle_buffer_length: Optional[int] = None,
        **kwargs,
    ) -> TrainOutput:
        transformation = self.create_transformation()

        training_data_loader = TrainDataLoader(
            dataset=training_data,
            transform=transformation,
            batch_size=self.trainer.batch_size,
            num_batches_per_epoch=self.trainer.num_batches_per_epoch,
            stack_fn=partial(
                batchify, ctx=self.trainer.ctx, dtype=self.dtype,
            ),
            num_workers=num_workers,
            num_prefetch=num_prefetch,
            shuffle_buffer_length=shuffle_buffer_length,
            decode_fn=partial(as_in_context, ctx=self.trainer.ctx),
            **kwargs,
        )

        validation_data_loader = None
        if validation_data is not None:
            validation_data_loader = ValidationDataLoader(
                dataset=validation_data,
                transform=transformation,
                batch_size=self.trainer.batch_size,
                stack_fn=partial(
                    batchify, ctx=self.trainer.ctx, dtype=self.dtype,
                ),
                num_workers=num_workers,
                num_prefetch=num_prefetch,
                **kwargs,
            )

        # ensure that the training network is created within the same MXNet
        # context as the one that will be used during training
        with self.trainer.ctx:
            trained_net = self.create_training_network()

        self.trainer(
            net=trained_net,
            input_names=get_hybrid_forward_input_names(trained_net),
            train_iter=training_data_loader,
            validation_iter=validation_data_loader,
        )

        with self.trainer.ctx:
            # ensure that the prediction network is created within the same MXNet
            # context as the one that was used during training
            return TrainOutput(
                transformation=transformation,
                trained_net=trained_net,
                predictor=self.create_predictor(transformation, trained_net),
            )
def test_training_loader_soft_constraint_02() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 02: NOT EVERY TS VISITED ONCE

    train_dataset_loader = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        decode_fn=partial(as_in_context, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    batches = list(islice(train_dataset_loader, int(0.5 * exp_num_batches)))
    transformation_counts = get_transformation_counts(batches)

    assert not all([
        k in transformation_counts for k in range(CD_NUM_TIME_SERIES)
    ]), "It should not have been possible to process every time series once. "
Example #6
0
    def test_dataset(dataset):
        transformation = InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            instance_sampler=ExactlyOneSampler(),
            past_length=10,
            future_length=5,
            dummy_value=1.0,
        )

        dl = TrainDataLoader(
            dataset=dataset,
            transform=transformation,
            batch_size=batch_size,
            stack_fn=partial(batchify, ctx=current_context()),
            decode_fn=partial(as_in_context, ctx=current_context()),
            num_workers=num_workers,
        )

        item_ids = defaultdict(int)

        for epoch in range(num_epochs):
            for batch in islice(dl, num_batches_per_epoch):
                for item_id in batch["item_id"]:
                    item_ids[item_id] += 1

        for i in range(len(dataset)):
            assert num_passes - 1 <= item_ids[i] <= num_passes + 1
Example #7
0
		def create_training_data_loader(self, dataset, **kwargs):
			instance_splitter = InstanceSplitter(
				target_field=FieldName.TARGET,
				is_pad_field=FieldName.IS_PAD,
				start_field=FieldName.START,
				forecast_start_field=FieldName.FORECAST_START,
				instance_sampler=ExpectedNumInstanceSampler(
					num_instances=1,
					min_future=self.prediction_length,
				),
				past_length=self.context_length + 1,
				future_length=self.prediction_length,
				time_series_fields=[
					FieldName.FEAT_DYNAMIC_REAL,
					FieldName.OBSERVED_VALUES,
				],
			)
			input_names = get_hybrid_forward_input_names(MyProbTrainRNN)
			return TrainDataLoader(
				dataset=dataset,
				transform=instance_splitter + SelectFields(input_names),
				batch_size=self.batch_size,
				stack_fn=functools.partial(batchify, ctx=self.trainer.ctx, dtype=self.dtype),
				decode_fn=functools.partial(as_in_context, ctx=self.trainer.ctx),
				**kwargs,
			)
Example #8
0
def create_trainset_loader(
    n_data_per_group,
    batch_size,
    past_length=4 * 7,
    prediction_length=2 * 7,
    n_groups=5,
    dataset_name="synthetic_issm",
):
    input_transform = create_input_transform(
        prediction_length=prediction_length,
        past_length=past_length,
        use_feat_static_cat=True,
        use_feat_dynamic_real=False,
        freq="D",
        time_features=None,
        is_train=True,
        extract_tail_chunks_for_train=True,
    )
    dataset = get_dataset(
        subset="train",
        dataset_name=dataset_name,
        n_data_per_group=n_data_per_group,
    )
    dataloader = TrainDataLoader(
        dataset=dataset,
        transform=input_transform,
        num_batches_per_epoch=math.ceil(n_data_per_group * n_groups /
                                        batch_size),
        batch_size=batch_size,
        ctx=None,  # mx.context.cpu(),
        dtype=np.float32,
    )
    return dataloader
Example #9
0
    def train_loader(
        dataset: ListDataset,
        prediction_interval_length: float,
        context_interval_length: float,
        is_train: bool = True,
        override_args: dict = None,
    ) -> DataLoader:

        if override_args is None:
            override_args = {}

        splitter = ContinuousTimeInstanceSplitter(
            future_interval_length=prediction_interval_length,
            past_interval_length=context_interval_length,
            train_sampler=ContinuousTimeUniformSampler(num_instances=10),
        )

        kwargs: Dict[str, Any] = dict(
            dataset=dataset,
            transform=splitter,
            batch_size=10,
            stack_fn=partial(batchify,
                             ctx=mx.cpu(),
                             dtype=np.float32,
                             variable_length=True),
        )
        kwargs.update(override_args)

        if is_train:
            return TrainDataLoader(num_batches_per_epoch=22,
                                   num_workers=None,
                                   **kwargs)
        else:
            return InferenceDataLoader(num_workers=None, **kwargs)
def test_training_loader_soft_constraint_01() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 01: EVERY TS VISITED AT LEAST ONCE

    train_dataset_loader_01 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=int(3 * exp_num_batches),
    )

    # give all the workers a little time to get ready, so they can start at the same time
    time.sleep(1.5)

    # multi-processed validation dataset
    mp_training_data_loader_result_01 = list(train_dataset_loader_01)

    # should contain an entry for every time series id
    transformation_counts_01 = get_transformation_counts(
        mp_training_data_loader_result_01)

    assert all([
        k in transformation_counts_01 for k in range(CD_NUM_TIME_SERIES)
    ]), "Not every time series processed at least once."
def test_training_loader_soft_constraint_02() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 02: NOT EVERY TS VISITED ONCE

    train_dataset_loader_02 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=int(0.5 * exp_num_batches),
    )

    # multi-processed validation dataset
    mp_training_data_loader_result_02 = list(train_dataset_loader_02)

    # should contain an entry for every time series id
    transformation_counts_02 = get_transformation_counts(
        mp_training_data_loader_result_02)

    assert not all([
        k in transformation_counts_02 for k in range(CD_NUM_TIME_SERIES)
    ]), "It should not have been possible to process every time series once. "
def test_training_loader_soft_constraint_01() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 01: EVERY TS VISITED AT LEAST ONCE

    train_dataset_loader = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    batches = list(islice(train_dataset_loader, int(3 * exp_num_batches)))
    transformation_counts = get_transformation_counts(batches)

    assert all([k in transformation_counts for k in range(CD_NUM_TIME_SERIES)
                ]), "Not every time series processed at least once."
def test_training_loader_soft_constraint_03() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 03: ONE WORKER TRAVERSES ALL

    train_dataset_loader_03 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=1,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=int(3 * exp_num_batches),
    )

    # multi-processed validation dataset
    mp_training_data_loader_result_03 = list(train_dataset_loader_03)

    # should contain an entry for every time series id
    transformation_counts_03 = get_transformation_counts(
        mp_training_data_loader_result_03)

    assert all(
        k in transformation_counts_03 for k in range(CD_NUM_TIME_SERIES)
    ), "One worker should be able to traverse all in one sweep, and should not deplete its iterator."
    def test_dataset(dataset):
        class ExactlyOneSampler(InstanceSampler):
            def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
                window_size = b - a + 1
                assert window_size > 0
                return np.array([a])

        transformation = InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            train_sampler=ExactlyOneSampler(),
            past_length=10,
            future_length=5,
            dummy_value=1.0,
        )

        dl = TrainDataLoader(
            dataset=dataset,
            transform=transformation,
            batch_size=batch_size,
            stack_fn=partial(batchify, ctx=current_context()),
            num_workers=num_workers,
        )

        item_ids = defaultdict(int)

        for epoch in range(num_epochs):
            for batch in islice(dl, num_batches_per_epoch):
                for item_id in batch["item_id"]:
                    item_ids[item_id] += 1

        for i in range(len(dataset)):
            assert num_passes - 1 <= item_ids[i] <= num_passes + 1
def test_training_loader_soft_constraint_03() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 03: ONE WORKER TRAVERSES ALL

    train_dataset_loader = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        decode_fn=partial(as_in_context, ctx=current_context()),
        num_workers=1,  # This is the crucial difference
    )

    batches = list(islice(train_dataset_loader, int(3 * exp_num_batches)))
    transformation_counts = get_transformation_counts(batches)

    assert all(
        k in transformation_counts for k in range(CD_NUM_TIME_SERIES)
    ), "One worker should be able to traverse all in one sweep, and should not deplete its iterator."
Example #16
0
    def train(
        self, training_data: Dataset, validation_data: Optional[Dataset] = None
    ) -> Predictor:
        has_negative_data = any(np.any(d["target"] < 0) for d in training_data)
        low = -10.0 if has_negative_data else 0
        high = 10.0
        bin_centers = np.linspace(low, high, self.num_bins)
        bin_edges = np.concatenate(
            [[-1e20], (bin_centers[1:] + bin_centers[:-1]) / 2.0, [1e20]]
        )

        logging.info(
            f"using training windows of length = {self.train_window_length}"
        )

        transformation = self.create_transformation(
            bin_edges, pred_length=self.train_window_length
        )

        transformation.estimate(iter(training_data))

        training_data_loader = TrainDataLoader(
            dataset=training_data,
            transform=transformation,
            batch_size=self.trainer.batch_size,
            num_batches_per_epoch=self.trainer.num_batches_per_epoch,
            ctx=self.trainer.ctx,
        )

        validation_data_loader = None
        if validation_data is not None:
            validation_data_loader = ValidationDataLoader(
                dataset=validation_data,
                transform=transformation,
                batch_size=self.trainer.batch_size,
                ctx=self.trainer.ctx,
                dtype=self.dtype,
            )

        # ensure that the training network is created within the same MXNet
        # context as the one that will be used during training
        with self.trainer.ctx:
            params = self._get_wavenet_args(bin_centers)
            params.update(pred_length=self.train_window_length)
            trained_net = WaveNet(**params)

        self.trainer(
            net=trained_net,
            input_names=get_hybrid_forward_input_names(trained_net),
            train_iter=training_data_loader,
            validation_iter=validation_data_loader,
        )

        # ensure that the prediction network is created within the same MXNet
        # context as the one that was used during training
        with self.trainer.ctx:
            return self.create_predictor(
                transformation, trained_net, bin_centers
            )
def test_training_loader_batch_size_hard_constraint() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    train_dataset_loader_01 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=30,
    )

    train_dataset_loader_02 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=30,
        shuffle_buffer_length=3 * BATCH_SIZE,
    )

    # multi-processed training dataset
    mp_training_data_loader_result_01 = list(train_dataset_loader_01)

    # multi-processed training dataset
    mp_training_data_loader_result_02 = list(train_dataset_loader_02)

    assert all(
        [
            len(batch["item_id"]) == BATCH_SIZE
            for batch in mp_training_data_loader_result_01
        ]
    ), "Not every batch from training loader is right size."

    assert all(
        [
            len(batch["item_id"]) == BATCH_SIZE
            for batch in mp_training_data_loader_result_02
        ]
    ), "Not every batch from training loader is right size, with shuffling on."
def test_simple_model():
    dsinfo, training_data, test_data = default_synthetic()

    freq = dsinfo.metadata.freq
    prediction_length = dsinfo.prediction_length
    context_length = 2 * prediction_length
    hidden_dimensions = [10, 10]

    net = LightningFeedForwardNetwork(
        freq=freq,
        prediction_length=prediction_length,
        context_length=context_length,
        hidden_dimensions=hidden_dimensions,
        distr_output=NormalOutput(),
        batch_norm=True,
        scaling=mean_abs_scaling,
    )

    transformation = Chain([
        AddObservedValuesIndicator(
            target_field=FieldName.TARGET,
            output_field=FieldName.OBSERVED_VALUES,
        ),
        InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            train_sampler=ExpectedNumInstanceSampler(num_instances=1),
            past_length=context_length,
            future_length=prediction_length,
            time_series_fields=[FieldName.OBSERVED_VALUES],
        ),
    ])

    data_loader = TrainDataLoader(
        training_data,
        batch_size=8,
        stack_fn=batchify,
        transform=transformation,
        num_batches_per_epoch=5,
    )

    trainer = pl.Trainer(max_epochs=3, callbacks=[], weights_summary=None)
    trainer.fit(net, train_dataloader=data_loader)

    predictor = net.get_predictor(transformation)

    forecast_it, ts_it = make_evaluation_predictions(
        dataset=test_data,
        predictor=predictor,
        num_samples=100,
    )

    evaluator = Evaluator(quantiles=[0.5, 0.9], num_workers=None)

    agg_metrics, _ = evaluator(ts_it, forecast_it)
Example #19
0
 def create_training_data_loader(self, data: Dataset, network: nn.Module,
                                 **kwargs):
     data_loader = TrainDataLoader(
         Cached(data),
         batch_size=self.batch_size,
         stack_fn=batchify,
         transform=self.create_transformation() +
         self._create_instance_splitter("training"),
         num_batches_per_epoch=self.num_batches_per_epoch,
     )
     return data_loader
Example #20
0
    def train(self, training_data: Dataset) -> Predictor:
        has_negative_data = any(np.any(d["target"] < 0) for d in training_data)
        mean_length = int(np.mean([len(d["target"]) for d in training_data]))
        low = -10.0 if has_negative_data else 0
        high = 10.0
        bin_centers = np.linspace(low, high, self.num_bins)
        bin_edges = np.concatenate([[-1e20],
                                    (bin_centers[1:] + bin_centers[:-1]) / 2.0,
                                    [1e20]])

        # Here we override the prediction length for training.
        # This computes the loss over longer windows and makes the convolutions more
        # efficient, since calculations are reused.
        pred_length = min(mean_length, self.train_window_length)

        logging.info(f"mean series length = {mean_length}")
        logging.info(f"using training windows of length = {pred_length}")

        transformation = self.create_transformation(bin_edges,
                                                    pred_length=pred_length)

        transformation.estimate(iter(training_data))

        training_data_loader = TrainDataLoader(
            dataset=training_data,
            transform=transformation,
            batch_size=self.trainer.batch_size,
            num_batches_per_epoch=self.trainer.num_batches_per_epoch,
            ctx=self.trainer.ctx,
        )

        # ensure that the training network is created within the same MXNet
        # context as the one that will be used during training
        with self.trainer.ctx:
            params = self._get_wavenet_args(bin_centers)
            params.update(pred_length=pred_length)
            trained_net = WaveNet(**params)

        self.trainer(
            net=trained_net,
            input_names=get_hybrid_forward_input_names(trained_net),
            train_iter=training_data_loader,
        )

        # ensure that the prediction network is created within the same MXNet
        # context as the one that was used during training
        with self.trainer.ctx:
            return self.create_predictor(transformation, trained_net,
                                         bin_centers)
Example #21
0
 def create_training_data_loader(
     self,
     data: Dataset,
     **kwargs,
 ) -> DataLoader:
     with env._let(max_idle_transforms=maybe_len(data) or 0):
         train_transform = (self._create_instance_splitter("training") +
                            self._create_post_split_transform() +
                            SelectFields(["past_target", "valid_length"]))
     return TrainDataLoader(
         train_transform.apply(Cyclic(data)),
         batch_size=self.batch_size,
         stack_fn=self._stack_fn(),
         decode_fn=partial(as_in_context, ctx=self.trainer.ctx),
     )
Example #22
0
 def create_training_data_loader(
     self,
     data: Dataset,
     **kwargs,
 ) -> DataLoader:
     input_names = get_hybrid_forward_input_names(DeepFactorTrainingNetwork)
     instance_splitter = self._create_instance_splitter("training")
     return TrainDataLoader(
         dataset=data,
         transform=instance_splitter + SelectFields(input_names),
         batch_size=self.batch_size,
         stack_fn=partial(batchify, ctx=self.trainer.ctx, dtype=self.dtype),
         decode_fn=partial(as_in_context, ctx=self.trainer.ctx),
         **kwargs,
     )
def data_loader(estimator, dataset, batch):
    dataset = get_dataset(dataset)
    data = dataset.train
    epochs = 5
    batch_size = batch
    num_batches_per_epoch = 10
    bin_edges = np.array([-1e20, -1e10, 1, 1e20])
    transform = estimator.create_transformation(bin_edges=bin_edges, pred_length=dataset.metadata.prediction_length)\
        if estimator.__class__.__name__ == 'WaveNetEstimator' else estimator.create_transformation()
    loader = TrainDataLoader(
        data,
        transform=transform,
        batch_size=batch_size,
        ctx=mx.cpu(),
        num_batches_per_epoch=num_batches_per_epoch,
    )
Example #24
0
 def create_training_data_loader(
     self,
     data: Dataset,
     **kwargs,
 ) -> DataLoader:
     input_names = get_hybrid_forward_input_names(CanonicalTrainingNetwork)
     with env._let(max_idle_transforms=maybe_len(data) or 0):
         instance_splitter = self._create_instance_splitter("training")
     return TrainDataLoader(
         dataset=data,
         transform=instance_splitter + SelectFields(input_names),
         batch_size=self.batch_size,
         stack_fn=partial(batchify, ctx=self.trainer.ctx, dtype=self.dtype),
         decode_fn=partial(as_in_context, ctx=self.trainer.ctx),
         **kwargs,
     )
Example #25
0
def test_distribution():
    """
    Makes sure additional tensors can be accessed and have expected shapes
    """
    prediction_length = ds_info.prediction_length
    estimator = DeepAREstimator(
        freq=freq,
        prediction_length=prediction_length,
        input_size=15,
        trainer=Trainer(epochs=1, num_batches_per_epoch=1),
        distr_output=StudentTOutput(),
    )

    train_output = estimator.train_model(train_ds)

    # todo adapt loader to anomaly detection use-case
    batch_size = 2
    num_samples = 3

    training_data_loader = TrainDataLoader(
        train_ds,
        transform=train_output.transformation
        + estimator.create_instance_splitter("training"),
        batch_size=batch_size,
        num_batches_per_epoch=estimator.trainer.num_batches_per_epoch,
        stack_fn=batchify,
    )

    seq_len = 2 * ds_info.prediction_length

    for data_entry in islice(training_data_loader, 1):
        input_names = get_module_forward_input_names(train_output.trained_net)

        distr = train_output.trained_net.distribution(
            *[data_entry[k] for k in input_names]
        )

        assert distr.sample((num_samples,)).shape == (
            num_samples,
            batch_size,
            seq_len,
        )
Example #26
0
    def train_loader(
        dataset: ListDataset,
        prediction_interval_length: float,
        context_interval_length: float,
        is_train: bool = True,
        override_args: dict = None,
    ) -> Iterable[DataBatch]:

        if override_args is None:
            override_args = {}

        if is_train:
            sampler = ContinuousTimeUniformSampler(
                num_instances=10,
                min_past=context_interval_length,
                min_future=prediction_interval_length,
            )
        else:
            sampler = ContinuousTimePredictionSampler(
                min_past=context_interval_length)

        splitter = ContinuousTimeInstanceSplitter(
            future_interval_length=prediction_interval_length,
            past_interval_length=context_interval_length,
            instance_sampler=sampler,
            freq=dataset.freq,
        )

        kwargs = dict(
            dataset=dataset,
            transform=splitter,
            batch_size=10,
            stack_fn=partial(batchify, dtype=np.float32, variable_length=True),
        )

        kwargs.update(override_args)

        if is_train:
            return itertools.islice(
                TrainDataLoader(num_workers=None, **kwargs), NUM_BATCHES)
        else:
            return InferenceDataLoader(**kwargs)
Example #27
0
    def __init__(self,dictionary_of_hyperparameters):
        search_config = {}
        search_config['learning_rate'] = ag.Real(1e-3, 1e-2, log=True)
        search_config['epochs'] = ag.Choice(40, 80)
        self.dictionary_of_hyperparameters = dictionary_of_hyperparameters
        for config in search_config.keys():
            if not config in self.dictionary_of_hyperparameters .keys():
                self.dictionary_of_hyperparameters [config] = search_config[config]

        self.dataset = dataset
        self.init_estimator  = SimpleFeedForwardEstimator(
            num_hidden_dimensions=[10],
            prediction_length=dataset.metadata.prediction_length,
            context_length=100,
            freq=dataset.metadata.freq,
            trainer=Trainer(ctx="cpu",
                            epochs=5,
                            learning_rate=1e-3,
                            num_batches_per_epoch=100
                           )
            )
        transformation = estimator.create_transformation()
        dtype = np.float32
        num_workers = None
        num_prefetch = None
        shuffle_buffer_length = None
        trainer = Trainer(ctx="cpu",
                          epochs=1,
                          learning_rate=0.01,
                          num_batches_per_epoch=100
                          )
        self.training_data_loader = TrainDataLoader(
            dataset=dataset.train,
            transform=transformation,
            batch_size=trainer.batch_size,
            num_batches_per_epoch=trainer.num_batches_per_epoch,
            ctx=trainer.ctx,
            dtype=dtype,
            num_workers=num_workers,
            num_prefetch=num_prefetch,
        )
def test_training_data_loader(dataset_context, num_workers):
    with dataset_context as dataset:
        dataset_length = len(list(dataset))

        batch_size = 4

        dl = TrainDataLoader(
            dataset=dataset,
            transform=default_transformation(),
            batch_size=batch_size,
            stack_fn=partial(batchify, ctx=current_context()),
            decode_fn=partial(as_in_context, ctx=current_context()),
            num_workers=num_workers,
        )

        num_epochs = 20
        epoch_length = 2

        passes_through_dataset = int(
            (num_epochs * epoch_length * batch_size) / dataset_length
        )

        # these are to make sure that the test makes sense:
        # we want to go over the dataset multiple times
        assert passes_through_dataset >= 10
        # we want each epoch to be shorter than the dataset
        assert epoch_length * batch_size < dataset_length

        batches = []

        for epoch in range(num_epochs):
            for batch in islice(dl, epoch_length):
                assert all(x is True for x in batch["is_train"])
                batches.append(batch)

        counter = count_item_ids(batches)

        if num_workers is None or num_workers == 1:
            for entry in dataset:
                assert counter[entry[FieldName.ITEM_ID]] >= 1
def test_shape():
    """
    Makes sure additional tensors can be accessed and have expected shapes
    """
    prediction_length = ds_info.prediction_length
    estimator = DeepAREstimator(
        freq=freq,
        prediction_length=prediction_length,
        trainer=Trainer(epochs=1, num_batches_per_epoch=1),
        distr_output=StudentTOutput(),
    )

    training_transformation, trained_net = estimator.train_model(train_ds)

    # todo adapt loader to anomaly detection use-case
    batch_size = 2
    training_data_loader = TrainDataLoader(
        dataset=train_ds,
        transform=training_transformation,
        batch_size=batch_size,
        num_batches_per_epoch=estimator.trainer.num_batches_per_epoch,
        ctx=mx.cpu(),
    )

    seq_len = 2 * ds_info.prediction_length

    for data_entry in islice(training_data_loader, 1):
        input_names = get_hybrid_forward_input_names(trained_net)

        loss, likelihoods, *distr_args = trained_net(
            *[data_entry[k] for k in input_names])

        distr = StudentT(*distr_args)

        assert likelihoods.shape == (batch_size, seq_len)
        assert distr.mu.shape == (batch_size, seq_len)
        assert distr.sigma.shape == (batch_size, seq_len)
        assert distr.nu.shape == (batch_size, seq_len)
Example #30
0
def test_distribution():
    """
    Makes sure additional tensors can be accessed and have expected shapes
    """
    prediction_length = ds_info.prediction_length
    estimator = DeepAREstimator(
        freq=freq,
        prediction_length=prediction_length,
        trainer=Trainer(epochs=2, num_batches_per_epoch=1),
        distr_output=StudentTOutput(),
    )

    train_output = estimator.train_model(train_ds, test_ds)

    # todo adapt loader to anomaly detection use-case
    batch_size = 2
    num_samples = 3

    training_data_loader = TrainDataLoader(
        dataset=train_ds,
        transform=train_output.transformation,
        batch_size=batch_size,
        stack_fn=partial(batchify, ctx=mx.cpu()),
    )

    seq_len = 2 * ds_info.prediction_length

    for data_entry in islice(training_data_loader, 1):
        input_names = get_hybrid_forward_input_names(train_output.trained_net)

        distr = train_output.trained_net.distribution(
            *[data_entry[k] for k in input_names])

        assert distr.sample(num_samples).shape == (
            num_samples,
            batch_size,
            seq_len,
        )