Ejemplo n.º 1
0
    def test_dataset(dataset):
        transformation = InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            instance_sampler=ExactlyOneSampler(),
            past_length=10,
            future_length=5,
            dummy_value=1.0,
        )

        dl = TrainDataLoader(
            dataset=dataset,
            transform=transformation,
            batch_size=batch_size,
            stack_fn=partial(batchify, ctx=current_context()),
            decode_fn=partial(as_in_context, ctx=current_context()),
            num_workers=num_workers,
        )

        item_ids = defaultdict(int)

        for epoch in range(num_epochs):
            for batch in islice(dl, num_batches_per_epoch):
                for item_id in batch["item_id"]:
                    item_ids[item_id] += 1

        for i in range(len(dataset)):
            assert num_passes - 1 <= item_ids[i] <= num_passes + 1
def test_training_loader_soft_constraint_03() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 03: ONE WORKER TRAVERSES ALL

    train_dataset_loader = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        decode_fn=partial(as_in_context, ctx=current_context()),
        num_workers=1,  # This is the crucial difference
    )

    batches = list(islice(train_dataset_loader, int(3 * exp_num_batches)))
    transformation_counts = get_transformation_counts(batches)

    assert all(
        k in transformation_counts for k in range(CD_NUM_TIME_SERIES)
    ), "One worker should be able to traverse all in one sweep, and should not deplete its iterator."
def test_training_loader_batch_size_hard_constraint() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    train_dataset_loader_1 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    train_dataset_loader_2 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        shuffle_buffer_length=3 * BATCH_SIZE,
    )

    batches_1 = list(islice(train_dataset_loader_1, 30))
    batches_2 = list(islice(train_dataset_loader_2, 30))

    assert all([len(batch["item_id"]) == BATCH_SIZE for batch in batches_1
                ]), "Not every batch from training loader is right size."

    assert all(
        [len(batch["item_id"]) == BATCH_SIZE for batch in batches_2]
    ), "Not every batch from training loader is right size, with shuffling on."
def test_training_loader_soft_constraint_02() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 02: NOT EVERY TS VISITED ONCE

    train_dataset_loader = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        decode_fn=partial(as_in_context, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    batches = list(islice(train_dataset_loader, int(0.5 * exp_num_batches)))
    transformation_counts = get_transformation_counts(batches)

    assert not all([
        k in transformation_counts for k in range(CD_NUM_TIME_SERIES)
    ]), "It should not have been possible to process every time series once. "
Ejemplo n.º 5
0
def test_inference_loader_equivalence() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()
    current_desired_context = current_context()

    # original no multiprocessing processed validation dataset
    inference_loader_data_transformed_original = list(
        InferenceDataLoader(
            dataset=list_dataset,
            transform=transformation,
            batch_size=BATCH_SIZE,
            num_workers=0,  # This is the crucial difference
            ctx=current_context(),
        )
    )

    inference_loader = InferenceDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
    )

    # multi-processed validation dataset
    mp_inf_data_loader_result_01 = list(inference_loader)

    # multi-processed validation dataset NR2, second iteration/pass through
    mp_inf_data_loader_result_02 = list(inference_loader)

    # ASSERTIONS:

    assert get_transformation_counts(
        mp_inf_data_loader_result_01
    ) == get_transformation_counts(
        inference_loader_data_transformed_original
    ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one."

    assert get_transformation_counts(
        mp_inf_data_loader_result_02
    ) == get_transformation_counts(
        inference_loader_data_transformed_original
    ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one."

    assert (
        len(mp_inf_data_loader_result_02[1]["item_id"]) == BATCH_SIZE
    ), "Incorrect batch size from multiprocessing."

    assert (
        mp_inf_data_loader_result_02[0]["past_target"].context
        == current_desired_context
    ), "Batches in incorrect context"
def get_dataset_and_transformation():
    # dont recompute, since expensive
    global _data_cache
    if _data_cache is not None:
        return _data_cache

    # create constant dataset with each time series having
    # variable length and unique constant integer entries
    dataset = ConstantDataset(num_steps=CD_NUM_STEPS,
                              num_timeseries=CD_NUM_TIME_SERIES)
    list_dataset = list(dataset.train)
    for i, ts in enumerate(list_dataset):
        ts["start"] = pd.Timestamp(ts_input=ts["start"], freq=dataset.freq)
        # get randomness in the ts lengths
        ts["target"] = np.array(
            ts["target"] * random.randint(1, CD_MAX_LEN_MULTIPLICATION_FACTOR))
    list_dataset = ListDataset(data_iter=list_dataset, freq=dataset.freq)
    list_dataset_pred_length = dataset.prediction_length

    # use every possible time point to split the time series
    transformation = InstanceSplitter(
        target_field=FieldName.TARGET,
        is_pad_field=FieldName.IS_PAD,
        start_field=FieldName.START,
        forecast_start_field=FieldName.FORECAST_START,
        instance_sampler=UniformSplitSampler(
            p=1.0,
            min_future=list_dataset_pred_length,
        ),
        past_length=CONTEXT_LEN,
        future_length=list_dataset_pred_length,
        dummy_value=1.0,
    )

    # original no multiprocessing processed validation dataset
    train_data_transformed_original = list(
        ValidationDataLoader(
            dataset=list_dataset,
            transform=transformation,
            batch_size=BATCH_SIZE,
            stack_fn=partial(batchify, ctx=current_context()),
            decode_fn=partial(as_in_context, ctx=current_context()),
            num_workers=None,  # This is the crucial difference
        ))

    _data_cache = (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    )

    return _data_cache
Ejemplo n.º 7
0
def _prepare_image(img, nrow=8, padding=2, square_image=False):
    """Given an image of format HW, CHW, or NCHW, returns a image of format HWC.
    If the input is a batch of images, a grid of images is made by stitching them together.
    If data type is float, values must be in the range [0, 1], and then they are rescaled to
    range [0, 255]. If data type is 'uint8`, values are unchanged.
    """
    if isinstance(img, np.ndarray):
        img = nd.array(img, dtype=img.dtype, ctx=current_context())
    if not isinstance(img, NDArray):
        raise TypeError('expected MXNet NDArray or numpy.ndarray, '
                        'while received type {}'.format(str(type(img))))
    assert img.ndim == 2 or img.ndim == 3 or img.ndim == 4

    if img.dtype == np.uint8:
        return make_image_grid(
            img, nrow=nrow, padding=padding, square_image=square_image).transpose((1, 2, 0))
    elif img.dtype == np.float32 or img.dtype == np.float64:
        min_val = img.min().asscalar()
        max_val = img.max().asscalar()
        if min_val < 0.0:
            raise ValueError('expected non-negative min value from img, '
                             'while received {}'.format(min_val))
        if max_val > 1.0:
            raise ValueError('expected max value from img not greater than 1, '
                             'while received {}'.format(max_val))
        img = make_image_grid(img, nrow=nrow, padding=padding, square_image=square_image) * 255.0
        return img.astype(np.uint8).transpose((1, 2, 0))
    else:
        raise ValueError('expected input image dtype is one of uint8, float32, '
                         'and float64, received dtype {}'.format(str(img.dtype)))
def test_training_loader_soft_constraint_01() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 01: EVERY TS VISITED AT LEAST ONCE

    train_dataset_loader = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    batches = list(islice(train_dataset_loader, int(3 * exp_num_batches)))
    transformation_counts = get_transformation_counts(batches)

    assert all([k in transformation_counts for k in range(CD_NUM_TIME_SERIES)
                ]), "Not every time series processed at least once."
    def test_dataset(dataset):
        class ExactlyOneSampler(InstanceSampler):
            def __call__(self, ts: np.ndarray, a: int, b: int) -> np.ndarray:
                window_size = b - a + 1
                assert window_size > 0
                return np.array([a])

        transformation = InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            train_sampler=ExactlyOneSampler(),
            past_length=10,
            future_length=5,
            dummy_value=1.0,
        )

        dl = TrainDataLoader(
            dataset=dataset,
            transform=transformation,
            batch_size=batch_size,
            stack_fn=partial(batchify, ctx=current_context()),
            num_workers=num_workers,
        )

        item_ids = defaultdict(int)

        for epoch in range(num_epochs):
            for batch in islice(dl, num_batches_per_epoch):
                for item_id in batch["item_id"]:
                    item_ids[item_id] += 1

        for i in range(len(dataset)):
            assert num_passes - 1 <= item_ids[i] <= num_passes + 1
Ejemplo n.º 10
0
def test_validation_data_loader(dataset_context):
    with dataset_context as dataset:
        dataset_length = len(list(dataset))
        counter = defaultdict(lambda: 0)

        dl = ValidationDataLoader(
            dataset=dataset,
            transform=default_transformation(),
            batch_size=4,
            stack_fn=partial(batchify, ctx=current_context()),
        )

        batches = list(dl)

        for batch in batches:
            assert all(x is True for x in batch["is_train"])

        counter = count_item_ids(batches)

        for entry in dataset:
            assert counter[entry[FieldName.ITEM_ID]] == 1

        batches_again = list(dl)

        assert (b1 == b2 for b1, b2 in zip(batches, batches_again))
Ejemplo n.º 11
0
def test_training_loader_soft_constraint_01() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 01: EVERY TS VISITED AT LEAST ONCE

    train_dataset_loader_01 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=int(3 * exp_num_batches),
    )

    # give all the workers a little time to get ready, so they can start at the same time
    time.sleep(1.5)

    # multi-processed validation dataset
    mp_training_data_loader_result_01 = list(train_dataset_loader_01)

    # should contain an entry for every time series id
    transformation_counts_01 = get_transformation_counts(
        mp_training_data_loader_result_01)

    assert all([
        k in transformation_counts_01 for k in range(CD_NUM_TIME_SERIES)
    ]), "Not every time series processed at least once."
Ejemplo n.º 12
0
def test_training_loader_soft_constraint_03() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 03: ONE WORKER TRAVERSES ALL

    train_dataset_loader_03 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=1,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=int(3 * exp_num_batches),
    )

    # multi-processed validation dataset
    mp_training_data_loader_result_03 = list(train_dataset_loader_03)

    # should contain an entry for every time series id
    transformation_counts_03 = get_transformation_counts(
        mp_training_data_loader_result_03)

    assert all(
        k in transformation_counts_03 for k in range(CD_NUM_TIME_SERIES)
    ), "One worker should be able to traverse all in one sweep, and should not deplete its iterator."
Ejemplo n.º 13
0
def test_training_loader_soft_constraint_02() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    # the expected number of batches
    exp_num_batches = len(train_data_transformed_original)

    # CASE 02: NOT EVERY TS VISITED ONCE

    train_dataset_loader_02 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=int(0.5 * exp_num_batches),
    )

    # multi-processed validation dataset
    mp_training_data_loader_result_02 = list(train_dataset_loader_02)

    # should contain an entry for every time series id
    transformation_counts_02 = get_transformation_counts(
        mp_training_data_loader_result_02)

    assert not all([
        k in transformation_counts_02 for k in range(CD_NUM_TIME_SERIES)
    ]), "It should not have been possible to process every time series once. "
def test_training_loader_batch_size_hard_constraint() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    train_dataset_loader_01 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=30,
    )

    train_dataset_loader_02 = TrainDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
        ctx=current_context(),
        num_batches_per_epoch=30,
        shuffle_buffer_length=3 * BATCH_SIZE,
    )

    # multi-processed training dataset
    mp_training_data_loader_result_01 = list(train_dataset_loader_01)

    # multi-processed training dataset
    mp_training_data_loader_result_02 = list(train_dataset_loader_02)

    assert all(
        [
            len(batch["item_id"]) == BATCH_SIZE
            for batch in mp_training_data_loader_result_01
        ]
    ), "Not every batch from training loader is right size."

    assert all(
        [
            len(batch["item_id"]) == BATCH_SIZE
            for batch in mp_training_data_loader_result_02
        ]
    ), "Not every batch from training loader is right size, with shuffling on."
def test_validation_loader_equivalence() -> None:
    (
        list_dataset,
        transformation,
        list_dataset_pred_length,
        train_data_transformed_original,
    ) = get_dataset_and_transformation()

    validation_dataset_loader = ValidationDataLoader(
        dataset=list_dataset,
        transform=transformation,
        batch_size=BATCH_SIZE,
        stack_fn=partial(batchify, ctx=current_context()),
        decode_fn=partial(as_in_context, ctx=current_context()),
        num_workers=NUM_WORKERS_MP,  # This is the crucial difference
    )

    # multi-processed validation dataset
    mp_val_data_loader_result_01 = list(validation_dataset_loader)

    # multi-processed validation dataset NR2, second iteration/pass through
    mp_val_data_loader_result_02 = list(validation_dataset_loader)

    # ASSERTIONS:

    assert len(list_dataset.list_data) == len(
        get_transformation_counts(mp_val_data_loader_result_01)
    ), "The dataloaders do not cover the whole dataset. Check that each time series was assigned at least one worker."

    assert get_transformation_counts(
        mp_val_data_loader_result_01
    ) == get_transformation_counts(
        train_data_transformed_original
    ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one."

    assert get_transformation_counts(
        mp_val_data_loader_result_02
    ) == get_transformation_counts(
        train_data_transformed_original
    ), "The multiprocessing ValidationDataLoader should yield equivalent result to the non multiprocessing one."

    assert (mp_val_data_loader_result_02[0]["past_target"].context ==
            current_context()), "Batches in incorrect context"
Ejemplo n.º 16
0
def test_training_data_loader(dataset_context, num_workers):
    with dataset_context as dataset:
        dataset_length = len(list(dataset))

        batch_size = 4

        dl = TrainDataLoader(
            dataset=dataset,
            transform=default_transformation(),
            batch_size=batch_size,
            stack_fn=partial(batchify, ctx=current_context()),
            decode_fn=partial(as_in_context, ctx=current_context()),
            num_workers=num_workers,
        )

        num_epochs = 20
        epoch_length = 2

        passes_through_dataset = int(
            (num_epochs * epoch_length * batch_size) / dataset_length
        )

        # these are to make sure that the test makes sense:
        # we want to go over the dataset multiple times
        assert passes_through_dataset >= 10
        # we want each epoch to be shorter than the dataset
        assert epoch_length * batch_size < dataset_length

        batches = []

        for epoch in range(num_epochs):
            for batch in islice(dl, epoch_length):
                assert all(x is True for x in batch["is_train"])
                batches.append(batch)

        counter = count_item_ids(batches)

        if num_workers is None or num_workers == 1:
            for entry in dataset:
                assert counter[entry[FieldName.ITEM_ID]] >= 1
Ejemplo n.º 17
0
def _make_sprite_image(images, save_path):
    """Given an NDArray as a batch images, make a sprite image out of it following the rule
    defined in
    https://www.tensorflow.org/programmers_guide/embedding
    and save it in sprite.png under the path provided by the user."""
    if isinstance(images, np.ndarray):
        images = nd.array(images, dtype=images.dtype, ctx=current_context())
    elif not isinstance(images, (NDArray, np.ndarray)):
        raise TypeError('images must be an MXNet NDArray or numpy.ndarray,'
                        ' while received type {}'.format(str(type(images))))

    assert isinstance(images, NDArray)
    shape = images.shape
    nrow = int(np.ceil(np.sqrt(shape[0])))
    _save_image(images, os.path.join(save_path, 'sprite.png'), nrow=nrow, padding=0)
Ejemplo n.º 18
0
def test_flatten_slice_after_conv():
    data = mx.symbol.Variable('data')
    weight = mx.symbol.Variable('weight')
    bias = mx.symbol.Variable('bias')
    conv1= mx.symbol.Convolution(data = data, weight=weight, bias=bias, name='conv1', num_filter=64, kernel=(3,3), stride=(1,1))
    flatten1 = mx.symbol.flatten(data = conv1)
    slice1 = mx.symbol.slice(data = flatten1, begin=0, end=1)

    shape = (2, 16, 16, 16)
    val = np.random.rand(2, 16, 16, 16).astype(np.float32)
    exe = slice1._simple_bind(context.current_context(), data=shape)
    exe.arg_arrays[0][:] = val
    exe.arg_arrays[1][:] = np.random.normal(size=exe.arg_arrays[1].shape)
    exe.arg_arrays[2][:] = np.random.normal(size=exe.arg_arrays[2].shape)
    p = exe.forward(is_train=False)
    p[0].wait_to_read()
    print(p[0])
Ejemplo n.º 19
0
def test_inference_data_loader(dataset_context):
    with dataset_context as dataset:
        dataset_length = len(list(dataset))
        counter = defaultdict(lambda: 0)

        dl = InferenceDataLoader(
            dataset=dataset,
            transform=default_transformation(),
            batch_size=4,
            stack_fn=partial(batchify, ctx=current_context()),
        )

        batches = list(dl)

        for batch in batches:
            assert all(x is False for x in batch["is_train"])

        counter = count_item_ids(batches)

        for entry in dataset:
            assert counter[entry[FieldName.ITEM_ID]] == 1
Ejemplo n.º 20
0
def test_context():
    ctx_list = []
    ctx_list.append(context.current_context())

    def f():
        set_default_context(mx.gpu(11))
        ctx_list.append(context.current_context())

    thread = threading.Thread(target=f)
    thread.start()
    thread.join()
    assert Context.devtype2str[ctx_list[0].device_typeid] == "cpu"
    assert ctx_list[0].device_id == 0
    assert Context.devtype2str[ctx_list[1].device_typeid] == "gpu"
    assert ctx_list[1].device_id == 11

    e1 = threading.Event()
    e2 = threading.Event()
    status = [False]

    def g():
        with mx.cpu(10):
            e2.set()
            e1.wait()
            if context.current_context().device_id == 10:
                status[0] = True

    thread = threading.Thread(target=g)
    thread.start()
    e2.wait()
    with Context("cpu", 11):
        e1.set()
        thread.join()
    e1.clear()
    e2.clear()
    assert status[0], "Spawned thread didn't set the correct context"
Ejemplo n.º 21
0
 def g():
     with mx.cpu(10):
         e2.set()
         e1.wait()
         if context.current_context().device_id == 10:
             status[0] = True
Ejemplo n.º 22
0
 def f():
     set_default_context(mx.gpu(11))
     ctx_list.append(context.current_context())