Esempio n. 1
0
def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf):
    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    if on_ddf:
        dataset = dataset.to_ddf()

    cont_names = ["x", "y", "id"]
    cat_names = ["name-string", "name-cat"]
    label_name = ["label"]

    data_loader = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
    )

    # Convert to iterators and then to DataFrames
    df1 = _concat(list(data_loader._buff.itr))
    df2 = _concat(list(data_loader.epochs(epochs)._buff.itr))

    # Check that the DataFrame sizes and rows make sense
    assert len(df2) == epochs * len(df1)
    assert_eq(
        _concat([df1 for i in range(epochs)]).reset_index(drop=True),
        df2.reset_index(drop=True),
    )
Esempio n. 2
0
 def __len__(self):
     """
     recreating since otherwise Keras yells at you
     """
     # TODO: what's a better way to do this inheritance
     # of the appropriate methods? A Metaclass?
     DataLoader.stop(self)
     return DataLoader.__len__(self)
Esempio n. 3
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        seed_fn=None,
        buffer_size=0.1,
        device=None,
        parts_per_chunk=1,
        reader_kwargs=None,
        global_size=None,
        global_rank=None,
        drop_last=False,
        sparse_names=None,
        sparse_max=None,
        sparse_as_dense=False,
    ):
        dataset = _validate_dataset(
            paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs
        )
        cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names)

        # sort the ccolumns to avoid getting incorrect output
        # (https://github.com/NVIDIA/NVTabular/issues/412)
        cat_names = _get_embedding_order(cat_names)
        cont_names = _get_embedding_order(cont_names)

        device = device or 0
        DataLoader.__init__(
            self,
            dataset,
            cat_names,
            cont_names,
            label_names,
            batch_size,
            shuffle,
            seed_fn=seed_fn,
            parts_per_chunk=parts_per_chunk,
            device=device,
            global_size=global_size,
            global_rank=global_rank,
            drop_last=drop_last,
            sparse_names=sparse_names,
            sparse_max=sparse_max,
            sparse_as_dense=sparse_as_dense,
        )
        self._map_fns = []
Esempio n. 4
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names=None,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        seed_fn=None,
        buffer_size=0.1,
        device=None,
        parts_per_chunk=1,
        reader_kwargs=None,
        global_size=None,
        global_rank=None,
        drop_last=False,
        sparse_names=None,
        sparse_max=None,
        sparse_as_dense=False,
        schema=None,
    ):
        dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size,
                                    engine, reader_kwargs)
        schema = _get_schema(dataset) if not schema else schema
        cat_names, cont_names = _validate_schema(feature_columns,
                                                 cat_names,
                                                 cont_names,
                                                 schema=schema)

        device = device or 0
        device = "cpu" if not HAS_GPU else device
        DataLoader.__init__(
            self,
            dataset,
            batch_size,
            shuffle,
            cat_names=cat_names,
            cont_names=cont_names,
            label_names=label_names,
            seed_fn=seed_fn,
            parts_per_chunk=parts_per_chunk,
            device=device,
            global_size=global_size,
            global_rank=global_rank,
            drop_last=drop_last,
            sparse_names=sparse_names,
            sparse_max=sparse_max,
            sparse_as_dense=sparse_as_dense,
        )
        self._map_fns = []
Esempio n. 5
0
 def __getitem__(self, idx):
     """
     implemented exclusively for consistency
     with Keras model.fit. Does not leverage
     passed idx in any way
     """
     return DataLoader.__next__(self)
Esempio n. 6
0
 def __getitem__(self, idx):
     """
     implemented exclusively for consistency
     with Keras model.fit. Does not leverage
     passed idx in any way
     """
     try:
         return DataLoader.__next__(self)
     except StopIteration:
         # TODO: I would like to do a check for idx == 0
         # here, but that requires that tf.keras.Model.fit
         # be called with shuffle=False, and that seems
         # small enough that it would be too easy to miss
         # for many users. That said, blind reinitialization
         # is probably irresponsible, so worth thinking
         # of something better here
         DataLoader.__iter__(self)
         return DataLoader.__next__(self)
Esempio n. 7
0
    def __init__(
        self,
        paths_or_dataset,
        batch_size,
        label_names,
        feature_columns=None,
        cat_names=None,
        cont_names=None,
        engine=None,
        shuffle=True,
        buffer_size=0.1,
        workflows=None,
        devices=None,
        parts_per_chunk=1,
        reader_kwargs=None,
    ):
        dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size,
                                    engine, reader_kwargs)
        cat_names, cont_names = _validate_schema(feature_columns, cat_names,
                                                 cont_names)

        # sort the ccolumns to avoid getting incorrect output
        # (https://github.com/NVIDIA/NVTabular/issues/412)
        cat_names = _get_embedding_order(cat_names)
        cont_names = _get_embedding_order(cont_names)

        assert devices is None or len(
            devices) == 1  # TODO: figure out multi-gpu support
        devices = devices or [0]
        DataLoader.__init__(
            self,
            dataset,
            cat_names,
            cont_names,
            label_names,
            batch_size,
            shuffle,
            parts_per_chunk=parts_per_chunk,
            workflows=workflows,
            devices=devices,
        )
Esempio n. 8
0
def test_dataloader_empty_error(datasets, engine, batch_size):
    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    with pytest.raises(ValueError) as exc_info:
        DataLoader(
            dataset,
            batch_size=batch_size,
            label_names=["label"],
            shuffle=False,
        )
    assert "Neither Categorical or Continuous columns were found by the dataloader. " in str(
        exc_info.value)
Esempio n. 9
0
def test_dataloader_seeding(datasets, engine, batch_size):
    cont_names = ["x", "y", "id"]
    cat_names = ["name-string", "name-cat"]
    label_name = ["label"]

    dataset = Dataset(str(datasets["parquet"]), engine=engine)

    # Define a seed function that returns the same seed on all workers
    seed_fragments = []

    def seed_fn():
        # Capturing the next random number generated allows us to check
        # that different workers have different random states when this
        # function is called
        next_rand = _generate_local_seed(0, 1)
        seed_fragments.append(next_rand)

        # But since we don't actually want to run two data loaders in
        # parallel in this test, we'll cheat and return a static seed
        # instead of combining the fragments into a new seed
        return 5678

    # Set up two dataloaders with different global ranks
    data_loader_0 = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
        global_size=2,
        global_rank=0,
        seed_fn=seed_fn,
    )

    data_loader_1 = DataLoader(
        dataset,
        cat_names=cat_names,
        cont_names=cont_names,
        batch_size=batch_size,
        label_names=label_name,
        shuffle=False,
        global_size=2,
        global_rank=1,
        seed_fn=seed_fn,
    )

    # Starting from the same random state, run a shuffle on each worker
    # and capture the results
    np.random.seed(1234)

    data_loader_0._shuffle_indices()

    dl0_rng_state = _get_random_state()
    dl0_next_rand = dl0_rng_state.tomaxint(size=1)
    dl0_indices = data_loader_0.indices

    np.random.seed(1234)

    data_loader_1._shuffle_indices()

    dl1_next_rand = _generate_local_seed(0, 1)
    dl1_indices = data_loader_1.indices

    # Test that the seed function actually gets called in each data loader
    assert len(seed_fragments) == 2

    # Test that each data loader had different random state
    # when seed_fn was called
    assert seed_fragments[0] != seed_fragments[1]

    # Test that the shuffle has the same result on both workers
    # (i.e. the random seeds are the same when the shuffle happens)
    for idx, element in enumerate(dl0_indices):
        assert dl0_indices[idx] == dl1_indices[idx]

    # Test that after the shuffle each worker generates different random numbers
    # (i.e. the random seeds are different on each worker after the shuffle)
    assert dl0_next_rand != dl1_next_rand