def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf): dataset = Dataset(str(datasets["parquet"]), engine=engine) if on_ddf: dataset = dataset.to_ddf() cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] data_loader = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, ) # Convert to iterators and then to DataFrames df1 = _concat(list(data_loader._buff.itr)) df2 = _concat(list(data_loader.epochs(epochs)._buff.itr)) # Check that the DataFrame sizes and rows make sense assert len(df2) == epochs * len(df1) assert_eq( _concat([df1 for i in range(epochs)]).reset_index(drop=True), df2.reset_index(drop=True), )
def __len__(self): """ recreating since otherwise Keras yells at you """ # TODO: what's a better way to do this inheritance # of the appropriate methods? A Metaclass? DataLoader.stop(self) return DataLoader.__len__(self)
def __init__( self, paths_or_dataset, batch_size, label_names, feature_columns=None, cat_names=None, cont_names=None, engine=None, shuffle=True, seed_fn=None, buffer_size=0.1, device=None, parts_per_chunk=1, reader_kwargs=None, global_size=None, global_rank=None, drop_last=False, sparse_names=None, sparse_max=None, sparse_as_dense=False, ): dataset = _validate_dataset( paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs ) cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names) # sort the ccolumns to avoid getting incorrect output # (https://github.com/NVIDIA/NVTabular/issues/412) cat_names = _get_embedding_order(cat_names) cont_names = _get_embedding_order(cont_names) device = device or 0 DataLoader.__init__( self, dataset, cat_names, cont_names, label_names, batch_size, shuffle, seed_fn=seed_fn, parts_per_chunk=parts_per_chunk, device=device, global_size=global_size, global_rank=global_rank, drop_last=drop_last, sparse_names=sparse_names, sparse_max=sparse_max, sparse_as_dense=sparse_as_dense, ) self._map_fns = []
def __init__( self, paths_or_dataset, batch_size, label_names=None, feature_columns=None, cat_names=None, cont_names=None, engine=None, shuffle=True, seed_fn=None, buffer_size=0.1, device=None, parts_per_chunk=1, reader_kwargs=None, global_size=None, global_rank=None, drop_last=False, sparse_names=None, sparse_max=None, sparse_as_dense=False, schema=None, ): dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs) schema = _get_schema(dataset) if not schema else schema cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names, schema=schema) device = device or 0 device = "cpu" if not HAS_GPU else device DataLoader.__init__( self, dataset, batch_size, shuffle, cat_names=cat_names, cont_names=cont_names, label_names=label_names, seed_fn=seed_fn, parts_per_chunk=parts_per_chunk, device=device, global_size=global_size, global_rank=global_rank, drop_last=drop_last, sparse_names=sparse_names, sparse_max=sparse_max, sparse_as_dense=sparse_as_dense, ) self._map_fns = []
def __getitem__(self, idx): """ implemented exclusively for consistency with Keras model.fit. Does not leverage passed idx in any way """ return DataLoader.__next__(self)
def __getitem__(self, idx): """ implemented exclusively for consistency with Keras model.fit. Does not leverage passed idx in any way """ try: return DataLoader.__next__(self) except StopIteration: # TODO: I would like to do a check for idx == 0 # here, but that requires that tf.keras.Model.fit # be called with shuffle=False, and that seems # small enough that it would be too easy to miss # for many users. That said, blind reinitialization # is probably irresponsible, so worth thinking # of something better here DataLoader.__iter__(self) return DataLoader.__next__(self)
def __init__( self, paths_or_dataset, batch_size, label_names, feature_columns=None, cat_names=None, cont_names=None, engine=None, shuffle=True, buffer_size=0.1, workflows=None, devices=None, parts_per_chunk=1, reader_kwargs=None, ): dataset = _validate_dataset(paths_or_dataset, batch_size, buffer_size, engine, reader_kwargs) cat_names, cont_names = _validate_schema(feature_columns, cat_names, cont_names) # sort the ccolumns to avoid getting incorrect output # (https://github.com/NVIDIA/NVTabular/issues/412) cat_names = _get_embedding_order(cat_names) cont_names = _get_embedding_order(cont_names) assert devices is None or len( devices) == 1 # TODO: figure out multi-gpu support devices = devices or [0] DataLoader.__init__( self, dataset, cat_names, cont_names, label_names, batch_size, shuffle, parts_per_chunk=parts_per_chunk, workflows=workflows, devices=devices, )
def test_dataloader_empty_error(datasets, engine, batch_size): dataset = Dataset(str(datasets["parquet"]), engine=engine) with pytest.raises(ValueError) as exc_info: DataLoader( dataset, batch_size=batch_size, label_names=["label"], shuffle=False, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value)
def test_dataloader_seeding(datasets, engine, batch_size): cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] dataset = Dataset(str(datasets["parquet"]), engine=engine) # Define a seed function that returns the same seed on all workers seed_fragments = [] def seed_fn(): # Capturing the next random number generated allows us to check # that different workers have different random states when this # function is called next_rand = _generate_local_seed(0, 1) seed_fragments.append(next_rand) # But since we don't actually want to run two data loaders in # parallel in this test, we'll cheat and return a static seed # instead of combining the fragments into a new seed return 5678 # Set up two dataloaders with different global ranks data_loader_0 = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, global_size=2, global_rank=0, seed_fn=seed_fn, ) data_loader_1 = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, global_size=2, global_rank=1, seed_fn=seed_fn, ) # Starting from the same random state, run a shuffle on each worker # and capture the results np.random.seed(1234) data_loader_0._shuffle_indices() dl0_rng_state = _get_random_state() dl0_next_rand = dl0_rng_state.tomaxint(size=1) dl0_indices = data_loader_0.indices np.random.seed(1234) data_loader_1._shuffle_indices() dl1_next_rand = _generate_local_seed(0, 1) dl1_indices = data_loader_1.indices # Test that the seed function actually gets called in each data loader assert len(seed_fragments) == 2 # Test that each data loader had different random state # when seed_fn was called assert seed_fragments[0] != seed_fragments[1] # Test that the shuffle has the same result on both workers # (i.e. the random seeds are the same when the shuffle happens) for idx, element in enumerate(dl0_indices): assert dl0_indices[idx] == dl1_indices[idx] # Test that after the shuffle each worker generates different random numbers # (i.e. the random seeds are different on each worker after the shuffle) assert dl0_next_rand != dl1_next_rand