def test_dataloader_epochs(datasets, engine, batch_size, epochs, on_ddf): dataset = Dataset(str(datasets["parquet"]), engine=engine) if on_ddf: dataset = dataset.to_ddf() cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] data_loader = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, ) # Convert to iterators and then to DataFrames df1 = _concat(list(data_loader._buff.itr)) df2 = _concat(list(data_loader.epochs(epochs)._buff.itr)) # Check that the DataFrame sizes and rows make sense assert len(df2) == epochs * len(df1) assert_eq( _concat([df1 for i in range(epochs)]).reset_index(drop=True), df2.reset_index(drop=True), )
def test_dataloader_empty_error(datasets, engine, batch_size): dataset = Dataset(str(datasets["parquet"]), engine=engine) with pytest.raises(ValueError) as exc_info: DataLoader( dataset, batch_size=batch_size, label_names=["label"], shuffle=False, ) assert "Neither Categorical or Continuous columns were found by the dataloader. " in str( exc_info.value)
def test_dataloader_seeding(datasets, engine, batch_size): cont_names = ["x", "y", "id"] cat_names = ["name-string", "name-cat"] label_name = ["label"] dataset = Dataset(str(datasets["parquet"]), engine=engine) # Define a seed function that returns the same seed on all workers seed_fragments = [] def seed_fn(): # Capturing the next random number generated allows us to check # that different workers have different random states when this # function is called next_rand = _generate_local_seed(0, 1) seed_fragments.append(next_rand) # But since we don't actually want to run two data loaders in # parallel in this test, we'll cheat and return a static seed # instead of combining the fragments into a new seed return 5678 # Set up two dataloaders with different global ranks data_loader_0 = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, global_size=2, global_rank=0, seed_fn=seed_fn, ) data_loader_1 = DataLoader( dataset, cat_names=cat_names, cont_names=cont_names, batch_size=batch_size, label_names=label_name, shuffle=False, global_size=2, global_rank=1, seed_fn=seed_fn, ) # Starting from the same random state, run a shuffle on each worker # and capture the results np.random.seed(1234) data_loader_0._shuffle_indices() dl0_rng_state = _get_random_state() dl0_next_rand = dl0_rng_state.tomaxint(size=1) dl0_indices = data_loader_0.indices np.random.seed(1234) data_loader_1._shuffle_indices() dl1_next_rand = _generate_local_seed(0, 1) dl1_indices = data_loader_1.indices # Test that the seed function actually gets called in each data loader assert len(seed_fragments) == 2 # Test that each data loader had different random state # when seed_fn was called assert seed_fragments[0] != seed_fragments[1] # Test that the shuffle has the same result on both workers # (i.e. the random seeds are the same when the shuffle happens) for idx, element in enumerate(dl0_indices): assert dl0_indices[idx] == dl1_indices[idx] # Test that after the shuffle each worker generates different random numbers # (i.e. the random seeds are different on each worker after the shuffle) assert dl0_next_rand != dl1_next_rand