def test_data_storage_in_raw_data_with_data_size_limit(self): config_dict = { 'output_path': self.output_dir, 'max_size': 25, 'separate_raw_data_runs': True } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) first_info = generate_dummy_dataset(self.data_saver, num_runs=2) self.assertEqual(sum(first_info['episode_lengths']), self.data_saver._frame_counter) self.data_saver.update_saving_directory() second_info = generate_dummy_dataset(self.data_saver, num_runs=2) self.assertTrue( (sum(first_info['episode_lengths']) + sum(second_info['episode_lengths'])) > config_dict['max_size']) self.assertTrue( self.data_saver._frame_counter <= config_dict['max_size']) raw_data_dir = os.path.dirname(self.data_saver.get_saving_directory()) count_actual_frames = sum([ len( os.listdir( os.path.join(raw_data_dir, episode_dir, 'observation'))) for episode_dir in os.listdir(raw_data_dir) ]) self.assertEqual(count_actual_frames, self.data_saver._frame_counter)
def generate_random_dataset_in_raw_data( output_dir: str, num_runs: int = 20, input_size: tuple = (100, 100, 3), output_size: tuple = (1, ), continuous: bool = True, fixed_input_value: Union[float, np.ndarray] = None, fixed_output_value: Union[float, np.ndarray] = None, store_hdf5: bool = False) -> dict: """Generate data, stored in raw_data directory of output_dir""" data_saver = DataSaver(config=DataSaverConfig().create( config_dict={ 'output_path': output_dir, 'store_hdf5': store_hdf5, 'separate_raw_data_runs': True })) info = generate_dummy_dataset(data_saver, num_runs=num_runs, input_size=input_size, output_size=output_size, continuous=continuous, fixed_input_value=fixed_input_value, fixed_output_value=fixed_output_value, store_hdf5=store_hdf5) return info
def test_create_hdf5_files_subsampled_in_time(self): num_runs = 10 split = 1.0 subsample = 3 config_dict = { 'output_path': self.output_dir, 'training_validation_split': split, 'store_hdf5': True, 'subsample_hdf5': subsample, 'separate_raw_data_runs': True } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=num_runs) self.data_saver.create_train_validation_hdf5_files() config = DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')] }) training_data_loader = DataLoader(config=config) training_data_loader.load_dataset() training_data = training_data_loader.get_dataset() self.assertEqual( len(training_data), sum([ np.ceil((el - 1) / subsample) + 1 for el in info['episode_lengths'] ]))
def test_big_data_hdf5_loop(self): # create 3 datasets as hdf5 files hdf5_files = [] infos = [] for index in range(3): output_path = os.path.join(self.output_dir, f'ds{index}') os.makedirs(output_path, exist_ok=True) config_dict = { 'output_path': output_path, 'store_hdf5': True, 'training_validation_split': 1.0 } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) infos.append( generate_dummy_dataset(self.data_saver, num_runs=2, input_size=(3, 10, 10), fixed_input_value=(0.3 * index) * np.ones((3, 10, 10)), store_hdf5=True)) self.assertTrue( os.path.isfile(os.path.join(output_path, 'train.hdf5'))) hdf5_files.append(os.path.join(output_path, 'train.hdf5')) hdf5_files.append(os.path.join(output_path, 'wrong.hdf5')) # create data loader with big data tag and three hdf5 training sets conf = { 'output_path': self.output_dir, 'hdf5_files': hdf5_files, 'batch_size': 15, 'loop_over_hdf5_files': True } loader = DataLoader(DataLoaderConfig().create(config_dict=conf)) # sample data batches and see that index increases every two batches sampled for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0) for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3, 2) for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6, 2) for batch in loader.get_data_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2) for batch in loader.sample_shuffled_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.3, 2) for batch in loader.sample_shuffled_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0.6, 2) for batch in loader.sample_shuffled_batch(): self.assertAlmostEqual(batch.observations[0][0, 0, 0].item(), 0, 2)
def setUp(self) -> None: self.output_dir = f'{os.environ["PWD"]}/test_dir/{get_filename_without_extension(__file__)}' if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) config_dict = {'output_path': self.output_dir, 'store_hdf5': True} config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False)
def test_sample_batch(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) max_num_batches = 2 config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, 'random_seed': 1, 'batch_size': 3 } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() first_batch = [] index = 0 for index, batch in enumerate( data_loader.sample_shuffled_batch( max_number_of_batches=max_num_batches)): if index == 0: first_batch = deepcopy(batch) self.assertEqual(len(batch), config_dict['batch_size']) self.assertEqual(index, max_num_batches - 1) # test sampling seed for reproduction config_dict['random_seed'] = 2 data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() second_batch = [] for index, batch in enumerate( data_loader.sample_shuffled_batch( max_number_of_batches=max_num_batches)): second_batch = deepcopy(batch) break self.assertNotEqual(np.sum(np.asarray(first_batch.observations[0])), np.sum(np.asarray(second_batch.observations[0]))) config_dict['random_seed'] = 1 data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() third_batch = [] for index, batch in enumerate( data_loader.sample_shuffled_batch( max_number_of_batches=max_num_batches)): third_batch = deepcopy(batch) break self.assertEqual(np.sum(np.asarray(first_batch.observations[0])), np.sum(np.asarray(third_batch.observations[0])))
def test_store_in_ram(self): config_dict = { 'output_path': self.output_dir, 'store_on_ram_only': True, 'max_size': 10 } number_of_runs = 10 config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=number_of_runs) data = self.data_saver.get_dataset() self.assertEqual(len(data), config_dict['max_size']) for lst in [data.observations, data.actions, data.rewards, data.done]: self.assertEqual(len(lst), config_dict['max_size']) self.assertTrue(isinstance(lst[0], torch.Tensor))
def test_empty_saving_directory(self): config_dict = { 'output_path': self.output_dir, 'separate_raw_data_runs': True } number_of_runs = 5 config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=number_of_runs) self.assertEqual( len(os.listdir(os.path.join(self.output_dir, 'raw_data'))), number_of_runs) self.data_saver.empty_raw_data_in_output_directory() self.assertEqual( len(os.listdir(os.path.join(self.output_dir, 'raw_data'))), 0)
def test_data_loader_from_raw_path_dirs(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) config_dict = { 'data_directories': [self.output_dir], 'output_path': self.output_dir, } config = DataLoaderConfig().create(config_dict=config_dict) data_loader = DataLoader(config=config) data_loader.load_dataset() config = DataLoaderConfig().create(config_dict=config_dict) for d in config.data_directories: self.assertTrue(os.path.isdir(d))
def test_data_batch(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, 'random_seed': 1, 'batch_size': 3 } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() for batch in data_loader.get_data_batch(): self.assertEqual(len(batch), config_dict['batch_size']) break
def test_data_loading(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, } config = DataLoaderConfig().create(config_dict=config_dict) data_loader = DataLoader(config=config) data_loader.load_dataset() # assert nothing is empty for k in ['observations', 'actions', 'rewards', 'done']: data = eval(f'data_loader.get_dataset().{k}') self.assertTrue(len(data) > 0) self.assertTrue(sum(data[0].shape) > 0)
def test_data_storage_in_raw_data(self): config_dict = { 'output_path': self.output_dir, 'separate_raw_data_runs': True } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=2) for total, episode_dir in zip(info['episode_lengths'], info['episode_directories']): self.assertEqual( len( os.listdir( os.path.join(self.output_dir, 'raw_data', episode_dir, 'observation'))), total) with open( os.path.join(self.output_dir, 'raw_data', episode_dir, 'action.data')) as f: expert_controls = f.readlines() self.assertEqual(len(expert_controls), total)
def test_data_subsample(self): self.info = generate_dummy_dataset(self.data_saver, num_runs=20, input_size=(100, 100, 3), output_size=(3, ), continuous=False) subsample = 4 config_dict = { 'data_directories': self.info['episode_directories'], 'output_path': self.output_dir, 'random_seed': 1, 'batch_size': 3, 'subsample': subsample } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=config_dict)) data_loader.load_dataset() self.assertTrue( sum([ np.ceil((el - 1) / subsample) + 1 for el in self.info['episode_lengths'] ]), len(data_loader.get_dataset()))
def test_create_train_validation_hdf5_files(self): num_runs = 10 split = 0.7 config_dict = { 'output_path': self.output_dir, 'training_validation_split': split, 'store_hdf5': True, 'separate_raw_data_runs': True } config = DataSaverConfig().create(config_dict=config_dict) self.data_saver = DataSaver(config=config) info = generate_dummy_dataset(self.data_saver, num_runs=num_runs) self.data_saver.create_train_validation_hdf5_files() config = DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')] }) training_data_loader = DataLoader(config=config) training_data_loader.load_dataset() training_data = training_data_loader.get_dataset() config = DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'validation.hdf5')] }) validation_data_loader = DataLoader(config=config) validation_data_loader.load_dataset() validation_data = validation_data_loader.get_dataset() self.assertEqual(len(training_data), sum(info['episode_lengths'][:int(split * num_runs)])) self.assertEqual(len(validation_data), sum(info['episode_lengths'][int(split * num_runs):]))