def test_clip_first_x_frames(self): info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], 'subsample': 2 }, 'training_validation_split': 1.0, 'remove_first_n_timestamps': 5, } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/train*.hdf5') })) data_loader.load_dataset() self.assertEqual( sum(int((e - 5) / 2) + 1 for e in info['episode_lengths']), len(data_loader.get_dataset()))
def test_create_dataset_and_clean(self): info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], 'input_size': (150, 150, 1) }, 'training_validation_split': 0.7, } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() data_loader_train = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/train*.hdf5') })) data_loader_train.load_dataset() data_loader_validation = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/validation*.hdf5') })) data_loader_validation.load_dataset() ratio = len(data_loader_train.get_dataset()) / ( 0. + len(data_loader_train.get_dataset()) + len(data_loader_validation.get_dataset())) self.assertTrue(ratio > 0.6) self.assertTrue(ratio < 0.8)
def test_split_hdf5_chunks(self): info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], }, 'training_validation_split': 1.0, 'max_hdf5_size': 5 * 10**6 } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() for hdf5_file in glob(f'{self.output_dir}/train*.hdf5'): data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': [hdf5_file] })) data_loader.load_dataset() self.assertTrue( data_loader.get_dataset().get_memory_size() < 6 * 10**6)
def test_train_model_on_external_dataset_as_hdf5(self): network = eval(self.experiment_config['architecture_config']['architecture']).Net( config=ArchitectureConfig().create(config_dict=self.experiment_config['architecture_config']) ) external_dataset = f'{os.environ["PWD"]}/test_dir/external_dataset' os.makedirs(external_dataset, exist_ok=True) info = generate_random_dataset_in_raw_data(output_dir=external_dataset, num_runs=5, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, store_hdf5=True) self.assertTrue(os.path.isfile(os.path.join(external_dataset, 'train.hdf5'))) self.assertTrue(os.path.isfile(os.path.join(external_dataset, 'validation.hdf5'))) self.experiment_config["trainer_config"]["data_loader_config"]["hdf5_files"] = [os.path.join(external_dataset, 'train.hdf5')] self.experiment_config["evaluator_config"]["data_loader_config"]["hdf5_files"] = [os.path.join(external_dataset, 'validation.hdf5')] experiment = Experiment(config=ExperimentConfig().create(config_dict=self.experiment_config)) experiment.run() # check if 5 + 2 checkpoints were stored in torch_checkpoints self.assertTrue(len([f for f in os.listdir(os.path.join(self.output_dir, 'torch_checkpoints')) if f.endswith('ckpt')]), 4) shutil.rmtree(external_dataset, ignore_errors=True)
def test_train_model_on_generated_dataset_with_tensorboard(self): network = eval(self.experiment_config['architecture_config']['architecture']).Net( config=ArchitectureConfig().create(config_dict=self.experiment_config['architecture_config']) ) info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=5, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, store_hdf5=True) self.experiment_config['tensorboard'] = True experiment = Experiment(config=ExperimentConfig().create(config_dict=self.experiment_config)) experiment.run() self.assertGreater(len(glob(os.path.join(self.output_dir, 'events.*'))), 0)
def test_train_model_on_generated_dataset(self): network = eval(self.experiment_config['architecture_config']['architecture']).Net( config=ArchitectureConfig().create(config_dict=self.experiment_config['architecture_config']) ) info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=5, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, store_hdf5=True) experiment = Experiment(config=ExperimentConfig().create(config_dict=self.experiment_config)) experiment.run() print(os.listdir(os.path.join(self.output_dir, 'torch_checkpoints'))) # check if checkpoints were stored in torch_checkpoints self.assertEqual(5, len([f for f in os.listdir(os.path.join(self.output_dir, 'torch_checkpoints')) if f.endswith('ckpt')]))
def setUp(self) -> None: self.output_dir = f'{os.environ["PWD"]}/test_dir/{get_filename_without_extension(__file__)}' os.makedirs(self.output_dir, exist_ok=True) architecture_base_config['output_path'] = self.output_dir trainer_base_config['output_path'] = self.output_dir self.network = eval(architecture_base_config['architecture']).Net( config=ArchitectureConfig().create(config_dict=architecture_base_config) ) # checksum network info = generate_random_dataset_in_raw_data(output_dir=self.output_dir, num_runs=5, input_size=self.network.input_size, output_size=self.network.output_size, continuous=not self.network.discrete) trainer_base_config['data_loader_config'] = { 'data_directories': info['episode_directories'], }
def test_evaluate_model_on_dataset(self): network = eval(architecture_base_config['architecture']).Net( config=ArchitectureConfig().create( config_dict=architecture_base_config)) info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete) # generate evaluator with correct data-loader evaluator_base_config['data_loader_config'] = { 'data_directories': info['episode_directories'], 'batch_size': 5 } evaluator = Evaluator( config=EvaluatorConfig().create(config_dict=evaluator_base_config), network=network) # evaluate error_msg = evaluator.evaluate() self.assertFalse('nan' in error_msg)
def test_generate_random_dataset_with_train_validation_hdf5(self): num_runs = 10 # generate network network = eval(architecture_base_config['architecture']).Net( config=ArchitectureConfig().create( config_dict=architecture_base_config)) # generate dummy dataset info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, num_runs=num_runs, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, store_hdf5=True) data_loader_config = { 'output_path': self.output_dir, 'hdf5_files': [os.path.join(self.output_dir, 'train.hdf5')] } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=data_loader_config)) data_loader.load_dataset() self.assertNotEqual( sum(d != 0 for d in data_loader.get_dataset().done), 0)
def test_generate_random_dataset_in_raw_data(self): num_runs = 10 # generate network network = eval(architecture_base_config['architecture']).Net( config=ArchitectureConfig().create( config_dict=architecture_base_config)) # generate dummy dataset info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, num_runs=num_runs, input_size=network.input_size, output_size=network.output_size, continuous=not network.discrete, ) data_loader_config = { 'output_path': self.output_dir, 'data_directories': info['episode_directories'], } data_loader = DataLoader(config=DataLoaderConfig().create( config_dict=data_loader_config)) data_loader.load_dataset() self.assertEqual(sum(d != 0 for d in data_loader.get_dataset().done), num_runs)
def test_line_world_augmentation(self): line_image = np.ones((100, 100, 3)) line_image[:, 40:43, 0:2] = 0 info = generate_random_dataset_in_raw_data( output_dir=self.output_dir, num_runs=20, input_size=(100, 100, 3), output_size=(1, ), continuous=True, fixed_input_value=line_image, store_hdf5=False) cleaner_config_dict = { 'output_path': self.output_dir, 'data_loader_config': { 'data_directories': info['episode_directories'], 'input_size': (1, 64, 64) }, 'training_validation_split': 0.7, 'remove_first_n_timestamps': 5, 'binary_maps_as_target': True, 'invert_binary_maps': True, 'augment_background_noise': 0.1, 'augment_background_textured': 0.9, 'texture_directory': 'textured_dataset', 'augment_empty_images': 0.1 } data_cleaner = DataCleaner(config=DataCleaningConfig().create( config_dict=cleaner_config_dict)) data_cleaner.clean() data_loader = DataLoader(config=DataLoaderConfig().create( config_dict={ 'output_path': self.output_dir, 'hdf5_files': glob(f'{self.output_dir}/train*.hdf5') })) data_loader.load_dataset() data_loader.get_dataset().plot()
def test_local_hdf5_file(self): # create fake hdf5 files info_0 = generate_random_dataset_in_raw_data(os.path.join( self.output_dir, 'fake_data_0'), num_runs=2, store_hdf5=True) info_1 = generate_random_dataset_in_raw_data(os.path.join( self.output_dir, 'fake_data_1'), num_runs=3, store_hdf5=True) # create experiment config using hdf5 files os.makedirs(os.path.join(self.output_dir, 'experiment_output'), exist_ok=True) experiment_config = { 'output_path': os.path.join(self.output_dir, 'experiment_output'), 'fake_key_a': [1, 2, 3], 'fake_key_b': { 'fake_key_b_0': 'ok', 'fake_key_b_1': { 'hdf5_files': [ os.path.join(self.output_dir, 'fake_data_0', 'train.hdf5'), os.path.join(self.output_dir, 'fake_data_1', 'train.hdf5') ] } }, 'fake_key_c': { 'hdf5_files': [] }, 'fake_key_d': { 'hdf5_files': [ os.path.join(self.output_dir, 'fake_data_0', 'validation.hdf5'), os.path.join(self.output_dir, 'fake_data_1', 'validation.hdf5') ] } } original_sizes = [ int(subprocess.getoutput("stat --format %s " + v)) for v in [ os.path.join(self.output_dir, 'fake_data_0', 'train.hdf5'), os.path.join(self.output_dir, 'fake_data_1', 'train.hdf5'), os.path.join(self.output_dir, 'fake_data_0', 'validation.hdf5'), os.path.join(self.output_dir, 'fake_data_1', 'validation.hdf5') ] ] print(f'original_sizes: {original_sizes}') with open(os.path.join(self.output_dir, 'experiment_config.yml'), 'w') as f: yaml.dump(experiment_config, f) # create and submit condor job using hdf5 files but save them locally job_config = { 'config_file': os.path.join(self.output_dir, 'experiment_config.yml'), 'output_path': self.output_dir, 'command': 'python src/condor/test/dummy_python_script_check_hdf5.py', 'wall_time_s': 60, 'save_locally': True } condor_job = CondorJob(config=CondorJobConfig().create( config_dict=job_config)) condor_job.write_job_file() condor_job.write_executable_file() condor_job.submit() wait_for_job_to_finish(condor_job.log_file) self.assertTrue( glob(os.path.join(condor_job.output_dir, 'FINISHED*'))[0].endswith('0')) # check jobs output file to control hdf5 files were loaded locally # compare file sizes to ensure same hdf5 files were copied with open(os.path.join(condor_job.output_dir, 'job.output'), 'r') as f: output_lines = f.readlines() hdf5_files = [ l.split(' ')[1] for l in output_lines if l.startswith('HDF5_FILE') ] hdf5_file_sizes = [ int(l.split(' ')[2]) for l in output_lines if l.startswith('HDF5_FILE') ] print(f'hdf5_files: {hdf5_files}') print(f'hdf5_file_sizes: {hdf5_file_sizes}') self.assertEqual(len(hdf5_file_sizes), len(original_sizes)) for f in hdf5_files: self.assertTrue(f.startswith(condor_job.local_home)) for js, rs in zip(hdf5_file_sizes, original_sizes): self.assertEqual(js, rs)