def test_eval_preparation(self): dl = DataloaderImages((149, 117), ignore_useless_states=False) st = ModelTrainer( lambda: DeconvModelEfficient(), self.test_src_dir, self.eval_output_path, load_datasets_path=self.test_split_dir, cache_path=None, batch_size=2, train_print_frequency=10, epochs=0, num_workers=4, num_validation_samples=2, num_test_samples=self.num_test_samples, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), data_root=test_resources.test_src_dir, ) st.start_training() dirs = [e for e in self.eval_output_path.iterdir() if e.is_dir()] code_dir = dirs[0] / 'rtm-predictions' slurm_script = dirs[0] / 'run_model_eval.sh' self.assertTrue(os.path.isdir(code_dir)) self.assertTrue(os.path.isfile(slurm_script)) with open(slurm_script) as f: lines = f.read().splitlines() tokens = lines[-1].split() self.assertEqual(dirs[0], Path(tokens[-3])) st.writer.flush() st.writer.close()
def test_training(self): num_epochs = 2 dl = DataloaderImages((149, 117), ignore_useless_states=False) st = ModelTrainer( lambda: DeconvModelEfficient(), self.test_src_dir, self.training_save_path, load_datasets_path=self.test_split_dir, cache_path=None, batch_size=16, train_print_frequency=10, epochs=num_epochs, num_workers=4, num_validation_samples=2, num_test_samples=self.num_test_samples, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.BCELoss(), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), data_root=test_resources.test_src_dir, ) st.start_training() dirs = [e for e in self.training_save_path.iterdir() if e.is_dir()] with open(dirs[0] / 'output.log') as f: content = f.read() epochs = re.findall('Mean Loss on Eval', content) self.assertEqual(num_epochs, len(epochs)) # Check if steps are growing / if there are doubled steps in the output steps = [ int(re.findall(r'\d+', x)[0]) for x in re.findall(r'Duration of step.+\d:', content) ] self.assertEqual(len(set(steps)), len(steps))
def test_training_load_optimizer(self): dl = DataloaderImages((149, 117), ignore_useless_states=False) st = ModelTrainer( lambda: DeconvModelEfficient(), self.test_src_dir, self.training_save_path, load_datasets_path=self.test_split_dir, cache_path=None, batch_size=16, train_print_frequency=10, epochs=self.expected_num_epochs_during_training, num_workers=4, num_validation_samples=2, num_test_samples=self.num_test_samples, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.BCELoss(), optimizer_path=self.checkpoint, classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), data_root=test_resources.test_src_dir, ) st.start_training() after = len(st.optimizer.state.keys()) """ Optimizer has now more than 0 states, therefore was loaded """ self.assertGreater(after, 0)
def test_save_load_training(self): num_epochs = 2 dl = DataloaderImages((149, 117), ignore_useless_states=False) st = ModelTrainer( lambda: DeconvModelEfficient(), self.test_src_dir, self.training_save_path, load_datasets_path=self.test_split_dir, cache_path=None, batch_size=16, train_print_frequency=10, epochs=num_epochs, num_workers=4, num_validation_samples=2, num_test_samples=self.num_test_samples, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.BCELoss(), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), data_root=test_resources.test_src_dir, ) st.start_training() num_epochs = 2 dl = DataloaderImages((149, 117), ignore_useless_states=False) st = ModelTrainer( lambda: DeconvModelEfficient(), self.test_src_dir, self.training_save_path, load_datasets_path=self.test_split_dir, cache_path=None, batch_size=16, train_print_frequency=10, epochs=num_epochs, num_workers=4, num_validation_samples=2, num_test_samples=self.num_test_samples, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.BCELoss(), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), data_root=test_resources.test_src_dir, ) st.start_training()
def test_eval(self): dl = DataloaderImages((149, 117), ignore_useless_states=False) st = ModelTrainer( lambda: DeconvModelEfficient(), self.test_src_dir, self.eval_output_path, load_datasets_path=self.test_split_dir, cache_path=None, batch_size=2, train_print_frequency=10, epochs=self.expected_num_epochs_during_training, num_workers=10, num_validation_samples=2, num_test_samples=self.num_test_samples, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.BCELoss(), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(self.eval_output_path / "eval_on_test_set", skip_images=False, summary_writer=summary_writer), data_root=test_resources.test_src_dir, ) st.inference_on_test_set( self.eval_output_path, self.checkpoint, classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(self.eval_output_path / "eval_on_test_set", skip_images=False, summary_writer=summary_writer)) with open(self.eval_output_path / "eval_on_test_set" / "test_output.log") as f: content = f.read() loss = float( re.findall(r'\d+.\d+', re.findall(r'Eval: \d+\.\d+', content)[0])[0]) self.assertEqual(np.round(loss, 4), self.expected_loss) img_path = self.eval_output_path / 'eval_on_test_set' / 'images' list_all_imgs = list(img_path.glob('**/*.jpg')) self.assertEqual(len(list_all_imgs), self.expected_num_frames)
data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path if args.demo is None else Path(args.demo), load_datasets_path=r.datasets_dryspots, cache_path=r.cache_path, batch_size=128, train_print_frequency=100, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=0.0001), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), demo_path=args.demo, run_eval_step_before_training=True, resize_label_to=img_size if args.demo is not None else (0, 0)) if not args.run_eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda summary_writer: SensorToFlowfrontEvaluator( Path(args.eval) / "eval_on_test_set", skip_images=False, sensors_shape=(10, 8), print_n_images=5000),
checkpoint_p = r.chkp_S1140_to_ff_0_basepr adv_output_dir = checkpoint_p.parent / "advanced_eval" m = ModelTrainer( lambda: DeconvModelEfficient(), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path, load_datasets_path=r.datasets_dryspots, cache_path=r.cache_path, batch_size=2048, train_print_frequency=10, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=0.0001), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), ) adv_output_dir.mkdir(exist_ok=True) m.inference_on_test_set( output_path=adv_output_dir, checkpoint_path=checkpoint_p, classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator( adv_output_dir, skip_images=False, print_n_images=5000))
if args.demo is not None else False), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path if args.demo is None else Path(args.demo), dataset_split_path=r.dataset_split, cache_path=r.cache_path, batch_size=128, train_print_frequency=100, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=0.001), classification_evaluator_function=lambda: SensorToFlowfrontEvaluator(), demo_path=args.demo, run_eval_step_before_training=True, resize_label_to=img_size if args.demo is not None else (0, 0)) if not args.run_eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda: SensorToFlowfrontEvaluator(Path(args.eval) / "eval_on_test_set", skip_images=False, sensors_shape=(5, 4), print_n_images=5000),
dataset_paths, r.save_path, dataset_split_path=r.dataset_split, cache_path=r.cache_path, batch_size=batch_size, epochs=10, num_workers=num_workers, num_validation_samples=num_val, num_test_samples=num_test, data_root=r.data_root_every_step, data_processing_function=dl.get_sensor_to_perm_map, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), classification_evaluator_function=lambda: SensorToFlowfrontEvaluator( skip_images=False, ignore_inp=True, save_path=Path("/cfs/home/s/c/schroeni/Images" "/SensorFiber/")), dummy_epoch=False) if not args.eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda: SensorToFlowfrontEvaluator( save_path=Path(args.eval) / "eval_on_test_set", skip_images=False, ignore_inp=True, ),
freeze_nlayers=9, checkpoint_path=checkpoint_p, round_at=.8), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path, dataset_split_path=r.dataset_split, cache_path=r.cache_path, batch_size=2048, train_print_frequency=100, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=0.0001), classification_evaluator_function=lambda: SensorToFlowfrontEvaluator(), ) output_path = r.chkp_S80_to_ff2.parent m.inference_on_test_set( output_path / "eval_on_test_set_rounded.5", r.chkp_S80_to_ff2, lambda: SensorToFlowfrontEvaluator(output_path / "eval_on_test_set_rounded.5", skip_images=False, sensors_shape=(10, 8), print_n_images=5000), )
dataset_paths, save_path, cache_path=None, batch_size=batch_size, epochs=150, num_workers=num_workers, num_validation_samples=num_val, num_test_samples=num_test, data_processing_function=dl.get_flowfront_to_perm_map, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), data_root=data_root, demo_path=data_folder, classification_evaluator_function=lambda: SensorToFlowfrontEvaluator( skip_images=False, ignore_inp=False, sensors_shape=(143, 111), save_path=save_path), dummy_epoch=False) if not eval: m.start_training() else: m.inference_on_test_set( Path(save_path) / Path(mode), Path(model_chkpts) / ("checkpoint" + mode + ".pth"), lambda: SensorToFlowfrontEvaluator(save_path=Path(save_path) / Path(mode) / "eval_on_test_set", skip_images=False, ignore_inp=True, sensors_shape=(143, 111)),
lambda: DeconvModelEfficient(), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path if args.demo is None else Path(args.demo), load_datasets_path=r.datasets_dryspots, cache_path=r.cache_path, batch_size=2048, train_print_frequency=100, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=1e-4), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), demo_path=args.demo, # run_eval_step_before_training=True ) if not args.run_eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda summary_writer: SensorToFlowfrontEvaluator( Path(args.eval) / "eval_on_test_set", skip_images=False), )
dl = DataloaderImages(image_size=(149, 117)) m = ModelTrainer( lambda: DeconvModelEfficient(), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path if args.demo is None else Path(args.demo), dataset_split_path=r.dataset_split, cache_path=r.cache_path, batch_size=2048, train_print_frequency=100, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=1e-4), classification_evaluator_function=lambda: SensorToFlowfrontEvaluator(), demo_path=args.demo, # run_eval_step_before_training=True ) if not args.run_eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda: SensorToFlowfrontEvaluator( Path(args.eval) / "eval_on_test_set", skip_images=False))
lambda: OptimusPrime_c2D(batch_size), dataset_paths, r.save_path, cache_path=r.cache_path, batch_size=batch_size, epochs=150, num_workers=num_workers, num_validation_samples=num_val, num_test_samples=num_test, data_processing_function=dl.get_flowfront_to_perm_map, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), data_root=data_root, classification_evaluator_function=lambda: SensorToFlowfrontEvaluator( skip_images=False, ignore_inp=False, sensors_shape=(143, 111), save_path=img_save_path), dummy_epoch=False) if not args.eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda summary_writer: SensorToFlowfrontEvaluator( save_path=Path(args.eval) / "eval_on_test_set", skip_images=False, ignore_inp=True, sensors_shape=(143, 111)),
if __name__ == "__main__": """ Producing data only. torch_datasets_chunk_size = 300 000: each chunk = ca. 22 GB torch_datasets_chunk_size = 75 000: each chunk = ca. 5.5 GB """ Utils.custom_mlflow.logging = False dl = DataloaderImages(image_size=(149, 117)) m = ModelTrainer( lambda: DeconvModelEfficient(), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path, dataset_split_path=r.dataset_split, cache_path=r.cache_path, batch_size=2048, num_workers=1, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, classification_evaluator_function=lambda: SensorToFlowfrontEvaluator(), produce_torch_datasets_only=True, sampler=lambda data_source: torch.utils.data.SequentialSampler( data_source=data_source), torch_datasets_chunk_size=75000) m.start_training()
if __name__ == "__main__": dl = DataloaderImages((149, 117)) checkpoint_p = r.chkp_S1140_to_ff_0_basepr adv_output_dir = checkpoint_p.parent / "advanced_eval" m = ModelTrainer( lambda: DeconvModelEfficient(), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path, dataset_split_path=r.dataset_split, cache_path=r.cache_path, batch_size=2048, train_print_frequency=10, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=0.0001), classification_evaluator_function=lambda: SensorToFlowfrontEvaluator()) adv_output_dir.mkdir(exist_ok=True) m.inference_on_test_set( output_path=adv_output_dir, checkpoint_path=checkpoint_p, classification_evaluator_function=lambda: SensorToFlowfrontEvaluator( adv_output_dir, skip_images=False, print_n_images=5000))
m = ModelTrainer( lambda: Bumblebee2(), dataset_paths, r.save_path, cache_path=r.cache_path, batch_size=batch_size, epochs=150, num_workers=num_workers, num_validation_samples=num_val, num_test_samples=num_test, data_processing_function=dl.get_flowfront_to_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), data_root=data_root, classification_evaluator_function=lambda: None, dummy_epoch=False) if not args.eval: m.start_training() else: m.inference_on_test_set( Path(args.eval), Path(args.checkpoint_path), lambda summary_writer: SensorToFlowfrontEvaluator( summary_writer=summary_writer, save_path=Path(args.eval) / "eval_on_test_set", skip_images=False, ignore_inp=True, ), )
round_at=.8), data_source_paths=r.get_data_paths_base_0(), save_path=r.save_path, load_datasets_path=r.datasets_dryspots, cache_path=r.cache_path, batch_size=2048, train_print_frequency=100, epochs=1000, num_workers=75, num_validation_samples=131072, num_test_samples=1048576, data_processing_function=dl.get_sensordata_and_flowfront, data_gather_function=get_filelist_within_folder_blacklisted, loss_criterion=torch.nn.MSELoss(), optimizer_function=lambda params: torch.optim.AdamW(params, lr=0.0001), classification_evaluator_function=lambda summary_writer: SensorToFlowfrontEvaluator(summary_writer=summary_writer), ) output_path = r.chkp_S80_to_ff2.parent m.inference_on_test_set( output_path / "eval_on_test_set_rounded.5", r.chkp_S80_to_ff2, lambda summary_writer: SensorToFlowfrontEvaluator( output_path / "eval_on_test_set_rounded.5", skip_images=False, sensors_shape=(10, 8), print_n_images=5000 ), )