def train_epoch(epoch, model: BaseModel, dataset, device: str, tracker: BaseTracker, checkpoint: ModelCheckpoint, log): model.train() tracker.reset("train") train_loader = dataset.train_dataloader() iter_data_time = time.time() with Ctq(train_loader) as tq_train_loader: for i, data in enumerate(tq_train_loader): data = data.to(device) # This takes time model.set_input(data) t_data = time.time() - iter_data_time iter_start_time = time.time() model.optimize_parameters(dataset.batch_size) if i % 10 == 0: tracker.track(model) tq_train_loader.set_postfix(**tracker.get_metrics(), data_loading=float(t_data), iteration=float(time.time() - iter_start_time), color=COLORS.TRAIN_COLOR) iter_data_time = time.time() metrics = tracker.publish() checkpoint.save_best_models_under_current_metrics(model, metrics) log.info("Learning rate = %f" % model.learning_rate)
def main(cfg): OmegaConf.set_struct(cfg, False) # Get device device = torch.device("cuda" if (torch.cuda.is_available() and cfg.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.enable_cudnn # Checkpoint checkpoint = ModelCheckpoint(cfg.checkpoint_dir, cfg.model_name, cfg.weight_name, strict=True) # Create model and datasets dataset = instantiate_dataset(checkpoint.data_config) model = checkpoint.create_model(dataset, weight_name=cfg.weight_name) log.info(model) log.info("Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) # Set dataloaders dataset.create_dataloaders( model, cfg.batch_size, cfg.shuffle, cfg.num_workers, cfg.precompute_multi_scale, ) log.info(dataset) model.eval() if cfg.enable_dropout: model.enable_dropout_in_eval() model = model.to(device) tracker: BaseTracker = dataset.get_tracker(model, dataset, False, False) # Run training / evaluation run(cfg, model, dataset, device, tracker, checkpoint)
def eval_epoch( epoch: int, model: BaseModel, dataset, device, tracker: BaseTracker, checkpoint: ModelCheckpoint, visualizer: Visualizer, early_break: bool, ): model.eval() tracker.reset("val") visualizer.reset(epoch, "val") loader = dataset.val_dataloader() with Ctq(loader) as tq_val_loader: for data in tq_val_loader: data = data.to(device) with torch.no_grad(): model.set_input(data) model.forward() tracker.track(model) tq_val_loader.set_postfix(**tracker.get_metrics(), color=COLORS.VAL_COLOR) if visualizer.is_active: visualizer.save_visuals(model.get_current_visuals()) if early_break: break metrics = tracker.publish(epoch) tracker.print_summary() checkpoint.save_best_models_under_current_metrics(model, metrics)
def main(cfg): OmegaConf.set_struct(cfg, False) # Get device device = torch.device("cuda" if ( torch.cuda.is_available() and cfg.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.enable_cudnn # Checkpoint checkpoint = ModelCheckpoint(cfg.checkpoint_dir, cfg.model_name, cfg.weight_name, strict=True) # Setup the dataset config # Generic config train_dataset_cls = get_dataset_class(checkpoint.data_config) setattr(checkpoint.data_config, "class", train_dataset_cls.FORWARD_CLASS) setattr(checkpoint.data_config, "dataroot", cfg.input_path) # Datset specific configs if cfg.data: for key, value in cfg.data.items(): checkpoint.data_config.update(key, value) # Create dataset and mdoel dataset = instantiate_dataset(checkpoint.data_config) model = checkpoint.create_model(dataset, weight_name=cfg.weight_name) log.info(model) log.info( "Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) # Set dataloaders dataset.create_dataloaders( model, cfg.batch_size, cfg.shuffle, cfg.num_workers, False, ) log.info(dataset) model.eval() if cfg.enable_dropout: model.enable_dropout_in_eval() model = model.to(device) # Run training / evaluation if not os.path.exists(cfg.output_path): os.makedirs(cfg.output_path) run(model, dataset, device, cfg.output_path)
def test_best_metric(self): self.run_path = os.path.join(DIR, "checkpt") if not os.path.exists(self.run_path): os.makedirs(self.run_path) model_checkpoint = ModelCheckpoint(self.run_path, self.model_name, "test", run_config=self.config, resume=False) model = MockModel() optimal_state = model.state.item() mock_metrics = { "current_metrics": { "acc": 12 }, "stage": "test", "epoch": 10 } model_checkpoint.save_best_models_under_current_metrics( model, mock_metrics) model.state[0] = 2 mock_metrics = { "current_metrics": { "acc": 0 }, "stage": "test", "epoch": 11 } model_checkpoint.save_best_models_under_current_metrics( model, mock_metrics) mock_metrics = { "current_metrics": { "acc": 10 }, "stage": "train", "epoch": 11 } model_checkpoint.save_best_models_under_current_metrics( model, mock_metrics) mock_metrics = { "current_metrics": { "acc": 15 }, "stage": "train", "epoch": 11 } model_checkpoint.save_best_models_under_current_metrics( model, mock_metrics) ckp = torch.load(os.path.join(self.run_path, self.model_name + ".pt")) self.assertEqual(ckp["models"]["best_acc"]["state"].item(), optimal_state) self.assertEqual(ckp["models"]["latest"]["state"].item(), model.state.item())
def test_model_ckpt_using_pointnet2ms(self,): # Create a checkpt name = "model" self.run_path = os.path.join(DIR, "checkpt") print(self.run_path) if not os.path.exists(self.run_path): os.makedirs(self.run_path) model_checkpoint = ModelCheckpoint(self.run_path, name, "test", run_config=self.config, resume=False) dataset = MockDatasetGeometric(5) model = instantiate_model(self.config, dataset) model.set_input(dataset[0], "cpu") model.instantiate_optimizers(self.config) mock_metrics = {"current_metrics": {"acc": 12}, "stage": "test", "epoch": 10} model_checkpoint.save_best_models_under_current_metrics(model, mock_metrics) # Load checkpoint and initialize model model_checkpoint = ModelCheckpoint(self.run_path, name, "test", self.config, resume=True) model2 = model_checkpoint.create_model(dataset, weight_name="acc") self.assertEqual(str(model.optimizer.__class__.__name__), str(model2.optimizer.__class__.__name__)) self.assertEqual(model.optimizer.defaults, model2.optimizer.defaults) self.assertEqual(model.schedulers["lr_scheduler"].state_dict(), model2.schedulers["lr_scheduler"].state_dict()) self.assertEqual(model.schedulers["bn_scheduler"].state_dict(), model2.schedulers["bn_scheduler"].state_dict()) shutil.rmtree(self.run_path) remove(os.path.join(ROOT, "{}.pt".format(name))) remove(os.path.join(DIR, "{}.pt".format(name)))
def test_epoch(model: BaseModel, dataset, device, tracker: BaseTracker, checkpoint: ModelCheckpoint, log): model.eval() tracker.reset("test") loader = dataset.test_dataloader() with Ctq(loader) as tq_test_loader: for data in tq_test_loader: data = data.to(device) with torch.no_grad(): model.set_input(data) model.forward() tracker.track(model) tq_test_loader.set_postfix(**tracker.get_metrics(), color=COLORS.TEST_COLOR) metrics = tracker.publish() tracker.print_summary() checkpoint.save_best_models_under_current_metrics(model, metrics)
def train_epoch( epoch: int, model: BaseModel, dataset, device: str, tracker: BaseTracker, checkpoint: ModelCheckpoint, visualizer: Visualizer, early_break: bool, ): model.train() tracker.reset("train") visualizer.reset(epoch, "train") train_loader = dataset.train_dataloader iter_data_time = time.time() with Ctq(train_loader) as tq_train_loader: for i, data in enumerate(tq_train_loader): model.set_input(data, device) t_data = time.time() - iter_data_time iter_start_time = time.time() model.optimize_parameters(epoch, dataset.batch_size) if i % 10 == 0: tracker.track(model) tq_train_loader.set_postfix(**tracker.get_metrics(), data_loading=float(t_data), iteration=float(time.time() - iter_start_time), color=COLORS.TRAIN_COLOR) if visualizer.is_active: visualizer.save_visuals(model.get_current_visuals()) iter_data_time = time.time() if early_break: break metrics = tracker.publish(epoch) checkpoint.save_best_models_under_current_metrics(model, metrics) log.info("Learning rate = %f" % model.learning_rate)
def main(cfg): OmegaConf.set_struct(cfg, False) # Get device device = torch.device("cuda" if ( torch.cuda.is_available() and cfg.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.enable_cudnn # Checkpoint checkpoint = ModelCheckpoint(cfg.checkpoint_dir, cfg.model_name, cfg.weight_name, strict=True) # Setup the dataset config # Generic config dataset = instantiate_dataset(cfg.data) model = checkpoint.create_model(dataset, weight_name=cfg.weight_name) log.info(model) log.info( "Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) log.info(dataset) model.eval() if cfg.enable_dropout: model.enable_dropout_in_eval() model = model.to(device) # Run training / evaluation output_path = os.path.join(cfg.checkpoint_dir, cfg.data.name, "features") if not os.path.exists(output_path): os.makedirs(output_path, exist_ok=True) run(model, dataset, device, output_path, cfg)
def test_epoch( epoch: int, model: BaseModel, dataset, device, tracker: BaseTracker, checkpoint: ModelCheckpoint, visualizer: Visualizer, early_break: bool, ): model.eval() loaders = dataset.test_dataloaders() for idx, loader in enumerate(loaders): stage_name = dataset.get_test_dataset_name(idx) tracker.reset(stage_name) visualizer.reset(epoch, stage_name) with Ctq(loader) as tq_test_loader: for data in tq_test_loader: data = data.to(device) with torch.no_grad(): model.set_input(data) model.forward() tracker.track(model) tq_test_loader.set_postfix(**tracker.get_metrics(), color=COLORS.TEST_COLOR) if visualizer.is_active: visualizer.save_visuals(model.get_current_visuals()) if early_break: break metrics = tracker.publish(epoch) tracker.print_summary() checkpoint.save_best_models_under_current_metrics(model, metrics)
def main(cfg): OmegaConf.set_struct( cfg, False) # This allows getattr and hasattr methods to function correctly if cfg.pretty_print: print(cfg.pretty()) # Get device device = torch.device("cuda" if ( torch.cuda.is_available() and cfg.training.cuda) else "cpu") log.info("DEVICE : {}".format(device)) # Enable CUDNN BACKEND torch.backends.cudnn.enabled = cfg.training.enable_cudnn # Start Wandb if public launch_wandb(cfg, cfg.wandb.public and cfg.wandb.log) # Checkpoint checkpoint = ModelCheckpoint( cfg.training.checkpoint_dir, cfg.model_name, cfg.training.weight_name, run_config=cfg, resume=bool(cfg.training.checkpoint_dir), ) # Create model and datasets if not checkpoint.is_empty: dataset = instantiate_dataset(checkpoint.data_config) model = checkpoint.create_model(dataset, weight_name=cfg.training.weight_name) else: dataset = instantiate_dataset(cfg.data) model = instantiate_model(cfg, dataset) model.instantiate_optimizers(cfg) log.info(model) model.log_optimizers() log.info( "Model size = %i", sum(param.numel() for param in model.parameters() if param.requires_grad)) # Set dataloaders dataset.create_dataloaders( model, cfg.training.batch_size, cfg.training.shuffle, cfg.training.num_workers, cfg.training.precompute_multi_scale, ) log.info(dataset) # Choose selection stage selection_stage = getattr(cfg, "selection_stage", "") checkpoint.selection_stage = dataset.resolve_saving_stage(selection_stage) tracker: BaseTracker = dataset.get_tracker(model, dataset, cfg.wandb.log, cfg.tensorboard.log) launch_wandb(cfg, not cfg.wandb.public and cfg.wandb.log) # Run training / evaluation model = model.to(device) visualizer = Visualizer(cfg.visualization, dataset.num_batches, dataset.batch_size, os.getcwd()) run(cfg, model, dataset, device, tracker, checkpoint, visualizer)