def test_model(trainer: ModelTrainer, x: torch.Tensor, y: torch.Tensor, opts: ExperimentOptions) -> float: """ Evaluate the test solver accuracy with a test solver and a test step size :param trainer: model trainer :param x: batch data :param y: batch labels :param opts: experiment options used for training the Neural ODE :return: test solver accuracy """ train_step_size = trainer.model.feature_ex_block.options['step_size'] train_solver = trainer.model.feature_ex_block.solver test_solver, test_step_size = find_test_model(solver=train_solver, step_size=train_step_size) trainer.model.feature_ex_block.options['step_size'] = test_step_size trainer.model.feature_ex_block.solver = test_solver with torch.no_grad(): logits = trainer.forward_one_step(x) acc = calculate_accuracy(y=y, logits=logits, batch_size=opts.batch_size, num_classes=opts.num_classes) trainer.model.feature_ex_block.options['step_size'] = train_step_size trainer.model.feature_ex_block.solver = train_solver return acc
def test_model(trainer: ModelTrainer, x: torch.Tensor, y: torch.Tensor, opts: ExperimentOptions, train_solver_nfe: int) -> float: """ Evaluate the test solver accuracy with a test solver and a test tolerance :param trainer: model trainer :param x: batch data :param y: batch labels :param opts: experiment options used for training the Neural ODE :return: test solver accuracy """ train_tol = trainer.model.feature_ex_block.tol train_solver = trainer.model.feature_ex_block.solver test_solver, test_tol = find_test_model(solver=train_solver, tol=train_tol) while True: trainer.model.feature_ex_block.tol = test_tol trainer.model.feature_ex_block.solver = test_solver trainer.model.feature_ex_block.nfe = 0 with torch.no_grad(): logits = trainer.forward_one_step(x) acc = calculate_accuracy(y=y, logits=logits, batch_size=opts.batch_size, num_classes=opts.num_classes) test_solver_nfe = trainer.model.feature_ex_block.nfe trainer.model.feature_ex_block.nfe = 0 if test_solver_nfe > train_solver_nfe: break """If test solver takes same number of steps as train solver decrease tolerance further if same solver for training and testing is used.""" test_tol = test_tol / 10 trainer.model.feature_ex_block.tol = train_tol trainer.model.feature_ex_block.solver = train_solver return acc
def load_model(path: str, model_iter: int, use_gpu=True) -> (ModelTrainer, DataLoader, ExperimentOptions): opts_folder = 'options' file = 'opts.pkl' file_path = os.path.join(path, opts_folder, file) with open(file_path, "rb") as input_file: opts = pickle.load(input_file) opts.fixed_step_solver = True if not use_gpu: opts.use_gpu = False check_folder = 'checkpoints' test_opts = copy.deepcopy(opts) test_opts.split = 'test' test_dataloader = data.create_dataloader.create_dataloader(test_opts) file = f'model_iter_{model_iter}.pth' file_path = os.path.join(path, check_folder, file) trainer = ModelTrainer(opts) if use_gpu: state_dict = torch.load(file_path) else: state_dict = torch.load(file_path, map_location=torch.device('cpu')) trainer.model.load_state_dict(state_dict['model_state']) return trainer, test_dataloader, opts
def auto_train(self): self.get_pretrain_info() self.load_model() os.makedirs(self.model_save_folder, exist_ok=True) self.data_loader = dataset.DataLoader_Auto(self.data_src, label_dict, self.batch_size, self.size) # MT = trainer.ModelTrainer(train_type, self.silent_model, self.size) # MT.train_with_test(self.data_loader.dataloaders_dict, self.criterion, self.optimizer_ft, self.epoch, self.is_inception, self.model_save_path, self.log_save_path) ModelTrainer.train_sport_model(self.sport_model, self.data_loader.dataloaders_dict, self.criterion, self.optimizer_ft, self.epoch, self.is_inception, self.model_save_path, self.log_save_path) print("train model done, save model to %s" % os.path.join(self.model_save_path, self.model_str)) self.record()
def __init__(self): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = initialize(parser) opts, unknown = parser.parse_known_args() self.opts = ExperimentOptions(opts) self.data_generator = self._get_data_generator() self.test_dataloader = self._get_test_dataloader() self.train_acc = None self.test_acc = None self.nfe_f = None self.nfe_b = None self.loss = None self.acc_log = { "train": torch.empty(self.opts.niter), "test": torch.empty(self.opts.niter), } self.loss_log = torch.empty(self.opts.niter) self.nfe_log = { "nfe_f": torch.empty(self.opts.niter), "nfe_b": torch.empty(self.opts.niter), } self.trainer = ModelTrainer(self.opts) if self.opts.use_gpu: self.trainer.model.cuda() # By default the device is cpu self.device = torch.device("cpu") if self.opts.use_gpu: self.device = torch.device("cuda:" + str(self.opts.gpu_ids[0])) # Initialize the summary writer if self.opts.use_tensorboard: self.writer = SummaryWriter(log_dir=self.opts.tensorboard_dir)
def adapt_tol(trainer: ModelTrainer, train_solver_acc: float, train_solver_nfe_dict: Dict[str, torch.Tensor], current_iter: int, x: torch.Tensor, y: torch.Tensor, opts: ExperimentOptions): """ Adapt the tolerance used for training as described in Algorithm 3 in the paper. If tolerance is too large to guarantee continuous dynamics, the tolerance used for training is decreased. Else the tolerance is increased, to achieve minimal training time. :param trainer: model trainer :param train_solver_acc: accuracy reached by the train solver :param x: batch data :param y: batch labels :param opts: experiment options used for training the Neural ODE """ train_solver_nfe = int(train_solver_nfe_dict["nfe_f"][-1].detach()) threshold = opts.threshold test_solver_acc = test_model(trainer=trainer, x=x, y=y, opts=opts, train_solver_nfe=train_solver_nfe) dif = np.abs(test_solver_acc - train_solver_acc) if dif > threshold: tol = trainer.model.feature_ex_block.tol new_tol = 0.5 * tol trainer.model.feature_ex_block.tol = new_tol else: tol = trainer.model.feature_ex_block.tol new_tol = 1.1 * tol trainer.model.feature_ex_block.tol = new_tol logits = trainer.forward_one_step(x) acc = calculate_accuracy(y=y, logits=logits, batch_size=opts.batch_size, num_classes=opts.num_classes) dif = np.abs(test_solver_acc - acc) if dif > threshold: new_tol = tol elif (trainer.model.feature_ex_block.nfe == train_solver_nfe_dict['nfe_f'][current_iter-5:current_iter]).all()\ and current_iter > 4: """ Do not change tolerance if the number of steps stays constant""" new_tol = tol trainer.model.feature_ex_block.tol = new_tol
def adapt_step_size(trainer: ModelTrainer, train_solver_acc: float, x: torch.Tensor, y: torch.Tensor, opts: ExperimentOptions): """ Adapt the step size used for training as described in Algorithm 2 in the paper. If step size is too large to guarantee continuous dynamics, the step size used for training is decreased. Else the step size is increased, to achieve minimal training time. :param trainer: model trainer :param train_solver_acc: accuracy reached by the train solver :param x: batch data :param y: batch labels :param opts: experiment options used for training the Neural ODE """ threshold = opts.threshold max_steps = opts.max_steps test_solver_acc = test_model(trainer=trainer, x=x, y=y, opts=opts) dif = np.abs(test_solver_acc - train_solver_acc) if dif > threshold: step_size = trainer.model.feature_ex_block.options['step_size'] if 1 / step_size * 2 > max_steps: if int(1 / step_size) == max_steps: print("WARNING: Cannot increase step size further!") new_step_size = step_size else: new_step_size = 2 * step_size / (max_steps * step_size + 1) else: new_step_size = 0.5 * step_size trainer.model.feature_ex_block.options['step_size'] = new_step_size else: step_size = trainer.model.feature_ex_block.options['step_size'] new_step_size = 1.1 * step_size if new_step_size > 1.0: new_step_size = 1.0 trainer.model.feature_ex_block.options['step_size'] = new_step_size logits = trainer.forward_one_step(x) acc = calculate_accuracy(y=y, logits=logits, batch_size=opts.batch_size, num_classes=opts.num_classes) dif = np.abs(test_solver_acc - acc) if dif > threshold: new_step_size = step_size trainer.model.feature_ex_block.options['step_size'] = new_step_size
def main(): f = open(RESULTS_FILE, "a") f.write("Results from " + str(datetime.datetime.now()) + "\n") # Load config config = Config(config_default) # Load data load_data = CifarDataLoader(config) train_data = load_data.get_train_data() validation_data = load_data.get_test_data() optimizers = ['adam', 'adagrad', 'sgd'] # optimizers = ['adam'] # Loop over multiple optimizers # Without dropout for optimizer in optimizers: # Set optimizer config_default['optimizer'] = optimizer # Load config config = Config(config_default) # Create model temp = ConvNet(config) model = temp.get_model() # without dropout # Train model trainer = ModelTrainer(model, train_data, validation_data, config) # without dropout trainer.train() # Save trained model model_name = 'cnn_' + optimizer + '.h5' # without dropout save_model = os.path.join(SAVE_DIR, model_name) trainer.save(save_model) # Print the results print("optimizer: ", optimizer) print("Without dropout") print("loss: ", trainer.loss) print("validation loss: ", trainer.val_loss) f.write("optimizer: " + optimizer + "\n\n") f.write("Without dropout \n") f.write("loss: " + str(trainer.loss) + "\n") f.write("validation loss: " + str(trainer.val_loss) + "\n") f.write("\n") # Loop over multiple optimizers # With dropout for optimizer in optimizers: # Set optimizer config_default['optimizer'] = optimizer # Load config config = Config(config_default) # Create model temp = ConvNetDropout(config) model_do = temp.get_model() # with dropout # Train model trainer_do = ModelTrainer(model_do, train_data, validation_data, config) # with dropout trainer_do.train() # Save trained model model_do_name = 'cnn_dropout' + optimizer + '.h5' # with dropout save_model_do = os.path.join(SAVE_DIR, model_do_name) trainer_do.save(save_model_do) # Print the results print("optimizer: ", optimizer) print("With dropout") print("loss: ", trainer_do.loss) print("validation loss: ", trainer_do.val_loss) f.write("optimizer: " + optimizer + "\n\n") f.write("With dropout \n") f.write("loss: " + str(trainer_do.loss) + "\n") f.write("validation loss: " + str(trainer_do.val_loss) + "\n") f.write("\n") f.write("\n\n\n") f.close()
class TrainModel: def __init__(self): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = initialize(parser) opts, unknown = parser.parse_known_args() self.opts = ExperimentOptions(opts) self.data_generator = self._get_data_generator() self.test_dataloader = self._get_test_dataloader() self.train_acc = None self.test_acc = None self.nfe_f = None self.nfe_b = None self.loss = None self.acc_log = { "train": torch.empty(self.opts.niter), "test": torch.empty(self.opts.niter), } self.loss_log = torch.empty(self.opts.niter) self.nfe_log = { "nfe_f": torch.empty(self.opts.niter), "nfe_b": torch.empty(self.opts.niter), } self.trainer = ModelTrainer(self.opts) if self.opts.use_gpu: self.trainer.model.cuda() # By default the device is cpu self.device = torch.device("cpu") if self.opts.use_gpu: self.device = torch.device("cuda:" + str(self.opts.gpu_ids[0])) # Initialize the summary writer if self.opts.use_tensorboard: self.writer = SummaryWriter(log_dir=self.opts.tensorboard_dir) def run(self): torch.cuda.empty_cache() # Set random seed torch.manual_seed(self.opts.random_seed) loss_function = torch.nn.CrossEntropyLoss().to(self.device) # Time input for the ODE t = torch.as_tensor([0.0, 1.0]).to(self.device) print("Starting training....") if self.opts.use_adaption_algo: self._initialize_adaption_algo() for current_iter in range(self.opts.niter): self._iterate_one_training_step(current_iter, loss_function, t) if self.opts.evaluate_with_dif_solver: results = evaluate_with_dif_solver( trainer=self.trainer, test_dataloader=self.test_dataloader, opts=self.opts, device=self.device, ) torch.save( results, os.path.join( self.opts.experiment_dir, f"eval_with_dif_solver_iter_{self.opts.niter - 1}.pt", ), ) plot_results(self.opts) def _get_data_generator(self) -> Generator: # load the dataset dataloader = data.create_dataloader.create_dataloader(self.opts) print("\n{} dataloader of size {} was created\n".format( self.opts.dataset.upper(), len(dataloader))) # Wrap pytorch's dataloader in a generator function return inf_generator(dataloader) def _get_test_dataloader(self) -> DataLoader: test_opts = copy.deepcopy(self.opts) test_opts.split = "test" return data.create_dataloader.create_dataloader(test_opts) def _initialize_adaption_algo(self): x, _ = self.data_generator.__next__() x = x.to(self.device) if self.opts.fixed_step_solver: step_size = find_initial_step_size( mymodel=self.trainer.model, batch_data=x, order=return_order(self.opts.solver), ) self.trainer.model.feature_ex_block.options[ "step_size"] = step_size else: tol = self.opts.initial_tol self.trainer.model.feature_ex_block.tol = tol def _iterate_one_training_step(self, current_iter: int, loss_function: _Loss, t: torch.Tensor): self.trainer.model.train() self.trainer.optimizer.zero_grad() self.trainer.model.feature_ex_block.nfe = 0 x, y = self.data_generator.__next__() x = x.to(self.device) y = y.to(self.device) logits = self.trainer.forward_one_step(x, t) self.loss = loss_function(logits, y) self.nfe_f = self.trainer.model.feature_ex_block.nfe self.trainer.model.feature_ex_block.nfe = 0 self.loss.backward() self.nfe_b = self.trainer.model.feature_ex_block.nfe self.trainer.model.feature_ex_block.nfe = 0 self.train_acc = calculate_accuracy(logits, y, self.opts.num_classes, self.opts.batch_size) if self.opts.evaluate_test_acc: with torch.no_grad(): self.trainer.model.eval() self.test_acc = evaluate_model(self.trainer.model, self.test_dataloader, self.opts, self.device) self._save_current_state(current_iter) if self.opts.use_adaption_algo: self._apply_step_adaption_algo(current_iter, self.train_acc, x, y) if self.opts.use_tensorboard: self._create_tensorboard_logs(current_iter) self._print_training_info(current_iter) self.trainer.optimizer.step() torch.cuda.empty_cache() def _save_current_state(self, current_iter: int): self.nfe_log["nfe_f"][current_iter] = self.nfe_f self.nfe_log["nfe_b"][current_iter] = self.nfe_b self.loss_log[current_iter] = self.loss.cpu().detach() self.acc_log["train"][current_iter] = self.train_acc if self.opts.evaluate_test_acc: self.acc_log["test"][current_iter] = self.test_acc torch.save(self.loss_log, os.path.join(self.opts.experiment_dir, "loss_log.pt")) torch.save(self.acc_log, os.path.join(self.opts.experiment_dir, "acc_log.pt")) torch.save(self.nfe_log, os.path.join(self.opts.experiment_dir, "nfe_log.pt")) # Save the current model if (current_iter + 1) % self.opts.model_checkpoint_freq == 0 or ( current_iter + 1) == self.opts.niter: self.trainer.checkpoint_model_state(current_iter, self.opts.checkpoints_dir) def _create_tensorboard_logs(self, current_iter: int): self.writer.add_scalar("ACC/train", self.train_acc, current_iter + 1) self.writer.add_scalar("NFE/forward", self.nfe_f, current_iter + 1) self.writer.add_scalar("NFE/backward", self.nfe_b, current_iter + 1) def _print_training_info(self, current_iter: int): print_str = "Iter {} \b\b\t NFE-F {:.2f} \t NFE-B {:.2f}" "\t Train Acc {:.3f}%" print_vars = (current_iter + 1, self.nfe_f, self.nfe_b, self.train_acc) if self.test_acc is not None: print_str = print_str + "\t Test Acc {:.3f}%" print_vars = print_vars + (self.test_acc, ) print( print_str.format(*print_vars), file=open(os.path.join(self.opts.experiment_dir, "output.txt"), "a"), ) def _apply_step_adaption_algo( self, current_iter: int, train_acc: float, x: torch.Tensor, y: torch.Tensor, ): if (current_iter + 1) % self.opts.adaption_interval == 0: if self.opts.fixed_step_solver: adapt_step_size( trainer=self.trainer, train_solver_acc=train_acc, x=x, y=y, opts=self.opts, ) else: adapt_tol( trainer=self.trainer, train_solver_acc=train_acc, x=x, y=y, opts=self.opts, train_solver_nfe_dict=self.nfe_log, current_iter=current_iter, )