def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs-1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to a file. # - Optional: Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== losses, acc = self.train_epoch(dl_train, **kw) loss = average(losses) train_loss.append(loss) train_acc.append(acc) losses, acc = self.test_epoch(dl_test, **kw) loss = average(losses) test_loss.append(loss) test_acc.append(acc) if best_acc is None or acc > best_acc: best_acc = acc if checkpoints is not None: self.model.save(checkpoints) else: epochs_without_improvement += 1 if early_stopping is not None: break # ======================== return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def load_experiment(filename): with open(filename, 'r') as f: output = json.load(f) config = output['config'] fit_res = FitResult(**output['results']) return config, fit_res
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, post_epoch_fn=None, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :param post_epoch_fn: A function to call after each epoch completes. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 low_loss = None checkpoint_filename = None if checkpoints is not None: checkpoint_filename = f'{checkpoints}.pt' Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True) if os.path.isfile(checkpoint_filename): print(f'*** Loading checkpoint file {checkpoint_filename}') saved_state = torch.load(checkpoint_filename, map_location=self.device) best_acc = saved_state.get('best_acc', best_acc) epochs_without_improvement =\ saved_state.get('ewi', epochs_without_improvement) self.model.load_state_dict(saved_state['model_state']) for epoch in range(num_epochs): save_checkpoint = False verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== train_result = self.train_epoch(dl_train, **kw) train_loss.extend(train_result.losses) train_acc.append(train_result.accuracy) test_result = self.test_epoch(dl_test, **kw) test_loss.extend(test_result.losses) test_acc.append(test_result.accuracy) min_delta = 0.05 #The first epoch ends, set the best epoch as test_acc[0] if low_loss == None: best_acc = test_acc[-1] low_loss = test_loss[-1] save_checkpoint = True # In the following epoch, if the test_loss oscillates little around the low_loss, then # it means that there is no improvement. Otherwise, this epoch should be maintained. else: if test_loss[-1] + min_delta > low_loss: epochs_without_improvement += 1 save_checkpoint = False else: epochs_without_improvement = 0 low_loss = test_loss[-1] best_acc = test_acc[-1] save_checkpoint = True # Decide whether to early stop or not. If early stopping, store the value in the checkpoints_final file. if epochs_without_improvement > early_stopping: break #Joy's code # train_result = self.train_epoch(dl_train, verbose=verbose) # train_loss.extend(train_result.losses) # train_acc.append(train_result.accuracy) # # test_result = self.test_epoch(dl_test, verbose=verbose) # test_loss.extend(test_result.losses) # test_acc.append(test_result.accuracy) # # # if there is no improvement or nan loss then stop the training # if early_stopping: # losses = [float(l) for l in test_loss] # if str(losses[-1]) == 'nan' or len(set(losses[-early_stopping:])) <= 1: # actual_num_epochs = epoch # break # # actual_num_epochs = epoch # ======================== # Save model checkpoint if requested if save_checkpoint and checkpoint_filename is not None: saved_state = dict(best_acc=best_acc, ewi=epochs_without_improvement, model_state=self.model.state_dict()) torch.save(saved_state, checkpoint_filename) print(f'*** Saved checkpoint {checkpoint_filename} ' f'at epoch {epoch+1}') if post_epoch_fn: post_epoch_fn(epoch, train_result, test_result, verbose) return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, post_epoch_fn=None, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :param post_epoch_fn: A function to call after each epoch completes. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 checkpoint_filename = None if checkpoints is not None: checkpoint_filename = f'{checkpoints}.pt' Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True) if os.path.isfile(checkpoint_filename): print(f'*** Loading checkpoint file {checkpoint_filename}') saved_state = torch.load(checkpoint_filename, map_location=self.device) best_acc = saved_state.get('best_acc', best_acc) epochs_without_improvement =\ saved_state.get('ewi', epochs_without_improvement) self.model.load_state_dict(saved_state['model_state']) for epoch in range(num_epochs): save_checkpoint = False verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== save_checkpoint = True epoch_train_loss, epoch_train_acc = self.train_epoch(dl_train) # for loss in epoch_train_loss: train_loss += epoch_train_loss train_acc.append(epoch_train_acc) epoch_test_loss, epoch_test_acc = self.test_epoch(dl_test) # for loss in epoch_test_loss: test_loss += epoch_test_loss test_acc.append(epoch_test_acc) actual_num_epochs += 1 if best_acc is None or best_acc < epoch_test_acc: best_acc = epoch_test_acc epochs_without_improvement = 0 else: epochs_without_improvement += 1 if early_stopping is not None and early_stopping > 0: if epochs_without_improvement >= early_stopping: break train_result = EpochResult(losses=epoch_train_loss, accuracy=epoch_train_acc) test_result = EpochResult(losses=epoch_test_loss, accuracy=epoch_test_acc) # ======================== # Save model checkpoint if requested if save_checkpoint and checkpoint_filename is not None: saved_state = dict(best_acc=best_acc, ewi=epochs_without_improvement, model_state=self.model.state_dict()) torch.save(saved_state, checkpoint_filename) print(f'*** Saved checkpoint {checkpoint_filename} ' f'at epoch {epoch+1}') if post_epoch_fn: post_epoch_fn(epoch, train_result, test_result, verbose) return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to a file. # - Optional: Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. batches = None if "max_batches" in kw: batches = kw.get("max_batches") actual_num_epochs += 1 train_res = self.train_epoch(dl_train, verbose=verbose, max_batches=batches) test_res = self.test_epoch(dl_test, verbose=verbose, max_batches=batches) train_loss.append(sum(train_res.losses) / len(train_res.losses)) train_acc.append(train_res.accuracy) test_loss.append(sum(test_res.losses) / len(test_res.losses)) test_acc.append(test_res.accuracy) if early_stopping is not None and len(test_loss) >= 2: if test_loss[-1] >= test_loss[-2]: epochs_without_improvement += 1 if epochs_without_improvement == early_stopping: break #TODO check if really exits loop else: epochs_without_improvement = 0 best_acc = max(best_acc if best_acc is not None else 0, test_res.accuracy) return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def run_experiment( run_name, out_dir='./results', seed=None, # Training params bs_train=128, bs_test=None, batches=100, epochs=100, early_stopping=3, checkpoints=None, lr=1e-3, reg=1e-3, # Model params filters_per_layer=[64], layers_per_block=2, pool_every=2, hidden_dims=[1024], ycn=False, **kw): """ Execute a single run of experiment 1 with a single configuration. :param run_name: The name of the run and output file to create. :param out_dir: Where to write the output to. """ if not seed: seed = random.randint(0, 2**31) torch.manual_seed(seed) if not bs_test: bs_test = max([bs_train // 4, 1]) cfg = locals() tf = torchvision.transforms.ToTensor() ds_train = CIFAR10(root=DATA_DIR, download=True, train=True, transform=tf) ds_test = CIFAR10(root=DATA_DIR, download=True, train=False, transform=tf) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Select model class (experiment 1 or 2) model_cls = models.ConvClassifier if not ycn else models.YourCodeNet # TODO: Train # - Create model, loss, optimizer and trainer based on the parameters. # Use the model you've implemented previously, cross entropy loss and # any optimizer that you wish. # - Run training and save the FitResults in the fit_res variable. # - The fit results and all the experiment parameters will then be saved # for you automatically. fit_res = None # ====== YOUR CODE: ====== filters_per_block = [] for filters in filters_per_layer: for _ in range(layers_per_block): filters_per_block.append(filters) model = model_cls((3, 32, 32), 10, filters=filters_per_block, pool_every=pool_every, hidden_dims=hidden_dims) loss_fn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=reg) dl_train = torch.utils.data.DataLoader(ds_train, bs_train, shuffle=True) dl_test = torch.utils.data.DataLoader(ds_test, bs_test, shuffle=True) trainer = training.TorchTrainer(model, loss_fn, optimizer, device) actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(epochs): epoch_res_train = trainer.train_epoch(dl_train, max_batches=batches, **kw) epoch_res_test = trainer.test_epoch(dl_test, max_batches=batches, **kw) train_losses_mean = sum(epoch_res_train.losses) / len( epoch_res_train.losses) test_losses_mean = sum(epoch_res_test.losses) / len( epoch_res_test.losses) actual_num_epochs += 1 train_loss.append(train_losses_mean) train_acc.append(epoch_res_train.accuracy) test_loss.append(test_losses_mean) test_acc.append(epoch_res_test.accuracy) if checkpoints != None: torch.save(model.state_dict(), checkpoints) #if epoch > 0 and torch.sum(torch.FloatTensor(test_loss[epoch])) >= torch.sum(torch.FloatTensor(test_loss[epoch-1])): # epochs_without_improvement += 1 if early_stopping != None and epochs_without_improvement == early_stopping: break fit_res = FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc) # ======================== save_experiment(run_name, out_dir, cfg, fit_res)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs-1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to a file. # - Optional: Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== # Start with training the model for 1 epoch & recording the results train_res = self.train_epoch(dl_train=dl_train) train_loss += train_res.losses train_acc.append(train_res.accuracy) # Run an evaluation of the model & save the results test_res = self.test_epoch(dl_test=dl_test) test_loss += test_res.losses test_acc.append(test_res.accuracy) # if torch.isnan(test_loss[-1]).item(): # print("Loss is NaN.\nBreaking training.") # break if test_loss[-1] != test_loss[-1]: # check if last loss is NaN print("Loss is NaN.\nBreaking training.") break if epoch == 0: best_acc = test_res.accuracy epochs_without_improvement = 0 save_checkpoint = 1 else: if test_res.accuracy > best_acc: best_acc = test_res.accuracy save_checkpoint = 1 epochs_without_improvement = 0 else: save_checkpoint = 0 epochs_without_improvement += 1 # Create a checkpoint after each epoch if checkpoints is not None: if save_checkpoint: file = open(checkpoints, 'wb') torch.save(obj=self.model, f=file, pickle_protocol=3) file.close() # Implement early stopping if early_stopping is not None: if epochs_without_improvement == early_stopping: # We haven't improved at all in the last 'early_stopping' epochs print("Reached the Early Stop condition.\nStopping the training.") break # ======================== return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)