def load_experiment(filename): with open(filename, 'r') as f: output = json.load(f) fit_res = FitResult(**output['results']) return fit_res
def load_experiment(filename): with open(filename, "r") as f: output = json.load(f) config = output["config"] fit_res = FitResult(**output["results"]) return config, fit_res
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to the file specified by the checkpoints # argument. # ====== YOUR CODE: ====== raise NotImplementedError() # ======================== return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def run_experiment(run_name, out_dir='./results', seed=None, device=None, # Training params bs_train=128, bs_test=None, batches=100, epochs=100, early_stopping=3, checkpoints=None, lr=1e-3, reg=1e-3, # Model params filters_per_layer=[64], layers_per_block=2, pool_every=2, hidden_dims=[1024], model_type='cnn', **kw): """ Executes a single run of a Part3 experiment with a single configuration. These parameters are populated by the CLI parser below. See the help string of each parameter for it's meaning. """ if not seed: seed = random.randint(0, 2 ** 31) torch.manual_seed(seed) if not bs_test: bs_test = max([bs_train // 4, 1]) cfg = locals() tf = torchvision.transforms.ToTensor() ds_train = CIFAR10(root=DATA_DIR, download=True, train=True, transform=tf) ds_test = CIFAR10(root=DATA_DIR, download=True, train=False, transform=tf) if not device: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Select model class if model_type not in MODEL_TYPES: raise ValueError(f"Unknown model type: {model_type}") model_cls = MODEL_TYPES[model_type] # TODO: Train # - Create model, loss, optimizer and trainer based on the parameters. # Use the model you've implemented previously, cross entropy loss and # any optimizer that you wish. # - Run training and save the FitResults in the fit_res variable. # - The fit results and all the experiment parameters will then be saved # for you automatically. fit_res = None # ====== YOUR CODE: ====== #raise NotImplementedError() x0, _ = ds_train[0] in_size = x0.shape num_classes = 10 filters = [layer for layer in filters_per_layer for _ in range(layers_per_block)] model = model_cls(in_size=in_size, out_classes=num_classes, channels=filters, pool_every=pool_every, hidden_dims=hidden_dims).to(device) loss_fn = torch.nn.CrossEntropyLoss().to(device) # optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=reg,) # optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, momentum=0.9, weight_decay=reg) optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=reg) trainer = training.TorchTrainer(model, loss_fn, optimizer, device) dl_train = torch.utils.data.DataLoader(ds_train, bs_train, shuffle=False) dl_test = torch.utils.data.DataLoader(ds_test, bs_test, shuffle=False) fit_tmp = trainer.fit(dl_train, dl_test, num_epochs=epochs, checkpoints='./checkpoint', early_stopping=early_stopping, max_batches=batches) train_loss_list = [tr_loss.item() for tr_loss in fit_tmp.train_loss] train_acc_list = [tr_acc.item() for tr_acc in fit_tmp.train_acc] test_loss_list = [tst_loss.item() for tst_loss in fit_tmp.test_loss] test_acc_list = [tst_acc.item() for tst_acc in fit_tmp.test_acc] fit_res = FitResult(num_epochs=fit_tmp.num_epochs, train_loss=train_loss_list, train_acc=train_acc_list, test_loss=test_loss_list, test_acc=test_acc_list) # ======================== save_experiment(run_name, out_dir, cfg, fit_res)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, post_epoch_fn=None, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :param post_epoch_fn: A function to call after each epoch completes. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 checkpoint_filename = None if checkpoints is not None: checkpoint_filename = f'{checkpoints}.pt' Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True) if os.path.isfile(checkpoint_filename): print(f'*** Loading checkpoint file {checkpoint_filename}') saved_state = torch.load(checkpoint_filename, map_location=self.device) best_acc = saved_state.get('best_acc', best_acc) epochs_without_improvement =\ saved_state.get('ewi', epochs_without_improvement) self.model.load_state_dict(saved_state['model_state']) for epoch in range(num_epochs): save_checkpoint = False verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: # Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== # Train train_result = self.train_epoch(dl_train, **kw) curr_accuracy = train_result[1] train_acc.append(curr_accuracy) # epoch_res_0_tensor = torch.tensor(epoch_result[0]) # print(epoch_res_0_tensor) curr_loss = torch.tensor(train_result[0]).sum().item() / len( train_result[0]) # curr_loss = torch.stack(epoch_result[0]).sum().item() / len(epoch_result[0]) train_loss.append(curr_loss) # Test test_result = self.test_epoch(dl_test, **kw) curr_test_accuracy = test_result[1] test_acc.append(curr_test_accuracy) curr_test_loss = torch.tensor(test_result[0]).sum().item() / len( test_result[0]) # curr_test_loss = torch.stack(test_result[0]).sum().item() / len(test_result[0]) test_loss.append(curr_test_loss) # Early stopping if epoch > 1 and test_loss[len(test_loss) - 1] < curr_test_loss: epochs_without_improvement += 1 else: epochs_without_improvement = 0 if best_acc == None or best_acc < curr_test_accuracy: best_acc = curr_test_accuracy if early_stopping != None and early_stopping <= epochs_without_improvement: break # Checkpoints save_checkpoint = True # ======================== # Save model checkpoint if requested if save_checkpoint and checkpoint_filename is not None: saved_state = dict(best_acc=best_acc, ewi=epochs_without_improvement, model_state=self.model.state_dict()) torch.save(saved_state, checkpoint_filename) print(f'*** Saved checkpoint {checkpoint_filename} ' f'at epoch {epoch+1}') if post_epoch_fn: post_epoch_fn(epoch, train_result, test_result, verbose) return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 min_loss = None #TODO: CHECK THIS. SHOULD IT BE BEST_ACC? for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs-1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to the file specified by the checkpoints # argument. # ====== YOUR CODE: ====== train_res = self.train_epoch(dl_train, verbose=verbose, **kw) train_loss.append(sum(train_res.losses)/len(train_res.losses)) train_acc.append(train_res.accuracy) test_res = self.test_epoch(dl_test, verbose=verbose, **kw) test_loss.append(sum(test_res.losses)/len(test_res.losses)) test_acc.append(test_res.accuracy) if best_acc is None or test_res.accuracy > best_acc: best_acc = test_res.accuracy if checkpoints is not None: torch.save(self.model, checkpoints) if early_stopping is not None: new_loss = sum(test_res.losses)/len(test_res.losses) if min_loss is None or new_loss < min_loss: min_loss = new_loss epochs_without_improvement = 0 else: epochs_without_improvement += 1 if epochs_without_improvement >= early_stopping: break # ======================== return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ dl = dl_train actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 current_learning_rate = self.scheduler.get_lr() for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch + 1}/{num_epochs} ---', verbose) if current_learning_rate != self.scheduler.get_lr(): current_learning_rate = self.scheduler.get_lr() self._print( f' learning rate has been changed to {current_learning_rate}', verbose) actual_num_epochs += 1 train = self.train_epoch(dl_train, epoch, verbose=verbose, **kw) train_loss += [torch.mean(torch.stack(train.losses)).item()] if self.autoencoder_training: train_acc += [0] else: train_acc += [train.accuracy.item()] if self.scheduler: self.scheduler.step() if self.autoencoder_training is None: test = self.test_epoch(dl_test, epoch, verbose=verbose, **kw) test_loss += [torch.mean(torch.stack(test.losses)).item()] if self.autoencoder_training: test_acc += [0] else: test_acc += [test.accuracy.item()] if epoch >= 1 and test_loss[-1] < test_loss[-2]: epochs_without_improvement = 0 else: epochs_without_improvement += 1 if early_stopping is not None and early_stopping == epochs_without_improvement: break return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit( self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, post_epoch_fn=None, **kw, ) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :param post_epoch_fn: A function to call after each epoch completes. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 checkpoint_filename = None if checkpoints is not None: checkpoint_filename = f"{checkpoints}.pt" Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True) if os.path.isfile(checkpoint_filename): print(f"*** Loading checkpoint file {checkpoint_filename}") saved_state = torch.load(checkpoint_filename, map_location=self.device) best_acc = saved_state.get("best_acc", best_acc) epochs_without_improvement = saved_state.get( "ewi", epochs_without_improvement) self.model.load_state_dict(saved_state["model_state"]) for epoch in range(num_epochs): save_checkpoint = False verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f"--- EPOCH {epoch+1}/{num_epochs} ---", verbose) # TODO: # Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== self.optimizer.zero_grad() train_result = self.train_epoch(dl_train, verbose=verbose, **kw) test_result = self.test_epoch(dl_test, verbose=verbose, **kw) train_loss.extend(train_result.losses) train_acc.append(train_result.accuracy) test_loss.extend(test_result.losses) test_acc.append(test_result.accuracy) acc_len = len(test_acc) if not (checkpoints is None) and acc_len > 1 and test_acc[-1] > test_acc[-2]: save_checkpoint = True no_improve = True if early_stopping and acc_len > early_stopping: curr_acc = test_acc[-1] for index in range(1, early_stopping + 1): if curr_acc > test_acc[acc_len - index]: no_improve = False else: no_improve = False if early_stopping and no_improve: return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc) # ======================== # Save model checkpoint if requested if save_checkpoint and checkpoint_filename is not None: saved_state = dict( best_acc=best_acc, ewi=epochs_without_improvement, model_state=self.model.state_dict(), ) torch.save(saved_state, checkpoint_filename) print(f"*** Saved checkpoint {checkpoint_filename} " f"at epoch {epoch+1}") if post_epoch_fn: post_epoch_fn(epoch, train_result, test_result, verbose) return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to the file specified by the checkpoints # argument. # ====== YOUR CODE: ====== # raise NotImplementedError() # ======================== actual_num_epochs += 1 if not best_acc: best_acc = 0.0 """"" train_result = self.train_epoch(dl_train, verbose=verbose) train_loss.extend(train_result.losses) train_acc.append(train_result.accuracy) test_result = self.test_epoch(dl_test, verbose=verbose) test_loss.extend(test_result.losses) test_acc.append(test_result.accuracy) """ "" curr_epoch_res_test = self.test_epoch(dl_test, verbose=verbose, **kw) curr_test_accuracy = curr_epoch_res_test.accuracy test_loss.extend(curr_epoch_res_test.losses) #test_loss.append(sum(curr_epoch_res_test.losses)/len(curr_epoch_res_test.losses)) test_acc.append(curr_test_accuracy) curr_epoch_res_train = self.train_epoch(dl_train, verbose=verbose, **kw) curr_train_accuracy = curr_epoch_res_train.accuracy train_loss.extend(curr_epoch_res_train.losses) #train_loss.append(sum(curr_epoch_res_train.losses)/len(curr_epoch_res_train.losses)) train_acc.append(curr_train_accuracy) # curr_loss = test_loss[-1] # best_loss = min(test_loss[:-1]) if len(test_loss) >= 2 else 1e3 # if early_stopping and (curr_loss > best_loss - 1e-4): # epochs_without_improvement += 1 # if epochs_without_improvement >= early_stopping: # break # else: # epochs_without_improvement = 0 if early_stopping and (best_acc > curr_test_accuracy): epochs_without_improvement += 1 if epochs_without_improvement >= early_stopping: break else: if best_acc < curr_test_accuracy: best_acc = curr_test_accuracy epochs_without_improvement = 0 """ if checkpoints is not None and test_acc[-1] > best_acc: torch.save(self.model, checkpoints) best_acc = test_acc[-1] """ return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader = None, num_epochs=100, checkpoints: str = None, early_stopping: int = 25, start_epoch: int = 0, print_every=1, post_epoch_fn=None, never_print=False, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :param post_epoch_fn: A function to call after each epoch completes. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 checkpoint_filename = None if checkpoints is not None: checkpoint_filename = f'{checkpoints}.pt' Path(os.path.dirname(checkpoint_filename)).mkdir(exist_ok=True) if os.path.isfile(checkpoint_filename): print(f'*** Loading checkpoint file {checkpoint_filename}') saved_state = torch.load(checkpoint_filename, map_location=self.device) best_acc = saved_state.get('best_acc', best_acc) epochs_without_improvement =\ saved_state.get('ewi', epochs_without_improvement) self.model.load_state_dict(saved_state['model_state']) for epoch in range(start_epoch, num_epochs + start_epoch): save_checkpoint = False verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: if not never_print: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: # Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # ====== YOUR CODE: ====== kw['verbose'] = verbose res = self.train_epoch(dl_train, **kw) train_loss.append(torch.mean((torch.tensor(res.losses))).item()) train_acc.append(res.accuracy) train_result = EpochResult(train_loss[-1], train_acc[-1]) if dl_test is not None: res = self.test_epoch(dl_test, **kw) test_loss.append(torch.mean((torch.tensor(res.losses))).item()) test_acc.append(res.accuracy) test_result = EpochResult(test_loss[-1], test_acc[-1]) if epoch > 0: if dl_test is not None: if res.accuracy >= torch.max(torch.tensor(test_acc)): if checkpoints: torch.save(self.model, checkpoint_filename) print('\nsaved\n') epochs_without_improvement = 0 else: epochs_without_improvement += 1 if self.scheduler is not None: self.scheduler.step() if early_stopping: if epochs_without_improvement >= early_stopping: break # ======================== # Save model checkpoint if requested if save_checkpoint and checkpoint_filename is not None: saved_state = dict(best_acc=best_acc, ewi=epochs_without_improvement, model_state=self.model.state_dict()) torch.save(saved_state, checkpoint_filename) print(f'*** Saved checkpoint {checkpoint_filename} ' f'at epoch {epoch+1}') if post_epoch_fn: post_epoch_fn(epoch, train_result, test_result, verbose) return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit( self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw, ) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 best_test_loss = None for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f"--- EPOCH {epoch+1}/{num_epochs} ---", verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to the file specified by the checkpoints # argument. # ====== YOUR CODE: ====== # print(dl_test) actual_num_epochs += 1 train_losses, train_accuracy = self.train_epoch(dl_train, **kw) test_losses, test_accuracy = self.test_epoch(dl_test, **kw) train_loss.extend(train_losses) train_acc.append(train_accuracy) test_loss.extend(test_losses) test_acc.append(test_accuracy) # current_test_loss = torch.mean(torch.stack(test_losses)) current_test_loss = torch.mean(torch.stack(test_losses)) if isinstance(test_losses[0], torch.Tensor) \ else sum(test_losses)/len(test_losses) if (best_test_loss is None) or (current_test_loss < best_test_loss): best_test_loss = current_test_loss epochs_without_improvement = 0 else: epochs_without_improvement += 1 if (early_stopping is not None) and (epochs_without_improvement == early_stopping): break # ======================== return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)
def fit(self, dl_train: DataLoader, dl_test: DataLoader, num_epochs, checkpoints: str = None, early_stopping: int = None, print_every=1, **kw) -> FitResult: """ Trains the model for multiple epochs with a given training set, and calculates validation loss over a given validation set. :param dl_train: Dataloader for the training set. :param dl_test: Dataloader for the test set. :param num_epochs: Number of epochs to train for. :param checkpoints: Whether to save model to file every time the test set accuracy improves. Should be a string containing a filename without extension. :param early_stopping: Whether to stop training early if there is no test loss improvement for this number of epochs. :param print_every: Print progress every this number of epochs. :return: A FitResult object containing train and test losses per epoch. """ actual_num_epochs = 0 train_loss, train_acc, test_loss, test_acc = [], [], [], [] best_acc = None epochs_without_improvement = 0 for epoch in range(num_epochs): verbose = False # pass this to train/test_epoch. if epoch % print_every == 0 or epoch == num_epochs - 1: verbose = True self._print(f'--- EPOCH {epoch+1}/{num_epochs} ---', verbose) # TODO: Train & evaluate for one epoch # - Use the train/test_epoch methods. # - Save losses and accuracies in the lists above. # - Implement early stopping. This is a very useful and # simple regularization technique that is highly recommended. # - Optional: Implement checkpoints. You can use torch.save() to # save the model to the file specified by the checkpoints # argument. # ====== YOUR CODE: ====== train_res = self.train_epoch(dl_train, **kw) test_res = self.train_epoch(dl_test, **kw) train_acc.append(train_res.accuracy) test_acc.append(test_res.accuracy) test_curr_epc_avg_loss = torch.sum(torch.tensor( test_res.losses)).item() / len(test_res.losses) # this is for evaluate improvment later on minimal_batches_count = min(len(test_loss), len(test_res.losses)) test_last_epc_avg_loss = 1 if minimal_batches_count == 0: test_last_epc_avg_loss += test_curr_epc_avg_loss else: test_last_epc_avg_loss = torch.sum( torch.tensor(test_loss[:-minimal_batches_count])).item( ) / minimal_batches_count # updating the epochs_without_improvement for early stopping if test_last_epc_avg_loss <= test_curr_epc_avg_loss and len( test_loss) > 0: epochs_without_improvement += 1 else: epochs_without_improvement = 0 # update the arrays: train_loss.extend( [train_loss_t.item() for train_loss_t in train_res.losses]) test_loss.extend( [test_loss_t.item() for test_loss_t in test_res.losses]) # Implement checkpoints if (not best_acc) or best_acc < test_res.accuracy: best_acc = test_res.accuracy if checkpoints: torch.save(self.model, checkpoints) # Implement early stopping if early_stopping and epochs_without_improvement >= early_stopping: break # ======================== return FitResult(actual_num_epochs, train_loss, train_acc, test_loss, test_acc)