def clf_fit(net: nn.Module, crit: nn.Module, opti: torch.optim, tloader, vloader, **kwargs): """ This function is used to train the classification networks. """ epochs = kwargs['epochs'] lr = kwargs['lr'] lr_step = kwargs['lr_step'] lr_decay = kwargs['lr_decay'] seed = kwargs['seed'] if kwargs['seed'] else np.random.randint(100) bloss = float('inf') torch.manual_seed(seed) np.random.seed(seed) print('[INFO] Setting torch seed to {}'.format(seed)) device = 'cuda' if torch.cuda.is_available() else 'cpu' tlist = [] vlist = [] for e in range(1, epochs + 1): if lr_step is not None and type(lr_step) == int and e % lr_step == 0: lr = adjust_lr(opti, lr, lr_decay) if lr_step is not None and type(lr_step) == list and e in lr_step: lr = adjust_lr(opti, lr, lr_decay) tacc, tloss = clf_train(net, tloader, opti, crit, topk=kwargs['topk']) vacc, vloss = clf_test(net, vloader, crit, topk=kwargs['topk']) tlist.append((tacc, tloss)) vlist.append((vacc, vloss)) if vloss < bloss: bloss = vloss torch.save({ 'net': net.state_dict(), 'opti': opti.state_dict() }, 'best_net-{}-{:.2f}.pth'.format(e, vacc[0])) # TODO The tloss and vloss needs a recheck. print('Epoch: {}/{} - Train Loss: {:.3f} - Train Acc@1: {:.3f}' '- Train Acc@5: {:.3f} - Val Loss: {:.3f} - Val Acc@1: {:.3f}' '- Val Acc@5: {:.3f}'.format(e, epochs, tloss, tacc[0], tacc[1], vloss, vacc[0], vacc[1])) torch.save({ 'net': net.cpu().state_dict(), 'opti': opti.state_dict() }, 'net-{}-{:.2f}.pth'.format(e, vacc[0])) return tlist, vlist
def model_save(model: torch.nn.Module, encoder_optimizer: torch.optim, decoder_optimizer: torch.optim, loss, latent_dim, ckpt_dir): torch.save( { 'model_state_dict': model.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'loss': loss, 'latent_dim': latent_dim, 'model': model }, ckpt_dir)
def train_on_dataset( train_dataset: Dataset, val_dataset, model: Tree2Seq, criterion: nn.modules.loss, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, clip_norm: int, logger: AbstractLogger, start_batch_id: int = 0, log_step: int = -1, eval_step: int = -1, save_step: int = -1 ): train_epoch_info = LearningInfo() batch_iterator_pb = tqdm(range(start_batch_id, len(train_dataset)), total=len(train_dataset)) batch_iterator_pb.update(start_batch_id) batch_iterator_pb.refresh() for batch_id in batch_iterator_pb: graph, labels = train_dataset[batch_id] batch_info = train_on_batch(model, criterion, optimizer, scheduler, graph, labels, clip_norm) train_epoch_info.accumulate_info(batch_info) if is_step_match(batch_id, log_step): logger.log(train_epoch_info.get_state_dict(), batch_id, is_train=True) train_epoch_info = LearningInfo() if is_step_match(batch_id, save_step): train_dump = { 'state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'batch_id': batch_id } logger.save_model(f'batch_{batch_id}.pt', train_dump) if is_step_match(batch_id, eval_step): eval_info = evaluate_on_dataset(val_dataset, model, criterion) logger.log(eval_info.get_state_dict(), batch_id, is_train=False) if train_epoch_info.batch_processed > 0: logger.log(train_epoch_info.get_state_dict(), len(train_dataset) - 1, is_train=True)
def save_checkpoint(self, model: torch.nn.Module, optimizer: torch.optim, is_best: bool, save_state: bool = True): if save_state: state = {'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()} torch.save(state, self.state_dir) if is_best: torch.save(model.state_dict(), self.model_dir)
def train(self, model: torchvision.models, criterion: torch.nn, optimizer: torch.optim, train_dataset: ImageFoldersDataset, test_dataset: ImageFoldersDataset, n_epochs: int = 25, batch_size: int = 32, shuffle: bool = True, *args, **kwargs): # TODO(lukasz): add scheduler for learning rate metrics = defaultdict(list) best_score_test = 0. for epoch in range(n_epochs): model.train() running_loss = 0. for data_idx, data in enumerate( train_dataset.loader( batch_size=batch_size, shuffle=shuffle # TODO(lukasz): add sampler for imbalanced dataset )): inputs, labels = data inputs = inputs.to(self.device) labels = labels.to(self.device) optimizer.zero_grad() model = model.to(self.device) outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() # TODO(lukasz): add as argument if data_idx % 100 == 0: msg = '[%d, %5d] loss: %.3f' print(msg % (epoch + 1, data_idx + 1, running_loss / 100)) running_loss = 0. score_train = self.score(model, train_dataset) score_test = self.score(model, test_dataset) metrics['score_train'].append(score_train) metrics['score_test'].append(score_test) msg = '[%d] train score: %.3f, test score: %.3f' print(msg % (epoch + 1, score_train, score_test)) # save model (make sure that Google Colab do not destroy your results) if score_test > best_score_test: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, self.save_experiment) best_score_test = score_test self.metrics = metrics return self
def save_ckpt(model: nn.Module, optimizer: torch.optim, checkpoint_path: str) -> dict: """ Save model and optimizer checkpoint to continuer training """ torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), }, checkpoint_path) print("Saved model and optimizer state to {}".format(checkpoint_path))
def save_checkpoint(_net: torch.nn.Module, _optimizer: torch.optim, _epoch, _best_acc, _ckpt_path): checkpoint = { 'net': _net.state_dict(), 'optimizer': _optimizer.state_dict(), 'epoch': _epoch, 'best_acc': _best_acc } torch.save(checkpoint, _ckpt_path)
def checkpoint_save(epoch: int, nn_model: model, nn_optimizer: torch.optim, training_loss: list, validation_loss: list, model_name: str, locations: dict, args): """ Save model checkpoints """ checkpoint_name = model_name.replace('.tar','_chkepo_{0}.tar'.format(str(epoch).zfill(3))) torch.save({'epoch':epoch, 'model_state_dict':nn_model.state_dict(), 'optimizer_state_dict':nn_optimizer.state_dict(), 'training_loss':training_loss, 'validation_loss':validation_loss, 'arguments':args}, locations['model_loc']+'/'+checkpoint_name)
def saveCheckpoint(checkpoint_path, model: nn.Module, optimizer: optim, scheduler: optim.lr_scheduler.MultiStepLR, epoch, feature=None): state = { 'state_dict': model.state_dict(), 'optimizer' : optimizer.state_dict(), 'epoch': epoch, 'scheduler': scheduler.state_dict() } if feature: state['feature'] = feature.state_dict() torch.save(state, checkpoint_path) return
def train_and_evaluate(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, optimizer: optim, args) -> None: logger.info('begin training and evaluation') best_test_R2 = float('inf') train_len = len(train_loader) loss_summary = np.zeros((train_len * args.num_epochs)) R2_summary = np.zeros(args.num_epochs) for epoch in range(args.num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, args.num_epochs)) loss_summary[epoch * train_len:(epoch + 1) * train_len] = train( model, optimizer, train_loader, test_loader, args, epoch) test_metrics = evaluate(model, test_loader, args, epoch) R2_summary[epoch] = test_metrics['R2'] is_best = R2_summary[epoch] <= best_test_R2 # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, epoch=epoch, is_best=is_best, checkpoint=args.model_dir) if is_best: logger.info('- Found new best R2') best_test_R2 = R2_summary[epoch] best_json_path = os.path.join(args.model_dir, 'metrics_test_best_weights.json') utils.save_dict_to_json(test_metrics, best_json_path) logger.info('Current Best R2 is: %.5f' % best_test_R2) utils.plot_all_epoch(R2_summary[:epoch + 1], args.dataset + '_ND', args.plot_dir) utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', args.plot_dir) last_json_path = os.path.join(args.model_dir, 'metrics_test_last_weights.json') utils.save_dict_to_json(test_metrics, last_json_path)
def find_lr(dataloader, model, optimizer: torch.optim, criterion, device, num_steps, lr_min: float = 1e-7, lr_max: float = 10, beta: float = 0.98): model.to(device) optim_dict = optimizer.state_dict().copy() optimizer.param_groups[0]['lr'] = lr_min # num_steps = len(dataloader) - 1 scheduler = LrSchedulerFinder(optimizer, lr_min, lr_max, num_steps) model_dict = model.state_dict().copy() losses = list() lrs = list() avg_loss = 0 best_loss = 0 for idx_batch, (data, label) in tqdm(enumerate(dataloader, 1), total=num_steps): print("here") if idx_batch == num_steps: break y, kl = model(data.to(device)) print(y, kl) loss = criterion(y, label, kl, 0) if np.isnan(loss.item()): print(loss.item()) avg_loss = beta * avg_loss + (1 - beta) * loss.item() smooth_loss = avg_loss / (1 - beta**idx_batch) if idx_batch > 1 and smooth_loss > 4 * best_loss: break if smooth_loss < best_loss or idx_batch == 1: best_loss = smooth_loss losses.append(smooth_loss) lrs.append(scheduler.get_lr()[0]) optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() model.load_state_dict(model_dict) optimizer.load_state_dict(optim_dict) return np.array(lrs), np.array(losses)
def save_model(model_directory: str, trained_model: models, class_to_idx: dict, optimizer: optim, arch: str, epochs=4, model_name: str = 'checkpoint.pth'): """ Saves model to directory :param model_directory: a path where the model should be saved :param trained_model: the model to be saved :param class_to_idx: Dict with items (class_name, class_index) :param optimizer: the optimizer that has been used in training :param arch: :param epochs: number of epochs that is used in the training. could be used later :param model_name: checkpoint name :return: """ # check for save directory if not os.path.isdir(model_directory): print(f'Directory {model_directory} does not exist. Creating...') os.makedirs(model_directory) trained_model.class_to_idx = class_to_idx model_state = { 'epoch': epochs, 'state_dict': trained_model.state_dict(), 'optimizer_dict': optimizer.state_dict(), 'classifier': trained_model.classifier, 'class_to_idx': trained_model.class_to_idx, 'arch': arch } save_location = f'{model_directory}/{model_name}' print(f"Saving checkpoint to {save_location}") torch.save(model_state, save_location)
def train_and_evaluate(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, optimizer: optim, loss_fn, params: utils.Params, restore_file: str = None) -> None: '''Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the Deep AR model train_loader: load train data and labels test_loader: load test data and labels optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch params: (Params) hyperparameters restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) ''' # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar') logger.info('Restoring parameters from {}'.format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) logger.info('begin training and evaluation') best_test_ND = float('inf') train_len = len(train_loader) ND_summary = np.zeros(params.num_epochs) loss_summary = np.zeros((train_len * params.num_epochs)) for epoch in range(params.num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs)) loss_summary[epoch * train_len:(epoch + 1) * train_len] = train( model, optimizer, loss_fn, train_loader, test_loader, params, epoch) test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling) ND_summary[epoch] = test_metrics['ND'] is_best = ND_summary[epoch] <= best_test_ND # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, epoch=epoch, is_best=is_best, checkpoint=params.model_dir) if is_best: logger.info('- Found new best ND') best_test_ND = ND_summary[epoch] best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json') utils.save_dict_to_json(test_metrics, best_json_path) logger.info('Current Best ND is: %.5f' % best_test_ND) utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir) utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir) last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json') utils.save_dict_to_json(test_metrics, last_json_path) if args.save_best: f = open('./param_search.txt', 'w') f.write('-----------\n') list_of_params = args.search_params.split(',') print_params = '' for param in list_of_params: param_value = getattr(params, param) print_params += f'{param}: {param_value:.2f}' print_params = print_params[:-1] f.write(print_params + '\n') f.write('Best ND: ' + str(best_test_ND) + '\n') logger.info(print_params) logger.info(f'Best ND: {best_test_ND}') f.close() utils.plot_all_epoch(ND_summary, print_params + '_ND', location=params.plot_dir) utils.plot_all_epoch(loss_summary, print_params + '_loss', location=params.plot_dir)
def _train_helper(self, model: torchvision.models.resnet.ResNet, dataloaders: Dict[str, torch.utils.data.DataLoader], dataset_sizes: Dict[str, int], loss_fn, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, start_epoch: int, writer: IO) -> None: """ Function for learning ResNet. Args: model: ResNet model for learning. dataloaders: Dataloaders for IO pipeline. dataset_sizes: Sizes of the learning and validation dataset. loss_fn: Metric used for calculating loss. optimizer: Optimizer to use for gradient descent. scheduler: Scheduler to use for learning rate decay. start_epoch: Starting epoch for learning. writer: Writer to write logging information. """ learning_init_time = time.time() # Initialize all the tensors to be used in learning and validation. # Do this outside the loop since it will be written over entirely at each # epoch and doesn't need to be reallocated each time. train_all_labels = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() train_all_predicts = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() val_all_labels = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() val_all_predicts = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() early_stopper = EarlyStopper(patience=self._early_stopping_patience, mode=EarlyStopper.Mode.MAX) if self._resume_checkpoint and self._last_val_acc: best_val_acc = self._last_val_acc else: best_val_acc = 0. # Train for specified number of epochs. for epoch in range(start_epoch, self._num_epochs): epoch_init_time = time.time() # Training phase. model.train(mode=True) train_running_loss = 0.0 train_running_corrects = 0 # Train over all learning data. for idx, (train_inputs, true_labels) in enumerate(dataloaders["train"]): train_patches = train_inputs["patch"].to(device=self._device) train_x_coord = train_inputs["x_coord"].to(device=self._device) train_y_coord = train_inputs["y_coord"].to(device=self._device) true_labels = true_labels.to(device=self._device) optimizer.zero_grad() # Forward and backpropagation. with torch.set_grad_enabled(mode=True): train_logits = model(train_patches, train_x_coord, train_y_coord).squeeze(dim=1) train_loss = loss_fn(logits=train_logits, target=true_labels) train_loss.backward() optimizer.step() # Update learning diagnostics. train_running_loss += train_loss.item() * train_patches.size(0) pred_labels = self._extract_pred_labels(train_logits) train_running_corrects += torch.sum( pred_labels == true_labels.data, dtype=torch.double) start = idx * self._batch_size end = start + self._batch_size train_all_labels[start:end] = true_labels.detach().cpu() train_all_predicts[start:end] = pred_labels.detach().cpu() self._calculate_confusion_matrix( all_labels=train_all_labels.numpy(), all_predicts=train_all_predicts.numpy(), classes=self._classes, num_classes=self._num_classes) # Store learning diagnostics. train_loss = train_running_loss / dataset_sizes["train"] train_acc = train_running_corrects / dataset_sizes["train"] if torch.cuda.is_available(): torch.cuda.empty_cache() # Validation phase. model.train(mode=False) val_running_loss = 0.0 val_running_corrects = 0 # Feed forward over all the validation data. for idx, (val_inputs, val_labels) in enumerate(dataloaders["val"]): val_patches = val_inputs["patch"].to(device=self._device) val_x_coord = val_inputs["x_coord"].to(device=self._device) val_y_coord = val_inputs["y_coord"].to(device=self._device) val_labels = val_labels.to(device=self._device) # Feed forward. with torch.set_grad_enabled(mode=False): val_logits = model(val_patches, val_x_coord, val_y_coord).squeeze(dim=1) val_loss = loss_fn(logits=val_logits, target=val_labels) # Update validation diagnostics. val_running_loss += val_loss.item() * val_patches.size(0) pred_labels = self._extract_pred_labels(val_logits) val_running_corrects += torch.sum( pred_labels == val_labels.data, dtype=torch.double) start = idx * self._batch_size end = start + self._batch_size val_all_labels[start:end] = val_labels.detach().cpu() val_all_predicts[start:end] = pred_labels.detach().cpu() self._calculate_confusion_matrix( all_labels=val_all_labels.numpy(), all_predicts=val_all_predicts.numpy(), classes=self._classes, num_classes=self._num_classes) # Store validation diagnostics. val_loss = val_running_loss / dataset_sizes["val"] val_acc = val_running_corrects / dataset_sizes["val"] if torch.cuda.is_available(): torch.cuda.empty_cache() scheduler.step() current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Remaining things related to learning. if val_acc > best_val_acc: best_val_acc = val_acc best_model_ckpt_path = self._checkpoints_folder.joinpath( f"resnet{self._num_layers}_e{epoch}_va{val_acc:.5f}.pt") # Confirm the output directory exists. best_model_ckpt_path.parent.mkdir(parents=True, exist_ok=True) # Save the model as a state dictionary. torch.save(obj={ "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "epoch": epoch + 1 }, f=str(best_model_ckpt_path)) self._clean_ckpt_folder(best_model_ckpt_path) writer.write(f"{epoch},{train_loss:.4f}," f"{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n") # Print the diagnostics for each epoch. logging.info( f"Epoch {epoch} " f"with lr {current_lr:.15f}: " f"{self._format_time_period(epoch_init_time, time.time())} " f"t_loss: {train_loss:.4f} " f"t_acc: {train_acc:.4f} " f"v_loss: {val_loss:.4f} " f"v_acc: {val_acc:.4f}\n") early_stopper.update(val_acc) if early_stopper.is_stopping(): logging.info("Early stopping") break # Print learning information at the end. logging.info( f"\nlearning complete in " f"{self._format_time_period(learning_init_time, time.time())}")
def save_checkpoint( path: str, model: TEDD1104, optimizer_name: str, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, running_loss: float, total_batches: int, total_training_examples: int, acc_dev: float, epoch: int, fp16: bool, scaler: Optional[GradScaler], ) -> None: """ Save a checkpoint that allows to continue training the model in the future Input: - path: path where the model is going to be saved - model: TEDD1104 model to save - optimizer_name: Name of the optimizer used for training: SGD or Adam - optimizer: Optimizer used for training - acc_dev: Accuracy of the model in the development set - epoch: Num of epoch used to train the model - fp16: If the model uses FP16 - scaler: If the model uses FP16, the scaler used for training Output: """ dict_hyperparams: dict = { "sequence_size": model.sequence_size, "resnet": model.resnet, "pretrained_resnet": model.pretrained_resnet, "embedded_size": model.embedded_size, "hidden_size": model.hidden_size, "num_layers_lstm": model.num_layers_lstm, "bidirectional_lstm": model.bidirectional_lstm, "layers_out": model.layers_out, "dropout_cnn": model.dropout_cnn, "dropout_cnn_out": model.dropout_cnn_out, "dropout_lstm": model.dropout_lstm, "dropout_lstm_out": model.dropout_lstm_out, "fp16": fp16, } checkpoint = { "hyper_params": dict_hyperparams, "model": model.state_dict(), "optimizer_name": optimizer_name, "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "running_loss": running_loss, "total_batches": total_batches, "total_training_examples": total_training_examples, "acc_dev": acc_dev, "epoch": epoch, "scaler": None if not fp16 else scaler.state_dict(), } torch.save(checkpoint, path)
def train_and_evaluate(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, val_loader: DataLoader, optimizer: optim, loss_fn, params: utils.Params, restore_file: str = None) -> None: # 箭头无意义,提示函数返回值为None ''' Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the Deep AR model train_loader: load train data and labels test_loader: load test data and labels optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch params: (Params) hyperparameters restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) ''' print('Begin training') print(model) train_len = len(train_loader) loss_summary = np.zeros((train_len * params.num_epochs)) early_stopping = EarlyStopping(patience=5, verbose=True) for epoch in range(params.num_epochs): print('Epoch {}/{}'.format(epoch + 1, params.num_epochs)) # train loss_summary[epoch * train_len:(epoch + 1) * train_len] = train( model, optimizer, loss_fn, train_loader, test_loader, params, epoch) print( f"train_loss: {np.mean(loss_summary[epoch * train_len:(epoch + 1) * train_len])}" ) # evaluate val_metrics, tmp_mu, tmp_sigma = evaluate(model, loss_fn, val_loader, params, sample=params.sampling) # test_metrics = evaluate(model, loss_fn, test_loader, params, sample=params.sampling) # early stop early_stopping(val_metrics['test_loss'], model) if early_stopping.early_stop: print("Early stopping") # save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, filepath=params.model_dir) break
def train_model(model: nn.Module, data_loaders: Dict[str, DataLoader], loss_func: callable, optimizer: optim, model_folder: str, tensorboard_folder: str, args, **kwargs): num_epochs = args.epochs phases = ['train', 'val', 'test'] writer = SummaryWriter(tensorboard_folder) since = time.clock() # save_dict, best_rmse = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 100000 save_dict, best_pcc = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 0 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.2, patience=5, threshold=1e-3, min_lr=1e-6) try: for epoch in range(num_epochs): running_loss = {phase: 0.0 for phase in phases} for phase in phases: if phase == 'train': model.train() else: model.eval() steps, predictions, targets = 0, list(), list() tqdm_loader = tqdm(enumerate(data_loaders[phase])) for step, (features, truth_data) in tqdm_loader: features = to_var(features, args.device) truth_data = to_var(truth_data, args.device) with torch.set_grad_enabled(phase == 'train'): if args.lossinside: loss, outputs = model(features, truth_data, args, loss_func=loss_func) else: outputs = model(features, args) loss = loss_func(truth=truth_data, predict=outputs) # loss = loss_func(outputs, truth_data) if phase == 'train': if torch.isnan(loss): print("=============LOSS NAN============") print(features) print(truth_data) print(outputs) else: optimizer.zero_grad() loss.backward() optimizer.step() targets.append(truth_data.cpu().numpy()) with torch.no_grad(): predictions.append(outputs.cpu().detach().numpy()) running_loss[phase] += loss * truth_data.size(0) steps += truth_data.size(0) tqdm_loader.set_description( f'{phase} epoch: {epoch}, {phase} loss: {running_loss[phase] / steps}') # For the issue that the CPU memory increases while training. DO NOT know why, but it works. torch.cuda.empty_cache() # 性能 predictions = np.concatenate(predictions) targets = np.concatenate(targets) # print(2) # print(predictions[:3, :3]) # print(targets[:3, :3]) scores = calculate_metrics(predictions.reshape(predictions.shape[0], -1), targets.reshape(targets.shape[0], -1), args, plot=epoch % 5 == 0, **kwargs) # print(3) writer.add_scalars(f'score/{phase}', scores, global_step=epoch) with open(model_folder+"/output.txt", "a") as f: f.write(f'{phase} epoch: {epoch}, {phase} loss: {running_loss[phase] / steps}\n') f.write(str(scores)) f.write('\n') f.write(str(time.time())) f.write("\n\n") print(scores) # if phase == 'val' and scores['RMSE'] < best_rmse: if phase == 'val' and scores['pearr'] > best_pcc: best_pcc = scores['pearr'] # best_rmse = scores['RMSE'] save_dict.update(model_state_dict=copy.deepcopy(model.state_dict()), epoch=epoch, optimizer_state_dict=copy.deepcopy(optimizer.state_dict())) scheduler.step(running_loss['train']) writer.add_scalars('Loss', { f'{phase} loss': running_loss[phase] / len(data_loaders[phase].dataset) for phase in phases}, global_step=epoch) finally: time_elapsed = time.clock() - since print(f"cost {time_elapsed} seconds") save_model(f"{model_folder}/best_model.pkl", **save_dict) save_model(f"{model_folder}/final_model.pkl", **{'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': num_epochs, 'optimizer_state_dict': copy.deepcopy(optimizer.state_dict())})
def train_helper(model: torchvision.models.resnet.ResNet, dataloaders: Dict[str, torch.utils.data.DataLoader], dataset_sizes: Dict[str, int], criterion: torch.nn.modules.loss, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, num_epochs: int, log_writer: IO, train_order_writer: IO, device: torch.device, batch_size: int, checkpoints_folder: Path, num_layers: int, classes: List[str], minibatch_counter, num_classes: int) -> None: since = time.time() global_minibatch_counter = minibatch_counter # Initialize all the tensors to be used in training and validation. # Do this outside the loop since it will be written over entirely at each # epoch and doesn't need to be reallocated each time. train_all_labels = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() train_all_predicts = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() val_all_labels = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() val_all_predicts = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() for epoch in range(1, num_epochs + 1): model.train(mode=True) # Training phase. train_running_loss, train_running_corrects, epoch_minibatch_counter = 0.0, 0, 0 for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]): train_inputs = inputs.to(device=device) train_labels = labels.to(device=device) optimizer.zero_grad() # Forward and backpropagation. with torch.set_grad_enabled(mode=True): train_outputs = model(train_inputs) __, train_preds = torch.max(train_outputs, dim=1) train_loss = criterion(input=train_outputs, target=train_labels) train_loss.backward() optimizer.step() # Update training diagnostics. train_running_loss += train_loss.item() * train_inputs.size(0) train_running_corrects += torch.sum( train_preds == train_labels.data, dtype=torch.double) this_batch_size = train_labels.detach().cpu().shape[0] start = idx * batch_size end = start + this_batch_size train_all_labels[start:end] = train_labels.detach().cpu() train_all_predicts[start:end] = train_preds.detach().cpu() global_minibatch_counter += 1 epoch_minibatch_counter += 1 # Calculate training diagnostics calculate_confusion_matrix(all_labels=train_all_labels.numpy(), all_predicts=train_all_predicts.numpy(), classes=classes, num_classes=num_classes) train_loss = train_running_loss / (epoch_minibatch_counter * batch_size) train_acc = train_running_corrects / (epoch_minibatch_counter * batch_size) # Validation phase. model.train(mode=False) val_running_loss = 0.0 val_running_corrects = 0 # Feed forward over all the validation data. for idx, (val_inputs, val_labels, paths) in enumerate(dataloaders["val"]): val_inputs = val_inputs.to(device=device) val_labels = val_labels.to(device=device) # Feed forward. with torch.set_grad_enabled(mode=False): val_outputs = model(val_inputs) _, val_preds = torch.max(val_outputs, dim=1) val_loss = criterion(input=val_outputs, target=val_labels) # Update validation diagnostics. val_running_loss += val_loss.item() * val_inputs.size(0) val_running_corrects += torch.sum(val_preds == val_labels.data, dtype=torch.double) this_batch_size = val_labels.detach().cpu().shape[0] start = idx * batch_size end = start + this_batch_size val_all_labels[start:end] = val_labels.detach().cpu() val_all_predicts[start:end] = val_preds.detach().cpu() # Calculate validation diagnostics calculate_confusion_matrix(all_labels=val_all_labels.numpy(), all_predicts=val_all_predicts.numpy(), classes=classes, num_classes=num_classes) val_loss = val_running_loss / dataset_sizes["val"] val_acc = val_running_corrects / dataset_sizes["val"] if torch.cuda.is_available(): torch.cuda.empty_cache() # Remaining things related to training. epoch_output_path = checkpoints_folder.joinpath( f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt" ) epoch_output_path.parent.mkdir(parents=True, exist_ok=True) # Save the model as a state dictionary. torch.save(obj={ "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "epoch": epoch + 1 }, f=str(epoch_output_path)) log_writer.write( f"{epoch},{global_minibatch_counter},{train_loss:.4f},{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n" ) current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print the diagnostics for each epoch. print(f"Epoch {epoch} with " f"mb {global_minibatch_counter} " f"lr {current_lr:.15f}: " f"t_loss: {train_loss:.4f} " f"t_acc: {train_acc:.4f} " f"v_loss: {val_loss:.4f} " f"v_acc: {val_acc:.4f}\n") scheduler.step() current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print training information at the end. print(f"\ntraining complete in " f"{(time.time() - since) // 60:.2f} minutes") return epoch_output_path, global_minibatch_counter
def train_helper(model: torchvision.models.resnet.ResNet, dataloaders: Dict[str, torch.utils.data.DataLoader], dataset_sizes: Dict[str, int], criterion: torch.nn.modules.loss, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, num_epochs: int, writer: IO, train_order_writer: IO, device: torch.device, start_epoch: int, batch_size: int, save_interval: int, checkpoints_folder: Path, num_layers: int, classes: List[str], num_classes: int) -> None: """ Function for training ResNet. Args: model: ResNet model for training. dataloaders: Dataloaders for IO pipeline. dataset_sizes: Sizes of the training and validation dataset. criterion: Metric used for calculating loss. optimizer: Optimizer to use for gradient descent. scheduler: Scheduler to use for learning rate decay. start_epoch: Starting epoch for training. writer: Writer to write logging information. train_order_writer: Writer to write the order of training examples. device: Device to use for running model. num_epochs: Total number of epochs to train for. batch_size: Mini-batch size to use for training. save_interval: Number of epochs between saving checkpoints. checkpoints_folder: Directory to save model checkpoints to. num_layers: Number of layers to use in the ResNet model from [18, 34, 50, 101, 152]. classes: Names of the classes in the dataset. num_classes: Number of classes in the dataset. """ since = time.time() # Initialize all the tensors to be used in training and validation. # Do this outside the loop since it will be written over entirely at each # epoch and doesn't need to be reallocated each time. train_all_labels = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() train_all_predicts = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() val_all_labels = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() val_all_predicts = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() global_minibatch_counter = 0 # Train for specified number of epochs. for epoch in range(start_epoch, num_epochs): # Training phase. model.train(mode=True) train_running_loss = 0.0 train_running_corrects = 0 epoch_minibatch_counter = 0 # Train over all training data. for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]): train_inputs = inputs.to(device=device) train_labels = labels.to(device=device) optimizer.zero_grad() # Forward and backpropagation. with torch.set_grad_enabled(mode=True): train_outputs = model(train_inputs) __, train_preds = torch.max(train_outputs, dim=1) train_loss = criterion(input=train_outputs, target=train_labels) train_loss.backward() optimizer.step() # Update training diagnostics. train_running_loss += train_loss.item() * train_inputs.size(0) train_running_corrects += torch.sum( train_preds == train_labels.data, dtype=torch.double) start = idx * batch_size end = start + batch_size train_all_labels[start:end] = train_labels.detach().cpu() train_all_predicts[start:end] = train_preds.detach().cpu() global_minibatch_counter += 1 epoch_minibatch_counter += 1 # for path in paths: #write the order that the model was trained in # train_order_writer.write("/".join(path.split("/")[-2:]) + "\n") if global_minibatch_counter % 10 == 0 or global_minibatch_counter == 5: calculate_confusion_matrix( all_labels=train_all_labels.numpy(), all_predicts=train_all_predicts.numpy(), classes=classes, num_classes=num_classes) # Store training diagnostics. train_loss = train_running_loss / (epoch_minibatch_counter * batch_size) train_acc = train_running_corrects / (epoch_minibatch_counter * batch_size) # Validation phase. model.train(mode=False) val_running_loss = 0.0 val_running_corrects = 0 # Feed forward over all the validation data. for idx, (val_inputs, val_labels, paths) in enumerate(dataloaders["val"]): val_inputs = val_inputs.to(device=device) val_labels = val_labels.to(device=device) # Feed forward. with torch.set_grad_enabled(mode=False): val_outputs = model(val_inputs) _, val_preds = torch.max(val_outputs, dim=1) val_loss = criterion(input=val_outputs, target=val_labels) # Update validation diagnostics. val_running_loss += val_loss.item() * val_inputs.size(0) val_running_corrects += torch.sum( val_preds == val_labels.data, dtype=torch.double) start = idx * batch_size end = start + batch_size val_all_labels[start:end] = val_labels.detach().cpu() val_all_predicts[start:end] = val_preds.detach().cpu() calculate_confusion_matrix( all_labels=val_all_labels.numpy(), all_predicts=val_all_predicts.numpy(), classes=classes, num_classes=num_classes) # Store validation diagnostics. val_loss = val_running_loss / dataset_sizes["val"] val_acc = val_running_corrects / dataset_sizes["val"] if torch.cuda.is_available(): torch.cuda.empty_cache() # Remaining things related to training. if global_minibatch_counter % 10 == 0 or global_minibatch_counter == 5: epoch_output_path = checkpoints_folder.joinpath( f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt" ) # Confirm the output directory exists. epoch_output_path.parent.mkdir(parents=True, exist_ok=True) # Save the model as a state dictionary. torch.save(obj={ "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "epoch": epoch + 1 }, f=str(epoch_output_path)) writer.write( f"{epoch},{global_minibatch_counter},{train_loss:.4f}," f"{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n") current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print the diagnostics for each epoch. print(f"Epoch {epoch} with " f"mb {global_minibatch_counter} " f"lr {current_lr:.15f}: " f"t_loss: {train_loss:.4f} " f"t_acc: {train_acc:.4f} " f"v_loss: {val_loss:.4f} " f"v_acc: {val_acc:.4f}\n") scheduler.step() current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print training information at the end. print(f"\ntraining complete in " f"{(time.time() - since) // 60:.2f} minutes")
def train_and_evaluate2(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, optimizer: optim, params: utils.Params, loss_fn: None, restore_file: None, args: None, idx: None) -> None: '''Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the Deep AR model train_loader: load train data and labels test_loader: load test data and labels optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch params: (Params) hyperparameters restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) ''' # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar') logger.info('Restoring parameters from {}'.format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) logger.info('begin training and evaluation') best_test_ND = float('inf') # File to save first results out_file = os.path.join(os.path.join('experiments', args.model_name), 'train_results.csv') if not os.path.isfile(out_file): of_connection = open(out_file, 'w') writer = csv.writer(of_connection) # Write the headers to the file writer.writerow(['iteration', 'epoch', 'test_metric', 'train_loss']) of_connection.close() train_len = len(train_loader) ND_summary = np.zeros(params.num_epochs) loss_summary = np.zeros((train_len * params.num_epochs)) # initialize the early_stopping object early_stopping = EarlyStopping(patience=5, verbose=True, delta=0.0001, folder=params.model_dir) for epoch in range(params.num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs)) loss_summary[epoch * train_len:(epoch + 1) * train_len] = train( model, optimizer, loss_fn, train_loader, test_loader, params, args.sampling, epoch) test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling) if test_metrics['rou50'] == float('nan'): test_metrics['rou50'] = 100 elif test_metrics['rou50'] == 'nan': test_metrics['rou50'] = 100 elif test_metrics['rou50'] == np.nan: test_metrics['rou50'] = 100 ND_summary[epoch] = test_metrics['rou50'] is_best = ND_summary[epoch] <= best_test_ND # Save weights utils.save_checkpoint( { 'epoch': 0, #epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, epoch=0, # to prevent extra model savings is_best=is_best, checkpoint=params.model_dir) if is_best: logger.info('- Found new best ND') best_test_ND = ND_summary[epoch] best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json') utils.save_dict_to_json(test_metrics, best_json_path) logger.info('Current Best loss is: %.5f' % best_test_ND) #if args.plot_figure: # utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir) # utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir) last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json') utils.save_dict_to_json(test_metrics, last_json_path) # Write to the csv file ('a' means append) of_connection = open(out_file, 'a') writer = csv.writer(of_connection) writer.writerow([idx, epoch + 1, test_metrics, loss_summary[-1]]) #loss_summary[0]?? of_connection.close() logger.info('Loss_summary: ' % loss_summary[epoch * train_len:(epoch + 1) * train_len]) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model logger.info('test_metrics[rou50]: %.5f ' % test_metrics['rou50']) early_stopping(test_metrics['rou50'], model) if early_stopping.early_stop: logger.info('Early stopping') break with open(best_json_path) as json_file: best_metrics = json.load(json_file) return best_metrics, test_metrics
def train_helper_with_gradients(model: torchvision.models.resnet.ResNet, dataloaders: Dict[str, torch.utils.data.DataLoader], dataset_sizes: Dict[str, int], criterion: torch.nn.modules.loss, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, num_epochs: int, writer: IO, train_order_writer: IO, device: torch.device, start_epoch: int, batch_size: int, save_interval: int, checkpoints_folder: Path, num_layers: int, classes: List[str], num_classes: int) -> None: since = time.time() # Initialize all the tensors to be used in training and validation. # Do this outside the loop since it will be written over entirely at each # epoch and doesn't need to be reallocated each time. train_all_labels = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() train_all_predicts = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() val_all_labels = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() val_all_predicts = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() global_minibatch_counter = 0 mag_writer = open("mags_resnet18_imagenet.csv", "w") mag_writer.write( "image_name,train_loss,layers_-1,layer_0,layer_60,layer_1,layer_20,layer_40,layer_59,conf,correct\n" ) # Train for specified number of epochs. for epoch in range(start_epoch, num_epochs): # Training phase. model.train(mode=True) train_running_loss = 0.0 train_running_corrects = 0 epoch_minibatch_counter = 0 # Train over all training data. for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]): train_inputs = inputs.to(device=device) train_labels = labels.to(device=device) optimizer.zero_grad() # Forward and backpropagation. with torch.set_grad_enabled(mode=True): train_outputs = model(train_inputs) confs, train_preds = torch.max(train_outputs, dim=1) train_loss = criterion(input=train_outputs, target=train_labels) train_loss.backward(retain_graph=True) optimizer.step() batch_grads = torch.autograd.grad(train_loss, model.parameters(), retain_graph=True) # print(len(batch_grads)) # for batch_grad in batch_grads: # print(batch_grad.size()) train_loss_npy = float(train_loss.detach().cpu().numpy()) layer_num_to_mag = get_grad_magnitude(model) image_name = get_image_name(paths[0]) conf = float(confs.detach().cpu().numpy()) train_pred = int(train_preds.detach().cpu().numpy()[0]) gt_label = int(train_labels.detach().cpu().numpy()[0]) correct = 0 if train_pred == gt_label: correct = 1 output_line = f"{image_name},{train_loss_npy:.4f},{layer_num_to_mag[-1]:.4f},{layer_num_to_mag[0]:.4f},{layer_num_to_mag[60]:.4f},{layer_num_to_mag[1]:.4f},{layer_num_to_mag[20]:.4f},{layer_num_to_mag[40]:.4f},{layer_num_to_mag[59]:.4f},{conf:.4f},{correct}\n" mag_writer.write(output_line) print(idx, output_line) # print(idx, image_name, train_loss_npy, conf, train_pred, gt_label) # Update training diagnostics. train_running_loss += train_loss.item() * train_inputs.size(0) train_running_corrects += torch.sum( train_preds == train_labels.data, dtype=torch.double) start = idx * batch_size end = start + batch_size train_all_labels[start:end] = train_labels.detach().cpu() train_all_predicts[start:end] = train_preds.detach().cpu() global_minibatch_counter += 1 epoch_minibatch_counter += 1 if global_minibatch_counter % 1000 == 0: calculate_confusion_matrix( all_labels=train_all_labels.numpy(), all_predicts=train_all_predicts.numpy(), classes=classes, num_classes=num_classes) # Store training diagnostics. train_loss = train_running_loss / (epoch_minibatch_counter * batch_size) train_acc = train_running_corrects / (epoch_minibatch_counter * batch_size) # Validation phase. model.train(mode=False) val_running_loss = 0.0 val_running_corrects = 0 # Feed forward over all the validation data. for idx, (val_inputs, val_labels, paths) in enumerate(dataloaders["val"]): val_inputs = val_inputs.to(device=device) val_labels = val_labels.to(device=device) # Feed forward. with torch.set_grad_enabled(mode=False): val_outputs = model(val_inputs) _, val_preds = torch.max(val_outputs, dim=1) val_loss = criterion(input=val_outputs, target=val_labels) # Update validation diagnostics. val_running_loss += val_loss.item() * val_inputs.size(0) val_running_corrects += torch.sum( val_preds == val_labels.data, dtype=torch.double) start = idx * batch_size end = start + batch_size val_all_labels[start:end] = val_labels.detach().cpu() val_all_predicts[start:end] = val_preds.detach().cpu() calculate_confusion_matrix( all_labels=val_all_labels.numpy(), all_predicts=val_all_predicts.numpy(), classes=classes, num_classes=num_classes) # Store validation diagnostics. val_loss = val_running_loss / dataset_sizes["val"] val_acc = val_running_corrects / dataset_sizes["val"] if torch.cuda.is_available(): torch.cuda.empty_cache() # Remaining things related to training. if global_minibatch_counter % 200000 == 0 or global_minibatch_counter == 5: epoch_output_path = checkpoints_folder.joinpath( f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt" ) # Confirm the output directory exists. epoch_output_path.parent.mkdir(parents=True, exist_ok=True) # Save the model as a state dictionary. torch.save(obj={ "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "epoch": epoch + 1 }, f=str(epoch_output_path)) writer.write( f"{epoch},{global_minibatch_counter},{train_loss:.4f}," f"{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n") current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print the diagnostics for each epoch. print(f"Epoch {epoch} with " f"mb {global_minibatch_counter} " f"lr {current_lr:.15f}: " f"t_loss: {train_loss:.4f} " f"t_acc: {train_acc:.4f} " f"v_loss: {val_loss:.4f} " f"v_acc: {val_acc:.4f}\n") scheduler.step() current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print training information at the end. print(f"\ntraining complete in " f"{(time.time() - since) // 60:.2f} minutes")
def train_and_evaluate(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, optimizer: optim, loss_fn, params: utils.Params, restore_file: str = None) -> None: '''Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the Deep AR model train_loader: load train data and labels test_loader: load test data and labels optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch params: (Params) hyperparameters restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) ''' # reload weights from restore_file if specified restore_epoch = 0 if restore_file is not None: restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar') logger.info('Restoring parameters from {}'.format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) restore_epoch = int(restore_file[-2:].replace('_',''))+1 logger.info('Restoring epoch: {}'.format(restore_epoch)) logger.info('Begin training and evaluation') # initialize the early_stopping object early_stopping = EarlyStopping(patience=25, verbose=True, delta=0.0001, folder=params.model_dir) if os.path.exists(os.path.join(params.model_dir, 'metrics_test_best_weights.json')): with open(os.path.join(params.model_dir, 'metrics_test_best_weights.json')) as json_file: best_test_ND = json.load(json_file)['ND'] early_stopping.best_score = best_test_ND else: best_test_ND = float('inf') early_stopping.best_score = best_test_ND train_len = len(train_loader) ND_summary = np.zeros(params.num_epochs) loss_summary = np.zeros((train_len * params.num_epochs)) for epoch in range(restore_epoch, params.num_epochs): logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs)) loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(model, optimizer, loss_fn, train_loader, test_loader, params, epoch) test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling) # if test_metrics['ND'] == float('nan'): # test_metrics['ND'] = 1000 # print('NAN ') # elif test_metrics['ND'] == np.nan: # print('NAN ') # test_metrics['ND'] = 1000 ND_summary[epoch] = test_metrics['ND'] ##################################'ND' is_best = ND_summary[epoch] <= best_test_ND # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict()}, epoch=epoch, is_best=is_best, checkpoint=params.model_dir) if is_best: logger.info('- Found new best ND') ############# 'ND' best_test_ND = ND_summary[epoch] best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json') utils.save_dict_to_json(test_metrics, best_json_path) logger.info('Current Best ND is: %.5f' % best_test_ND) ## 'ND' utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir) utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir) last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json') utils.save_dict_to_json(test_metrics, last_json_path) # early_stopping needs the validation loss to check if it has decresed, # and if it has, it will make a checkpoint of the current model logger.info('ND : %.5f ' % test_metrics['ND']) early_stopping(test_metrics['ND'], model) if early_stopping.early_stop: logger.info('Early stopping') break # # load the last checkpoint with the best model # model.load_state_dict(torch.load('checkpoint.pt')) if args.save_best: f = open('./param_search.txt', 'w') f.write('-----------\n') list_of_params = args.search_params.split(',') print_params = '' for param in list_of_params: param_value = getattr(params, param) print_params += f'{param}: {param_value:.2f}' print_params = print_params[:-1] f.write(print_params + '\n') f.write('Best ND: ' + str(best_test_ND) + '\n') logger.info(print_params) logger.info(f'Best ND: {best_test_ND}') f.close() utils.plot_all_epoch(ND_summary, print_params + '_ND', location=params.plot_dir) utils.plot_all_epoch(loss_summary, print_params + '_loss', location=params.plot_dir)
def save_checkpoint( path: str, model: TEDD1104, optimizer_name: str, optimizer: torch.optim, acc_dev: float, epoch: int, fp16: bool, opt_level: str = None, ) -> None: """ Save a checkpoint that allows to continue training the model in the future Input: - path: path where the model is going to be saved - model: TEDD1104 model to save - optimizer_name: Name of the optimizer used for training: SGD or Adam - optimizer: Optimizer used for training - acc_dev: Accuracy of the model in the development set - epoch: Num of epoch used to train the model - amp: If the model uses FP16, Nvidia Apex AMP - amp_opt_level: If the model uses FP16, the AMP opt_level Output: """ if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) dict_hyperparams: dict = { "sequence_size": model.sequence_size, "resnet": model.resnet, "pretrained_resnet": model.pretrained_resnet, "embedded_size": model.embedded_size, "hidden_size": model.hidden_size, "num_layers_lstm": model.num_layers_lstm, "bidirectional_lstm": model.bidirectional_lstm, "layers_out": model.layers_out, "dropout_cnn": model.dropout_cnn, "dropout_cnn_out": model.dropout_cnn_out, "dropout_lstm": model.dropout_lstm, "dropout_lstm_out": model.dropout_lstm_out, "fp16": fp16, "amp_opt_level": opt_level, } checkpoint = { "hyper_params": dict_hyperparams, "model": model.state_dict(), "optimizer_name": optimizer_name, "optimizer": optimizer.state_dict(), "acc_dev": acc_dev, "epoch": epoch, "amp": None if not fp16 else amp.state_dict(), "opt_level": opt_level, } torch.save(checkpoint, path)
def train_smartgrad_helper(model: torchvision.models.resnet.ResNet, dataloaders: Dict[str, torch.utils.data.DataLoader], dataset_sizes: Dict[str, int], criterion: torch.nn.modules.loss, optimizer: torch.optim, scheduler: torch.optim.lr_scheduler, num_epochs: int, log_writer: IO, train_order_writer: IO, device: torch.device, train_batch_size: int, val_batch_size: int, fake_minibatch_size: int, annealling_factor: float, save_mb_interval: int, val_mb_interval: int, checkpoints_folder: Path, num_layers: int, classes: List[str], num_classes: int) -> None: grad_layers = list(range(1, 21)) since = time.time() global_minibatch_counter = 0 # Initialize all the tensors to be used in training and validation. # Do this outside the loop since it will be written over entirely at each # epoch and doesn't need to be reallocated each time. train_all_labels = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() train_all_predicts = torch.empty(size=(dataset_sizes["train"], ), dtype=torch.long).cpu() val_all_labels = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() val_all_predicts = torch.empty(size=(dataset_sizes["val"], ), dtype=torch.long).cpu() for epoch in range(1, num_epochs+1): model.train(mode=False) # Training phase. train_running_loss, train_running_corrects, epoch_minibatch_counter = 0.0, 0, 0 idx_to_gt = {} for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]): train_inputs = inputs.to(device=device) train_labels = labels.to(device=device) optimizer.zero_grad() # Forward and backpropagation. with torch.set_grad_enabled(mode=True): train_outputs = model(train_inputs) __, train_preds = torch.max(train_outputs, dim=1) train_loss = criterion(input=train_outputs, target=train_labels) train_loss.backward(retain_graph=True) gt_label = int(train_labels.detach().cpu().numpy()[0]) idx_to_gt[idx] = gt_label ######################## #### important code #### ######################## #clear the memory fake_minibatch_idx = idx % fake_minibatch_size fake_minibatch_num = int(idx / fake_minibatch_size) if fake_minibatch_idx == 0: minibatch_grad_dict = {}; gc.collect() #get the per-example gradient magnitude and add to minibatch_grad_dict grad_as_dict, grad_flattened = model_to_grad_as_dict_and_flatten(model, grad_layers) minibatch_grad_dict[idx] = (grad_as_dict, grad_flattened) #every batch, calculate the best ones if fake_minibatch_idx == fake_minibatch_size - 1: idx_to_weight_batch = get_idx_to_weight(minibatch_grad_dict, annealling_factor, idx_to_gt) print(idx_to_weight_batch) ########################## # print("\n...............................updating......................................" + str(idx)) for layer_num, param in enumerate(model.parameters()): # if layer_num in [0]:#grad_layers: new_grad = get_new_layer_grad(layer_num, idx_to_weight_batch, minibatch_grad_dict) assert param.grad.detach().cpu().numpy().shape == new_grad.detach().cpu().numpy().shape param.grad = new_grad # check_model_weights(idx, model) optimizer.step() # check_model_weights(idx, model) # print("................................done........................................." + str(idx) + '\n\n\n\n') ########################## # Update training diagnostics. train_running_loss += train_loss.item() * train_inputs.size(0) train_running_corrects += torch.sum(train_preds == train_labels.data, dtype=torch.double) start = idx * train_batch_size end = start + train_batch_size train_all_labels[start:end] = train_labels.detach().cpu() train_all_predicts[start:end] = train_preds.detach().cpu() global_minibatch_counter += 1 epoch_minibatch_counter += 1 # Write the path of training order if it exists if train_order_writer: for path in paths: #write the order that the model was trained in train_order_writer.write("/".join(path.split("/")[-2:]) + "\n") # Validate the model if global_minibatch_counter % val_mb_interval == 0 or global_minibatch_counter == 1: # Calculate training diagnostics calculate_confusion_matrix( all_labels=train_all_labels.numpy(), all_predicts=train_all_predicts.numpy(), classes=classes, num_classes=num_classes) train_loss = train_running_loss / (epoch_minibatch_counter * train_batch_size) train_acc = train_running_corrects / (epoch_minibatch_counter * train_batch_size) # Validation phase. model.train(mode=False) val_running_loss = 0.0 val_running_corrects = 0 # Feed forward over all the validation data. for idx, (val_inputs, val_labels, paths) in enumerate(dataloaders["val"]): val_inputs = val_inputs.to(device=device) val_labels = val_labels.to(device=device) # Feed forward. with torch.set_grad_enabled(mode=False): val_outputs = model(val_inputs) _, val_preds = torch.max(val_outputs, dim=1) val_loss = criterion(input=val_outputs, target=val_labels) # Update validation diagnostics. val_running_loss += val_loss.item() * val_inputs.size(0) val_running_corrects += torch.sum(val_preds == val_labels.data, dtype=torch.double) start = idx * val_batch_size end = start + val_batch_size val_all_labels[start:end] = val_labels.detach().cpu() val_all_predicts[start:end] = val_preds.detach().cpu() # Calculate validation diagnostics calculate_confusion_matrix( all_labels=val_all_labels.numpy(), all_predicts=val_all_predicts.numpy(), classes=classes, num_classes=num_classes) val_loss = val_running_loss / dataset_sizes["val"] val_acc = val_running_corrects / dataset_sizes["val"] if torch.cuda.is_available(): torch.cuda.empty_cache() # Remaining things related to training. if global_minibatch_counter % save_mb_interval == 0 or global_minibatch_counter == 1: epoch_output_path = checkpoints_folder.joinpath(f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt") epoch_output_path.parent.mkdir(parents=True, exist_ok=True) # Save the model as a state dictionary. torch.save(obj={ "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "epoch": epoch + 1 }, f=str(epoch_output_path)) log_writer.write(f"{epoch},{global_minibatch_counter},{train_loss:.4f},{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n") current_lr = None for group in optimizer.param_groups: current_lr = group["lr"] # Print the diagnostics for each epoch. print(f"Epoch {epoch} with " f"mb {global_minibatch_counter} " f"lr {current_lr:.15f}: " f"t_loss: {train_loss:.4f} " f"t_acc: {train_acc:.4f} " f"v_loss: {val_loss:.4f} " f"v_acc: {val_acc:.4f}\n") scheduler.step() current_lr = None for group in optimizer.param_groups: current_lr = group["lr"]