Exemple #1
0
def clf_fit(net: nn.Module, crit: nn.Module, opti: torch.optim, tloader,
            vloader, **kwargs):
    """
  This function is used to train the classification networks.
  """
    epochs = kwargs['epochs']
    lr = kwargs['lr']
    lr_step = kwargs['lr_step']
    lr_decay = kwargs['lr_decay']
    seed = kwargs['seed'] if kwargs['seed'] else np.random.randint(100)

    bloss = float('inf')

    torch.manual_seed(seed)
    np.random.seed(seed)
    print('[INFO] Setting torch seed to {}'.format(seed))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    tlist = []
    vlist = []

    for e in range(1, epochs + 1):
        if lr_step is not None and type(lr_step) == int and e % lr_step == 0:
            lr = adjust_lr(opti, lr, lr_decay)
        if lr_step is not None and type(lr_step) == list and e in lr_step:
            lr = adjust_lr(opti, lr, lr_decay)

        tacc, tloss = clf_train(net, tloader, opti, crit, topk=kwargs['topk'])
        vacc, vloss = clf_test(net, vloader, crit, topk=kwargs['topk'])

        tlist.append((tacc, tloss))
        vlist.append((vacc, vloss))

        if vloss < bloss:
            bloss = vloss
            torch.save({
                'net': net.state_dict(),
                'opti': opti.state_dict()
            }, 'best_net-{}-{:.2f}.pth'.format(e, vacc[0]))

        # TODO The tloss and vloss needs a recheck.
        print('Epoch: {}/{} - Train Loss: {:.3f} - Train Acc@1: {:.3f}'
              '- Train Acc@5: {:.3f} - Val Loss: {:.3f} - Val Acc@1: {:.3f}'
              '- Val Acc@5: {:.3f}'.format(e, epochs, tloss, tacc[0], tacc[1],
                                           vloss, vacc[0], vacc[1]))

        torch.save({
            'net': net.cpu().state_dict(),
            'opti': opti.state_dict()
        }, 'net-{}-{:.2f}.pth'.format(e, vacc[0]))

    return tlist, vlist
def model_save(model: torch.nn.Module, encoder_optimizer: torch.optim,
               decoder_optimizer: torch.optim, loss, latent_dim, ckpt_dir):

    torch.save(
        {
            'model_state_dict': model.state_dict(),
            'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
            'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
            'loss': loss,
            'latent_dim': latent_dim,
            'model': model
        }, ckpt_dir)
def train_on_dataset(
        train_dataset: Dataset, val_dataset, model: Tree2Seq, criterion: nn.modules.loss, optimizer: torch.optim,
        scheduler: torch.optim.lr_scheduler, clip_norm: int, logger: AbstractLogger, start_batch_id: int = 0,
        log_step: int = -1, eval_step: int = -1, save_step: int = -1
):
    train_epoch_info = LearningInfo()

    batch_iterator_pb = tqdm(range(start_batch_id, len(train_dataset)), total=len(train_dataset))
    batch_iterator_pb.update(start_batch_id)
    batch_iterator_pb.refresh()

    for batch_id in batch_iterator_pb:
        graph, labels = train_dataset[batch_id]
        batch_info = train_on_batch(model, criterion, optimizer, scheduler, graph, labels, clip_norm)
        train_epoch_info.accumulate_info(batch_info)

        if is_step_match(batch_id, log_step):
            logger.log(train_epoch_info.get_state_dict(), batch_id, is_train=True)
            train_epoch_info = LearningInfo()

        if is_step_match(batch_id, save_step):
            train_dump = {
                'state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'batch_id': batch_id
            }
            logger.save_model(f'batch_{batch_id}.pt', train_dump)

        if is_step_match(batch_id, eval_step):
            eval_info = evaluate_on_dataset(val_dataset, model, criterion)
            logger.log(eval_info.get_state_dict(), batch_id, is_train=False)

    if train_epoch_info.batch_processed > 0:
        logger.log(train_epoch_info.get_state_dict(), len(train_dataset) - 1, is_train=True)
Exemple #4
0
    def save_checkpoint(self, model: torch.nn.Module, optimizer: torch.optim,
                        is_best: bool, save_state: bool = True):
        if save_state:
            state = {'model_state_dict': model.state_dict(),
                     'optimizer_state_dict': optimizer.state_dict()}
            torch.save(state, self.state_dir)

        if is_best:
            torch.save(model.state_dict(), self.model_dir)
Exemple #5
0
 def train(self,
           model: torchvision.models,
           criterion: torch.nn,
           optimizer: torch.optim,
           train_dataset: ImageFoldersDataset,
           test_dataset: ImageFoldersDataset,
           n_epochs: int = 25,
           batch_size: int = 32,
           shuffle: bool = True,
           *args,
           **kwargs):
     # TODO(lukasz): add scheduler for learning rate
     metrics = defaultdict(list)
     best_score_test = 0.
     for epoch in range(n_epochs):
         model.train()
         running_loss = 0.
         for data_idx, data in enumerate(
                 train_dataset.loader(
                     batch_size=batch_size,
                     shuffle=shuffle
                     # TODO(lukasz): add sampler for imbalanced dataset
                 )):
             inputs, labels = data
             inputs = inputs.to(self.device)
             labels = labels.to(self.device)
             optimizer.zero_grad()
             model = model.to(self.device)
             outputs = model(inputs)
             loss = criterion(outputs, labels)
             loss.backward()
             optimizer.step()
             running_loss += loss.item()
             # TODO(lukasz): add as argument
             if data_idx % 100 == 0:
                 msg = '[%d, %5d] loss: %.3f'
                 print(msg % (epoch + 1, data_idx + 1, running_loss / 100))
                 running_loss = 0.
         score_train = self.score(model, train_dataset)
         score_test = self.score(model, test_dataset)
         metrics['score_train'].append(score_train)
         metrics['score_test'].append(score_test)
         msg = '[%d] train score: %.3f, test score: %.3f'
         print(msg % (epoch + 1, score_train, score_test))
         # save model (make sure that Google Colab do not destroy your results)
         if score_test > best_score_test:
             torch.save(
                 {
                     'epoch': epoch,
                     'model_state_dict': model.state_dict(),
                     'optimizer_state_dict': optimizer.state_dict()
                 }, self.save_experiment)
             best_score_test = score_test
     self.metrics = metrics
     return self
def save_ckpt(model: nn.Module, optimizer: torch.optim,
              checkpoint_path: str) -> dict:
    """
    Save model and optimizer checkpoint to continuer training
    """
    torch.save(
        {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, checkpoint_path)
    print("Saved model and optimizer state to {}".format(checkpoint_path))
def save_checkpoint(_net: torch.nn.Module, _optimizer: torch.optim, _epoch,
                    _best_acc, _ckpt_path):

    checkpoint = {
        'net': _net.state_dict(),
        'optimizer': _optimizer.state_dict(),
        'epoch': _epoch,
        'best_acc': _best_acc
    }

    torch.save(checkpoint, _ckpt_path)
Exemple #8
0
def checkpoint_save(epoch: int, nn_model: model, nn_optimizer: torch.optim, training_loss: list, validation_loss: list, model_name: str, locations: dict, args):
    """
    Save model checkpoints
    """
    checkpoint_name = model_name.replace('.tar','_chkepo_{0}.tar'.format(str(epoch).zfill(3)))
    torch.save({'epoch':epoch,
                'model_state_dict':nn_model.state_dict(),
                'optimizer_state_dict':nn_optimizer.state_dict(),
                'training_loss':training_loss,
                'validation_loss':validation_loss,
                'arguments':args},
                locations['model_loc']+'/'+checkpoint_name)
def saveCheckpoint(checkpoint_path, model: nn.Module, optimizer: optim, scheduler: optim.lr_scheduler.MultiStepLR, epoch, feature=None):
    state = {
        'state_dict': model.state_dict(),
        'optimizer' : optimizer.state_dict(),
        'epoch': epoch,
        'scheduler': scheduler.state_dict()
    }

    if feature:
        state['feature'] = feature.state_dict()

    torch.save(state, checkpoint_path)

    return
Exemple #10
0
def train_and_evaluate(model: nn.Module, train_loader: DataLoader,
                       test_loader: DataLoader, optimizer: optim,
                       args) -> None:
    logger.info('begin training and evaluation')
    best_test_R2 = float('inf')
    train_len = len(train_loader)
    loss_summary = np.zeros((train_len * args.num_epochs))
    R2_summary = np.zeros(args.num_epochs)
    for epoch in range(args.num_epochs):
        logger.info('Epoch {}/{}'.format(epoch + 1, args.num_epochs))
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(
            model, optimizer, train_loader, test_loader, args, epoch)
        test_metrics = evaluate(model, test_loader, args, epoch)
        R2_summary[epoch] = test_metrics['R2']
        is_best = R2_summary[epoch] <= best_test_R2

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            epoch=epoch,
            is_best=is_best,
            checkpoint=args.model_dir)

        if is_best:
            logger.info('- Found new best R2')
            best_test_R2 = R2_summary[epoch]
            best_json_path = os.path.join(args.model_dir,
                                          'metrics_test_best_weights.json')
            utils.save_dict_to_json(test_metrics, best_json_path)

        logger.info('Current Best R2 is: %.5f' % best_test_R2)

        utils.plot_all_epoch(R2_summary[:epoch + 1], args.dataset + '_ND',
                             args.plot_dir)
        utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len],
                             args.dataset + '_loss', args.plot_dir)

        last_json_path = os.path.join(args.model_dir,
                                      'metrics_test_last_weights.json')
        utils.save_dict_to_json(test_metrics, last_json_path)
Exemple #11
0
 def find_lr(dataloader,
             model,
             optimizer: torch.optim,
             criterion,
             device,
             num_steps,
             lr_min: float = 1e-7,
             lr_max: float = 10,
             beta: float = 0.98):
     model.to(device)
     optim_dict = optimizer.state_dict().copy()
     optimizer.param_groups[0]['lr'] = lr_min
     #     num_steps = len(dataloader) - 1
     scheduler = LrSchedulerFinder(optimizer, lr_min, lr_max, num_steps)
     model_dict = model.state_dict().copy()
     losses = list()
     lrs = list()
     avg_loss = 0
     best_loss = 0
     for idx_batch, (data, label) in tqdm(enumerate(dataloader, 1),
                                          total=num_steps):
         print("here")
         if idx_batch == num_steps:
             break
         y, kl = model(data.to(device))
         print(y, kl)
         loss = criterion(y, label, kl, 0)
         if np.isnan(loss.item()):
             print(loss.item())
         avg_loss = beta * avg_loss + (1 - beta) * loss.item()
         smooth_loss = avg_loss / (1 - beta**idx_batch)
         if idx_batch > 1 and smooth_loss > 4 * best_loss:
             break
         if smooth_loss < best_loss or idx_batch == 1:
             best_loss = smooth_loss
         losses.append(smooth_loss)
         lrs.append(scheduler.get_lr()[0])
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
         scheduler.step()
     model.load_state_dict(model_dict)
     optimizer.load_state_dict(optim_dict)
     return np.array(lrs), np.array(losses)
Exemple #12
0
def save_model(model_directory: str,
               trained_model: models,
               class_to_idx: dict,
               optimizer: optim,
               arch: str,
               epochs=4,
               model_name: str = 'checkpoint.pth'):
    """
    Saves model to directory
    :param model_directory: a path where the model should be saved
    :param trained_model: the model to be saved
    :param class_to_idx: Dict with items (class_name, class_index)
    :param optimizer: the optimizer that has been used in training
    :param arch:
    :param epochs: number of epochs that is used in the training. could be used later
    :param model_name: checkpoint name
    :return:
    """

    # check for save directory
    if not os.path.isdir(model_directory):
        print(f'Directory {model_directory} does not exist. Creating...')
        os.makedirs(model_directory)

    trained_model.class_to_idx = class_to_idx

    model_state = {
        'epoch': epochs,
        'state_dict': trained_model.state_dict(),
        'optimizer_dict': optimizer.state_dict(),
        'classifier': trained_model.classifier,
        'class_to_idx': trained_model.class_to_idx,
        'arch': arch
    }

    save_location = f'{model_directory}/{model_name}'
    print(f"Saving checkpoint to {save_location}")

    torch.save(model_state, save_location)
Exemple #13
0
def train_and_evaluate(model: nn.Module,
                       train_loader: DataLoader,
                       test_loader: DataLoader,
                       optimizer: optim,
                       loss_fn,
                       params: utils.Params,
                       restore_file: str = None) -> None:
    '''Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the Deep AR model
        train_loader: load train data and labels
        test_loader: load test data and labels
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
        params: (Params) hyperparameters
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    '''
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(params.model_dir,
                                    restore_file + '.pth.tar')
        logger.info('Restoring parameters from {}'.format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
    logger.info('begin training and evaluation')
    best_test_ND = float('inf')
    train_len = len(train_loader)
    ND_summary = np.zeros(params.num_epochs)
    loss_summary = np.zeros((train_len * params.num_epochs))
    for epoch in range(params.num_epochs):
        logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(
            model, optimizer, loss_fn, train_loader, test_loader, params,
            epoch)
        test_metrics = evaluate(model,
                                loss_fn,
                                test_loader,
                                params,
                                epoch,
                                sample=args.sampling)
        ND_summary[epoch] = test_metrics['ND']
        is_best = ND_summary[epoch] <= best_test_ND

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            epoch=epoch,
            is_best=is_best,
            checkpoint=params.model_dir)

        if is_best:
            logger.info('- Found new best ND')
            best_test_ND = ND_summary[epoch]
            best_json_path = os.path.join(params.model_dir,
                                          'metrics_test_best_weights.json')
            utils.save_dict_to_json(test_metrics, best_json_path)

        logger.info('Current Best ND is: %.5f' % best_test_ND)

        utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND',
                             params.plot_dir)
        utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len],
                             args.dataset + '_loss', params.plot_dir)

        last_json_path = os.path.join(params.model_dir,
                                      'metrics_test_last_weights.json')
        utils.save_dict_to_json(test_metrics, last_json_path)

    if args.save_best:
        f = open('./param_search.txt', 'w')
        f.write('-----------\n')
        list_of_params = args.search_params.split(',')
        print_params = ''
        for param in list_of_params:
            param_value = getattr(params, param)
            print_params += f'{param}: {param_value:.2f}'
        print_params = print_params[:-1]
        f.write(print_params + '\n')
        f.write('Best ND: ' + str(best_test_ND) + '\n')
        logger.info(print_params)
        logger.info(f'Best ND: {best_test_ND}')
        f.close()
        utils.plot_all_epoch(ND_summary,
                             print_params + '_ND',
                             location=params.plot_dir)
        utils.plot_all_epoch(loss_summary,
                             print_params + '_loss',
                             location=params.plot_dir)
Exemple #14
0
    def _train_helper(self, model: torchvision.models.resnet.ResNet,
                      dataloaders: Dict[str, torch.utils.data.DataLoader],
                      dataset_sizes: Dict[str, int], loss_fn,
                      optimizer: torch.optim,
                      scheduler: torch.optim.lr_scheduler, start_epoch: int,
                      writer: IO) -> None:
        """
        Function for learning ResNet.

        Args:
            model: ResNet model for learning.
            dataloaders: Dataloaders for IO pipeline.
            dataset_sizes: Sizes of the learning and validation dataset.
            loss_fn: Metric used for calculating loss.
            optimizer: Optimizer to use for gradient descent.
            scheduler: Scheduler to use for learning rate decay.
            start_epoch: Starting epoch for learning.
            writer: Writer to write logging information.
        """
        learning_init_time = time.time()

        # Initialize all the tensors to be used in learning and validation.
        # Do this outside the loop since it will be written over entirely at each
        # epoch and doesn't need to be reallocated each time.
        train_all_labels = torch.empty(size=(dataset_sizes["train"], ),
                                       dtype=torch.long).cpu()
        train_all_predicts = torch.empty(size=(dataset_sizes["train"], ),
                                         dtype=torch.long).cpu()
        val_all_labels = torch.empty(size=(dataset_sizes["val"], ),
                                     dtype=torch.long).cpu()
        val_all_predicts = torch.empty(size=(dataset_sizes["val"], ),
                                       dtype=torch.long).cpu()
        early_stopper = EarlyStopper(patience=self._early_stopping_patience,
                                     mode=EarlyStopper.Mode.MAX)

        if self._resume_checkpoint and self._last_val_acc:
            best_val_acc = self._last_val_acc
        else:
            best_val_acc = 0.

        # Train for specified number of epochs.
        for epoch in range(start_epoch, self._num_epochs):
            epoch_init_time = time.time()

            # Training phase.
            model.train(mode=True)

            train_running_loss = 0.0
            train_running_corrects = 0

            # Train over all learning data.
            for idx, (train_inputs,
                      true_labels) in enumerate(dataloaders["train"]):
                train_patches = train_inputs["patch"].to(device=self._device)
                train_x_coord = train_inputs["x_coord"].to(device=self._device)
                train_y_coord = train_inputs["y_coord"].to(device=self._device)
                true_labels = true_labels.to(device=self._device)
                optimizer.zero_grad()

                # Forward and backpropagation.
                with torch.set_grad_enabled(mode=True):
                    train_logits = model(train_patches, train_x_coord,
                                         train_y_coord).squeeze(dim=1)
                    train_loss = loss_fn(logits=train_logits,
                                         target=true_labels)
                    train_loss.backward()
                    optimizer.step()

                # Update learning diagnostics.
                train_running_loss += train_loss.item() * train_patches.size(0)
                pred_labels = self._extract_pred_labels(train_logits)
                train_running_corrects += torch.sum(
                    pred_labels == true_labels.data, dtype=torch.double)

                start = idx * self._batch_size
                end = start + self._batch_size

                train_all_labels[start:end] = true_labels.detach().cpu()
                train_all_predicts[start:end] = pred_labels.detach().cpu()

            self._calculate_confusion_matrix(
                all_labels=train_all_labels.numpy(),
                all_predicts=train_all_predicts.numpy(),
                classes=self._classes,
                num_classes=self._num_classes)

            # Store learning diagnostics.
            train_loss = train_running_loss / dataset_sizes["train"]
            train_acc = train_running_corrects / dataset_sizes["train"]

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            # Validation phase.
            model.train(mode=False)

            val_running_loss = 0.0
            val_running_corrects = 0

            # Feed forward over all the validation data.
            for idx, (val_inputs, val_labels) in enumerate(dataloaders["val"]):
                val_patches = val_inputs["patch"].to(device=self._device)
                val_x_coord = val_inputs["x_coord"].to(device=self._device)
                val_y_coord = val_inputs["y_coord"].to(device=self._device)
                val_labels = val_labels.to(device=self._device)

                # Feed forward.
                with torch.set_grad_enabled(mode=False):
                    val_logits = model(val_patches, val_x_coord,
                                       val_y_coord).squeeze(dim=1)
                    val_loss = loss_fn(logits=val_logits, target=val_labels)

                # Update validation diagnostics.
                val_running_loss += val_loss.item() * val_patches.size(0)
                pred_labels = self._extract_pred_labels(val_logits)
                val_running_corrects += torch.sum(
                    pred_labels == val_labels.data, dtype=torch.double)

                start = idx * self._batch_size
                end = start + self._batch_size

                val_all_labels[start:end] = val_labels.detach().cpu()
                val_all_predicts[start:end] = pred_labels.detach().cpu()

            self._calculate_confusion_matrix(
                all_labels=val_all_labels.numpy(),
                all_predicts=val_all_predicts.numpy(),
                classes=self._classes,
                num_classes=self._num_classes)

            # Store validation diagnostics.
            val_loss = val_running_loss / dataset_sizes["val"]
            val_acc = val_running_corrects / dataset_sizes["val"]

            if torch.cuda.is_available():
                torch.cuda.empty_cache()

            scheduler.step()

            current_lr = None
            for group in optimizer.param_groups:
                current_lr = group["lr"]

            # Remaining things related to learning.
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_model_ckpt_path = self._checkpoints_folder.joinpath(
                    f"resnet{self._num_layers}_e{epoch}_va{val_acc:.5f}.pt")

                # Confirm the output directory exists.
                best_model_ckpt_path.parent.mkdir(parents=True, exist_ok=True)

                # Save the model as a state dictionary.
                torch.save(obj={
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scheduler_state_dict": scheduler.state_dict(),
                    "epoch": epoch + 1
                },
                           f=str(best_model_ckpt_path))

                self._clean_ckpt_folder(best_model_ckpt_path)

            writer.write(f"{epoch},{train_loss:.4f},"
                         f"{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n")

            # Print the diagnostics for each epoch.
            logging.info(
                f"Epoch {epoch} "
                f"with lr {current_lr:.15f}: "
                f"{self._format_time_period(epoch_init_time, time.time())} "
                f"t_loss: {train_loss:.4f} "
                f"t_acc: {train_acc:.4f} "
                f"v_loss: {val_loss:.4f} "
                f"v_acc: {val_acc:.4f}\n")

            early_stopper.update(val_acc)
            if early_stopper.is_stopping():
                logging.info("Early stopping")
                break

        # Print learning information at the end.
        logging.info(
            f"\nlearning complete in "
            f"{self._format_time_period(learning_init_time, time.time())}")
def save_checkpoint(
    path: str,
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    scheduler: torch.optim.lr_scheduler,
    running_loss: float,
    total_batches: int,
    total_training_examples: int,
    acc_dev: float,
    epoch: int,
    fp16: bool,
    scaler: Optional[GradScaler],
) -> None:

    """
    Save a checkpoint that allows to continue training the model in the future

    Input:
     - path: path where the model is going to be saved
     - model: TEDD1104 model to save
     - optimizer_name: Name of the optimizer used for training: SGD or Adam
     - optimizer: Optimizer used for training
     - acc_dev: Accuracy of the model in the development set
     - epoch: Num of epoch used to train the model
     - fp16: If the model uses FP16
     - scaler: If the model uses FP16, the scaler used for training

    Output:
    """

    dict_hyperparams: dict = {
        "sequence_size": model.sequence_size,
        "resnet": model.resnet,
        "pretrained_resnet": model.pretrained_resnet,
        "embedded_size": model.embedded_size,
        "hidden_size": model.hidden_size,
        "num_layers_lstm": model.num_layers_lstm,
        "bidirectional_lstm": model.bidirectional_lstm,
        "layers_out": model.layers_out,
        "dropout_cnn": model.dropout_cnn,
        "dropout_cnn_out": model.dropout_cnn_out,
        "dropout_lstm": model.dropout_lstm,
        "dropout_lstm_out": model.dropout_lstm_out,
        "fp16": fp16,
    }

    checkpoint = {
        "hyper_params": dict_hyperparams,
        "model": model.state_dict(),
        "optimizer_name": optimizer_name,
        "optimizer": optimizer.state_dict(),
        "scheduler": scheduler.state_dict(),
        "running_loss": running_loss,
        "total_batches": total_batches,
        "total_training_examples": total_training_examples,
        "acc_dev": acc_dev,
        "epoch": epoch,
        "scaler": None if not fp16 else scaler.state_dict(),
    }

    torch.save(checkpoint, path)
Exemple #16
0
def train_and_evaluate(model: nn.Module,
                       train_loader: DataLoader,
                       test_loader: DataLoader,
                       val_loader: DataLoader,
                       optimizer: optim,
                       loss_fn,
                       params: utils.Params,
                       restore_file: str = None) -> None:  # 箭头无意义,提示函数返回值为None
    '''
    Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the Deep AR model
        train_loader: load train data and labels
        test_loader: load test data and labels
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
        params: (Params) hyperparameters
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    '''

    print('Begin training')
    print(model)
    train_len = len(train_loader)

    loss_summary = np.zeros((train_len * params.num_epochs))
    early_stopping = EarlyStopping(patience=5, verbose=True)

    for epoch in range(params.num_epochs):
        print('Epoch {}/{}'.format(epoch + 1, params.num_epochs))

        # train
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(
            model, optimizer, loss_fn, train_loader, test_loader, params,
            epoch)
        print(
            f"train_loss: {np.mean(loss_summary[epoch * train_len:(epoch + 1) * train_len])}"
        )

        # evaluate
        val_metrics, tmp_mu, tmp_sigma = evaluate(model,
                                                  loss_fn,
                                                  val_loader,
                                                  params,
                                                  sample=params.sampling)
        # test_metrics = evaluate(model, loss_fn, test_loader, params, sample=params.sampling)

        # early stop
        early_stopping(val_metrics['test_loss'], model)
        if early_stopping.early_stop:

            print("Early stopping")

            # save weights
            utils.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optim_dict': optimizer.state_dict()
                },
                filepath=params.model_dir)

            break
def train_model(model: nn.Module, data_loaders: Dict[str, DataLoader],
                loss_func: callable, optimizer: optim,
                model_folder: str, tensorboard_folder: str,
                args, **kwargs):
    num_epochs = args.epochs
    phases = ['train', 'val', 'test']

    writer = SummaryWriter(tensorboard_folder)

    since = time.clock()

    # save_dict, best_rmse = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 100000
    save_dict, best_pcc = {'model_state_dict': copy.deepcopy(model.state_dict()), 'epoch': 0}, 0

    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.2, patience=5, threshold=1e-3, min_lr=1e-6)

    try:
        for epoch in range(num_epochs):
            running_loss = {phase: 0.0 for phase in phases}
            for phase in phases:
                if phase == 'train':
                    model.train()
                else:
                    model.eval()

                steps, predictions, targets = 0, list(), list()
                tqdm_loader = tqdm(enumerate(data_loaders[phase]))
                for step, (features, truth_data) in tqdm_loader:
                    features = to_var(features, args.device)
                    truth_data = to_var(truth_data, args.device)
                    with torch.set_grad_enabled(phase == 'train'):
                        if args.lossinside:
                            loss, outputs = model(features, truth_data, args, loss_func=loss_func)
                        else:
                            outputs = model(features, args)
                            loss = loss_func(truth=truth_data, predict=outputs)
                        # loss = loss_func(outputs, truth_data)

                        if phase == 'train':
                            if torch.isnan(loss):
                                print("=============LOSS NAN============")
                                print(features)
                                print(truth_data)
                                print(outputs)
                            else:
                                optimizer.zero_grad()
                                loss.backward()
                                optimizer.step()

                    targets.append(truth_data.cpu().numpy())
                    with torch.no_grad():
                        predictions.append(outputs.cpu().detach().numpy())

                    running_loss[phase] += loss * truth_data.size(0)
                    steps += truth_data.size(0)

                    tqdm_loader.set_description(
                        f'{phase} epoch: {epoch}, {phase} loss: {running_loss[phase] / steps}')

                    # For the issue that the CPU memory increases while training. DO NOT know why, but it works.
                    torch.cuda.empty_cache()
                # 性能
                predictions = np.concatenate(predictions)
                targets = np.concatenate(targets)
                # print(2)
                # print(predictions[:3, :3])
                # print(targets[:3, :3])
                scores = calculate_metrics(predictions.reshape(predictions.shape[0], -1),
                                           targets.reshape(targets.shape[0], -1), args, plot=epoch % 5 == 0, **kwargs)
                # print(3)
                writer.add_scalars(f'score/{phase}', scores, global_step=epoch)
                with open(model_folder+"/output.txt", "a") as f:
                    f.write(f'{phase} epoch: {epoch}, {phase} loss: {running_loss[phase] / steps}\n')
                    f.write(str(scores))
                    f.write('\n')
                    f.write(str(time.time()))
                    f.write("\n\n")
                print(scores)
                # if phase == 'val' and scores['RMSE'] < best_rmse:
                if phase == 'val' and scores['pearr'] > best_pcc:
                    best_pcc = scores['pearr']
                    # best_rmse = scores['RMSE']
                    save_dict.update(model_state_dict=copy.deepcopy(model.state_dict()),
                                     epoch=epoch,
                                     optimizer_state_dict=copy.deepcopy(optimizer.state_dict()))

            scheduler.step(running_loss['train'])

            writer.add_scalars('Loss', {
                f'{phase} loss': running_loss[phase] / len(data_loaders[phase].dataset) for phase in phases},
                               global_step=epoch)
    finally:
        time_elapsed = time.clock() - since
        print(f"cost {time_elapsed} seconds")

        save_model(f"{model_folder}/best_model.pkl", **save_dict)
        save_model(f"{model_folder}/final_model.pkl",
                   **{'model_state_dict': copy.deepcopy(model.state_dict()),
                      'epoch': num_epochs,
                      'optimizer_state_dict': copy.deepcopy(optimizer.state_dict())})
Exemple #18
0
def train_helper(model: torchvision.models.resnet.ResNet,
                 dataloaders: Dict[str, torch.utils.data.DataLoader],
                 dataset_sizes: Dict[str,
                                     int], criterion: torch.nn.modules.loss,
                 optimizer: torch.optim, scheduler: torch.optim.lr_scheduler,
                 num_epochs: int, log_writer: IO, train_order_writer: IO,
                 device: torch.device, batch_size: int,
                 checkpoints_folder: Path, num_layers: int, classes: List[str],
                 minibatch_counter, num_classes: int) -> None:

    since = time.time()
    global_minibatch_counter = minibatch_counter
    # Initialize all the tensors to be used in training and validation.
    # Do this outside the loop since it will be written over entirely at each
    # epoch and doesn't need to be reallocated each time.
    train_all_labels = torch.empty(size=(dataset_sizes["train"], ),
                                   dtype=torch.long).cpu()
    train_all_predicts = torch.empty(size=(dataset_sizes["train"], ),
                                     dtype=torch.long).cpu()
    val_all_labels = torch.empty(size=(dataset_sizes["val"], ),
                                 dtype=torch.long).cpu()
    val_all_predicts = torch.empty(size=(dataset_sizes["val"], ),
                                   dtype=torch.long).cpu()

    for epoch in range(1, num_epochs + 1):

        model.train(mode=True)  # Training phase.
        train_running_loss, train_running_corrects, epoch_minibatch_counter = 0.0, 0, 0

        for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]):
            train_inputs = inputs.to(device=device)
            train_labels = labels.to(device=device)
            optimizer.zero_grad()

            # Forward and backpropagation.
            with torch.set_grad_enabled(mode=True):
                train_outputs = model(train_inputs)
                __, train_preds = torch.max(train_outputs, dim=1)
                train_loss = criterion(input=train_outputs,
                                       target=train_labels)
                train_loss.backward()
                optimizer.step()

            # Update training diagnostics.
            train_running_loss += train_loss.item() * train_inputs.size(0)
            train_running_corrects += torch.sum(
                train_preds == train_labels.data, dtype=torch.double)

            this_batch_size = train_labels.detach().cpu().shape[0]
            start = idx * batch_size
            end = start + this_batch_size
            train_all_labels[start:end] = train_labels.detach().cpu()
            train_all_predicts[start:end] = train_preds.detach().cpu()

            global_minibatch_counter += 1
            epoch_minibatch_counter += 1

        # Calculate training diagnostics
        calculate_confusion_matrix(all_labels=train_all_labels.numpy(),
                                   all_predicts=train_all_predicts.numpy(),
                                   classes=classes,
                                   num_classes=num_classes)
        train_loss = train_running_loss / (epoch_minibatch_counter *
                                           batch_size)
        train_acc = train_running_corrects / (epoch_minibatch_counter *
                                              batch_size)

        # Validation phase.
        model.train(mode=False)
        val_running_loss = 0.0
        val_running_corrects = 0

        # Feed forward over all the validation data.
        for idx, (val_inputs, val_labels,
                  paths) in enumerate(dataloaders["val"]):
            val_inputs = val_inputs.to(device=device)
            val_labels = val_labels.to(device=device)

            # Feed forward.
            with torch.set_grad_enabled(mode=False):
                val_outputs = model(val_inputs)
                _, val_preds = torch.max(val_outputs, dim=1)
                val_loss = criterion(input=val_outputs, target=val_labels)

            # Update validation diagnostics.
            val_running_loss += val_loss.item() * val_inputs.size(0)
            val_running_corrects += torch.sum(val_preds == val_labels.data,
                                              dtype=torch.double)

            this_batch_size = val_labels.detach().cpu().shape[0]
            start = idx * batch_size
            end = start + this_batch_size
            val_all_labels[start:end] = val_labels.detach().cpu()
            val_all_predicts[start:end] = val_preds.detach().cpu()

        # Calculate validation diagnostics
        calculate_confusion_matrix(all_labels=val_all_labels.numpy(),
                                   all_predicts=val_all_predicts.numpy(),
                                   classes=classes,
                                   num_classes=num_classes)
        val_loss = val_running_loss / dataset_sizes["val"]
        val_acc = val_running_corrects / dataset_sizes["val"]

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        # Remaining things related to training.

        epoch_output_path = checkpoints_folder.joinpath(
            f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt"
        )
        epoch_output_path.parent.mkdir(parents=True, exist_ok=True)

        # Save the model as a state dictionary.
        torch.save(obj={
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "scheduler_state_dict": scheduler.state_dict(),
            "epoch": epoch + 1
        },
                   f=str(epoch_output_path))

        log_writer.write(
            f"{epoch},{global_minibatch_counter},{train_loss:.4f},{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n"
        )

        current_lr = None
        for group in optimizer.param_groups:
            current_lr = group["lr"]

        # Print the diagnostics for each epoch.
        print(f"Epoch {epoch} with "
              f"mb {global_minibatch_counter} "
              f"lr {current_lr:.15f}: "
              f"t_loss: {train_loss:.4f} "
              f"t_acc: {train_acc:.4f} "
              f"v_loss: {val_loss:.4f} "
              f"v_acc: {val_acc:.4f}\n")

        scheduler.step()

        current_lr = None
        for group in optimizer.param_groups:
            current_lr = group["lr"]

    # Print training information at the end.
    print(f"\ntraining complete in "
          f"{(time.time() - since) // 60:.2f} minutes")

    return epoch_output_path, global_minibatch_counter
def train_helper(model: torchvision.models.resnet.ResNet,
                 dataloaders: Dict[str, torch.utils.data.DataLoader],
                 dataset_sizes: Dict[str,
                                     int], criterion: torch.nn.modules.loss,
                 optimizer: torch.optim, scheduler: torch.optim.lr_scheduler,
                 num_epochs: int, writer: IO, train_order_writer: IO,
                 device: torch.device, start_epoch: int, batch_size: int,
                 save_interval: int, checkpoints_folder: Path, num_layers: int,
                 classes: List[str], num_classes: int) -> None:
    """
    Function for training ResNet.
    Args:
        model: ResNet model for training.
        dataloaders: Dataloaders for IO pipeline.
        dataset_sizes: Sizes of the training and validation dataset.
        criterion: Metric used for calculating loss.
        optimizer: Optimizer to use for gradient descent.
        scheduler: Scheduler to use for learning rate decay.
        start_epoch: Starting epoch for training.
        writer: Writer to write logging information.
        train_order_writer: Writer to write the order of training examples.
        device: Device to use for running model.
        num_epochs: Total number of epochs to train for.
        batch_size: Mini-batch size to use for training.
        save_interval: Number of epochs between saving checkpoints.
        checkpoints_folder: Directory to save model checkpoints to.
        num_layers: Number of layers to use in the ResNet model from [18, 34, 50, 101, 152].
        classes: Names of the classes in the dataset.
        num_classes: Number of classes in the dataset.
    """
    since = time.time()

    # Initialize all the tensors to be used in training and validation.
    # Do this outside the loop since it will be written over entirely at each
    # epoch and doesn't need to be reallocated each time.
    train_all_labels = torch.empty(size=(dataset_sizes["train"], ),
                                   dtype=torch.long).cpu()
    train_all_predicts = torch.empty(size=(dataset_sizes["train"], ),
                                     dtype=torch.long).cpu()
    val_all_labels = torch.empty(size=(dataset_sizes["val"], ),
                                 dtype=torch.long).cpu()
    val_all_predicts = torch.empty(size=(dataset_sizes["val"], ),
                                   dtype=torch.long).cpu()

    global_minibatch_counter = 0

    # Train for specified number of epochs.
    for epoch in range(start_epoch, num_epochs):

        # Training phase.
        model.train(mode=True)

        train_running_loss = 0.0
        train_running_corrects = 0
        epoch_minibatch_counter = 0

        # Train over all training data.
        for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]):

            train_inputs = inputs.to(device=device)
            train_labels = labels.to(device=device)
            optimizer.zero_grad()

            # Forward and backpropagation.
            with torch.set_grad_enabled(mode=True):
                train_outputs = model(train_inputs)
                __, train_preds = torch.max(train_outputs, dim=1)
                train_loss = criterion(input=train_outputs,
                                       target=train_labels)
                train_loss.backward()
                optimizer.step()

            # Update training diagnostics.
            train_running_loss += train_loss.item() * train_inputs.size(0)
            train_running_corrects += torch.sum(
                train_preds == train_labels.data, dtype=torch.double)

            start = idx * batch_size
            end = start + batch_size

            train_all_labels[start:end] = train_labels.detach().cpu()
            train_all_predicts[start:end] = train_preds.detach().cpu()

            global_minibatch_counter += 1
            epoch_minibatch_counter += 1

            # for path in paths: #write the order that the model was trained in
            #     train_order_writer.write("/".join(path.split("/")[-2:]) + "\n")

            if global_minibatch_counter % 10 == 0 or global_minibatch_counter == 5:

                calculate_confusion_matrix(
                    all_labels=train_all_labels.numpy(),
                    all_predicts=train_all_predicts.numpy(),
                    classes=classes,
                    num_classes=num_classes)

                # Store training diagnostics.
                train_loss = train_running_loss / (epoch_minibatch_counter *
                                                   batch_size)
                train_acc = train_running_corrects / (epoch_minibatch_counter *
                                                      batch_size)

                # Validation phase.
                model.train(mode=False)

                val_running_loss = 0.0
                val_running_corrects = 0

                # Feed forward over all the validation data.
                for idx, (val_inputs, val_labels,
                          paths) in enumerate(dataloaders["val"]):
                    val_inputs = val_inputs.to(device=device)
                    val_labels = val_labels.to(device=device)

                    # Feed forward.
                    with torch.set_grad_enabled(mode=False):
                        val_outputs = model(val_inputs)
                        _, val_preds = torch.max(val_outputs, dim=1)
                        val_loss = criterion(input=val_outputs,
                                             target=val_labels)

                    # Update validation diagnostics.
                    val_running_loss += val_loss.item() * val_inputs.size(0)
                    val_running_corrects += torch.sum(
                        val_preds == val_labels.data, dtype=torch.double)

                    start = idx * batch_size
                    end = start + batch_size

                    val_all_labels[start:end] = val_labels.detach().cpu()
                    val_all_predicts[start:end] = val_preds.detach().cpu()

                calculate_confusion_matrix(
                    all_labels=val_all_labels.numpy(),
                    all_predicts=val_all_predicts.numpy(),
                    classes=classes,
                    num_classes=num_classes)

                # Store validation diagnostics.
                val_loss = val_running_loss / dataset_sizes["val"]
                val_acc = val_running_corrects / dataset_sizes["val"]

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

                # Remaining things related to training.
                if global_minibatch_counter % 10 == 0 or global_minibatch_counter == 5:
                    epoch_output_path = checkpoints_folder.joinpath(
                        f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt"
                    )

                    # Confirm the output directory exists.
                    epoch_output_path.parent.mkdir(parents=True, exist_ok=True)

                    # Save the model as a state dictionary.
                    torch.save(obj={
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "scheduler_state_dict": scheduler.state_dict(),
                        "epoch": epoch + 1
                    },
                               f=str(epoch_output_path))

                writer.write(
                    f"{epoch},{global_minibatch_counter},{train_loss:.4f},"
                    f"{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n")

                current_lr = None
                for group in optimizer.param_groups:
                    current_lr = group["lr"]

                # Print the diagnostics for each epoch.
                print(f"Epoch {epoch} with "
                      f"mb {global_minibatch_counter} "
                      f"lr {current_lr:.15f}: "
                      f"t_loss: {train_loss:.4f} "
                      f"t_acc: {train_acc:.4f} "
                      f"v_loss: {val_loss:.4f} "
                      f"v_acc: {val_acc:.4f}\n")

        scheduler.step()

        current_lr = None
        for group in optimizer.param_groups:
            current_lr = group["lr"]

    # Print training information at the end.
    print(f"\ntraining complete in "
          f"{(time.time() - since) // 60:.2f} minutes")
Exemple #20
0
def train_and_evaluate2(model: nn.Module, train_loader: DataLoader,
                        test_loader: DataLoader, optimizer: optim,
                        params: utils.Params, loss_fn: None,
                        restore_file: None, args: None, idx: None) -> None:
    '''Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the Deep AR model
        train_loader: load train data and labels
        test_loader: load test data and labels
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
        params: (Params) hyperparameters
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    '''
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(params.model_dir,
                                    restore_file + '.pth.tar')
        logger.info('Restoring parameters from {}'.format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
    logger.info('begin training and evaluation')
    best_test_ND = float('inf')

    # File to save first results
    out_file = os.path.join(os.path.join('experiments', args.model_name),
                            'train_results.csv')
    if not os.path.isfile(out_file):
        of_connection = open(out_file, 'w')
        writer = csv.writer(of_connection)
        # Write the headers to the file
        writer.writerow(['iteration', 'epoch', 'test_metric', 'train_loss'])
        of_connection.close()

    train_len = len(train_loader)
    ND_summary = np.zeros(params.num_epochs)
    loss_summary = np.zeros((train_len * params.num_epochs))

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=5,
                                   verbose=True,
                                   delta=0.0001,
                                   folder=params.model_dir)

    for epoch in range(params.num_epochs):
        logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(
            model, optimizer, loss_fn, train_loader, test_loader, params,
            args.sampling, epoch)
        test_metrics = evaluate(model,
                                loss_fn,
                                test_loader,
                                params,
                                epoch,
                                sample=args.sampling)
        if test_metrics['rou50'] == float('nan'):
            test_metrics['rou50'] = 100
        elif test_metrics['rou50'] == 'nan':
            test_metrics['rou50'] = 100
        elif test_metrics['rou50'] == np.nan:
            test_metrics['rou50'] = 100

        ND_summary[epoch] = test_metrics['rou50']
        is_best = ND_summary[epoch] <= best_test_ND

        # Save weights
        utils.save_checkpoint(
            {
                'epoch': 0,  #epoch + 1,
                'state_dict': model.state_dict(),
                'optim_dict': optimizer.state_dict()
            },
            epoch=0,  # to prevent extra model savings
            is_best=is_best,
            checkpoint=params.model_dir)

        if is_best:
            logger.info('- Found new best ND')
            best_test_ND = ND_summary[epoch]
            best_json_path = os.path.join(params.model_dir,
                                          'metrics_test_best_weights.json')
            utils.save_dict_to_json(test_metrics, best_json_path)

        logger.info('Current Best loss is: %.5f' % best_test_ND)

        #if args.plot_figure:
        #    utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir)
        #    utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir)

        last_json_path = os.path.join(params.model_dir,
                                      'metrics_test_last_weights.json')
        utils.save_dict_to_json(test_metrics, last_json_path)
        # Write to the csv file ('a' means append)
        of_connection = open(out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow([idx, epoch + 1, test_metrics,
                         loss_summary[-1]])  #loss_summary[0]??
        of_connection.close()
        logger.info('Loss_summary: ' %
                    loss_summary[epoch * train_len:(epoch + 1) * train_len])

        # early_stopping needs the validation loss to check if it has decresed,
        # and if it has, it will make a checkpoint of the current model
        logger.info('test_metrics[rou50]: %.5f ' % test_metrics['rou50'])
        early_stopping(test_metrics['rou50'], model)

        if early_stopping.early_stop:
            logger.info('Early stopping')
            break

    with open(best_json_path) as json_file:
        best_metrics = json.load(json_file)
    return best_metrics, test_metrics
def train_helper_with_gradients(model: torchvision.models.resnet.ResNet,
                                dataloaders: Dict[str,
                                                  torch.utils.data.DataLoader],
                                dataset_sizes: Dict[str, int],
                                criterion: torch.nn.modules.loss,
                                optimizer: torch.optim,
                                scheduler: torch.optim.lr_scheduler,
                                num_epochs: int, writer: IO,
                                train_order_writer: IO, device: torch.device,
                                start_epoch: int, batch_size: int,
                                save_interval: int, checkpoints_folder: Path,
                                num_layers: int, classes: List[str],
                                num_classes: int) -> None:
    since = time.time()

    # Initialize all the tensors to be used in training and validation.
    # Do this outside the loop since it will be written over entirely at each
    # epoch and doesn't need to be reallocated each time.
    train_all_labels = torch.empty(size=(dataset_sizes["train"], ),
                                   dtype=torch.long).cpu()
    train_all_predicts = torch.empty(size=(dataset_sizes["train"], ),
                                     dtype=torch.long).cpu()
    val_all_labels = torch.empty(size=(dataset_sizes["val"], ),
                                 dtype=torch.long).cpu()
    val_all_predicts = torch.empty(size=(dataset_sizes["val"], ),
                                   dtype=torch.long).cpu()

    global_minibatch_counter = 0

    mag_writer = open("mags_resnet18_imagenet.csv", "w")
    mag_writer.write(
        "image_name,train_loss,layers_-1,layer_0,layer_60,layer_1,layer_20,layer_40,layer_59,conf,correct\n"
    )

    # Train for specified number of epochs.
    for epoch in range(start_epoch, num_epochs):

        # Training phase.
        model.train(mode=True)

        train_running_loss = 0.0
        train_running_corrects = 0
        epoch_minibatch_counter = 0

        # Train over all training data.
        for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]):
            train_inputs = inputs.to(device=device)
            train_labels = labels.to(device=device)
            optimizer.zero_grad()

            # Forward and backpropagation.
            with torch.set_grad_enabled(mode=True):
                train_outputs = model(train_inputs)
                confs, train_preds = torch.max(train_outputs, dim=1)
                train_loss = criterion(input=train_outputs,
                                       target=train_labels)
                train_loss.backward(retain_graph=True)
                optimizer.step()

                batch_grads = torch.autograd.grad(train_loss,
                                                  model.parameters(),
                                                  retain_graph=True)
                # print(len(batch_grads))
                # for batch_grad in batch_grads:
                #     print(batch_grad.size())

                train_loss_npy = float(train_loss.detach().cpu().numpy())
                layer_num_to_mag = get_grad_magnitude(model)
                image_name = get_image_name(paths[0])
                conf = float(confs.detach().cpu().numpy())
                train_pred = int(train_preds.detach().cpu().numpy()[0])
                gt_label = int(train_labels.detach().cpu().numpy()[0])
                correct = 0
                if train_pred == gt_label:
                    correct = 1

                output_line = f"{image_name},{train_loss_npy:.4f},{layer_num_to_mag[-1]:.4f},{layer_num_to_mag[0]:.4f},{layer_num_to_mag[60]:.4f},{layer_num_to_mag[1]:.4f},{layer_num_to_mag[20]:.4f},{layer_num_to_mag[40]:.4f},{layer_num_to_mag[59]:.4f},{conf:.4f},{correct}\n"
                mag_writer.write(output_line)
                print(idx, output_line)
                # print(idx, image_name, train_loss_npy, conf, train_pred, gt_label)

            # Update training diagnostics.
            train_running_loss += train_loss.item() * train_inputs.size(0)
            train_running_corrects += torch.sum(
                train_preds == train_labels.data, dtype=torch.double)

            start = idx * batch_size
            end = start + batch_size

            train_all_labels[start:end] = train_labels.detach().cpu()
            train_all_predicts[start:end] = train_preds.detach().cpu()

            global_minibatch_counter += 1
            epoch_minibatch_counter += 1

            if global_minibatch_counter % 1000 == 0:

                calculate_confusion_matrix(
                    all_labels=train_all_labels.numpy(),
                    all_predicts=train_all_predicts.numpy(),
                    classes=classes,
                    num_classes=num_classes)

                # Store training diagnostics.
                train_loss = train_running_loss / (epoch_minibatch_counter *
                                                   batch_size)
                train_acc = train_running_corrects / (epoch_minibatch_counter *
                                                      batch_size)

                # Validation phase.
                model.train(mode=False)

                val_running_loss = 0.0
                val_running_corrects = 0

                # Feed forward over all the validation data.
                for idx, (val_inputs, val_labels,
                          paths) in enumerate(dataloaders["val"]):
                    val_inputs = val_inputs.to(device=device)
                    val_labels = val_labels.to(device=device)

                    # Feed forward.
                    with torch.set_grad_enabled(mode=False):
                        val_outputs = model(val_inputs)
                        _, val_preds = torch.max(val_outputs, dim=1)
                        val_loss = criterion(input=val_outputs,
                                             target=val_labels)

                    # Update validation diagnostics.
                    val_running_loss += val_loss.item() * val_inputs.size(0)
                    val_running_corrects += torch.sum(
                        val_preds == val_labels.data, dtype=torch.double)

                    start = idx * batch_size
                    end = start + batch_size

                    val_all_labels[start:end] = val_labels.detach().cpu()
                    val_all_predicts[start:end] = val_preds.detach().cpu()

                calculate_confusion_matrix(
                    all_labels=val_all_labels.numpy(),
                    all_predicts=val_all_predicts.numpy(),
                    classes=classes,
                    num_classes=num_classes)

                # Store validation diagnostics.
                val_loss = val_running_loss / dataset_sizes["val"]
                val_acc = val_running_corrects / dataset_sizes["val"]

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

                # Remaining things related to training.
                if global_minibatch_counter % 200000 == 0 or global_minibatch_counter == 5:
                    epoch_output_path = checkpoints_folder.joinpath(
                        f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt"
                    )

                    # Confirm the output directory exists.
                    epoch_output_path.parent.mkdir(parents=True, exist_ok=True)

                    # Save the model as a state dictionary.
                    torch.save(obj={
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "scheduler_state_dict": scheduler.state_dict(),
                        "epoch": epoch + 1
                    },
                               f=str(epoch_output_path))

                writer.write(
                    f"{epoch},{global_minibatch_counter},{train_loss:.4f},"
                    f"{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n")

                current_lr = None
                for group in optimizer.param_groups:
                    current_lr = group["lr"]

                # Print the diagnostics for each epoch.
                print(f"Epoch {epoch} with "
                      f"mb {global_minibatch_counter} "
                      f"lr {current_lr:.15f}: "
                      f"t_loss: {train_loss:.4f} "
                      f"t_acc: {train_acc:.4f} "
                      f"v_loss: {val_loss:.4f} "
                      f"v_acc: {val_acc:.4f}\n")

        scheduler.step()

        current_lr = None
        for group in optimizer.param_groups:
            current_lr = group["lr"]

    # Print training information at the end.
    print(f"\ntraining complete in "
          f"{(time.time() - since) // 60:.2f} minutes")
Exemple #22
0
def train_and_evaluate(model: nn.Module,
                       train_loader: DataLoader,
                       test_loader: DataLoader,
                       optimizer: optim, loss_fn,
                       params: utils.Params,
                       restore_file: str = None) -> None:
    '''Train the model and evaluate every epoch.
    Args:
        model: (torch.nn.Module) the Deep AR model
        train_loader: load train data and labels
        test_loader: load test data and labels
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes outputs and labels per timestep, and then computes the loss for the batch
        params: (Params) hyperparameters
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    '''
    # reload weights from restore_file if specified
    restore_epoch = 0
    if restore_file is not None:
        restore_path = os.path.join(params.model_dir, restore_file + '.pth.tar')
        logger.info('Restoring parameters from {}'.format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        restore_epoch = int(restore_file[-2:].replace('_',''))+1
    logger.info('Restoring epoch: {}'.format(restore_epoch))
    logger.info('Begin training and evaluation')
    
    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=25, verbose=True, delta=0.0001, folder=params.model_dir)
    
    if os.path.exists(os.path.join(params.model_dir, 'metrics_test_best_weights.json')):
        with open(os.path.join(params.model_dir, 'metrics_test_best_weights.json')) as json_file:
            best_test_ND = json.load(json_file)['ND']
            early_stopping.best_score = best_test_ND
    else:
        best_test_ND = float('inf')
        early_stopping.best_score = best_test_ND
    
    train_len = len(train_loader)
    ND_summary = np.zeros(params.num_epochs)
    loss_summary = np.zeros((train_len * params.num_epochs))
    
    for epoch in range(restore_epoch, params.num_epochs):
        logger.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))
        loss_summary[epoch * train_len:(epoch + 1) * train_len] = train(model, optimizer, loss_fn, train_loader,
                                                                        test_loader, params, epoch)
        test_metrics = evaluate(model, loss_fn, test_loader, params, epoch, sample=args.sampling)
#         if test_metrics['ND'] == float('nan'):
#             test_metrics['ND'] = 1000
#             print('NAN ')

#         elif test_metrics['ND'] == np.nan:
#             print('NAN ')
#             test_metrics['ND'] = 1000
        
        ND_summary[epoch] = test_metrics['ND'] ##################################'ND'
        is_best = ND_summary[epoch] <= best_test_ND

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict': optimizer.state_dict()},
                              epoch=epoch,
                              is_best=is_best,
                              checkpoint=params.model_dir)

        if is_best:
            logger.info('- Found new best ND') ############# 'ND'
            best_test_ND = ND_summary[epoch]
            best_json_path = os.path.join(params.model_dir, 'metrics_test_best_weights.json')
            utils.save_dict_to_json(test_metrics, best_json_path)

        logger.info('Current Best ND is: %.5f' % best_test_ND) ## 'ND'

        utils.plot_all_epoch(ND_summary[:epoch + 1], args.dataset + '_ND', params.plot_dir)
        utils.plot_all_epoch(loss_summary[:(epoch + 1) * train_len], args.dataset + '_loss', params.plot_dir)

        last_json_path = os.path.join(params.model_dir, 'metrics_test_last_weights.json')
        utils.save_dict_to_json(test_metrics, last_json_path)
        
        
        # early_stopping needs the validation loss to check if it has decresed, 
        # and if it has, it will make a checkpoint of the current model
        logger.info('ND : %.5f ' % test_metrics['ND'])
        early_stopping(test_metrics['ND'], model)
        
        if early_stopping.early_stop:
            logger.info('Early stopping')
            break
        
#     # load the last checkpoint with the best model
#     model.load_state_dict(torch.load('checkpoint.pt'))

    if args.save_best:
        f = open('./param_search.txt', 'w')
        f.write('-----------\n')
        list_of_params = args.search_params.split(',')
        print_params = ''
        for param in list_of_params:
            param_value = getattr(params, param)
            print_params += f'{param}: {param_value:.2f}'
        print_params = print_params[:-1]
        f.write(print_params + '\n')
        f.write('Best ND: ' + str(best_test_ND) + '\n')
        logger.info(print_params)
        logger.info(f'Best ND: {best_test_ND}')
        f.close()
        utils.plot_all_epoch(ND_summary, print_params + '_ND', location=params.plot_dir)
        utils.plot_all_epoch(loss_summary, print_params + '_loss', location=params.plot_dir)
def save_checkpoint(
    path: str,
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    acc_dev: float,
    epoch: int,
    fp16: bool,
    opt_level: str = None,
) -> None:

    """
    Save a checkpoint that allows to continue training the model in the future

    Input:
     - path: path where the model is going to be saved
     - model: TEDD1104 model to save
     - optimizer_name: Name of the optimizer used for training: SGD or Adam
     - optimizer: Optimizer used for training
     - acc_dev: Accuracy of the model in the development set
     - epoch: Num of epoch used to train the model
     - amp: If the model uses FP16, Nvidia Apex AMP
     - amp_opt_level: If the model uses FP16, the AMP opt_level

    Output:
    """

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    dict_hyperparams: dict = {
        "sequence_size": model.sequence_size,
        "resnet": model.resnet,
        "pretrained_resnet": model.pretrained_resnet,
        "embedded_size": model.embedded_size,
        "hidden_size": model.hidden_size,
        "num_layers_lstm": model.num_layers_lstm,
        "bidirectional_lstm": model.bidirectional_lstm,
        "layers_out": model.layers_out,
        "dropout_cnn": model.dropout_cnn,
        "dropout_cnn_out": model.dropout_cnn_out,
        "dropout_lstm": model.dropout_lstm,
        "dropout_lstm_out": model.dropout_lstm_out,
        "fp16": fp16,
        "amp_opt_level": opt_level,
    }

    checkpoint = {
        "hyper_params": dict_hyperparams,
        "model": model.state_dict(),
        "optimizer_name": optimizer_name,
        "optimizer": optimizer.state_dict(),
        "acc_dev": acc_dev,
        "epoch": epoch,
        "amp": None if not fp16 else amp.state_dict(),
        "opt_level": opt_level,
    }

    torch.save(checkpoint, path)
def train_smartgrad_helper(model: torchvision.models.resnet.ResNet,
                 dataloaders: Dict[str, torch.utils.data.DataLoader],
                 dataset_sizes: Dict[str, int],
                 criterion: torch.nn.modules.loss, 
                 optimizer: torch.optim,
                 scheduler: torch.optim.lr_scheduler, 
                 num_epochs: int,
                 log_writer: IO, 
                 train_order_writer: IO, 
                 device: torch.device, 
                 train_batch_size: int,
                 val_batch_size: int,
                 fake_minibatch_size: int, 
                 annealling_factor: float,
                 save_mb_interval: int, 
                 val_mb_interval: int,
                 checkpoints_folder: Path,
                 num_layers: int, 
                 classes: List[str],
                 num_classes: int) -> None:

    grad_layers = list(range(1, 21))

    since = time.time()
    global_minibatch_counter = 0
    # Initialize all the tensors to be used in training and validation.
    # Do this outside the loop since it will be written over entirely at each
    # epoch and doesn't need to be reallocated each time.
    train_all_labels = torch.empty(size=(dataset_sizes["train"], ),
                                   dtype=torch.long).cpu()
    train_all_predicts = torch.empty(size=(dataset_sizes["train"], ),
                                     dtype=torch.long).cpu()
    val_all_labels = torch.empty(size=(dataset_sizes["val"], ),
                                 dtype=torch.long).cpu()
    val_all_predicts = torch.empty(size=(dataset_sizes["val"], ),
                                   dtype=torch.long).cpu()

    for epoch in range(1, num_epochs+1):

        model.train(mode=False) # Training phase.
        train_running_loss, train_running_corrects, epoch_minibatch_counter = 0.0, 0, 0
        idx_to_gt = {}
        
        for idx, (inputs, labels, paths) in enumerate(dataloaders["train"]):
            train_inputs = inputs.to(device=device)
            train_labels = labels.to(device=device)
            optimizer.zero_grad()

            # Forward and backpropagation.
            with torch.set_grad_enabled(mode=True):
                train_outputs = model(train_inputs)
                __, train_preds = torch.max(train_outputs, dim=1)
                train_loss = criterion(input=train_outputs, target=train_labels)
                train_loss.backward(retain_graph=True)

                gt_label = int(train_labels.detach().cpu().numpy()[0])
                idx_to_gt[idx] = gt_label

                ########################
                #### important code ####
                ########################

                #clear the memory
                fake_minibatch_idx = idx % fake_minibatch_size
                fake_minibatch_num = int(idx / fake_minibatch_size)
                if fake_minibatch_idx == 0:
                    minibatch_grad_dict = {}; gc.collect()
                
                #get the per-example gradient magnitude and add to minibatch_grad_dict
                grad_as_dict, grad_flattened = model_to_grad_as_dict_and_flatten(model, grad_layers)
                minibatch_grad_dict[idx] = (grad_as_dict, grad_flattened)

                #every batch, calculate the best ones
                if fake_minibatch_idx == fake_minibatch_size - 1:
                    idx_to_weight_batch = get_idx_to_weight(minibatch_grad_dict, annealling_factor, idx_to_gt)
                    print(idx_to_weight_batch)

                    ##########################
                    # print("\n...............................updating......................................" + str(idx))
                    for layer_num, param in enumerate(model.parameters()):
                        # if layer_num in [0]:#grad_layers:
                        new_grad = get_new_layer_grad(layer_num, idx_to_weight_batch, minibatch_grad_dict)
                        assert param.grad.detach().cpu().numpy().shape == new_grad.detach().cpu().numpy().shape
                        param.grad = new_grad
                            # check_model_weights(idx, model)
                    optimizer.step()
                    # check_model_weights(idx, model)
                    # print("................................done........................................." + str(idx) + '\n\n\n\n')
                    ##########################

            # Update training diagnostics.
            train_running_loss += train_loss.item() * train_inputs.size(0)
            train_running_corrects += torch.sum(train_preds == train_labels.data, dtype=torch.double)

            start = idx * train_batch_size
            end = start + train_batch_size
            train_all_labels[start:end] = train_labels.detach().cpu()
            train_all_predicts[start:end] = train_preds.detach().cpu()

            global_minibatch_counter += 1
            epoch_minibatch_counter += 1

            # Write the path of training order if it exists
            if train_order_writer:
                for path in paths: #write the order that the model was trained in
                    train_order_writer.write("/".join(path.split("/")[-2:]) + "\n")

            # Validate the model
            if global_minibatch_counter % val_mb_interval == 0 or global_minibatch_counter == 1:

                # Calculate training diagnostics
                calculate_confusion_matrix( all_labels=train_all_labels.numpy(), all_predicts=train_all_predicts.numpy(),
                                            classes=classes, num_classes=num_classes)
                train_loss = train_running_loss / (epoch_minibatch_counter * train_batch_size)
                train_acc = train_running_corrects / (epoch_minibatch_counter * train_batch_size)

                # Validation phase.
                model.train(mode=False)
                val_running_loss = 0.0
                val_running_corrects = 0

                # Feed forward over all the validation data.
                for idx, (val_inputs, val_labels, paths) in enumerate(dataloaders["val"]):
                    val_inputs = val_inputs.to(device=device)
                    val_labels = val_labels.to(device=device)

                    # Feed forward.
                    with torch.set_grad_enabled(mode=False):
                        val_outputs = model(val_inputs)
                        _, val_preds = torch.max(val_outputs, dim=1)
                        val_loss = criterion(input=val_outputs, target=val_labels)

                    # Update validation diagnostics.
                    val_running_loss += val_loss.item() * val_inputs.size(0)
                    val_running_corrects += torch.sum(val_preds == val_labels.data,
                                                    dtype=torch.double)

                    start = idx * val_batch_size
                    end = start + val_batch_size
                    val_all_labels[start:end] = val_labels.detach().cpu()
                    val_all_predicts[start:end] = val_preds.detach().cpu()

                # Calculate validation diagnostics
                calculate_confusion_matrix( all_labels=val_all_labels.numpy(), all_predicts=val_all_predicts.numpy(),
                                            classes=classes, num_classes=num_classes)
                val_loss = val_running_loss / dataset_sizes["val"]
                val_acc = val_running_corrects / dataset_sizes["val"]

                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    

                # Remaining things related to training.
                if global_minibatch_counter % save_mb_interval == 0 or global_minibatch_counter == 1:

                    epoch_output_path = checkpoints_folder.joinpath(f"resnet{num_layers}_e{epoch}_mb{global_minibatch_counter}_va{val_acc:.5f}.pt")
                    epoch_output_path.parent.mkdir(parents=True, exist_ok=True)

                    # Save the model as a state dictionary.
                    torch.save(obj={
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "scheduler_state_dict": scheduler.state_dict(),
                        "epoch": epoch + 1
                    }, f=str(epoch_output_path))

                log_writer.write(f"{epoch},{global_minibatch_counter},{train_loss:.4f},{train_acc:.4f},{val_loss:.4f},{val_acc:.4f}\n")

                current_lr = None
                for group in optimizer.param_groups:
                    current_lr = group["lr"]

                # Print the diagnostics for each epoch.
                print(f"Epoch {epoch} with "
                    f"mb {global_minibatch_counter} "
                    f"lr {current_lr:.15f}: "
                    f"t_loss: {train_loss:.4f} "
                    f"t_acc: {train_acc:.4f} "
                    f"v_loss: {val_loss:.4f} "
                    f"v_acc: {val_acc:.4f}\n")

        scheduler.step()

        current_lr = None
        for group in optimizer.param_groups:
            current_lr = group["lr"]