Ejemplo n.º 1
0
    def __init__(self,
                 model,
                 model_dir=opt.model_dir,
                 mode=CONST.TRAIN,
                 resume=opt.resume,
                 lr=opt.lr):
        """ Initialize Trainer

        Args:
            model: (MusicRNN) Model
            model_dir: (str) Path to the saved model directory
            mode: (str) Train or Test
            resume: (str) Path to pretrained model

        """
        self.logger = logging.getLogger('trainer')
        self.logger.setLevel(opt.logging_level)
        self.computing_device = self._set_cuda()

        self.model = model.to(self.computing_device)
        self.logger.debug("Model on CUDA? {}".format(
            next(self.model.parameters()).is_cuda))
        self.model_dir = model_dir
        if not os.path.isdir(self.model_dir):
            os.makedirs(self.model_dir)
        self.start_epoch = 0
        self.best_err = np.inf

        self.optimizer = self._get_optimizer(lr=lr)

        self.criterion = nn.MSELoss()

        # meter
        self.meter = {CONST.TRAIN: get_meter(), CONST.VAL: get_meter()}

        if mode == CONST.TRAIN:
            freezing_layers(model)

        if opt.resume:
            fn = os.path.join(opt.model_dir, 'model_best.pth.tar')
            self.load_checkpoint(fn)
Ejemplo n.º 2
0
def train_and_evaluate(opt, logger=None, tb_logger=None):
    """ Train and evaluate a model

    The basic understanding of `train_and_evaluate()` can be broken down
    into two parts. Part 1 focuses on getting the dataloaders, model,
    and trainer to conduct the training/evaluation. Part 2.A and 2.B is about
    training or evaluating, respectively.

    Given the mode, train_and_evaluate can take two actions:

    1) mode == TRAIN ---> action: train_and_validate
    2) mode == VAL   ---> action: evaluate the model on the full validation/test set


    Args:
        opt (Config): A state dictionary holding preset parameters
        logger (Logger): Logging instance
        tb_logger (SummaryWriter): Tensorboard logging instance

    Returns:
        None

    """

    #TODO implement Early Stopping
    #TODO implement test code

    logger = logger if logger else logging.getLogger('train-and-evaluate')
    logger.setLevel(opt.logging_level)

    # Read in dataset
    # check the path for the data loader to make sure it is loading the right data set
    data_loader = {
        mode: get_dataloader(data_dir=opt.data_dir,
                             batch_size=opt.batch_size,
                             mode=mode)
        for mode in [CONST.TRAIN, CONST.VAL]
    }
    # Create model
    model = HABClassifier(arch=opt.arch,
                          pretrained=opt.pretrained,
                          num_classes=opt.class_num)

    # Initialize Trainer for initializing losses, optimizers, loading weights, etc
    trainer = Trainer(model=model,
                      model_dir=opt.model_dir,
                      mode=opt.mode,
                      resume=opt.resume,
                      lr=opt.lr,
                      class_count=data_loader[CONST.TRAIN].dataset.data[
                          CONST.LBL].value_counts())

    #==== BEGIN OPTION 1: TRAINING ====#
    # Train and validate model if set to TRAINING
    # When training, we do both training and validation within the loop.
    # When set to the validation mode, this will run a full evaluation
    # and produce more summarized evaluation results. This is the default condition
    # if the mode is not training.
    if opt.mode == CONST.TRAIN:
        best_err = trainer.best_err
        Logger.section_break('Valid (Epoch {})'.format(trainer.start_epoch))
        err, acc, _, metrics_test = evaluate(trainer.model,
                                             trainer,
                                             data_loader[CONST.VAL],
                                             0,
                                             opt.batch_size,
                                             logger,
                                             tb_logger,
                                             max_iters=None)
        metrics_best = metrics_test

        eps_meter = get_meter(
            meters=['train_loss', 'val_loss', 'train_acc', 'val_acc'])

        for ii, epoch in enumerate(
                range(trainer.start_epoch, trainer.start_epoch + opt.epochs)):

            # Train for one epoch
            Logger.section_break('Train (Epoch {})'.format(epoch))
            train_loss, train_acc = train(trainer.model, trainer,
                                          data_loader[CONST.TRAIN], epoch,
                                          logger, tb_logger, opt.batch_size,
                                          opt.print_freq)
            eps_meter['train_loss'].update(train_loss)
            eps_meter['train_acc'].update(train_acc)

            # Evaluate on validation set
            Logger.section_break('Valid (Epoch {})'.format(epoch))
            err, acc, _, metrics_test = evaluate(trainer.model,
                                                 trainer,
                                                 data_loader[CONST.VAL],
                                                 epoch,
                                                 opt.batch_size,
                                                 logger,
                                                 tb_logger,
                                                 max_iters=None)
            eps_meter['val_loss'].update(err)
            eps_meter['val_acc'].update(acc)

            # Remember best error and save checkpoint
            is_best = err < best_err
            best_err = min(err, best_err)
            state = trainer.generate_state_dict(epoch=epoch, best_err=best_err)

            if epoch % opt.save_freq == 0:
                trainer.save_checkpoint(
                    state,
                    is_best=False,
                    filename='checkpoint-{}_{:0.4f}.pth.tar'.format(
                        epoch, acc))
            if is_best:
                metrics_best = metrics_test
                trainer.save_checkpoint(state,
                                        is_best=is_best,
                                        filename='model_best.pth.tar')

        # ==== END OPTION 1: TRAINING LOOP ====#
        # Generate evaluation plots
        opt.train_acc = max(eps_meter['train_acc'].data)
        opt.test_acc = max(eps_meter['val_acc'].data)
        #plot loss over eps
        vis_training(eps_meter['train_loss'].data,
                     eps_meter['val_loss'].data,
                     loss=True)
        #plot acc over eps
        vis_training(eps_meter['train_acc'].data,
                     eps_meter['val_acc'].data,
                     loss=False)

        #plot best confusion matrix
        plt.figure()
        metrics_best.compute_cm(plot=True)

    #==== BEGIN OPTION 2: EVALUATION ====#
    # EVALUATE the model if set to evaluation mode
    # Below you'll receive a more comprehensive report of the evaluation in the eval.log
    elif opt.mode == CONST.VAL:
        err, acc, run_time, metrics = evaluate(
            model=trainer.model,
            trainer=trainer,
            data_loader=data_loader[CONST.VAL],
            logger=logger,
            tb_logger=tb_logger)

        Logger.section_break('EVAL COMPLETED')
        model_parameters = filter(lambda p: p.requires_grad,
                                  trainer.model.parameters())
        params = sum([np.prod(p.size()) for p in model_parameters])
        metrics.print_eval(params, run_time, err, acc, metrics.results_dir)

        cm, mca = metrics.compute_cm(plot=True)
Ejemplo n.º 3
0
def evaluate(model,
             trainer,
             data_loader,
             epoch=0,
             batch_size=opt.batch_size,
             logger=None,
             tb_logger=None,
             max_iters=None):
    """ Evaluate model

    Similar to `train()` structure, where the function includes bookkeeping
    features and wrapper items. The only difference is that evaluation will
    only occur until the `max_iter` if it is specified and includes an
    `EvalMetrics` intiailization.

    The latter is currrently used to save predictions and ground truths to
    compute the confusion matrix.

    Args:
        model: Classification model
        trainer (Trainer): Training wrapper
        data_loader (torch.data.Dataloader): Generator data loading instance
        epoch (int): Current epoch
        logger (Logger): Logger. Used to display/log metrics
        tb_logger (SummaryWriter): Tensorboard Logger
        batch_size (int): Batch size
        max_iters (int): Max iterations

    Returns:
        float: Loss average
        float: Accuracy average
        float: Run time average
        EvalMetrics: Evaluation wrapper to compute CMs

    """
    criterion = trainer.criterion

    # Initialize meter and metrics
    meter = get_meter(meters=['batch_time', 'loss', 'acc'])
    predictions, gtruth, ids = [], [], []
    classes = data_loader.dataset.classes
    metrics = EvalMetrics(classes, predictions, gtruth, ids, trainer.model_dir)

    # Switch to evaluate mode
    model.eval()
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            # process batch items: images, labels
            img = to_cuda(batch[CONST.IMG], trainer.computing_device)
            target = to_cuda(batch[CONST.LBL],
                             trainer.computing_device,
                             label=True)
            id = batch[CONST.ID]

            # compute output
            end = time.time()
            logits = model(img)
            loss = criterion(logits, target)
            acc = accuracy(logits, target)
            batch_size = list(batch[CONST.LBL].shape)[0]

            # update metrics
            meter['acc'].update(acc, batch_size)
            meter['loss'].update(loss, batch_size)

            # update metrics2
            metrics.update(logits, target, id)

            # measure elapsed time
            meter['batch_time'].update(time.time() - end, batch_size)

            if i % opt.print_freq == 0:
                log = 'EVAL [{:02d}][{:2d}/{:2d}] TIME {:10} ACC {:10} LOSS {' \
                      ':10}'.format(epoch, i, len(data_loader),
                    "{t.val:.3f} ({t.avg:.3f})".format(t=meter['batch_time']),
                    "{t.val:.3f} ({t.avg:.3f})".format(t=meter['acc']),
                    "{t.val:.3f} ({t.avg:.3f})".format(t=meter['loss'])
                                    )
                logger.info(log)

                if tb_logger is not None:
                    tb_logger.add_scalar('test/loss', meter['loss'].val, epoch)
                    tb_logger.add_scalar('test/accuracy', meter['acc'].val,
                                         epoch)

            if max_iters is not None and i >= max_iters:
                break

        # Print last eval
        log = 'EVAL [{:02d}][{:2d}/{:2d}] TIME {:10} ACC {:10} LOSS {' \
              ':10}'.format(epoch, i, len(data_loader),
                            "{t.val:.3f} ({t.avg:.3f})".format(t=meter['batch_time']),
                            "{t.val:.3f} ({t.avg:.3f})".format(t=meter['acc']),
                            "{t.val:.3f} ({t.avg:.3f})".format(t=meter['loss'])
                            )
        logger.info(log)

        if tb_logger is not None:
            tb_logger.add_scalar('test-epoch/loss', meter['loss'].avg, epoch)
            tb_logger.add_scalar('test-epoch/accuracy', meter['acc'].avg,
                                 epoch)

    return meter['loss'].avg, meter['acc'].avg, meter['batch_time'], metrics
Ejemplo n.º 4
0
def train(model,
          trainer,
          train_loader,
          epoch,
          logger,
          tb_logger,
          batch_size=opt.batch_size,
          print_freq=opt.print_freq):
    """ Train the model

    Outside of the typical training loops, `train()` incorporates other
    useful bookkeeping features and wrapper functions. This includes things
    like keeping track of accuracy, loss, batch time to wrapping optimizers
    and loss functions in the `trainer`. Be sure to reference `trainer.py`
    or `utils/eval_utils.py` if extra detail is needed.

    Args:
        model: Classification model
        trainer (Trainer): Training wrapper
        train_loader (torch.data.Dataloader): Generator data loading instance
        epoch (int): Current epoch
        logger (Logger): Logger. Used to display/log metrics
        tb_logger (SummaryWriter): Tensorboard Logger
        batch_size (int): Batch size
        print_freq (int): Print frequency

    Returns:
        None

    """
    criterion = trainer.criterion
    optimizer = trainer.optimizer

    # Initialize meter to bookkeep the following parameters
    meter = get_meter(meters=['batch_time', 'data_time', 'loss', 'acc'])

    # Switch to training mode
    model.train(True)

    end = time.time()
    for i, batch in enumerate(train_loader):
        # process batch items: images, labels
        img = to_cuda(batch[CONST.IMG], trainer.computing_device)
        target = to_cuda(batch[CONST.LBL],
                         trainer.computing_device,
                         label=True)
        id = batch[CONST.ID]

        # measure data loading time
        meter['data_time'].update(time.time() - end)

        # compute output
        end = time.time()
        logits = model(img)
        loss = criterion(logits, target)
        acc = accuracy(logits, target)

        # update metrics
        meter['acc'].update(acc, batch_size)
        meter['loss'].update(loss, batch_size)

        # compute gradient and do sgd step
        optimizer.zero_grad()
        loss.backward()

        if i % print_freq == 0:
            log = 'TRAIN [{:02d}][{:2d}/{:2d}] TIME {:10} DATA {:10} ACC {:10} LOSS {:10}'.\
                format(epoch, i, len(train_loader),
                       "{t.val:.3f} ({t.avg:.3f})".format(t=meter['batch_time']),
                       "{t.val:.3f} ({t.avg:.3f})".format(t=meter['data_time']),
                       "{t.val:.3f} ({t.avg:.3f})".format(t=meter['acc']),
                       "{t.val:.3f} ({t.avg:.3f})".format(t=meter['loss'])
                                )
            logger.info(log)

            tb_logger.add_scalar('train/loss', meter['loss'].val,
                                 epoch * len(train_loader) + i)
            tb_logger.add_scalar('train/accuracy', meter['acc'].val,
                                 epoch * len(train_loader) + i)
            tb_logger.add_scalar('data_time', meter['data_time'].val,
                                 epoch * len(train_loader) + i)
            tb_logger.add_scalar(
                'compute_time',
                meter['batch_time'].val - meter['data_time'].val,
                epoch * len(train_loader) + i)

        optimizer.step()

        # measure elapsed time
        meter['batch_time'].update(time.time() - end)
        end = time.time()

    tb_logger.add_scalar('train-epoch/loss', meter['loss'].avg, epoch)
    tb_logger.add_scalar('train-epoch/accuracy', meter['acc'].avg, epoch)

    return meter['loss'].avg, meter['acc'].avg
Ejemplo n.º 5
0
def train_and_evaluate(opt, logger=None):
    """ Train and evaluate a model

    The basic understanding of `train_and_evaluate()` can be broken down
    into two parts. Part 1 focuses on getting the dataloaders, model,
    and trainer to conduct the training/evaluation. Part 2.A and 2.B is about
    training or evaluating, respectively.

    Given the mode, train_and_evaluate can take two actions:

    1) mode == TRAIN ---> action: train_and_validate
    2) mode == VAL   ---> action: evaluate the model on the full validation/test set


    Args:
        opt (Config): A state dictionary holding preset parameters
        logger (Logger): Logging instance

    Returns:
        None

    """

    #TODO implement Early Stopping
    #TODO implement test code
    
    logger = logger if logger else logging.getLogger('train-and-evaluate')
    logger.setLevel(opt.logging_level)

    # Read in dataset
    # check the path for the data loader to make sure it is loading the right data set
    data_loader = {mode: get_dataloader(data_dir=opt.data_dir,
                                        batch_size=opt.batch_size,
                                        mode=mode) for mode in [CONST.TRAIN, CONST.VAL]}
    # Create model
    model = MODEL(arch=opt.arch, pretrained=opt.pretrained, num_classes=2)
    # Initialize Trainer for initializing losses, optimizers, loading weights, etc
    trainer = Trainer(model=model, model_dir=opt.model_dir, mode=opt.mode,
                      resume=opt.resume, lr=opt.lr)

    #==== TRAINING ====#
    # Train and validate model if set to TRAINING
    # When training, we do both training and validation within the loop.
    # When set to the validation mode, this will run a full evaluation
    # and produce more summarized evaluation results. This is the default condition
    # if the mode is not training.
    if opt.mode == CONST.TRAIN:
        best_err = trainer.best_err
        Logger.section_break('Valid (Epoch {})'.format(trainer.start_epoch))
        err, acc, _ = evaluate(trainer.model, trainer, data_loader[CONST.VAL],
                                             0, opt.batch_size, logger)

        eps_meter = get_meter(meters=['train_loss', 'val_loss', 'train_acc', 'val_acc'])
        best_err = min(best_err, err)
        
        for ii, epoch in enumerate(range(trainer.start_epoch,
                                         trainer.start_epoch+opt.epochs)):

            # Train for one epoch
            Logger.section_break('Train (Epoch {})'.format(epoch))
            train_loss, train_acc = train(trainer.model, trainer, data_loader[CONST.TRAIN], 
                                          epoch, logger, opt.batch_size, opt.print_freq)
            eps_meter['train_loss'].update(train_loss)
            eps_meter['train_acc'].update(train_acc)
            
            # Evaluate on validation set
            Logger.section_break('Valid (Epoch {})'.format(epoch))
            err, acc, _ = evaluate(trainer.model, trainer, data_loader[CONST.VAL],
                                                 epoch, opt.batch_size, logger)
            eps_meter['val_loss'].update(err)
            eps_meter['val_acc'].update(acc)
                
            # Remember best error and save checkpoint
            is_best = err < best_err
            best_err = min(err, best_err)
            state = trainer.generate_state_dict(epoch=epoch, best_err=best_err)
            
            if is_best:
                trainer.save_checkpoint(state, is_best=is_best,
                                        filename='model_best.pth.tar')

        # ==== END: TRAINING LOOP ====#
        if len(eps_meter['train_loss'].data) > 0:
            #plot loss over eps
            vis_training(eps_meter['train_loss'].data, eps_meter['val_loss'].data, loss=True)
            #plot acc over eps
            vis_training(eps_meter['train_acc'].data, eps_meter['val_acc'].data, loss=False)
Ejemplo n.º 6
0
    def __init__(self, model, model_dir=opt.model_dir, mode=CONST.TRAIN,
                 resume=opt.resume, lr=opt.lr, class_count = None):
        """ Initialize Trainer

        Args:
            model: (MusicRNN) Model
            model_dir: (str) Path to the saved model directory
            mode: (str) Train or Test
            resume: (str) Path to pretrained model

        """
        self.logger = logging.getLogger('trainer')
        self.logger.setLevel(opt.logging_level)
        self.computing_device = self._set_cuda()

        self.model = model.to(self.computing_device)
        self.logger.debug("Model on CUDA? {}".format(next(self.model.parameters()).is_cuda))
        self.model_dir = model_dir
        if not os.path.isdir(self.model_dir):
            os.makedirs(self.model_dir)
        self.start_epoch = 0
        self.best_err = np.inf

        self.optimizer = self._get_optimizer(lr=lr)
        
        # Defaulted to CrossEntropyLoss
        #TODO set interactive mode for setting the losses
        if opt.mode == CONST.TRAIN:
            self.logger.debug(class_count)
            if opt.interactive:
                weighted_y_n = input('Do you want to use weighted loss? (y/n)\n')
            else:
                weighted_y_n = opt.weighted_loss

            if weighted_y_n or weighted_y_n == 'y':
                weight = np.array([x for _,x in sorted(zip(class_count.keys().tolist(),class_count.tolist()))])
                self.logger.info('Class_count is: '+ str(weight))
                weight = weight/sum(weight)
                self.logger.info('Classes Weights are: '+ str(weight))
                weight = np.flip(weight).tolist()
                self.logger.info('Weighted Loss will be: ' + str(weight))
                weight = torch.FloatTensor(weight).cuda()
                self.criterion = nn.CrossEntropyLoss(weight=weight)
            else:
                self.criterion = nn.CrossEntropyLoss()
        else:
            self.criterion = nn.CrossEntropyLoss()

        # meter
        self.meter = {CONST.TRAIN: get_meter(), CONST.VAL: get_meter()}

        if resume or mode == CONST.VAL:
            sql = model_sql()
            fn = os.path.join(opt.model_dir, 'model_best.pth.tar')
            sql.close()
            self.load_checkpoint(fn)

        if mode == CONST.TRAIN:
            freezing_layers(model)

        if mode == CONST.DEPLOY:
            fn = os.path.join(opt.model_dir, 'model_best.pth.tar')
            self.load_checkpoint(fn)