def __init__(self, model, model_dir=opt.model_dir, mode=CONST.TRAIN, resume=opt.resume, lr=opt.lr): """ Initialize Trainer Args: model: (MusicRNN) Model model_dir: (str) Path to the saved model directory mode: (str) Train or Test resume: (str) Path to pretrained model """ self.logger = logging.getLogger('trainer') self.logger.setLevel(opt.logging_level) self.computing_device = self._set_cuda() self.model = model.to(self.computing_device) self.logger.debug("Model on CUDA? {}".format( next(self.model.parameters()).is_cuda)) self.model_dir = model_dir if not os.path.isdir(self.model_dir): os.makedirs(self.model_dir) self.start_epoch = 0 self.best_err = np.inf self.optimizer = self._get_optimizer(lr=lr) self.criterion = nn.MSELoss() # meter self.meter = {CONST.TRAIN: get_meter(), CONST.VAL: get_meter()} if mode == CONST.TRAIN: freezing_layers(model) if opt.resume: fn = os.path.join(opt.model_dir, 'model_best.pth.tar') self.load_checkpoint(fn)
def train_and_evaluate(opt, logger=None, tb_logger=None): """ Train and evaluate a model The basic understanding of `train_and_evaluate()` can be broken down into two parts. Part 1 focuses on getting the dataloaders, model, and trainer to conduct the training/evaluation. Part 2.A and 2.B is about training or evaluating, respectively. Given the mode, train_and_evaluate can take two actions: 1) mode == TRAIN ---> action: train_and_validate 2) mode == VAL ---> action: evaluate the model on the full validation/test set Args: opt (Config): A state dictionary holding preset parameters logger (Logger): Logging instance tb_logger (SummaryWriter): Tensorboard logging instance Returns: None """ #TODO implement Early Stopping #TODO implement test code logger = logger if logger else logging.getLogger('train-and-evaluate') logger.setLevel(opt.logging_level) # Read in dataset # check the path for the data loader to make sure it is loading the right data set data_loader = { mode: get_dataloader(data_dir=opt.data_dir, batch_size=opt.batch_size, mode=mode) for mode in [CONST.TRAIN, CONST.VAL] } # Create model model = HABClassifier(arch=opt.arch, pretrained=opt.pretrained, num_classes=opt.class_num) # Initialize Trainer for initializing losses, optimizers, loading weights, etc trainer = Trainer(model=model, model_dir=opt.model_dir, mode=opt.mode, resume=opt.resume, lr=opt.lr, class_count=data_loader[CONST.TRAIN].dataset.data[ CONST.LBL].value_counts()) #==== BEGIN OPTION 1: TRAINING ====# # Train and validate model if set to TRAINING # When training, we do both training and validation within the loop. # When set to the validation mode, this will run a full evaluation # and produce more summarized evaluation results. This is the default condition # if the mode is not training. if opt.mode == CONST.TRAIN: best_err = trainer.best_err Logger.section_break('Valid (Epoch {})'.format(trainer.start_epoch)) err, acc, _, metrics_test = evaluate(trainer.model, trainer, data_loader[CONST.VAL], 0, opt.batch_size, logger, tb_logger, max_iters=None) metrics_best = metrics_test eps_meter = get_meter( meters=['train_loss', 'val_loss', 'train_acc', 'val_acc']) for ii, epoch in enumerate( range(trainer.start_epoch, trainer.start_epoch + opt.epochs)): # Train for one epoch Logger.section_break('Train (Epoch {})'.format(epoch)) train_loss, train_acc = train(trainer.model, trainer, data_loader[CONST.TRAIN], epoch, logger, tb_logger, opt.batch_size, opt.print_freq) eps_meter['train_loss'].update(train_loss) eps_meter['train_acc'].update(train_acc) # Evaluate on validation set Logger.section_break('Valid (Epoch {})'.format(epoch)) err, acc, _, metrics_test = evaluate(trainer.model, trainer, data_loader[CONST.VAL], epoch, opt.batch_size, logger, tb_logger, max_iters=None) eps_meter['val_loss'].update(err) eps_meter['val_acc'].update(acc) # Remember best error and save checkpoint is_best = err < best_err best_err = min(err, best_err) state = trainer.generate_state_dict(epoch=epoch, best_err=best_err) if epoch % opt.save_freq == 0: trainer.save_checkpoint( state, is_best=False, filename='checkpoint-{}_{:0.4f}.pth.tar'.format( epoch, acc)) if is_best: metrics_best = metrics_test trainer.save_checkpoint(state, is_best=is_best, filename='model_best.pth.tar') # ==== END OPTION 1: TRAINING LOOP ====# # Generate evaluation plots opt.train_acc = max(eps_meter['train_acc'].data) opt.test_acc = max(eps_meter['val_acc'].data) #plot loss over eps vis_training(eps_meter['train_loss'].data, eps_meter['val_loss'].data, loss=True) #plot acc over eps vis_training(eps_meter['train_acc'].data, eps_meter['val_acc'].data, loss=False) #plot best confusion matrix plt.figure() metrics_best.compute_cm(plot=True) #==== BEGIN OPTION 2: EVALUATION ====# # EVALUATE the model if set to evaluation mode # Below you'll receive a more comprehensive report of the evaluation in the eval.log elif opt.mode == CONST.VAL: err, acc, run_time, metrics = evaluate( model=trainer.model, trainer=trainer, data_loader=data_loader[CONST.VAL], logger=logger, tb_logger=tb_logger) Logger.section_break('EVAL COMPLETED') model_parameters = filter(lambda p: p.requires_grad, trainer.model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) metrics.print_eval(params, run_time, err, acc, metrics.results_dir) cm, mca = metrics.compute_cm(plot=True)
def evaluate(model, trainer, data_loader, epoch=0, batch_size=opt.batch_size, logger=None, tb_logger=None, max_iters=None): """ Evaluate model Similar to `train()` structure, where the function includes bookkeeping features and wrapper items. The only difference is that evaluation will only occur until the `max_iter` if it is specified and includes an `EvalMetrics` intiailization. The latter is currrently used to save predictions and ground truths to compute the confusion matrix. Args: model: Classification model trainer (Trainer): Training wrapper data_loader (torch.data.Dataloader): Generator data loading instance epoch (int): Current epoch logger (Logger): Logger. Used to display/log metrics tb_logger (SummaryWriter): Tensorboard Logger batch_size (int): Batch size max_iters (int): Max iterations Returns: float: Loss average float: Accuracy average float: Run time average EvalMetrics: Evaluation wrapper to compute CMs """ criterion = trainer.criterion # Initialize meter and metrics meter = get_meter(meters=['batch_time', 'loss', 'acc']) predictions, gtruth, ids = [], [], [] classes = data_loader.dataset.classes metrics = EvalMetrics(classes, predictions, gtruth, ids, trainer.model_dir) # Switch to evaluate mode model.eval() with torch.no_grad(): for i, batch in enumerate(data_loader): # process batch items: images, labels img = to_cuda(batch[CONST.IMG], trainer.computing_device) target = to_cuda(batch[CONST.LBL], trainer.computing_device, label=True) id = batch[CONST.ID] # compute output end = time.time() logits = model(img) loss = criterion(logits, target) acc = accuracy(logits, target) batch_size = list(batch[CONST.LBL].shape)[0] # update metrics meter['acc'].update(acc, batch_size) meter['loss'].update(loss, batch_size) # update metrics2 metrics.update(logits, target, id) # measure elapsed time meter['batch_time'].update(time.time() - end, batch_size) if i % opt.print_freq == 0: log = 'EVAL [{:02d}][{:2d}/{:2d}] TIME {:10} ACC {:10} LOSS {' \ ':10}'.format(epoch, i, len(data_loader), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['batch_time']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['acc']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['loss']) ) logger.info(log) if tb_logger is not None: tb_logger.add_scalar('test/loss', meter['loss'].val, epoch) tb_logger.add_scalar('test/accuracy', meter['acc'].val, epoch) if max_iters is not None and i >= max_iters: break # Print last eval log = 'EVAL [{:02d}][{:2d}/{:2d}] TIME {:10} ACC {:10} LOSS {' \ ':10}'.format(epoch, i, len(data_loader), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['batch_time']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['acc']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['loss']) ) logger.info(log) if tb_logger is not None: tb_logger.add_scalar('test-epoch/loss', meter['loss'].avg, epoch) tb_logger.add_scalar('test-epoch/accuracy', meter['acc'].avg, epoch) return meter['loss'].avg, meter['acc'].avg, meter['batch_time'], metrics
def train(model, trainer, train_loader, epoch, logger, tb_logger, batch_size=opt.batch_size, print_freq=opt.print_freq): """ Train the model Outside of the typical training loops, `train()` incorporates other useful bookkeeping features and wrapper functions. This includes things like keeping track of accuracy, loss, batch time to wrapping optimizers and loss functions in the `trainer`. Be sure to reference `trainer.py` or `utils/eval_utils.py` if extra detail is needed. Args: model: Classification model trainer (Trainer): Training wrapper train_loader (torch.data.Dataloader): Generator data loading instance epoch (int): Current epoch logger (Logger): Logger. Used to display/log metrics tb_logger (SummaryWriter): Tensorboard Logger batch_size (int): Batch size print_freq (int): Print frequency Returns: None """ criterion = trainer.criterion optimizer = trainer.optimizer # Initialize meter to bookkeep the following parameters meter = get_meter(meters=['batch_time', 'data_time', 'loss', 'acc']) # Switch to training mode model.train(True) end = time.time() for i, batch in enumerate(train_loader): # process batch items: images, labels img = to_cuda(batch[CONST.IMG], trainer.computing_device) target = to_cuda(batch[CONST.LBL], trainer.computing_device, label=True) id = batch[CONST.ID] # measure data loading time meter['data_time'].update(time.time() - end) # compute output end = time.time() logits = model(img) loss = criterion(logits, target) acc = accuracy(logits, target) # update metrics meter['acc'].update(acc, batch_size) meter['loss'].update(loss, batch_size) # compute gradient and do sgd step optimizer.zero_grad() loss.backward() if i % print_freq == 0: log = 'TRAIN [{:02d}][{:2d}/{:2d}] TIME {:10} DATA {:10} ACC {:10} LOSS {:10}'.\ format(epoch, i, len(train_loader), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['batch_time']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['data_time']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['acc']), "{t.val:.3f} ({t.avg:.3f})".format(t=meter['loss']) ) logger.info(log) tb_logger.add_scalar('train/loss', meter['loss'].val, epoch * len(train_loader) + i) tb_logger.add_scalar('train/accuracy', meter['acc'].val, epoch * len(train_loader) + i) tb_logger.add_scalar('data_time', meter['data_time'].val, epoch * len(train_loader) + i) tb_logger.add_scalar( 'compute_time', meter['batch_time'].val - meter['data_time'].val, epoch * len(train_loader) + i) optimizer.step() # measure elapsed time meter['batch_time'].update(time.time() - end) end = time.time() tb_logger.add_scalar('train-epoch/loss', meter['loss'].avg, epoch) tb_logger.add_scalar('train-epoch/accuracy', meter['acc'].avg, epoch) return meter['loss'].avg, meter['acc'].avg
def train_and_evaluate(opt, logger=None): """ Train and evaluate a model The basic understanding of `train_and_evaluate()` can be broken down into two parts. Part 1 focuses on getting the dataloaders, model, and trainer to conduct the training/evaluation. Part 2.A and 2.B is about training or evaluating, respectively. Given the mode, train_and_evaluate can take two actions: 1) mode == TRAIN ---> action: train_and_validate 2) mode == VAL ---> action: evaluate the model on the full validation/test set Args: opt (Config): A state dictionary holding preset parameters logger (Logger): Logging instance Returns: None """ #TODO implement Early Stopping #TODO implement test code logger = logger if logger else logging.getLogger('train-and-evaluate') logger.setLevel(opt.logging_level) # Read in dataset # check the path for the data loader to make sure it is loading the right data set data_loader = {mode: get_dataloader(data_dir=opt.data_dir, batch_size=opt.batch_size, mode=mode) for mode in [CONST.TRAIN, CONST.VAL]} # Create model model = MODEL(arch=opt.arch, pretrained=opt.pretrained, num_classes=2) # Initialize Trainer for initializing losses, optimizers, loading weights, etc trainer = Trainer(model=model, model_dir=opt.model_dir, mode=opt.mode, resume=opt.resume, lr=opt.lr) #==== TRAINING ====# # Train and validate model if set to TRAINING # When training, we do both training and validation within the loop. # When set to the validation mode, this will run a full evaluation # and produce more summarized evaluation results. This is the default condition # if the mode is not training. if opt.mode == CONST.TRAIN: best_err = trainer.best_err Logger.section_break('Valid (Epoch {})'.format(trainer.start_epoch)) err, acc, _ = evaluate(trainer.model, trainer, data_loader[CONST.VAL], 0, opt.batch_size, logger) eps_meter = get_meter(meters=['train_loss', 'val_loss', 'train_acc', 'val_acc']) best_err = min(best_err, err) for ii, epoch in enumerate(range(trainer.start_epoch, trainer.start_epoch+opt.epochs)): # Train for one epoch Logger.section_break('Train (Epoch {})'.format(epoch)) train_loss, train_acc = train(trainer.model, trainer, data_loader[CONST.TRAIN], epoch, logger, opt.batch_size, opt.print_freq) eps_meter['train_loss'].update(train_loss) eps_meter['train_acc'].update(train_acc) # Evaluate on validation set Logger.section_break('Valid (Epoch {})'.format(epoch)) err, acc, _ = evaluate(trainer.model, trainer, data_loader[CONST.VAL], epoch, opt.batch_size, logger) eps_meter['val_loss'].update(err) eps_meter['val_acc'].update(acc) # Remember best error and save checkpoint is_best = err < best_err best_err = min(err, best_err) state = trainer.generate_state_dict(epoch=epoch, best_err=best_err) if is_best: trainer.save_checkpoint(state, is_best=is_best, filename='model_best.pth.tar') # ==== END: TRAINING LOOP ====# if len(eps_meter['train_loss'].data) > 0: #plot loss over eps vis_training(eps_meter['train_loss'].data, eps_meter['val_loss'].data, loss=True) #plot acc over eps vis_training(eps_meter['train_acc'].data, eps_meter['val_acc'].data, loss=False)
def __init__(self, model, model_dir=opt.model_dir, mode=CONST.TRAIN, resume=opt.resume, lr=opt.lr, class_count = None): """ Initialize Trainer Args: model: (MusicRNN) Model model_dir: (str) Path to the saved model directory mode: (str) Train or Test resume: (str) Path to pretrained model """ self.logger = logging.getLogger('trainer') self.logger.setLevel(opt.logging_level) self.computing_device = self._set_cuda() self.model = model.to(self.computing_device) self.logger.debug("Model on CUDA? {}".format(next(self.model.parameters()).is_cuda)) self.model_dir = model_dir if not os.path.isdir(self.model_dir): os.makedirs(self.model_dir) self.start_epoch = 0 self.best_err = np.inf self.optimizer = self._get_optimizer(lr=lr) # Defaulted to CrossEntropyLoss #TODO set interactive mode for setting the losses if opt.mode == CONST.TRAIN: self.logger.debug(class_count) if opt.interactive: weighted_y_n = input('Do you want to use weighted loss? (y/n)\n') else: weighted_y_n = opt.weighted_loss if weighted_y_n or weighted_y_n == 'y': weight = np.array([x for _,x in sorted(zip(class_count.keys().tolist(),class_count.tolist()))]) self.logger.info('Class_count is: '+ str(weight)) weight = weight/sum(weight) self.logger.info('Classes Weights are: '+ str(weight)) weight = np.flip(weight).tolist() self.logger.info('Weighted Loss will be: ' + str(weight)) weight = torch.FloatTensor(weight).cuda() self.criterion = nn.CrossEntropyLoss(weight=weight) else: self.criterion = nn.CrossEntropyLoss() else: self.criterion = nn.CrossEntropyLoss() # meter self.meter = {CONST.TRAIN: get_meter(), CONST.VAL: get_meter()} if resume or mode == CONST.VAL: sql = model_sql() fn = os.path.join(opt.model_dir, 'model_best.pth.tar') sql.close() self.load_checkpoint(fn) if mode == CONST.TRAIN: freezing_layers(model) if mode == CONST.DEPLOY: fn = os.path.join(opt.model_dir, 'model_best.pth.tar') self.load_checkpoint(fn)