Example #1
0
class Trainer(object):
    def __init__(self, batch_size=32, optimizer_name="Adam", lr=1e-3, weight_decay=1e-5,
                 epochs=200, model_name="model01", gpu_ids=None, resume=None, tqdm=None):
        """
        args:
            batch_size = (int) batch_size of training and validation
            lr = (float) learning rate of optimization
            weight_decay = (float) weight decay of optimization
            epochs = (int) The number of epochs of training
            model_name = (string) The name of training model. Will be folder name.
            gpu_ids = (List) List of gpu_ids. (e.g. gpu_ids = [0, 1]). Use CPU, if it is None. 
            resume = (Dict) Dict of some settings. (resume = {"checkpoint_path":PATH_of_checkpoint, "fine_tuning":True or False}). 
                     Learn from scratch, if it is None.
            tqdm = (tqdm Object) progress bar object. Set your tqdm please.
                   Don't view progress bar, if it is None.
        """
        # Set params
        self.batch_size = batch_size
        self.epochs = epochs
        self.start_epoch = 0
        self.use_cuda = (gpu_ids is not None) and torch.cuda.is_available
        self.tqdm = tqdm
        self.use_tqdm = tqdm is not None
        # ------------------------- #
        # Define Utils. (No need to Change.)
        """
        These are Project Modules.
        You may not have to change these.
        
        Saver: Save model weight. / <utils.saver.Saver()>
        TensorboardSummary: Write tensorboard file. / <utils.summaries.TensorboardSummary()>
        Evaluator: Calculate some metrics (e.g. Accuracy). / <utils.metrics.Evaluator()>
        """
        ## ***Define Saver***
        self.saver = Saver(model_name, lr, epochs)
        self.saver.save_experiment_config()
        
        ## ***Define Tensorboard Summary***
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()
        
        # ------------------------- #
        # Define Training components. (You have to Change!)
        """
        These are important setting for training.
        You have to change these.
        
        make_data_loader: This creates some <Dataloader>s. / <dataloader.__init__>
        Modeling: You have to define your Model. / <modeling.modeling.Modeling()>
        Evaluator: You have to define Evaluator. / <utils.metrics.Evaluator()>
        Optimizer: You have to define Optimizer. / <utils.optimizer.Optimizer()>
        Loss: You have to define Loss function. / <utils.loss.Loss()>
        """
        ## ***Define Dataloader***
        self.train_loader, self.val_loader, self.test_loader, self.num_classes = make_data_loader(batch_size)
        
        ## ***Define Your Model***
        self.model = Modeling(self.num_classes)
        
        ## ***Define Evaluator***
        self.evaluator = Evaluator(self.num_classes)
        
        ## ***Define Optimizer***
        self.optimizer = Optimizer(self.model.parameters(), optimizer_name=optimizer_name, lr=lr, weight_decay=weight_decay)
        
        ## ***Define Loss***
        self.criterion = Loss()
        
        # ------------------------- #
        # Some settings
        """
        You don't have to touch bellow code.
        
        Using cuda: Enable to use cuda if you want.
        Resuming checkpoint: You can resume training if you want.
        """
        ## ***Using cuda***
        if self.use_cuda:
            self.model = torch.nn.DataParallel(self.model, device_ids=gpu_ids).cuda()

        ## ***Resuming checkpoint***
        """You can ignore bellow code."""
        self.best_pred = 0.0
        if resume is not None:
            if not os.path.isfile(resume["checkpoint_path"]):
                raise RuntimeError("=> no checkpoint found at '{}'" .format(resume["checkpoint_path"]))
            checkpoint = torch.load(resume["checkpoint_path"])
            self.start_epoch = checkpoint['epoch']
            if self.use_cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])
            if resume["fine_tuning"]:
                # resume params of optimizer, if run fine tuning.
                self.optimizer.load_state_dict(checkpoint['optimizer'])
                self.start_epoch = 0
            self.best_pred = checkpoint['best_pred']
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(resume["checkpoint_path"], checkpoint['epoch']))
            
    def _run_epoch(self, epoch, mode="train", leave_progress=True, use_optuna=False):
        """
        run training or validation 1 epoch.
        You don't have to change almost of this method.
        
        args:
            epoch = (int) How many epochs this time.
            mode = {"train" or "val"}
            leave_progress = {True or False} Can choose whether leave progress bar or not.
            use_optuna = {True or False} Can choose whether use optuna or not.
        
        Change point (if you need):
        - Evaluation: You can change metrics of monitoring.
        - writer.add_scalar: You can change metrics to be saved in tensorboard.
        """
        # ------------------------- #
        leave_progress = leave_progress and not use_optuna
        # Initializing
        epoch_loss = 0.0
        ## Set model mode & tqdm (progress bar; it wrap dataloader)
        assert (mode=="train") or (mode=="val"), "argument 'mode' can be 'train' or 'val.' Not {}.".format(mode)
        if mode=="train":
            data_loader = self.tqdm(self.train_loader, leave=leave_progress) if self.use_tqdm else self.train_loader
            self.model.train()
            num_dataset = len(self.train_loader)
        elif mode=="val":
            data_loader = self.tqdm(self.val_loader, leave=leave_progress) if self.use_tqdm else self.val_loader
            self.model.eval()
            num_dataset = len(self.val_loader)
        ## Reset confusion matrix of evaluator
        self.evaluator.reset()
        
        # ------------------------- #
        # Run 1 epoch
        for i, sample in enumerate(data_loader):
            ## ***Get Input data***
            inputs, target = sample["input"], sample["label"]
            if self.use_cuda:
                inputs, target = inputs.cuda(), target.cuda()
                
            ## ***Calculate Loss <Train>***
            if mode=="train":
                self.optimizer.zero_grad()
                output = self.model(inputs)
                loss = self.criterion(output, target)
                loss.backward()
                self.optimizer.step()
            ## ***Calculate Loss <Validation>***
            elif mode=="val":
                with torch.no_grad():
                    output = self.model(inputs)
                loss = self.criterion(output, target)
            epoch_loss += loss.item()
            ## ***Report results***
            if self.use_tqdm:
                data_loader.set_description('{} loss: {:.3f}'.format(mode, epoch_loss / (i + 1)))
            ## ***Add batch results into evaluator***
            target = target.cpu().numpy()
            output = torch.argmax(output, axis=1).data.cpu().numpy()
            self.evaluator.add_batch(target, output)
            
        ## **********Evaluate Score**********
        """You can add new metrics! <utils.metrics.Evaluator()>"""
        Acc = self.evaluator.Accuracy()
        
        if not use_optuna:
            ## ***Save eval into Tensorboard***
            self.writer.add_scalar('{}/loss_epoch'.format(mode), epoch_loss / (i + 1), epoch)
            self.writer.add_scalar('{}/Acc'.format(mode), Acc, epoch)
            print('Total {} loss: {:.3f}'.format(mode, epoch_loss / num_dataset))
            print("{0} Acc:{1:.2f}".format(mode, Acc))
        
        # Return score to watch. (update checkpoint or optuna's objective)
        return Acc
    
    def run(self, leave_progress=True, use_optuna=False):
        """
        Run all epochs of training and validation.
        """
        for epoch in tqdm(range(self.start_epoch, self.epochs)):
            print(pycolor.GREEN + "[Epoch: {}]".format(epoch) + pycolor.END)
            
            ## ***Train***
            print(pycolor.YELLOW+"Training:"+pycolor.END)
            self._run_epoch(epoch, mode="train", leave_progress=leave_progress, use_optuna=use_optuna)
            ## ***Validation***
            print(pycolor.YELLOW+"Validation:"+pycolor.END)
            score = self._run_epoch(epoch, mode="val", leave_progress=leave_progress, use_optuna=use_optuna)
            print("---------------------")
            if score > self.best_pred:
                print("model improve best score from {:.4f} to {:.4f}.".format(self.best_pred, score))
                self.best_pred = score
                self.saver.save_checkpoint({
                    'epoch': epoch + 1,
                    'state_dict': self.model.state_dict(),
                    'optimizer': self.optimizer.state_dict(),
                    'best_pred': self.best_pred,
                })
        self.writer.close()
        return self.best_pred
Example #2
0
class Trainer(BaseContainer):
    def __init__(self):
        super().__init__()
        now_time = time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time()))
        logger_path = os.path.join(
            self.args.training.save_dir,
            self.args.dataset.dataset_train,
            self.args.models.model_warpper,
            self.args.training.experiment_id,
            '%s.log' % now_time
        )
        set_logger_path(logger_path)
        logger.info(self.args)

        # Define Saver
        self.saver = Saver(self.args)

        # Define Tensorboard Summary
        self.summary = TensorboardSummary()
        self.writer = self.summary.create_summary(self.saver.experiment_dir, self.args.models)


        self.init_training_container()
        self.batchsize = self.args.training.batchsize
        self.reset_batchsize()
        self.evaluator = Evaluator()
        self.best = 0.0

        # show parameters to be trained
        logger.debug('\nTraining params:')
        for p in self.model.named_parameters():
            if p[1].requires_grad:
                logger.debug(p[0])
        logger.debug('\n')

        # Clear start epoch if fine-tuning
        logger.info('Starting iteration: %d' % self.start_it)
        logger.info('Total iterationes: %d' % self.args.training.max_iter)

    # main function for training
    def training(self):
        self.model.train()

        num_img_tr = len(self.train_loader)
        logger.info('\nTraining')

        max_iter = self.args.training.max_iter
        it = self.start_it

        # support multiple optimizers, but only one 
        # optimizer is used here, i.e., names = ['match']
        names = self.args.training.optimizer.keys()

        while it < max_iter:
            for samples in self.train_loader:
                samples = to_cuda(samples)

                # validation
                val_iter = self.args.training.get('val_iter', -1)
                if val_iter > 0 and it % val_iter == 0 and it >= self.args.training.get('start_eval_it', 15000):
                    self.validation(it, 'val')
                    self.model.train()

                if it % 100 == 0:
                    logger.info('\n===> Iteration  %d/%d' % (it, max_iter))
    
                # update class weights
                if it >= 500 and self.args.training.get('weight_update_iter', -1) > 0 and it % self.args.training.get('weight_update_iter', -1) == 0:
                    self.model.update_hard()
                    logger.info('\nUpdate hard ID: %.3f'%self.model.center.ratio)
                    self.writer.add_scalar('train/data_ratio', self.model.center.ratio, it)

                for name in names:
                    losses = dict()

                    self.optimizer[name].zero_grad()
                    outputs = self.model(samples, type=name)
                    losses = self.criterion(outputs, name)
                    loss = losses['loss']
                    loss.backward()
                    self.optimizer[name].step()

                    losses.update(losses)

                    # log training loss
                    if it % 100 == 0:
                        loss_log_str = '=>%s   loss: %.4f'%(name, loss.item())
                        for loss_name in losses.keys():
                            if loss_name != 'loss':
                                loss_log_str += '    %s: %.4f'%(loss_name, losses[loss_name])
                                self.writer.add_scalar('train/%s_iter'%loss_name, losses[loss_name], it)
                        logger.info(loss_log_str)
                        self.writer.add_scalar('train/total_loss_iter_%s'%name, loss.item(), it)

                    # adjust learning rate
                    lr_decay_iter = self.args.training.optimizer[name].get('lr_decay_iter', None)
                    if lr_decay_iter is not None:
                        for i in range(len(lr_decay_iter)):
                            if it == lr_decay_iter[i]:
                                lr = self.args.training.optimizer[name].lr * (self.args.training.optimizer[name].lr_decay ** (i+1))
                                logger.info('\nReduce lr to %.6f\n'%(lr))
                                for param_group in self.optimizer[name].param_groups:
                                    param_group["lr"] = lr 
                                break

                it += 1

                # save model and optimizer
                if it % self.args.training.save_iter == 0 or it == max_iter or it == 1:
                    logger.info('\nSaving checkpoint ......')
                    optimizer_to_save = dict()
                    for i in self.optimizer.keys():
                        optimizer_to_save[i] = self.optimizer[i].state_dict()
                    self.saver.save_checkpoint({
                        'start_it': it,
                        'stage': self.stage,
                        'state_dict': self.model.state_dict(),
                        'optimizer': optimizer_to_save,
                    }, filename='ckp_%06d.pth.tar'%it)
                    logger.info('Done.')

    # main function for validation
    def validation(self, it, split):
        logger.info('\nEvaluating %s...'%split)
        self.evaluator.reset()
        self.model.eval()

        data_loader = self.val_loader if split == 'val' else self.test_loader
        num_img_tr = len(data_loader)
        dist_pos = []
        dist_neg = []
        total_loss = []
        name = list(self.args.training.optimizer.keys())[0]
        for i, samples in enumerate(data_loader):
            samples = to_cuda(samples)

            with torch.no_grad():
                outputs = self.model(samples, type=name, is_triple=True)
                dist_pos.append(outputs[-1]['dist_pos'].mean().item())
                dist_neg.append(outputs[-1]['dist_neg'].mean().item())

            self.evaluator.add_batch(outputs[-1]['pred'], outputs[0]['target'])

        self.writer.add_scalar('%s/dist_pos'%split, np.array(dist_pos).mean(), it)
        self.writer.add_scalar('%s/dist_neg'%split, np.array(dist_neg).mean(), it)

        acc = self.evaluator.Accuracy()
        self.writer.add_scalar('%s/acc'%split, acc, it)
        if split == 'val':
            logger.info('=====>[Iteration: %d    %s/acc=%.4f    previous best=%.4f'%(it, split, acc, self.best))
        else:
            logger.info('=====>[Iteration: %d    %s/acc=%.4f'%(it, split, acc))

        # if split == 'val':
        #     self.validation(it, 'test')

        if split == 'val' and acc > self.best:
            self.best = acc
            logger.info('\nSaving checkpoint ......')
            optimizer_to_save = dict()
            for i in self.optimizer.keys():
                optimizer_to_save[i] = self.optimizer[i].state_dict()
            self.saver.save_checkpoint({
                'start_it': it,
                'stage': self.stage,
                'state_dict': self.model.state_dict(),
                'optimizer': optimizer_to_save,
            }, filename='best.pth.tar')
class Trainer(object):
    def __init__(self, args):
        self.args = args

        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # Define Dataloader
        kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(
            args, **kwargs)

        # Define weight
        self.temporal_weight = args.temporal_weight
        self.spatial_weight = args.spatial_weight

        # Define network
        temporal_model = Model(name='vgg16_bn', num_classes=101,
                               is_flow=True).get_model()
        spatial_model = Model(name='vgg16_bn', num_classes=101,
                              is_flow=False).get_model()

        # Define Optimizer
        #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
        temporal_optimizer = torch.optim.Adam(temporal_model.parameters(),
                                              lr=args.temporal_lr)
        spatial_optimizer = torch.optim.Adam(spatial_model.parameters(),
                                             lr=args.spatial_lr)

        # Define Criterion
        self.temporal_criterion = nn.BCELoss().cuda()
        self.spatial_criterion = nn.BCELoss().cuda()

        self.temporal_model, self.temporal_optimizer = temporal_model, temporal_optimizer
        self.spatial_model, self.spatial_optimizer = spatial_model, spatial_optimizer

        # Define Evaluator
        self.top1_eval = Evaluator(self.nclass)

        # Using cuda
        if args.cuda:
            self.temporal_model = torch.nn.DataParallel(
                self.temporal_model, device_ids=self.args.gpu_ids)
            patch_replication_callback(self.temporal_model)
            self.temporal_model = self.temporal_model.cuda()

            self.spatial_model = torch.nn.DataParallel(
                self.spatial_model, device_ids=self.args.gpu_ids)
            patch_replication_callback(self.spatial_model)
            self.spatial_model = self.spatial_model.cuda()

        # Resuming checkpoint
        self.best_accuracy = 0.0
        '''
        if args.resume is not None:
            if not os.path.isfile(args.resume):
                raise RuntimeError(
                    "=> no checkpoint found at '{}'" .format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            if args.cuda:
                self.model.module.load_state_dict(checkpoint['state_dict'])
                #self.model.load_state_dict(checkpoint['state_dict'])
            else:
                self.model.load_state_dict(checkpoint['state_dict'])

            #self.optimizer.load_state_dict(checkpoint['optimizer'])
            self.best_accuracy = checkpoint['best_accuracy']
            print("=> loaded checkpoint '{}' (epoch {}), best prediction {}"
                  .format(args.resume, checkpoint['epoch'], self.best_accuracy))
        '''

    def training(self, epoch):
        train_loss = 0.0
        self.temporal_model.train()
        tbar = tqdm(self.train_loader)
        num_img_tr = len(self.train_loader)
        for i, sample in enumerate(tbar):
            rgbs, flows, targets = sample['rgb'], sample['flow'], sample[
                'label']
            targets = targets.view(-1, 1).float()
            if self.args.cuda:
                rgbs, flows, targets = rgbs.cuda(), flows.cuda(), targets.cuda(
                )

            self.temporal_optimizer.zero_grad()
            self.spatial_optimizer.zero_grad()

            temporal_output = self.temporal_model(flows)
            spatial_output = self.spatial_model(rgbs)

            temporal_loss = self.temporal_criterion(temporal_output, targets)
            spatial_loss = self.spatial_criterion(spatial_output, targets)

            temporal_loss.backward()
            spatial_loss.backward()

            self.temporal_optimizer.step()
            self.spatial_optimizer.step()

            train_loss += temporal_loss.item()
            train_loss += spatial_loss.item()

            tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
            self.writer.add_scalar('train/total_temporal_loss_iter',
                                   temporal_loss.item(),
                                   i + num_img_tr * epoch)
            self.writer.add_scalar('train/total_spatial_loss_iter',
                                   spatial_loss.item(), i + num_img_tr * epoch)

            # Show 10 * 3 inference results each epoch
            #if i % (num_img_tr // 10) == 0:
            #    global_step = i + num_img_tr * epoch
            #    self.summary.visualize_image(self.writer, images, targets.squeeze(1).cpu().numpy(), output.squeeze(1).data.cpu().numpy(), global_step)

        self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch)
        print('[Epoch: %d, numImages: %5d]' %
              (epoch, i * self.args.batch_size + rgbs.data.shape[0]))
        print('Loss: %.3f' % train_loss)

    def validation(self, epoch):
        self.temporal_model.eval()
        self.spatial_model.eval()

        self.top1_eval.reset()
        tbar = tqdm(self.val_loader, desc='\r')
        test_loss = 0.0
        for i, sample in enumerate(tbar):
            rgbs, flows, targets = sample['rgb'], sample['flow'], sample[
                'label']
            targets = targets.view(-1, 1).float()
            if self.args.cuda:
                rgbs, flows, targets = rgbs.cuda(), flows.cuda(), targets.cuda(
                )
            with torch.no_grad():
                temporal_output = self.temporal_model(flows)
                spatial_output = self.spatial_model(rgbs)

            temporal_loss = self.temporal_criterion(temporal_output, targets)
            spatial_loss = self.spatial_criterion(spatial_output, targets)

            test_loss += temporal_loss.item()
            test_loss += spatial_loss.item()
            tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))

            pred = temporal_output.data.cpu(
            ).numpy() * self.temporal_weight + spatial_output.data.cpu().numpy(
            ) * self.spatial_weight
            targets = targets.cpu().numpy()
            # Add batch sample into evaluator
            self.top1_eval.add_batch(targets, pred)

        # Fast test during the training
        top1_acc = self.top1_eval.Accuracy()

        self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch)
        self.writer.add_scalar('val/Acc', top1_acc, epoch)
        print('Validation:')
        print('[Epoch: %d, numImages: %5d]' %
              (epoch, i * self.args.batch_size + rgbs.data.shape[0]))
        print("Top1: acc:{}, best accuracy:{}".format(top1_acc,
                                                      self.best_accuracy))
        print("Sensitivity:{}, Specificity:{}".format(
            self.top1_eval.Sensitivity(), self.top1_eval.Specificity()))
        print("Confusion Maxtrix:\n{}".format(
            self.top1_eval.Confusion_Matrix()))
        print('Loss: %.3f' % test_loss)

        if top1_acc > self.best_accuracy:
            is_best = True
            self.best_accuracy = top1_acc
            self.saver.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'temporal_state_dict':
                    self.temporal_model.module.state_dict(),
                    'temporal_optimizer': self.temporal_optimizer.state_dict(),
                    'spatial_state_dict':
                    self.spatial_model.module.state_dict(),
                    'spatial_optimizer': self.spatial_optimizer.state_dict(),
                    'best_accuracy': self.best_accuracy,
                    'sensitivity': self.top1_eval.Sensitivity(),
                    'specificity': self.top1_eval.Specificity(),
                }, is_best)
Example #4
0
class Predictor(object):
    def __init__(self, PATH):
        # Define Dataloader
        # word_vector = gensim.models.KeyedVectors.load_word2vec_format(conf.word_vector_dir+'model.vec', binary=False)
        self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(
            16)

        print(pycolor.CYAN + "  Define Model." + pycolor.END)
        # Define network (****Change****)
        model = Modeling(embedding_dim=conf.embedding_dim,
                         c_out=conf.num_class,
                         c_hidden=conf.hidden_channel,
                         hidden_layer=conf.hidden_layer)

        model_state = torch.load(PATH)
        state_dict = model_state["state_dict"]
        print("epoch: {}".format(model_state["epoch"]))

        # create new OrderedDict that does not contain `module.`
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove `module.`
            new_state_dict[name] = v
        # load params
        model.load_state_dict(new_state_dict)

        # Define Criterion
        self.criterion = nn.CrossEntropyLoss(reduction="none")

        # Define Evaluator
        self.evaluator = Evaluator(self.nclass)

        # Using cuda
        if True:
            #model = torch.nn.DataParallel(model, device_ids=[0])
            model = model.cuda()

        self.model = model
        self.predicts = []
        self.answers = []

    def predict(self):
        self.model.eval()
        self.evaluator.reset()
        tbar = tqdm(self.test_loader, desc='\r')
        print()
        print(pycolor.YELLOW + "Test:" + pycolor.END)
        test_loss = 0.0
        self.predicts = []
        self.answers = []
        for i, sample in enumerate(tbar):
            question, target = sample['question'], sample['label']
            if True:
                question, target = question.cuda(), target.cuda()
            with torch.no_grad():
                output = self.model(question)
            loss = self.criterion(output, target)
            test_loss += loss.sum().item()
            tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))
            # Compute Metrics
            pred = output.data.cpu().numpy()
            target = target.cpu().numpy()
            # Add batch sample into evaluator
            self.evaluator.add_batch(target, pred)

            # Count pred
            self.predicts += list(np.argsort(pred)[:, ::-1])
            self.answers += list(target)

        # Fast test during the training
        self.Acc = self.evaluator.Accuracy()
        self.Top3Acc = self.evaluator.TopN_Accuracy()
        self.MRR = self.evaluator.MRR()