class Trainer(object): def __init__(self, batch_size=32, optimizer_name="Adam", lr=1e-3, weight_decay=1e-5, epochs=200, model_name="model01", gpu_ids=None, resume=None, tqdm=None): """ args: batch_size = (int) batch_size of training and validation lr = (float) learning rate of optimization weight_decay = (float) weight decay of optimization epochs = (int) The number of epochs of training model_name = (string) The name of training model. Will be folder name. gpu_ids = (List) List of gpu_ids. (e.g. gpu_ids = [0, 1]). Use CPU, if it is None. resume = (Dict) Dict of some settings. (resume = {"checkpoint_path":PATH_of_checkpoint, "fine_tuning":True or False}). Learn from scratch, if it is None. tqdm = (tqdm Object) progress bar object. Set your tqdm please. Don't view progress bar, if it is None. """ # Set params self.batch_size = batch_size self.epochs = epochs self.start_epoch = 0 self.use_cuda = (gpu_ids is not None) and torch.cuda.is_available self.tqdm = tqdm self.use_tqdm = tqdm is not None # ------------------------- # # Define Utils. (No need to Change.) """ These are Project Modules. You may not have to change these. Saver: Save model weight. / <utils.saver.Saver()> TensorboardSummary: Write tensorboard file. / <utils.summaries.TensorboardSummary()> Evaluator: Calculate some metrics (e.g. Accuracy). / <utils.metrics.Evaluator()> """ ## ***Define Saver*** self.saver = Saver(model_name, lr, epochs) self.saver.save_experiment_config() ## ***Define Tensorboard Summary*** self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # ------------------------- # # Define Training components. (You have to Change!) """ These are important setting for training. You have to change these. make_data_loader: This creates some <Dataloader>s. / <dataloader.__init__> Modeling: You have to define your Model. / <modeling.modeling.Modeling()> Evaluator: You have to define Evaluator. / <utils.metrics.Evaluator()> Optimizer: You have to define Optimizer. / <utils.optimizer.Optimizer()> Loss: You have to define Loss function. / <utils.loss.Loss()> """ ## ***Define Dataloader*** self.train_loader, self.val_loader, self.test_loader, self.num_classes = make_data_loader(batch_size) ## ***Define Your Model*** self.model = Modeling(self.num_classes) ## ***Define Evaluator*** self.evaluator = Evaluator(self.num_classes) ## ***Define Optimizer*** self.optimizer = Optimizer(self.model.parameters(), optimizer_name=optimizer_name, lr=lr, weight_decay=weight_decay) ## ***Define Loss*** self.criterion = Loss() # ------------------------- # # Some settings """ You don't have to touch bellow code. Using cuda: Enable to use cuda if you want. Resuming checkpoint: You can resume training if you want. """ ## ***Using cuda*** if self.use_cuda: self.model = torch.nn.DataParallel(self.model, device_ids=gpu_ids).cuda() ## ***Resuming checkpoint*** """You can ignore bellow code.""" self.best_pred = 0.0 if resume is not None: if not os.path.isfile(resume["checkpoint_path"]): raise RuntimeError("=> no checkpoint found at '{}'" .format(resume["checkpoint_path"])) checkpoint = torch.load(resume["checkpoint_path"]) self.start_epoch = checkpoint['epoch'] if self.use_cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if resume["fine_tuning"]: # resume params of optimizer, if run fine tuning. self.optimizer.load_state_dict(checkpoint['optimizer']) self.start_epoch = 0 self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(resume["checkpoint_path"], checkpoint['epoch'])) def _run_epoch(self, epoch, mode="train", leave_progress=True, use_optuna=False): """ run training or validation 1 epoch. You don't have to change almost of this method. args: epoch = (int) How many epochs this time. mode = {"train" or "val"} leave_progress = {True or False} Can choose whether leave progress bar or not. use_optuna = {True or False} Can choose whether use optuna or not. Change point (if you need): - Evaluation: You can change metrics of monitoring. - writer.add_scalar: You can change metrics to be saved in tensorboard. """ # ------------------------- # leave_progress = leave_progress and not use_optuna # Initializing epoch_loss = 0.0 ## Set model mode & tqdm (progress bar; it wrap dataloader) assert (mode=="train") or (mode=="val"), "argument 'mode' can be 'train' or 'val.' Not {}.".format(mode) if mode=="train": data_loader = self.tqdm(self.train_loader, leave=leave_progress) if self.use_tqdm else self.train_loader self.model.train() num_dataset = len(self.train_loader) elif mode=="val": data_loader = self.tqdm(self.val_loader, leave=leave_progress) if self.use_tqdm else self.val_loader self.model.eval() num_dataset = len(self.val_loader) ## Reset confusion matrix of evaluator self.evaluator.reset() # ------------------------- # # Run 1 epoch for i, sample in enumerate(data_loader): ## ***Get Input data*** inputs, target = sample["input"], sample["label"] if self.use_cuda: inputs, target = inputs.cuda(), target.cuda() ## ***Calculate Loss <Train>*** if mode=="train": self.optimizer.zero_grad() output = self.model(inputs) loss = self.criterion(output, target) loss.backward() self.optimizer.step() ## ***Calculate Loss <Validation>*** elif mode=="val": with torch.no_grad(): output = self.model(inputs) loss = self.criterion(output, target) epoch_loss += loss.item() ## ***Report results*** if self.use_tqdm: data_loader.set_description('{} loss: {:.3f}'.format(mode, epoch_loss / (i + 1))) ## ***Add batch results into evaluator*** target = target.cpu().numpy() output = torch.argmax(output, axis=1).data.cpu().numpy() self.evaluator.add_batch(target, output) ## **********Evaluate Score********** """You can add new metrics! <utils.metrics.Evaluator()>""" Acc = self.evaluator.Accuracy() if not use_optuna: ## ***Save eval into Tensorboard*** self.writer.add_scalar('{}/loss_epoch'.format(mode), epoch_loss / (i + 1), epoch) self.writer.add_scalar('{}/Acc'.format(mode), Acc, epoch) print('Total {} loss: {:.3f}'.format(mode, epoch_loss / num_dataset)) print("{0} Acc:{1:.2f}".format(mode, Acc)) # Return score to watch. (update checkpoint or optuna's objective) return Acc def run(self, leave_progress=True, use_optuna=False): """ Run all epochs of training and validation. """ for epoch in tqdm(range(self.start_epoch, self.epochs)): print(pycolor.GREEN + "[Epoch: {}]".format(epoch) + pycolor.END) ## ***Train*** print(pycolor.YELLOW+"Training:"+pycolor.END) self._run_epoch(epoch, mode="train", leave_progress=leave_progress, use_optuna=use_optuna) ## ***Validation*** print(pycolor.YELLOW+"Validation:"+pycolor.END) score = self._run_epoch(epoch, mode="val", leave_progress=leave_progress, use_optuna=use_optuna) print("---------------------") if score > self.best_pred: print("model improve best score from {:.4f} to {:.4f}.".format(self.best_pred, score)) self.best_pred = score self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }) self.writer.close() return self.best_pred
class Trainer(BaseContainer): def __init__(self): super().__init__() now_time = time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time())) logger_path = os.path.join( self.args.training.save_dir, self.args.dataset.dataset_train, self.args.models.model_warpper, self.args.training.experiment_id, '%s.log' % now_time ) set_logger_path(logger_path) logger.info(self.args) # Define Saver self.saver = Saver(self.args) # Define Tensorboard Summary self.summary = TensorboardSummary() self.writer = self.summary.create_summary(self.saver.experiment_dir, self.args.models) self.init_training_container() self.batchsize = self.args.training.batchsize self.reset_batchsize() self.evaluator = Evaluator() self.best = 0.0 # show parameters to be trained logger.debug('\nTraining params:') for p in self.model.named_parameters(): if p[1].requires_grad: logger.debug(p[0]) logger.debug('\n') # Clear start epoch if fine-tuning logger.info('Starting iteration: %d' % self.start_it) logger.info('Total iterationes: %d' % self.args.training.max_iter) # main function for training def training(self): self.model.train() num_img_tr = len(self.train_loader) logger.info('\nTraining') max_iter = self.args.training.max_iter it = self.start_it # support multiple optimizers, but only one # optimizer is used here, i.e., names = ['match'] names = self.args.training.optimizer.keys() while it < max_iter: for samples in self.train_loader: samples = to_cuda(samples) # validation val_iter = self.args.training.get('val_iter', -1) if val_iter > 0 and it % val_iter == 0 and it >= self.args.training.get('start_eval_it', 15000): self.validation(it, 'val') self.model.train() if it % 100 == 0: logger.info('\n===> Iteration %d/%d' % (it, max_iter)) # update class weights if it >= 500 and self.args.training.get('weight_update_iter', -1) > 0 and it % self.args.training.get('weight_update_iter', -1) == 0: self.model.update_hard() logger.info('\nUpdate hard ID: %.3f'%self.model.center.ratio) self.writer.add_scalar('train/data_ratio', self.model.center.ratio, it) for name in names: losses = dict() self.optimizer[name].zero_grad() outputs = self.model(samples, type=name) losses = self.criterion(outputs, name) loss = losses['loss'] loss.backward() self.optimizer[name].step() losses.update(losses) # log training loss if it % 100 == 0: loss_log_str = '=>%s loss: %.4f'%(name, loss.item()) for loss_name in losses.keys(): if loss_name != 'loss': loss_log_str += ' %s: %.4f'%(loss_name, losses[loss_name]) self.writer.add_scalar('train/%s_iter'%loss_name, losses[loss_name], it) logger.info(loss_log_str) self.writer.add_scalar('train/total_loss_iter_%s'%name, loss.item(), it) # adjust learning rate lr_decay_iter = self.args.training.optimizer[name].get('lr_decay_iter', None) if lr_decay_iter is not None: for i in range(len(lr_decay_iter)): if it == lr_decay_iter[i]: lr = self.args.training.optimizer[name].lr * (self.args.training.optimizer[name].lr_decay ** (i+1)) logger.info('\nReduce lr to %.6f\n'%(lr)) for param_group in self.optimizer[name].param_groups: param_group["lr"] = lr break it += 1 # save model and optimizer if it % self.args.training.save_iter == 0 or it == max_iter or it == 1: logger.info('\nSaving checkpoint ......') optimizer_to_save = dict() for i in self.optimizer.keys(): optimizer_to_save[i] = self.optimizer[i].state_dict() self.saver.save_checkpoint({ 'start_it': it, 'stage': self.stage, 'state_dict': self.model.state_dict(), 'optimizer': optimizer_to_save, }, filename='ckp_%06d.pth.tar'%it) logger.info('Done.') # main function for validation def validation(self, it, split): logger.info('\nEvaluating %s...'%split) self.evaluator.reset() self.model.eval() data_loader = self.val_loader if split == 'val' else self.test_loader num_img_tr = len(data_loader) dist_pos = [] dist_neg = [] total_loss = [] name = list(self.args.training.optimizer.keys())[0] for i, samples in enumerate(data_loader): samples = to_cuda(samples) with torch.no_grad(): outputs = self.model(samples, type=name, is_triple=True) dist_pos.append(outputs[-1]['dist_pos'].mean().item()) dist_neg.append(outputs[-1]['dist_neg'].mean().item()) self.evaluator.add_batch(outputs[-1]['pred'], outputs[0]['target']) self.writer.add_scalar('%s/dist_pos'%split, np.array(dist_pos).mean(), it) self.writer.add_scalar('%s/dist_neg'%split, np.array(dist_neg).mean(), it) acc = self.evaluator.Accuracy() self.writer.add_scalar('%s/acc'%split, acc, it) if split == 'val': logger.info('=====>[Iteration: %d %s/acc=%.4f previous best=%.4f'%(it, split, acc, self.best)) else: logger.info('=====>[Iteration: %d %s/acc=%.4f'%(it, split, acc)) # if split == 'val': # self.validation(it, 'test') if split == 'val' and acc > self.best: self.best = acc logger.info('\nSaving checkpoint ......') optimizer_to_save = dict() for i in self.optimizer.keys(): optimizer_to_save[i] = self.optimizer[i].state_dict() self.saver.save_checkpoint({ 'start_it': it, 'stage': self.stage, 'state_dict': self.model.state_dict(), 'optimizer': optimizer_to_save, }, filename='best.pth.tar')
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define weight self.temporal_weight = args.temporal_weight self.spatial_weight = args.spatial_weight # Define network temporal_model = Model(name='vgg16_bn', num_classes=101, is_flow=True).get_model() spatial_model = Model(name='vgg16_bn', num_classes=101, is_flow=False).get_model() # Define Optimizer #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) temporal_optimizer = torch.optim.Adam(temporal_model.parameters(), lr=args.temporal_lr) spatial_optimizer = torch.optim.Adam(spatial_model.parameters(), lr=args.spatial_lr) # Define Criterion self.temporal_criterion = nn.BCELoss().cuda() self.spatial_criterion = nn.BCELoss().cuda() self.temporal_model, self.temporal_optimizer = temporal_model, temporal_optimizer self.spatial_model, self.spatial_optimizer = spatial_model, spatial_optimizer # Define Evaluator self.top1_eval = Evaluator(self.nclass) # Using cuda if args.cuda: self.temporal_model = torch.nn.DataParallel( self.temporal_model, device_ids=self.args.gpu_ids) patch_replication_callback(self.temporal_model) self.temporal_model = self.temporal_model.cuda() self.spatial_model = torch.nn.DataParallel( self.spatial_model, device_ids=self.args.gpu_ids) patch_replication_callback(self.spatial_model) self.spatial_model = self.spatial_model.cuda() # Resuming checkpoint self.best_accuracy = 0.0 ''' if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError( "=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) #self.model.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) #self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_accuracy = checkpoint['best_accuracy'] print("=> loaded checkpoint '{}' (epoch {}), best prediction {}" .format(args.resume, checkpoint['epoch'], self.best_accuracy)) ''' def training(self, epoch): train_loss = 0.0 self.temporal_model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): rgbs, flows, targets = sample['rgb'], sample['flow'], sample[ 'label'] targets = targets.view(-1, 1).float() if self.args.cuda: rgbs, flows, targets = rgbs.cuda(), flows.cuda(), targets.cuda( ) self.temporal_optimizer.zero_grad() self.spatial_optimizer.zero_grad() temporal_output = self.temporal_model(flows) spatial_output = self.spatial_model(rgbs) temporal_loss = self.temporal_criterion(temporal_output, targets) spatial_loss = self.spatial_criterion(spatial_output, targets) temporal_loss.backward() spatial_loss.backward() self.temporal_optimizer.step() self.spatial_optimizer.step() train_loss += temporal_loss.item() train_loss += spatial_loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_temporal_loss_iter', temporal_loss.item(), i + num_img_tr * epoch) self.writer.add_scalar('train/total_spatial_loss_iter', spatial_loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch #if i % (num_img_tr // 10) == 0: # global_step = i + num_img_tr * epoch # self.summary.visualize_image(self.writer, images, targets.squeeze(1).cpu().numpy(), output.squeeze(1).data.cpu().numpy(), global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + rgbs.data.shape[0])) print('Loss: %.3f' % train_loss) def validation(self, epoch): self.temporal_model.eval() self.spatial_model.eval() self.top1_eval.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): rgbs, flows, targets = sample['rgb'], sample['flow'], sample[ 'label'] targets = targets.view(-1, 1).float() if self.args.cuda: rgbs, flows, targets = rgbs.cuda(), flows.cuda(), targets.cuda( ) with torch.no_grad(): temporal_output = self.temporal_model(flows) spatial_output = self.spatial_model(rgbs) temporal_loss = self.temporal_criterion(temporal_output, targets) spatial_loss = self.spatial_criterion(spatial_output, targets) test_loss += temporal_loss.item() test_loss += spatial_loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = temporal_output.data.cpu( ).numpy() * self.temporal_weight + spatial_output.data.cpu().numpy( ) * self.spatial_weight targets = targets.cpu().numpy() # Add batch sample into evaluator self.top1_eval.add_batch(targets, pred) # Fast test during the training top1_acc = self.top1_eval.Accuracy() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/Acc', top1_acc, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + rgbs.data.shape[0])) print("Top1: acc:{}, best accuracy:{}".format(top1_acc, self.best_accuracy)) print("Sensitivity:{}, Specificity:{}".format( self.top1_eval.Sensitivity(), self.top1_eval.Specificity())) print("Confusion Maxtrix:\n{}".format( self.top1_eval.Confusion_Matrix())) print('Loss: %.3f' % test_loss) if top1_acc > self.best_accuracy: is_best = True self.best_accuracy = top1_acc self.saver.save_checkpoint( { 'epoch': epoch + 1, 'temporal_state_dict': self.temporal_model.module.state_dict(), 'temporal_optimizer': self.temporal_optimizer.state_dict(), 'spatial_state_dict': self.spatial_model.module.state_dict(), 'spatial_optimizer': self.spatial_optimizer.state_dict(), 'best_accuracy': self.best_accuracy, 'sensitivity': self.top1_eval.Sensitivity(), 'specificity': self.top1_eval.Specificity(), }, is_best)
class Predictor(object): def __init__(self, PATH): # Define Dataloader # word_vector = gensim.models.KeyedVectors.load_word2vec_format(conf.word_vector_dir+'model.vec', binary=False) self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( 16) print(pycolor.CYAN + " Define Model." + pycolor.END) # Define network (****Change****) model = Modeling(embedding_dim=conf.embedding_dim, c_out=conf.num_class, c_hidden=conf.hidden_channel, hidden_layer=conf.hidden_layer) model_state = torch.load(PATH) state_dict = model_state["state_dict"] print("epoch: {}".format(model_state["epoch"])) # create new OrderedDict that does not contain `module.` from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v # load params model.load_state_dict(new_state_dict) # Define Criterion self.criterion = nn.CrossEntropyLoss(reduction="none") # Define Evaluator self.evaluator = Evaluator(self.nclass) # Using cuda if True: #model = torch.nn.DataParallel(model, device_ids=[0]) model = model.cuda() self.model = model self.predicts = [] self.answers = [] def predict(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.test_loader, desc='\r') print() print(pycolor.YELLOW + "Test:" + pycolor.END) test_loss = 0.0 self.predicts = [] self.answers = [] for i, sample in enumerate(tbar): question, target = sample['question'], sample['label'] if True: question, target = question.cuda(), target.cuda() with torch.no_grad(): output = self.model(question) loss = self.criterion(output, target) test_loss += loss.sum().item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) # Compute Metrics pred = output.data.cpu().numpy() target = target.cpu().numpy() # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Count pred self.predicts += list(np.argsort(pred)[:, ::-1]) self.answers += list(target) # Fast test during the training self.Acc = self.evaluator.Accuracy() self.Top3Acc = self.evaluator.TopN_Accuracy() self.MRR = self.evaluator.MRR()