class Trainer(OptunaMlFlow): def __init__(self, args, search_space): super().__init__(args, search_space) self.args=args self.best_pred = 0.0 # 再現性を上げるためrandomを使用している場合はrandom.seed()でseedを設定する random.seed(self.args.seed) # numpyで再現性を上げるためのの設定 np.random.seed(self.args.seed) # pytorchで再現性を上げるための設定 torch.manual_seed(self.args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False self.scaler = GradScaler() self.evaluator=Evaluator(self.args.nclass) def save_checkpoint(self, state, filename='checkpoint.pth.tar'): """Saves checkpoint to disk""" torch.save(state, filename)#, _use_new_zipfile_serialization=False) mlflow.log_artifact(filename) def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample if self.args.cuda: image, target = image.cuda(), target.cuda() # self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() # for amp with autocast(): output = self.model(image) loss = self.criterion(output, target) self.scaler.scale(loss).backward() # needed for horovod+amp # self.optimizer.synchronize() # with self.optimizer.skip_synchronize(): self.scaler.step(self.optimizer) self.scaler.update() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) mlflow.log_metric('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) if i > self.args.proc_batch_count: break mlflow.log_metric('train/total_loss_epoch', train_loss, epoch) # print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch self.save_checkpoint(self.model.state_dict()) def validating(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample if self.args.cuda: image, target = image.cuda(), target.cuda() with autocast(): with torch.no_grad(): output = self.model(image) output=output.float() loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) # pred = output.data.cpu().detach().numpy() pred = torch.argmax(output.data, dim=1).cpu().detach().numpy() target = target.cpu().detach().numpy() # target = nn.functional.one_hot(target, num_classes=10).cpu().detach().numpy().astype(np.float32) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) if i > self.args.proc_batch_count: break # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() """ Acc_class = self.evaluator.Pixel_Accuracy_Class() """ print('Validation:') # print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) new_pred = Acc if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.save_checkpoint(self.model.state_dict()) # print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) mlflow.log_metric('val/total_loss_epoch', test_loss, epoch) mlflow.log_metric("val/best_Acc", self.best_pred, epoch) mlflow.log_metric('val/Acc', Acc, epoch) """ mlflow.log_metric('val/Acc_class', Acc_class, epoch) """ return self.best_pred # ここで一つのパラメータの組み合わせについて評価する def trial_process(self, trial, optimizer, learning_rate, horizontal_flip, horizontal_shift_ratio, vertical_shift_ratio, random_erasing): self.best_pred=0.0 self.start_run(trial) # mlflowにtrialごとの情報をロギング self.log_trial(trial) self.model = EfficientNet.from_pretrained(self.args.backbone) # Unfreeze model weights for param in self.model.parameters(): param.requires_grad = True num_ftrs = self.model._fc.in_features self.model._fc = nn.Linear(num_ftrs, self.args.nclass) if self.args.smry_viz: from torchinfo import summary from torchviz import make_dot dummy_image=torch.zeros((2, 3, 32,32)) dummy_output=self.model(dummy_image) make_dot(dummy_output,params=dict(self.model.named_parameters())).render("torchviz", format="png") summary(self.model, (1,3, 32,32)) import sys;sys.exit() if self.args.cuda: self.model = self.model.to('cuda') if optimizer=='SGD': self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate) elif optimizer=='Adam': self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate) pipeline = [ T.ToTensor(), T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ] if strtobool(horizontal_flip) == 1: pipeline.append(T.RandomHorizontalFlip(p=0.5)) pipeline.append(T.RandomAffine(0,translate=(horizontal_shift_ratio,vertical_shift_ratio))) if strtobool(random_erasing) == 1: pipeline.append(T.RandomErasing()) transform = T.Compose(pipeline) train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) self.train_loader = torch.utils.data.DataLoader(train_set, batch_size=self.args.batch_size, shuffle=True, num_workers=self.args.workers) val_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) self.val_loader = torch.utils.data.DataLoader(val_set, batch_size=self.args.batch_size, shuffle=False, num_workers=self.args.workers) self.criterion = nn.CrossEntropyLoss() for epoch in range(self.args.start_epoch, self.args.epochs): self.training(epoch) if not self.args.no_val and epoch % self.args.eval_interval == (self.args.eval_interval - 1): best_score=self.validating(epoch) self.end_run() # scoring by best return 1.0 - best_score # ランダムおよびTPEサーチを行うための目的関数 def objective_no_grid(self, trial): ''' # Categorical parameter optimizer = trial.suggest_categorical('optimizer', self.args.optimizer) # Int parameter num_layers = trial.suggest_int('num_layers', self.args.num_layers[0], self.args.num_layers[1]) # Uniform parameter dropout_rate = trial.suggest_uniform('dropout_rate', self.args.dropout_rate[0], self.args.dropout_rate[1]) # Loguniform parameter learning_rate = trial.suggest_loguniform('learning_rate', self.args.learning_rate[0], self.args.learning_rate[1]) # Discrete-uniform parameter drop_path_rate = trial.suggest_discrete_uniform('drop_path_rate', self.args.drop_path_rate[0], self.args.drop_path_rate[1], self.args.drop_path_rate[2]) ''' # Int parameter # num_layers = trial.suggest_int('num_layers', self.args.num_layers[0], self.args.num_layers[1]) optimizer = trial.suggest_categorical('optimizer', self.args.optimizer) learning_rate = trial.suggest_loguniform('learning_rate', self.args.learning_rate[0], self.args.learning_rate[1]) horizontal_flip = trial.suggest_categorical('horizontal_flip', self.args.horizontal_flip) horizontal_shift_ratio = trial.suggest_uniform('horizontal_shift_ratio', self.args.horizontal_shift_ratio[0], self.args.horizontal_shift_ratio[1]) vertical_shift_ratio = trial.suggest_uniform('vertical_shift_ratio', self.args.vertical_shift_ratio[0], self.args.vertical_shift_ratio[1]) random_erasing = trial.suggest_categorical('random_erasing', self.args.random_erasing) # ここで一つのパラメータの組み合わせについて評価する result=self.trial_process(trial, optimizer, learning_rate, horizontal_flip, horizontal_shift_ratio, vertical_shift_ratio, random_erasing) return result # 固定パラメータおよびグリッドサーチを行うための目的関数 def objective_grid(self, trial): ''' パラメータは原則trial,suggest_categorical()で指定する。 ''' optimizer = trial.suggest_categorical('optimizer', self.args.optimizer) learning_rate = trial.suggest_categorical('learning_rate', self.args.learning_rate) horizontal_flip = trial.suggest_categorical('horizontal_flip', self.args.horizontal_flip) horizontal_shift_ratio = trial.suggest_categorical('horizontal_shift_ratio', self.args.horizontal_shift_ratio) vertical_shift_ratio = trial.suggest_categorical('vertical_shift_ratio', self.args.vertical_shift_ratio) random_erasing = trial.suggest_categorical('random_erasing', self.args.random_erasing) # ここで一つのパラメータの組み合わせについて評価する result=self.trial_process(trial, optimizer, learning_rate, horizontal_flip, horizontal_shift_ratio, vertical_shift_ratio, random_erasing) return result
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(args.logdir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} dltrain = DLDataset('trainval', "./data/pascal_voc_seg/tfrecord/") dlval = DLDataset('val', "./data/pascal_voc_seg/tfrecord/") # dltrain = DLDataset('trainval', "./data/pascal_voc_seg/VOCdevkit/VOC2012/") # dlval = DLDataset('val', "./data/pascal_voc_seg/VOCdevkit/VOC2012/") self.train_loader = DataLoader(dltrain, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) self.val_loader = DataLoader(dlval, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) # Define network model = Deeplab() train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights self.criterion = nn.CrossEntropyLoss(ignore_index=255).cuda() self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(21) # Define lr scheduler self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer=optimizer) # Using cuda # if args.cuda: # self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, (image, target) in enumerate(tbar): if self.args.cuda: image, target = image.cuda(), target.cuda() self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target.long()) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch # if i % (num_img_tr // 10) == 0: if i % 10 == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.scheduler.step(train_loss) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, (image, target) in enumerate(tbar): if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target.long()) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)