def train(self, fix_net_weights=False): # have config valid_batch_size, and ignored drop_last. data_loader = self.run_manager.run_config.train_loader iter_per_epoch = len(data_loader) total_iteration = iter_per_epoch * self.run_manager.run_config.epochs self.update_scheduler = self.arch_search_config.get_update_schedule( iter_per_epoch) if fix_net_weights: # used to debug data_loader = [(0, 0)] * iter_per_epoch print('Train Phase close for debug') # arch_parameter update frequency and times in each iteration. #update_schedule = self.arch_search_config.get_update_schedule(iter_per_epoch) # pay attention here, total_epochs include warmup epochs epoch_time = AverageMeter() end_epoch = time.time() # TODO : use start_epochs # single_path init TODO: does not perform sampling init #_, network_index = self.net.get_network_arch_hardwts_with_constraint() #_, aspp_index = self.net.get_aspp_hardwts_index() #single_path = self.net.sample_single_path(self.run_manager.run_config.nb_layers, aspp_index, network_index) for epoch in range(self.start_epoch, self.run_manager.run_config.epochs): self.logger.log('\n' + '-' * 30 + 'Train Epoch: {}'.format(epoch + 1) + '-' * 30 + '\n', mode='search') self.run_manager.scheduler.step(epoch) train_lr = self.run_manager.scheduler.get_lr() arch_lr = self.arch_optimizer.param_groups[0]['lr'] self.net.set_tau(self.arch_search_config.tau_max - (self.arch_search_config.tau_max - self.arch_search_config.tau_min) * (epoch) / (self.run_manager.run_config.epochs)) tau = self.net.get_tau() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accs = AverageMeter() mious = AverageMeter() fscores = AverageMeter() #valid_data_time = AverageMeter() valid_losses = AverageMeter() valid_accs = AverageMeter() valid_mious = AverageMeter() valid_fscores = AverageMeter() self.net.train() epoch_str = 'epoch[{:03d}/{:03d}]'.format( epoch + 1, self.run_manager.run_config.epochs) time_left = epoch_time.average * ( self.run_manager.run_config.epochs - epoch) common_log = '[*Train-Search* the {:}] Left={:} WLR={:} ALR={:} tau={:}'\ .format(epoch_str, str(timedelta(seconds=time_left)) if epoch != 0 else None, train_lr, arch_lr, tau) self.logger.log(common_log, 'search') end = time.time() for i, (datas, targets) in enumerate(data_loader): #print(self.net.single_path) #print(i) #if i == 59: break if not fix_net_weights: if torch.cuda.is_available(): datas = datas.to(self.run_manager.device, non_blocking=True) targets = targets.to(self.run_manager.device, non_blocking=True) else: raise ValueError('do not support cpu version') data_time.update(time.time() - end) ''' if (i + 1) % self.arch_search_config.sample_arch_frequency == 0: _, network_index = self.net.get_network_arch_hardwts_with_constraint() _, aspp_index = self.net.get_aspp_hardwts_index() single_path = self.net.sample_single_path(self.run_manager.run_config.nb_layers, aspp_index, network_index) ''' #_, network_index = self.net.get_network_arch_hardwts() #_, aspp_index = self.net.get_aspp_hardwts_index() #single_path = self.net.sample_single_path(self.run_manager.run_config.nb_layers, aspp_index, network_index) logits = self.net.single_path_forward( datas) # super network gdas forward # loss ce_loss = self.run_manager.criterion(logits, targets) #cell_reg, network_reg, _ = self.net.calculate_entropy(single_path) # todo: pay attention, entropy is unnormalized, should use small lambda #print('entropy_reg:', entropy_reg) loss = self.run_manager.add_regularization_loss( epoch, ce_loss, None) #loss = self.run_manager.criterion(logits, targets) # metrics and update evaluator = Evaluator( self.run_manager.run_config.nb_classes) evaluator.add_batch(targets, logits) acc = evaluator.Pixel_Accuracy() miou = evaluator.Mean_Intersection_over_Union() fscore = evaluator.Fx_Score() losses.update(loss.data.item(), datas.size(0)) accs.update(acc.item(), datas.size(0)) mious.update(miou.item(), datas.size(0)) fscores.update(fscore.item(), datas.size(0)) self.net.zero_grad() loss.backward() self.run_manager.optimizer.step() # at the i-th iteration, update arch_parameters update_scheduler[i] times. valid_datas, valid_targets = self.run_manager.run_config.valid_next_batch if torch.cuda.is_available(): valid_datas = valid_datas.to(self.run_manager.device, non_blocking=True) valid_targets = valid_targets.to( self.run_manager.device, non_blocking=True) else: raise ValueError('do not support cpu version') #_, network_index = self.net.get_network_arch_hardwts() # set self.hardwts again #_, aspp_index = self.net.get_aspp_hardwts_index() #single_path = self.net.sample_single_path(self.run_manager.run_config.nb_layers, aspp_index, network_index) logits = self.net.single_path_forward(valid_datas) ce_loss = self.run_manager.criterion(logits, valid_targets) #cell_reg, network_reg, _ = self.net.calculate_entropy(single_path) # TODO: do not add entropy regularization in arch update loss = self.run_manager.add_regularization_loss( epoch, ce_loss, None) # metrics and update valid_evaluator = Evaluator( self.run_manager.run_config.nb_classes) valid_evaluator.add_batch(valid_targets, logits) acc = valid_evaluator.Pixel_Accuracy() miou = valid_evaluator.Mean_Intersection_over_Union() fscore = valid_evaluator.Fx_Score() valid_losses.update(loss.data.item(), datas.size(0)) valid_accs.update(acc.item(), datas.size(0)) valid_mious.update(miou.item(), datas.size(0)) valid_fscores.update(fscore.item(), datas.size(0)) self.net.zero_grad() loss.backward() # release computational graph # update arch_parameters per '{:}'.format(arch_param_update_frequency) self.arch_optimizer.step() # batch_time of one iter of train and valid. batch_time.update(time.time() - end) end = time.time() # in other case, calculate metrics normally # train_print_freq == sample_arch_freq if ( i + 1 ) % self.run_manager.run_config.train_print_freq == 0 or ( i + 1) == iter_per_epoch: Wstr = '|*Search*|' + time_string( ) + '[{:}][iter{:03d}/{:03d}]'.format( epoch_str, i + 1, iter_per_epoch) Tstr = '|Time | {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})'.format( batch_time=batch_time, data_time=data_time) Bstr = '|Base | [Loss {loss.val:.3f} ({loss.avg:.3f}) Accuracy {acc.val:.2f} ({acc.avg:.2f}) MIoU {miou.val:.2f} ({miou.avg:.2f}) F {fscore.val:.2f} ({fscore.avg:.2f})]'.format( loss=losses, acc=accs, miou=mious, fscore=fscores) Astr = '|Arch | [Loss {loss.val:.3f} ({loss.avg:.3f}) Accuracy {acc.val:.2f} ({acc.avg:.2f}) MIoU {miou.val:.2f} ({miou.avg:.2f}) F {fscore.val:.2f} ({fscore.avg:.2f})]'.format( loss=valid_losses, acc=valid_accs, miou=valid_mious, fscore=valid_fscores) self.logger.log(Wstr + '\n' + Tstr + '\n' + Bstr + '\n' + Astr, mode='search') _, network_index = self.net.get_network_arch_hardwts( ) # set self.hardwts again _, aspp_index = self.net.get_aspp_hardwts_index() single_path = self.net.sample_single_path( self.run_manager.run_config.nb_layers, aspp_index, network_index) cell_arch_entropy, network_arch_entropy, total_entropy = self.net.calculate_entropy( single_path) # update visdom if self.vis is not None: self.vis.visdom_update(epoch, 'loss', [losses.average, valid_losses.average]) self.vis.visdom_update(epoch, 'accuracy', [accs.average, valid_accs.average]) self.vis.visdom_update(epoch, 'miou', [mious.average, valid_mious.average]) self.vis.visdom_update( epoch, 'f1score', [fscores.average, valid_fscores.average]) self.vis.visdom_update(epoch, 'cell_entropy', [cell_arch_entropy]) self.vis.visdom_update(epoch, 'network_entropy', [network_arch_entropy]) self.vis.visdom_update(epoch, 'entropy', [total_entropy]) #torch.cuda.empty_cache() # update epoch_time epoch_time.update(time.time() - end_epoch) end_epoch = time.time() epoch_str = '{:03d}/{:03d}'.format( epoch + 1, self.run_manager.run_config.epochs) log = '[{:}] train :: loss={:.2f} accuracy={:.2f} miou={:.2f} f1score={:.2f}\n' \ '[{:}] valid :: loss={:.2f} accuracy={:.2f} miou={:.2f} f1score={:.2f}\n'.format( epoch_str, losses.average, accs.average, mious.average, fscores.average, epoch_str, valid_losses.average, valid_accs.average, valid_mious.average, valid_fscores.average ) self.logger.log(log, mode='search') self.logger.log( '<<<---------->>> Super Network decoding <<<---------->>> ', mode='search') actual_path, cell_genotypes = self.net.network_cell_arch_decode() #print(cell_genotypes) new_genotypes = [] for _index, genotype in cell_genotypes: xlist = [] print(_index, genotype) for edge_genotype in genotype: for (node_str, select_index) in edge_genotype: xlist.append((node_str, self.run_manager.run_config. conv_candidates[select_index])) new_genotypes.append((_index, xlist)) log_str = 'The {:} decode network:\n' \ 'actual_path = {:}\n' \ 'genotype:'.format(epoch_str, actual_path) for _index, genotype in new_genotypes: log_str += 'index: {:} arch: {:}\n'.format(_index, genotype) self.logger.log(log_str, mode='network_space', display=False) # TODO: perform save the best network ckpt # 1. save network_arch_parameters and cell_arch_parameters # 2. save weight_parameters # 3. weight_optimizer.state_dict # 4. arch_optimizer.state_dict # 5. training process # 6. monitor_metric and the best_value # get best_monitor in valid phase. val_monitor_metric = get_monitor_metric( self.run_manager.monitor_metric, valid_losses.average, valid_accs.average, valid_mious.average, valid_fscores.average) is_best = self.run_manager.best_monitor < val_monitor_metric self.run_manager.best_monitor = max(self.run_manager.best_monitor, val_monitor_metric) # 1. if is_best : save_current_ckpt # 2. if can be divided : save_current_ckpt #self.run_manager.save_model(epoch, { # 'arch_optimizer': self.arch_optimizer.state_dict(), #}, is_best=True, checkpoint_file_name=None) # TODO: have modification on checkpoint_save semantics if (epoch + 1) % self.run_manager.run_config.save_ckpt_freq == 0 or ( epoch + 1) == self.run_manager.run_config.epochs or is_best: checkpoint = { 'state_dict': self.net.state_dict(), 'weight_optimizer': self.run_manager.optimizer.state_dict(), 'weight_scheduler': self.run_manager.scheduler.state_dict(), 'arch_optimizer': self.arch_optimizer.state_dict(), 'best_monitor': (self.run_manager.monitor_metric, self.run_manager.best_monitor), 'warmup': False, 'start_epochs': epoch + 1, } checkpoint_arch = { 'actual_path': actual_path, 'cell_genotypes': cell_genotypes, } filename = self.logger.path(mode='search', is_best=is_best) filename_arch = self.logger.path(mode='arch', is_best=is_best) save_checkpoint(checkpoint, filename, self.logger, mode='search') save_checkpoint(checkpoint_arch, filename_arch, self.logger, mode='arch')
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) # init D model_D = FCDiscriminator(num_classes=19) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) optimizer_D = torch.optim.Adam(model_D.parameters(), lr=1e-4, betas=(0.9, 0.99)) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = 'dataloders\\datasets\\' + args.dataset + '_classes_weights.npy' if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.bce_loss = torch.nn.BCEWithLogitsLoss() self.model, self.optimizer = model, optimizer self.model_D, self.optimizer_D = model_D, optimizer_D # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) self.model_D = torch.nn.DataParallel(self.model_D, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) patch_replication_callback(self.model_D) self.model = self.model.cuda() self.model_D = self.model_D.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): # labels for adversarial training source_label = 0 target_label = 1 loss_seg_value = 0.0 loss_adv_target_value = 0.0 loss_D_value = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): src_image, src_label, tgt_image = sample['src_image'], sample[ 'src_label'], sample['tgt_image'] if self.args.cuda: src_image, src_label, tgt_image = src_image.cuda( ), src_label.cuda(), tgt_image.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() self.scheduler(self.optimizer_D, i, epoch, self.best_pred) self.optimizer_D.zero_grad() ## train G # don't accumulate grads in D for param in self.model_D.parameters(): param.requires_grad = False # train with source src_output = self.model(src_image) loss_seg = self.criterion(src_output, src_label) loss_seg.backward() loss_seg_value += loss_seg.item() # train with target tgt_output = self.model(tgt_image) D_out = self.model_D(F.softmax(tgt_output, dim=0)) loss_adv_target = self.bce_loss( D_out, Variable( torch.FloatTensor( D_out.data.size()).fill_(source_label)).cuda()) loss_adv_target.backward() loss_adv_target_value += loss_adv_target.item() ## train D # bring back requires_grad for param in self.model_D.parameters(): param.requires_grad = True # train with source src_output = src_output.detach() D_out = self.model_D(F.softmax(src_output, dim=0)) loss_D = self.bce_loss( D_out, Variable( torch.FloatTensor( D_out.data.size()).fill_(source_label)).cuda()) loss_D.backward() loss_D_value += loss_D.item() # train with source tgt_output = tgt_output.detach() D_out = self.model_D(F.softmax(tgt_output, dim=0)) loss_D = self.bce_loss( D_out, Variable( torch.FloatTensor( D_out.data.size()).fill_(target_label)).cuda()) loss_D.backward() loss_D_value += loss_D.item() self.optimizer.step() self.optimizer_D.step() tbar.set_description( 'Seg_loss: %.3f d_loss: %.3f d_inv_loss: %.3f' % (loss_seg_value / (i + 1), loss_adv_target_value / (i + 1), loss_D_value / (i + 1))) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch image = torch.cat([src_image, tgt_image], dim=0) output = torch.cat([src_output, tgt_output], dim=0) self.summary.visualize_image(self.writer, self.args.dataset, image, src_label, output, global_step) self.writer.add_scalar('train/Seg_loss', loss_seg_value, epoch) self.writer.add_scalar('train/d_loss', loss_adv_target_value, epoch) self.writer.add_scalar('train/d_inv_loss', loss_D_value, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % (loss_seg_value + loss_adv_target_value + loss_D_value)) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU, _ = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args if self.args.sync_bn: self.args.batchnorm_function = SynchronizedBatchNorm2d else: self.args.batchnorm_function = torch.nn.BatchNorm2d print(self.args) # Define Saver self.saver = Saver(self.args) # Define Tensorboard Summary self.summary = TensorboardSummary() self.writer = self.summary.create_summary(self.saver.experiment_dir) # Define Dataloader kwargs = {'num_workers': self.args.gpus, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader = make_data_loader( self.args, **kwargs) self.nclass = self.args.num_classes # Define network model = generate_net(self.args) train_params = [{ 'params': model.get_conv_weight_params(), 'lr': self.args.lr, 'weight_decay': self.args.weight_decay }, { 'params': model.get_conv_bias_params(), 'lr': self.args.lr * 2, 'weight_decay': 0 }] # Define Optimizer if self.args.optim_method == 'sgd': optimizer = torch.optim.SGD(train_params, momentum=self.args.momentum, lr=self.args.lr, weight_decay=0, nesterov=self.args.nesterov) elif self.args.optim_method == 'adagrad': optimizer = torch.optim.Adagrad( train_params, lr=self.args.lr, weight_decay=self.args.weight_decay) else: pass # Define Criterion # whether to use class balanced weights if self.args.use_balanced_weights: classes_weights_path = os.path.join(self.args.save_dir, self.args.dataset, 'classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(self.args.save_dir, self.args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)).type( torch.FloatTensor) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=self.args.cuda, foreloss_weight=args.foreloss_weight, seloss_weight=args.seloss_weight).build_loss( mode=self.args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) self.evaluator_inner = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(self.args.lr_scheduler, self.args.lr, self.args.epochs, len(self.train_loader)) self.model = self.model.cuda() # Resuming checkpoint self.args.start_epoch = 0 self.best_pred = 0.0 if self.args.resume is not None: optimizer, start_epoch, best_pred = load_pretrained_mode( self.model, checkpoint_path=self.args.resume) if not self.args.ft and optimizer is not None: self.optimizer.load_state_dict(optimizer) self.args.start_epoch = start_epoch self.best_pred = best_pred # Using cuda if self.args.cuda: self.model = torch.nn.DataParallel(self.model) patch_replication_callback(self.model) self.model = self.model.cuda() def training(self, epoch): train_loss = 0.0 self.model.train() num_img_tr = len(self.train_loader) self.evaluator.reset() self.evaluator_inner.reset() print('Training') start_time = time.time() for i, sample in enumerate(self.train_loader): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() current_lr = self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss, output = self.criterion(output, target) pred = output.data.clone() loss.backward() self.optimizer.step() train_loss += loss.item() pred = pred.data.cpu().numpy() target_array = target.cpu().numpy() pred = np.argmax(pred, axis=1) self.evaluator_inner.add_batch(target_array, pred) self.evaluator.add_batch(target_array, pred) if i % 10 == 0: Acc_train = self.evaluator_inner.Pixel_Accuracy() Acc_class_train = self.evaluator_inner.Pixel_Accuracy_Class() mIoU_train, IoU_train = self.evaluator_inner.Mean_Intersection_over_Union( ) FWIoU_train = self.evaluator_inner.Frequency_Weighted_Intersection_over_Union( ) print( '\n===>Iteration %d/%d learning_rate: %.6f metric:' % (i, num_img_tr, current_lr)) print( '=>Train loss: %.4f acc: %.4f m_acc: %.4f miou: %.4f fwiou: %.4f' % (loss.item(), Acc_train, Acc_class_train, mIoU_train, FWIoU_train)) print("IoU per class: ", IoU_train) self.evaluator_inner.reset() self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if num_img_tr > 10: if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) else: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) Acc_train_epoch = self.evaluator.Pixel_Accuracy() Acc_class_train_epoch = self.evaluator.Pixel_Accuracy_Class() mIoU_train_epoch, IoU_train_epoch = self.evaluator.Mean_Intersection_over_Union( ) FWIoU_train_epoch = self.evaluator.Frequency_Weighted_Intersection_over_Union( ) stop_time = time.time() self.writer.add_scalar('train/total_loss_epoch', train_loss / num_img_tr, epoch) print( '=====>[Epoch: %d, numImages: %5d time_consuming: %d]' % (epoch, num_img_tr * self.args.batch_size, stop_time - start_time)) print( "Loss: %.3f Acc: %.4f, Acc_class: %.4f, mIoU: %.4f, fwIoU: %.4f\n\n" % (train_loss / (num_img_tr), Acc_train_epoch, Acc_class_train_epoch, mIoU_train_epoch, FWIoU_train_epoch)) print("IoU per class: ", IoU_train_epoch) def validation(self, epoch): self.model.eval() self.evaluator.reset() test_loss = 0.0 print('\nValidation') num_img_tr = len(self.val_loader) start_time = time.time() for i, sample in enumerate(self.val_loader): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss, output = self.criterion(output, target) test_loss += loss.item() pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) stop_time = time.time() # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU, IoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss / num_img_tr, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print( '=====>[Epoch: %d, numImages: %5d previous best=%.4f time_consuming: %d]' % (epoch, num_img_tr * self.args.gpus, self.best_pred, (stop_time - start_time))) print( "Loss: %.3f Acc: %.4f, Acc_class: %.4f, mIoU: %.4f, fwIoU: %.4f\n\n" % (test_loss / (num_img_tr), Acc, Acc_class, mIoU, FWIoU)) print("IoU per class: ", IoU) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred else: is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': new_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass= make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: #print(model.state_dict().keys()) pretrained_dict = {k: v for k, v in checkpoint['state_dict'].items()} del_list = [] add_list = [] for key in pretrained_dict.keys(): if key.split('.')[1] == 'high_level_features' and (key.split('.')[2] == '4' or key.split('.')[2] == '5' or key.split('.')[2] == '6') : #pretrained_dict[key.replace('high','low')] = pretrained_dict[key] add_list.append(key) del_list.append(key) for key in add_list: pretrained_dict[key.replace('high','low')] = pretrained_dict[key] for key in del_list: del pretrained_dict[key] pretrained_dict['decoder.conv1.weight'] = model.state_dict()['decoder.conv1.weight'] # pretrained_dict['decoder.bn1.weight'] = model.state_dict()['decoder.bn1.weight'] # pretrained_dict['decoder.bn1.bias'] = model.state_dict()['decoder.bn1.bias'] # pretrained_dict['decoder.bn1.running_mean'] = model.state_dict()['decoder.bn1.running_mean'] # pretrained_dict['decoder.bn1.running_var'] = model.state_dict()['decoder.bn1.running_var'] # pretrained_dict['decoder.last_conv.0.weight'] = model.state_dict()['decoder.last_conv.0.weight'] pretrained_dict['decoder.last_conv.8.weight'] = model.state_dict()['decoder.last_conv.8.weight'] pretrained_dict['decoder.last_conv.8.bias'] = model.state_dict()['decoder.last_conv.8.bias'] self.model.module.load_state_dict(pretrained_dict) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] #print('target:',target.shape) if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output,img = self.model(image) #loss = self.criterion(output, target) loss = L.xloss(output, target.long(), ignore=255) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() num_img_tr = len(self.train_loader) tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 F1 = 0.0 index = 0 FF=FT=TF=TT=0 for i, sample in enumerate(tbar): image, target = sample[0]['image'], sample[0]['label'] w = sample[1] h = sample[2] name = sample[3] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output,img = self.model(image) #loss = self.criterion(output, target) loss = L.xloss(output, target.long(), ignore=255) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) #pred = output.data.cpu().numpy() pred = img.data.cpu().numpy() #summary global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) h.numpy().tolist() w.numpy().tolist() index += len(h) if target.size()[0] == 1: target = target.cpu().numpy().astype(np.uint8) else: target = target.cpu().numpy().squeeze().astype(np.uint8) pred = np.argmax(pred, axis=1) for i in range(len(h)): target_ = target[i] pred_ = pred[i] tar_img = Image.fromarray(target_) pre_img = Image.fromarray(pred_.squeeze().astype(np.uint8)) tar_img = Resize((h[i],w[i]),interpolation=2)(tar_img) pred_ = Resize((h[i],w[i]),interpolation=2)(pre_img) target_ = np.array(tar_img) pred_ = np.array(pred_) pred_[pred_ != 0] = 1 target_[target_ != 0] = 1 pred_ = pred_.astype(int) target_ = target_.astype(int) ff, ft, tf, tt = np.bincount((target_*2+pred_).reshape(-1), minlength=4) #print(ff,ft,tf,tt) FF += ff FT += ft TF += tf TT += tt # F1 score #F1 += self.evaluator.F1_score(target_, pred_) # Add batch sample into evaluator self.evaluator.add_batch(target_, pred_) # image_np = image[0].cpu().numpy() # image_np = np.array((image_np*128+128).transpose((1,2,0)),dtype=np.uint8) # self.writer.add_image('Input', image_np) R = TT / float(TT + FT) P = TT / float(TT + TF) F1 = (2*R*P)/(R+P) #Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() desire = (F1 + mIoU)*0.5 self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) self.writer.add_scalar('val/F1_score', F1, epoch) self.writer.add_scalar('val/desire', desire, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}, F1_score: {}, desire: {}".format(Acc, Acc_class, mIoU, FWIoU, F1, desire)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args self.mode = args.mode self.epochs = args.epochs self.dataset = args.dataset self.data_path = args.data_path self.train_crop_size = args.train_crop_size self.eval_crop_size = args.eval_crop_size self.stride = args.stride self.batch_size = args.train_batch_size self.train_data = AerialDataset(crop_size=self.train_crop_size, dataset=self.dataset, data_path=self.data_path, mode='train') self.train_loader = DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True, num_workers=2) self.eval_data = AerialDataset(dataset=self.dataset, data_path=self.data_path, mode='val') self.eval_loader = DataLoader(self.eval_data, batch_size=1, shuffle=False, num_workers=2) if self.dataset == 'Potsdam': self.num_of_class = 6 self.epoch_repeat = get_test_times(6000, 6000, self.train_crop_size, self.train_crop_size) elif self.dataset == 'UDD5': self.num_of_class = 5 self.epoch_repeat = get_test_times(4000, 3000, self.train_crop_size, self.train_crop_size) elif self.dataset == 'UDD6': self.num_of_class = 6 self.epoch_repeat = get_test_times(4000, 3000, self.train_crop_size, self.train_crop_size) else: raise NotImplementedError if args.model == 'FCN': self.model = models.FCN8(num_classes=self.num_of_class) elif args.model == 'DeepLabV3+': self.model = models.DeepLab(num_classes=self.num_of_class, backbone='resnet') elif args.model == 'GCN': self.model = models.GCN(num_classes=self.num_of_class) elif args.model == 'UNet': self.model = models.UNet(num_classes=self.num_of_class) elif args.model == 'ENet': self.model = models.ENet(num_classes=self.num_of_class) elif args.model == 'D-LinkNet': self.model = models.DinkNet34(num_classes=self.num_of_class) else: raise NotImplementedError if args.loss == 'CE': self.criterion = CrossEntropyLoss2d() elif args.loss == 'LS': self.criterion = LovaszSoftmax() elif args.loss == 'F': self.criterion = FocalLoss() elif args.loss == 'CE+D': self.criterion = CE_DiceLoss() else: raise NotImplementedError self.schedule_mode = args.schedule_mode self.optimizer = opt.AdamW(self.model.parameters(), lr=args.lr) if self.schedule_mode == 'step': self.scheduler = opt.lr_scheduler.StepLR(self.optimizer, step_size=30, gamma=0.1) elif self.schedule_mode == 'miou' or self.schedule_mode == 'acc': self.scheduler = opt.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max', patience=10, factor=0.1) elif self.schedule_mode == 'poly': iters_per_epoch = len(self.train_loader) self.scheduler = Poly(self.optimizer, num_epochs=args.epochs, iters_per_epoch=iters_per_epoch) else: raise NotImplementedError self.evaluator = Evaluator(self.num_of_class) self.model = nn.DataParallel(self.model) self.cuda = args.cuda if self.cuda is True: self.model = self.model.cuda() self.resume = args.resume self.finetune = args.finetune assert not (self.resume != None and self.finetune != None) if self.resume != None: print("Loading existing model...") if self.cuda: checkpoint = torch.load(args.resume) else: checkpoint = torch.load(args.resume, map_location='cpu') self.model.load_state_dict(checkpoint['parameters']) self.optimizer.load_state_dict(checkpoint['optimizer']) self.scheduler.load_state_dict(checkpoint['scheduler']) self.start_epoch = checkpoint['epoch'] + 1 #start from next epoch elif self.finetune != None: print("Loading existing model...") if self.cuda: checkpoint = torch.load(args.finetune) else: checkpoint = torch.load(args.finetune, map_location='cpu') self.model.load_state_dict(checkpoint['parameters']) self.start_epoch = checkpoint['epoch'] + 1 else: self.start_epoch = 1 if self.mode == 'train': self.writer = SummaryWriter(comment='-' + self.dataset + '_' + self.model.__class__.__name__ + '_' + args.loss) self.init_eval = args.init_eval #Note: self.start_epoch and self.epochs are only used in run() to schedule training & validation def run(self): if self.init_eval: #init with an evaluation init_test_epoch = self.start_epoch - 1 Acc, _, mIoU, _ = self.validate(init_test_epoch, save=True) self.writer.add_scalar('eval/Acc', Acc, init_test_epoch) self.writer.add_scalar('eval/mIoU', mIoU, init_test_epoch) self.writer.flush() end_epoch = self.start_epoch + self.epochs for epoch in range(self.start_epoch, end_epoch): loss = self.train(epoch) self.writer.add_scalar( 'train/lr', self.optimizer.state_dict()['param_groups'][0]['lr'], epoch) self.writer.add_scalar('train/loss', loss, epoch) self.writer.flush() saved_dict = { 'model': self.model.__class__.__name__, 'epoch': epoch, 'dataset': self.dataset, 'parameters': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'scheduler': self.scheduler.state_dict() } torch.save( saved_dict, f'./{self.model.__class__.__name__}_{self.dataset}_epoch{epoch}.pth.tar' ) Acc, _, mIoU, _ = self.validate(epoch, save=True) self.writer.add_scalar('eval/Acc', Acc, epoch) self.writer.add_scalar('eval/mIoU', mIoU, epoch) self.writer.flush() if self.schedule_mode == 'step' or self.schedule_mode == 'poly': self.scheduler.step() elif self.schedule_mode == 'miou': self.scheduler.step(mIoU) elif self.schedule_mode == 'acc': self.scheduler.step(Acc) else: raise NotImplementedError self.writer.close() def train(self, epoch): self.model.train() print(f"----------epoch {epoch}----------") print("lr:", self.optimizer.state_dict()['param_groups'][0]['lr']) total_loss = 0 num_of_batches = len(self.train_loader) * self.epoch_repeat for itr in range(100): for i, [img, gt] in enumerate(self.train_loader): print( f"epoch: {epoch} batch: {i+1+itr*len(self.train_loader)}/{num_of_batches}" ) print("img:", img.shape) print("gt:", gt.shape) self.optimizer.zero_grad() if self.cuda: img, gt = img.cuda(), gt.cuda() pred = self.model(img) print("pred:", pred.shape) loss = self.criterion(pred, gt.long()) print("loss:", loss) total_loss += loss.data loss.backward() self.optimizer.step() return total_loss def validate(self, epoch, save): self.model.eval() print(f"----------validate epoch {epoch}----------") if save and not os.path.exists("epoch_" + str(epoch)): os.mkdir("epoch" + str(epoch)) num_of_imgs = len(self.eval_loader) for i, sample in enumerate(self.eval_loader): img_name, gt_name = sample['img'][0], sample['gt'][0] print(f"{i+1}/{num_of_imgs}:") img = Image.open(img_name).convert('RGB') gt = np.array(Image.open(gt_name)) times, points = self.get_pointset(img) print(f'{times} tests will be carried out on {img_name}...') W, H = img.size #TODO: check numpy & PIL dimensions label_map = np.zeros([H, W], dtype=np.uint8) score_map = np.zeros([H, W], dtype=np.uint8) #score_map not necessarily to be uint8 but uint8 gets better result... tbar = tqdm(points) for i, j in tbar: tbar.set_description(f"{i},{j}") label_map, score_map = self.test_patch(i, j, img, label_map, score_map) #finish a large self.evaluator.add_batch(label_map, gt) if save: mask = ret2mask(label_map, dataset=self.dataset) png_name = os.path.join( "epoch" + str(epoch), os.path.basename(img_name).split('.')[0] + '.png') Image.fromarray(mask).save(png_name) Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print("Acc:", Acc) print("Acc_class:", Acc_class) print("mIoU:", mIoU) print("FWIoU:", FWIoU) self.evaluator.reset() return Acc, Acc_class, mIoU, FWIoU def test_patch(self, i, j, img, label_map, score_map): tr = EvaluationTransform(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) #print(img.size) cropped = img.crop( (i, j, i + self.eval_crop_size, j + self.eval_crop_size)) cropped = tr(cropped).unsqueeze(0) if self.cuda: cropped = cropped.cuda() out = self.model(cropped) #out = torch.nn.functional.softmax(out, dim=1) ret = torch.max(out.squeeze(), dim=0) score = ret[0].data.detach().cpu().numpy() label = ret[1].data.detach().cpu().numpy() #numpy array's shape is [H,W] while PIL.Image is [W,H] score_temp = score_map[j:j + self.eval_crop_size, i:i + self.eval_crop_size] label_temp = label_map[j:j + self.eval_crop_size, i:i + self.eval_crop_size] index = score > score_temp score_temp[index] = score[index] label_temp[index] = label[index] label_map[j:j + self.eval_crop_size, i:i + self.eval_crop_size] = label_temp score_map[j:j + self.eval_crop_size, i:i + self.eval_crop_size] = score_temp return label_map, score_map def get_pointset(self, img): W, H = img.size pointset = [] count = 0 i = 0 while i < W: break_flag_i = False if i + self.eval_crop_size >= W: i = W - self.eval_crop_size break_flag_i = True j = 0 while j < H: break_flag_j = False if j + self.eval_crop_size >= H: j = H - self.eval_crop_size break_flag_j = True count += 1 pointset.append((i, j)) if break_flag_j: break j += self.stride if break_flag_i: break i += self.stride value = get_test_times(W, H, self.eval_crop_size, self.stride) assert count == value, f'count={count} while get_test_times returns {value}' return count, pointset
class Tester(object): def __init__(self, args): self.args = args # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} val_set = pascal.VOCSegmentation(args, split='val') self.nclass = val_set.NUM_CLASSES self.val_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False, **kwargs) # Define network self.model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) self.criterion = SegmentationLosses( weight=None, cuda=args.cuda).build_loss(mode=args.loss_type) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Using cuda if args.cuda: print('device_ids', self.args.gpu_ids) self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) def visualization(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') num_img_val = len(self.val_loader) test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) # Save images, predictions, targets into disk if i % (num_img_val // 10) == 0: self.save_batch_images(image, output, target, i) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # if i == 0: # break # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() # mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() mIoU = self.evaluator.All_Mean_Intersection_over_Union() print('Validation:') print("Acc:{}, Acc_class:{}, fwIoU: {}".format(Acc, Acc_class, FWIoU)) print("mIoU:{:.4f} {:.4f} {:.4f} {:.4f}".format( mIoU[0], mIoU[1], mIoU[2], mIoU[3])) print('Loss: %.3f' % test_loss) def save_batch_images(self, imgs, preds, targets, batch_index): (filepath, _) = os.path.split(self.args.resume) save_path = os.path.join(filepath, 'visualization') if not os.path.exists(save_path): os.makedirs(save_path) grid_image = make_grid(imgs.clone().detach().cpu(), 8, normalize=True) save_image( grid_image, os.path.join(save_path, 'batch_{:0>4}-img.jpg'.format(batch_index))) grid_image = make_grid(decode_seg_map_sequence( torch.max(preds, 1)[1].detach().cpu().numpy(), dataset=self.args.dataset), 8, normalize=False, range=(0, 255)) save_image( grid_image, os.path.join(save_path, 'batch_{:0>4}-pred.png'.format(batch_index))) grid_image = make_grid(decode_seg_map_sequence( torch.squeeze(targets, 1).detach().cpu().numpy(), dataset=self.args.dataset), 8, normalize=False, range=(0, 255)) save_image( grid_image, os.path.join(save_path, 'batch_{:0>4}-target.png'.format(batch_index)))
class Trainer(object): def __init__(self, args): self.args = args self.saver = Saver(args) self.saver.save_experiment_config() self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() self.logger = self.saver.create_logger() kwargs = {'num_workers': args.workers, 'pin_memory': False} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) self.model = EDCNet(args.rgb_dim, args.event_dim, num_classes=self.nclass, use_bn=True) train_params = [{'params': self.model.random_init_params(), 'lr': 10*args.lr, 'weight_decay': 10*args.weight_decay}, {'params': self.model.fine_tune_params(), 'lr': args.lr, 'weight_decay': args.weight_decay}] self.optimizer = torch.optim.Adam(train_params, lr=args.lr, weight_decay=args.weight_decay) if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.to(self.args.device) if args.use_balanced_weights: root_dir = Path.db_root_dir(args.dataset)[0] if isinstance(Path.db_root_dir(args.dataset), list) else Path.db_root_dir(args.dataset) classes_weights_path = os.path.join(root_dir, args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass, classes_weights_path) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.criterion_event = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode='event') self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader), warmup_epochs=5) self.evaluator = Evaluator(self.nclass, self.logger) self.saver.save_model_summary(self.model) self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location='cuda:0') args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): target = sample['label'] image = sample['image'] event = sample['event'] if self.args.cuda: target = target.to(self.args.device) image = image.to(self.args.device) event = event.to(self.args.device) self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output, output_event = self.model(image) loss = self.criterion(output, target) loss_event = self.criterion_event(output_event, event) loss += (loss_event * 0.1) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) self.writer.add_scalar('train/total_loss_epoch', train_loss/num_img_tr, epoch) self.logger.info('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + target.data.shape[0])) self.logger.info('Loss: %.3f' % (train_loss/num_img_tr)) if self.args.no_val: is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 num_img_val = len(self.val_loader) for i, (sample, _) in enumerate(tbar): target = sample['label'] image = sample['image'] event = sample['event'] if self.args.cuda: target = target.to(self.args.device) image = image.to(self.args.device) event = event.to(self.args.device) with torch.no_grad(): output, output_event = self.model(image) loss = self.criterion(output, target) loss_event = self.criterion_event(output_event, event) loss += (loss_event * 20) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) self.evaluator.add_batch(target, pred) Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss/len(self.val_loader), epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) self.logger.info('Validation:') self.logger.info('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + target.data.shape[0])) self.logger.info("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) self.logger.info('Loss: %.3f' % (test_loss/num_img_val)) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer: def __init__(self, args, student, teacher, train_set, val_set, test_set, class_weights, saver, writer): self.args = args self.saver = saver self.saver.save_experiment_config() # save cfgs self.writer = writer self.num_classes = train_set.num_classes # dataloaders kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, **kwargs) self.val_dataloader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.test_dataloader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.dataset_size = { 'train': len(train_set), 'val': len(val_set), 'test': len(test_set) } print('dataset size:', self.dataset_size) # 加快训练,减少每轮迭代次数;不需要从引入样本时就截断数据,这样更好 self.iters_per_epoch = args.iters_per_epoch if args.iters_per_epoch else len( self.train_dataloader) self.device = torch.device(f'cuda:{args.gpu_ids}') # todo: 二者可以考虑在 2个 device 上? self.student = student.to(self.device) self.teacher = teacher.to(self.device).eval() # 用来生成训练 target # student is generator self.G_optimizer = torch.optim.SGD([{ 'params': filter(lambda p: p.requires_grad, self.student.parameters()), 'initial_lr': args.lr_g }], args.lr_g, momentum=args.momentum, weight_decay=args.weight_decay) self.G_lr_scheduler = LR_Scheduler( mode=args.lr_scheduler, base_lr=args.lr_g, num_epochs=args.epochs, iters_per_epoch=self.iters_per_epoch) # todo: discriminator # self.D_solver = optim.SGD([{ # 'params': filter(lambda p: p.requires_grad, D_model.parameters()), # 'initial_lr': args.lr_d # }], args.lr_d, momentum=args.momentum, weight_decay=args.weight_decay) # loss if args.use_balanced_weights: weight = torch.from_numpy(class_weights.astype(np.float32)).to( self.device) else: weight = None # 原有 loss self.criterion = SegmentationLosses(mode=args.loss_type, weight=weight, ignore_index=constants.BG_INDEX) self.criterion_pi = PixelWise_Loss(weight=weight, ignore_index=constants.BG_INDEX) self.criterion_pa = PairWise_Loss() # evaluator self.evaluator = Evaluator(self.num_classes) self.best_epoch = 0 self.best_mIoU = 0.0 self.best_pixelAcc = 0.0 def training(self, epoch, prefix='Train', evaluation=False): self.student.train() if evaluation: self.evaluator.reset() train_losses = AverageMeter() segment_losses = AverageMeter() pi_losses, pa_losses = AverageMeter(), AverageMeter() tbar = tqdm(self.train_dataloader, desc='\r', total=self.iters_per_epoch) # 设置最多迭代次数, 从0开始.. if self.writer: self.writer.add_scalar(f'{prefix}/learning_rate', get_learning_rate(self.G_optimizer), epoch) for i, sample in enumerate(tbar): image, target = sample['img'], sample['target'] image, target = image.to(self.device), target.to(self.device) # adjust lr self.G_lr_scheduler(self.G_optimizer, i, epoch) # forward with torch.no_grad(): preds_T = self.teacher(image) # [res, res1, res2, cx1, cx2] preds_S = self.student(image) # 分割 loss G_loss = self.criterion(preds_S[:3], target) # multiple output loss segment_losses.update(G_loss.item()) # 蒸馏 loss if self.args.pi: # pixel wise loss loss = self.args.lambda_pi * self.criterion_pi( preds_S[:3], preds_T[:3]) G_loss += loss pi_losses.update(loss.item()) if self.args.pa: # pairwise loss loss = self.args.lambda_pa * self.criterion_pa( preds_S[3:], preds_T[3:]) G_loss += loss pa_losses.update(loss.item()) self.G_optimizer.zero_grad() G_loss.backward() self.G_optimizer.step() train_losses.update(G_loss.item()) tbar.set_description( 'Epoch {}, Train loss: {:.3} = seg {:.3f} + pi {:.3f} + pa {:.10f}' .format(epoch, train_losses.avg, segment_losses.avg, pi_losses.avg, pa_losses.avg)) if evaluation: output = F.interpolate(preds_S[0], size=(target.size(1), target.size(2)), mode='bilinear', align_corners=True) pred = torch.argmax(output, dim=1) self.evaluator.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # B,H,W # 即便 tqdm 有 total,仍然要这样跳出 if i == self.iters_per_epoch - 1: break if self.writer: self.writer.add_scalars( f'{prefix}/loss', { 'train': train_losses.avg, 'segment': segment_losses.avg, 'pi': pi_losses.avg, 'pa': pa_losses.avg }, epoch) if evaluation: Acc = self.evaluator.Pixel_Accuracy() mIoU = self.evaluator.Mean_Intersection_over_Union() print('Epoch: {}, Acc_pixel:{:.3f}, mIoU:{:.3f}'.format( epoch, Acc, mIoU)) self.writer.add_scalars( f'{prefix}/IoU', { 'mIoU': mIoU, # 'mDice': mDice, }, epoch) self.writer.add_scalars( f'{prefix}/Acc', { 'acc_pixel': Acc, # 'acc_class': Acc_class }, epoch) @torch.no_grad() def validation(self, epoch, test=False): self.student.eval() self.evaluator.reset() # reset confusion matrix if test: tbar = tqdm(self.test_dataloader, desc='\r') prefix = 'Test' else: tbar = tqdm(self.val_dataloader, desc='\r') prefix = 'Valid' # loss segment_losses = AverageMeter() for i, sample in enumerate(tbar): image, target = sample['img'], sample['target'] image, target = image.to(self.device), target.to(self.device) output = self.student(image)[0] # 拿到首个输出 segment_loss = self.criterion(output, target) segment_losses.update(segment_loss.item()) tbar.set_description(f'{prefix} loss: %.4f' % segment_losses.avg) output = F.interpolate(output, size=(target.size()[1:]), mode='bilinear', align_corners=True) pred = torch.argmax(output, dim=1) # pred # eval: add batch result self.evaluator.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # B,H,W Acc = self.evaluator.Pixel_Accuracy() # Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() # mDice = self.evaluator.Mean_Dice() print('Epoch: {}, Acc_pixel: {:.4f}, mIoU: {:.4f}'.format( epoch, Acc, mIoU)) if self.writer: self.writer.add_scalar(f'{prefix}/loss', segment_losses.avg, epoch) self.writer.add_scalars( f'{prefix}/IoU', { 'mIoU': mIoU, # 'mDice': mDice, }, epoch) self.writer.add_scalars( f'{prefix}/Acc', { 'acc_pixel': Acc, # 'acc_class': Acc_class }, epoch) if not test: if mIoU > self.best_mIoU: print('saving model...') self.best_mIoU = mIoU self.best_pixelAcc = Acc self.best_epoch = epoch state = { 'epoch': self.best_epoch, 'state_dict': self.student.state_dict(), # 方便 test 保持同样结构? 'optimizer': self.G_optimizer.state_dict(), 'best_mIoU': self.best_mIoU, 'best_pixelAcc': self.best_pixelAcc } self.saver.save_checkpoint(state) print('save model at epoch', epoch) return mIoU, Acc def load_best_checkpoint(self): checkpoint = self.saver.load_checkpoint() self.student.load_state_dict(checkpoint['state_dict']) # self.G_optimizer.load_state_dict(checkpoint['optimizer']) print(f'=> loaded checkpoint - epoch {checkpoint["epoch"]}') return checkpoint["epoch"]
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer if args.densecrfloss >0: self.densecrflosslayer = DenseCRFLoss(weight=args.densecrfloss, sigma_rgb=args.sigma_rgb, sigma_xy=args.sigma_xy, scale_factor=args.rloss_scale) print(self.densecrflosslayer) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 train_celoss = 0.0 train_crfloss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) softmax = nn.Softmax(dim=1) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] croppings = (target!=254).float() target[target==254]=255 # Pixels labeled 255 are those unlabeled pixels. Padded region are labeled 254. # see function RandomScaleCrop in dataloaders/custom_transforms.py for the detail in data preprocessing if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) celoss = self.criterion(output, target) if self.args.densecrfloss ==0: loss = celoss else: probs = softmax(output) denormalized_image = denormalizeimage(sample['image'], mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) densecrfloss = self.densecrflosslayer(denormalized_image,probs,croppings) if self.args.cuda: densecrfloss = densecrfloss.cuda() loss = celoss + densecrfloss train_crfloss += densecrfloss.item() loss.backward() self.optimizer.step() train_loss += loss.item() train_celoss += celoss.item() tbar.set_description('Train loss: %.3f = CE loss %.3f + CRF loss: %.3f' % (train_loss / (i + 1),train_celoss / (i + 1),train_crfloss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) #if self.args.no_val: if self.args.save_interval: # save checkpoint every interval epoch is_best = False if (epoch + 1) % self.args.save_interval == 0: self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best, filename='checkpoint_epoch_{}.pth.tar'.format(str(epoch+1))) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] target[target==254]=255 if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = F.softmax(output, dim=1) pred = output.data.cpu().numpy() if self.args.post_process: pool = mp.Pool(mp.cpu_count()) image = image.data.cpu().numpy().astype(np.uint8).transpose(0, 2, 3, 1) pred = pool.map(dense_crf_wrapper, zip(image, pred)) pool.close() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class EfficientUnetModel(LightningModule): def __init__(self, hparams): super().__init__() self.hparams = hparams kwargs = {'num_workers': hparams.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( hparams, **kwargs) self.num_img_tr = len(self.train_loader) self.pretrained_net = get_efficientunet_b4(out_channels=self.nclass, concat_input=True, pretrained=True) if hparams.use_balanced_weights: parameters_dir = "/work/scratch/lei/MyProject/t_chucai/models_and_parameters/parameters/classes_weights" classes_weights_path = os.path.join( parameters_dir, hparams.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(hparams.dataset, self.train_loader, self.nclass) self.weight = torch.from_numpy(weight.astype(np.float32)) else: self.weight = None self.evaluator = Evaluator(self.nclass) def forward(self, X): return self.pretrained_net(X) def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.hparams.lr) def train_dataloader(self): return self.train_loader def val_dataloader(self): return self.val_loader def training_step(self, batch, batch_idx): images = batch["image"] masks = batch["label"] outputs = self(images) loss = F.cross_entropy(outputs, masks.long(), weight=self.weight.to(self.device), ignore_index=255) tensorboard_logs = {'loss/train': loss} output = OrderedDict({ "loss": loss, "progress_bar": tensorboard_logs, "log": tensorboard_logs }) if batch_idx % (self.num_img_tr // 10) == 0: global_step = batch_idx + self.num_img_tr * self.current_epoch self.visualize_image(self.hparams.dataset, images, masks, outputs, global_step) return output def validation_step(self, batch, batch_idx): images = batch["image"] masks = batch["label"] outputs = self(images) loss = F.cross_entropy(outputs, masks.long(), weight=self.weight.to(self.device), ignore_index=255) pred = outputs.data.cpu().numpy() pred = np.argmax(pred, axis=1) masks = masks.cpu().numpy() self.evaluator.add_batch(masks, pred) return {"loss/val": loss} def validation_epoch_end(self, outputs): tensorboard_logs = {} tensorboard_logs["loss/val"] = torch.tensor( [output["loss/val"] for output in outputs]).mean() tensorboard_logs["val/Acc"] = self.evaluator.Pixel_Accuracy() tensorboard_logs[ "val/Acc_class"] = self.evaluator.Pixel_Accuracy_Class() tensorboard_logs[ "val/mIoU"] = self.evaluator.Mean_Intersection_over_Union() tensorboard_logs[ "val/fwIoU"] = self.evaluator.Frequency_Weighted_Intersection_over_Union( ) self.evaluator.reset() return { "progress_bar": tensorboard_logs, "log": tensorboard_logs, "loss/val": tensorboard_logs["loss/val"], "val/Acc": tensorboard_logs["val/Acc"], "val/Acc_class": tensorboard_logs["val/Acc_class"], "val/mIoU": tensorboard_logs["val/mIoU"], "val/fwIoU": tensorboard_logs["val/fwIoU"], } def visualize_image(self, dataset, image, target, output, global_step): grid_image = make_grid(image[:3].clone().cpu().data, 3, normalize=True) self.logger.experiment.add_image('Image', grid_image, global_step) grid_image = make_grid(decode_seg_map_sequence(torch.max( output[:3], 1)[1].detach().cpu().numpy(), dataset=dataset), 3, normalize=False, range=(0, 255)) self.logger.experiment.add_image('Predicted label', grid_image, global_step) grid_image = make_grid(decode_seg_map_sequence(torch.squeeze( target[:3], 1).detach().cpu().numpy(), dataset=dataset), 3, normalize=False, range=(0, 255)) self.logger.experiment.add_image('Groundtruth label', grid_image, global_step)
#========================== compute loss ===================== with torch.no_grad(): output = model(images) loss = criterion(output, targets) test_loss += loss.item() print('Test loss: %.3f' % (test_loss / (iter_num + 1))) pred = output.data.cpu().numpy() targets = targets.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator evaluator.add_batch(targets, pred) # Fast test during the training Acc = evaluator.Pixel_Accuracy() Acc_class = evaluator.Pixel_Accuracy_Class() mIoU = evaluator.Mean_Intersection_over_Union() FWIoU = evaluator.Frequency_Weighted_Intersection_over_Union() writer.add_scalar('val/total_loss_epoch', test_loss, epoch) writer.add_scalar('val/mIoU', mIoU, epoch) writer.add_scalar('val/Acc', Acc, epoch) writer.add_scalar('val/Acc_class', Acc_class, epoch) writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, iter_num * par.batch_size + images.data.shape[0])) print("Acc:{:.5}, Acc_class:{:.5}, mIoU:{:.5}, fwIoU: {:.5}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) # when model is initialized, the track_running_stats is true so the running_mean and running_var is # initialized and loaded the pretrained model. Since the model uses the batch stats during training mode, # the optimization is easier while the running stats will be used for eval mode. # Using batch stats makes optimization easier for child in model.modules(): if type(child) == nn.BatchNorm2d: child.track_running_stats = False # use batch stats for train and eval modes; # if running stats are not None, they are still updated # in such toy example, we do not use running stats!!! if type(child) == nn.Dropout: child.p = 0 # no dropout train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer if args.densecrfloss > 0: self.densecrflosslayer = DenseCRFLoss( weight=args.densecrfloss, sigma_rgb=args.sigma_rgb, sigma_xy=args.sigma_xy, scale_factor=args.rloss_scale) print(self.densecrflosslayer) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 self.trainLoss = [] self.miou = [] self.mean_entropy = [] self.mid_entropy = [] self.celoss = [] self.crfloss = [] def training(self, epoch, args): train_loss = 0.0 train_celoss = 0.0 train_crfloss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) #number of batches softmax = nn.Softmax(dim=1) for i, sample in enumerate(tbar): image, target, gt = sample['image'], sample['label'], sample[ 'groundtruth'] croppings = (target != 254).float() if self.args.cuda: croppings = croppings.cuda() target[target == 254] = 255 #target[target==255]=0 #only for full CE gt[gt == 255] = 0 # gt is used for affinity matrix, no unsure regions needed # Pixels labeled 255 are those unlabeled pixels. Padded region are labeled 254. # see function RandomScaleCrop in dataloaders/custom_transforms.py for the detail in data preprocessing if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() outputT = self.model(image) #miou output_miou = outputT.clone().detach().cpu().numpy() output_miou = np.argmax(output_miou, axis=1) gt_miou = gt.clone().numpy() self.evaluator.reset() self.evaluator.add_batch(gt_miou, output_miou) mIoU = self.evaluator.Mean_Intersection_over_Union() self.miou.append(mIoU) celoss = self.criterion(outputT, target) if self.args.densecrfloss == 0: loss = celoss else: T = 1.0 output = outputT / T #entropy calculation logsoftmax = nn.LogSoftmax(dim=1) softmax = nn.Softmax(dim=1) logp = logsoftmax(output) p = softmax(output) logp = logp.cpu().detach().numpy() p = p.cpu().detach().numpy() entropy = np.sum(-p * logp, axis=1) self.mean_entropy.append(np.mean(entropy[0]).item()) self.mid_entropy.append(np.median(entropy[0]).item()) #if epoch<=30: # pass #else: # h = output.register_hook(Znormalization) #probs = softmax(output) #denormalized_image = denormalizeimage(sample['image'], mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) #densecrfloss = self.densecrflosslayer(denormalized_image,probs,croppings) #gt_tensor = gt.unsqueeze(1).repeat(1,3,1,1) gt_tensor = torch.zeros_like(output) gt_tensor[0, 0, gt[0, ...] == 0] = 1 gt_tensor[1, 0, gt[0, ...] == 0] = 1 gt_tensor[0, 1, gt[1, ...] == 1] = 1 gt_tensor[1, 1, gt[1, ...] == 1] = 1 gt_tensor = gt_tensor.cuda() tempreture = 1.0 #################################################################################################### #element-wise log logsoftmax = nn.LogSoftmax(dim=1) logS = logsoftmax(output) part2 = torch.logsumexp(output, dim=1, keepdim=True) part1 = torch.logsumexp(output[:, 1:, :, :], dim=1, keepdim=True) for d in range(1, 20): newtmp = torch.cat( (output[:, :d, :, :], output[:, d + 1:, :, :]), dim=1) newtmp2 = torch.logsumexp(newtmp, dim=1, keepdim=True) part1 = torch.cat((part1, newtmp2), dim=1) part1 = torch.cat( (part1, torch.logsumexp(output[:, :20, :, :], dim=1, keepdim=True)), dim=1) log1_S = part1 - part2 # element-wise log implementation2 #probs = softmax(output) densecrfloss = self.densecrflosslayer( gt_tensor, logS, log1_S, croppings) # use groundtruth ###################################################################################################### ##### class variance regularizer ##### ''' variance = 0 count = 0 S1num = (gt[0]==0).sum() S2num = (gt[0]==1).sum() for i in range(output.size()[0]): # i stands for batch variance += torch.sum(torch.var(output[i,:,gt[i]==0],dim=1)) variance += torch.sum(torch.var(output[i,:,gt[i]==1],dim=1)) count += 1 Variance = args.densecrfloss * variance / count loss = celoss + Variance ''' ###################################### if self.args.cuda: densecrfloss = densecrfloss.cuda() loss = celoss + densecrfloss train_crfloss += densecrfloss.item() #train_crfloss += Variance.item() loss.backward() self.optimizer.step() train_loss += loss.item() train_celoss += celoss.item() tbar.set_description( 'Train loss: %.3f = CE loss %.3f + CRF loss: %.3f' % (train_loss / (i + 1), train_celoss / (i + 1), train_crfloss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * ?(3) inference results each epoch if False: #i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, outputT, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) self.trainLoss.append(train_loss) self.celoss.append(train_celoss) self.crfloss.append(train_crfloss) #if self.args.no_val: if self.args.save_interval: # save checkpoint every interval epoch is_best = False if (epoch + 1) % self.args.save_interval == 0: self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best, filename='checkpoint_epoch_{}.pth.tar'.format( str(epoch + 1))) def validation(self, epoch): self.model.eval( ) # running stats is still updating now but we just do not use them self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] #target[target==254]=255 target[target == 255] = 0 #only for groundtruth affinity toy experiment if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') #print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) #print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) return new_pred
class Trainer(object): def __init__(self, args): self.args = args self.saver = Saver(args) self.saver.save_experiment_config() kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) self.model = OCRNet(self.nclass) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.use_balanced_weights: weight = torch.tensor([0.2, 0.8], dtype=torch.float32) else: weight = None self.criterion = SegmentationLosses( weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.evaluator = Evaluator(self.nclass) self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) if args.cuda: self.model = self.model.cuda() self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output, _ = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() print('[Epoch:{},num_images:{}]'.format( epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss:{}'.format(train_loss)) if self.args.nu_val: is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): _, output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) self.evaluator.add_batch(target, pred) Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() road_iou, mIOU = self.evaluator.Mean_Intersection_over_Union() FWIOU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Validation:\n') print('[Epoch:{},num_image:{}]'.format( epoch, i * self.args.batch_size + image.data.shape[0])) print('Acc:{},Acc_class:{},mIOU:{},road_iou:{},fwIOU:{}'.format( Acc, Acc_class, mIOU, road_iou, FWIOU)) print('Loss:{}'.format(test_loss)) new_pred = road_iou if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network G and D (the output of Deeplab is score for each class, and the score is # passed through softmax layer before going into PatchGAN) #================================== network ==============================================# network_G = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) softmax_layer = torch.nn.Softmax(dim=1) network_D = networks.define_D(24, 64, netD='basic', n_layers_D=3, norm='batch', init_type='normal', init_gain=0.02, gpu_ids=self.args.gpu_ids) #=========================================================================================# train_params = [{ 'params': network_G.get_1x_lr_params(), 'lr': args.lr }, { 'params': network_G.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer #================================== network ==============================================# optimizer_G = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) optimizer_D = torch.optim.Adam(network_D.parameters(), lr=0.0002, betas=(0.5, 0.999)) #=========================================================================================# # Define whether to use class balanced weights for criterion if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None #=================== GAN criterion and Segmentation criterion ======================================# self.criterionGAN = networks.GANLoss('vanilla').to( args.gpu_ids[0]) ### set device manually self.criterionSeg = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) #===================================================================================================# self.network_G, self.softmax_layer, self.network_D = network_G, softmax_layer, network_D self.optimizer_G, self.optimizer_D = optimizer_G, optimizer_D # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.network_G = torch.nn.DataParallel( self.network_G, device_ids=self.args.gpu_ids) patch_replication_callback(self.network_G) self.network_G = self.network_G.cuda() #====================== no resume ===================================================================# # Resuming checkpoint self.best_pred = 0.0 # if args.resume is not None: # if not os.path.isfile(args.resume): # raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # if args.cuda: # self.network_G.module.load_state_dict(checkpoint['state_dict']) # else: # self.network_G.load_state_dict(checkpoint['state_dict']) # if not args.ft: # self.optimizer.load_state_dict(checkpoint['optimizer']) # self.best_pred = checkpoint['best_pred'] # print("=> loaded checkpoint '{}' (epoch {})" # .format(args.resume, checkpoint['epoch'])) #=======================================================================================================# # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): G_Seg_loss = 0.0 G_GAN_loss = 0.0 D_fake_loss = 0.0 D_real_loss = 0.0 #======================== train mode to set batch normalization =======================================# self.network_G.train() self.network_D.train() #======================================================================================================# tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer_G, i, epoch, self.best_pred) # tune learning rate #================================= GAN training process (pix2pix) ============================================# # prepare tensors output_score = self.network_G( image) # score map for each class in pixels output = self.softmax_layer(output_score) # label for each pixel target_one_hot = self.make_one_hot( target, C=21) # change target to one-hot coding to feed into PatchGAN fake_AB = torch.cat((image, output), 1) real_AB = torch.cat((image, target_one_hot), 1) # ================================================================== # # Train the discriminator # # ================================================================== # # freeze G, unfreese D self.set_requires_grad(self.network_G, False) self.set_requires_grad(self.softmax_layer, False) self.set_requires_grad(self.network_D, True) # reset D grad self.optimizer_D.zero_grad() # fake input pred_fake = self.network_D(fake_AB.detach()) loss_D_fake = self.criterionGAN(pred_fake, False) # real input pred_real = self.network_D(real_AB) loss_D_real = self.criterionGAN(pred_real, True) # combine loss and calculate gradients loss_D = (loss_D_fake + loss_D_real) / (2.0 * self.args.batch_size) loss_D.backward() self.optimizer_D.step() # ================================================================== # # Train the generator # # ================================================================== # # unfreeze G, freese D self.set_requires_grad(self.network_G, True) self.set_requires_grad(self.softmax_layer, True) self.set_requires_grad(self.network_D, False) # reset G grad self.optimizer_G.zero_grad() # fake input should let D predict 1 pred_fake = self.network_D(fake_AB) loss_G_GAN = self.criterionGAN(pred_fake, True) # Segmentation loss G(A) = B loss_G_CE = self.criterionSeg( output_score, target ) * self.args.lambda_Seg # 1.0 is lambda_CE (weight for cross entropy loss) # combine loss and calculate gradients # lambda = 0.1 loss_G = loss_G_GAN * self.args.lambda_GAN / self.args.batch_size + loss_G_CE loss_G.backward() self.optimizer_G.step() # display G and D loss G_Seg_loss += loss_G_CE.item() G_GAN_loss += loss_G_GAN.item() D_fake_loss += loss_D_fake.item() D_real_loss += loss_D_real.item() #===================================================================================================# tbar.set_description( 'G_Seg_loss: %.3f G_GAN_los: %.3f D_fake_loss: %.3f D_real_loss: %.3f' % (G_Seg_loss / (i + 1), G_GAN_loss / (i + 1), D_fake_loss / (i + 1), D_real_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss_G_CE.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', G_Seg_loss, epoch) print('Training:') print(' [Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print(' Train G_Seg_Loss: %.3f' % G_Seg_loss) #======================================= no save checkpoint ==================# # if self.args.no_val: # # save checkpoint every epoch # is_best = False # self.saver.save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': self.model.module.state_dict(), # 'optimizer': self.optimizer.state_dict(), # 'best_pred': self.best_pred, # }, is_best) #=============================================================================# def validation(self, epoch): self.network_G.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.network_G(image) loss = self.criterionSeg(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print(' [Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print(" Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print(' Test G_Seg_Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred #============== only save checkpoint for best model ======================# self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict_G': self.network_G.module.state_dict(), 'state_dict_D': self.network_D.state_dict(), 'optimizer_G': self.optimizer_G.state_dict(), 'optimizer_D': self.optimizer_D.state_dict(), 'best_pred': self.best_pred, }, is_best) #=======================================================# #========================== new method ===============================# def set_requires_grad(self, nets, requires_grad=False): """Set requies_grad=Fasle for all the networks to avoid unnecessary computations Parameters: nets (network list) -- a list of networks requires_grad (bool) -- whether the networks require gradients or not """ if not isinstance(nets, list): nets = [nets] for net in nets: if net is not None: for param in net.parameters(): param.requires_grad = requires_grad def make_one_hot(self, labels, C=21): labels[labels == 255] = 0.0 labels = labels.unsqueeze(1) one_hot = torch.cuda.FloatTensor(labels.size(0), C, labels.size(2), labels.size(3), device=labels.device).zero_() target = one_hot.scatter_(1, labels.long(), 1.0) return target
class evaluation(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.train_hard_mining_loader, self.val_loader, self.val_save_loader, self.arg_loader, self.test_loader, self.val_loader_for_compare, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=True, freeze_bn=args.freeze_bn) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model = model # Define Evaluator self.evaluator = Evaluator(self.nclass) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 # evaluate the model on validation dataset def validation(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) self.evaluator.add_batch(target, pred) Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Validation:') print('[numImages: %5d]' % (i * self.args.batch_size + image.data.shape[0])) # print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print("Acc:", Acc) print("Acc_class:", Acc_class) print("mIoU:", mIoU) print("fwIoU:", FWIoU) print('Loss: %.3f' % test_loss) # save the segmentation of test datasets # change the target direction in pascal.py def test_save(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.test_loader, desc='\r') for i, sample in enumerate(tbar): image = sample[0] image_id = sample[1] if self.args.cuda: image = image.cuda() with torch.no_grad(): output = self.model(image) prediction = output.data.max(1)[1].squeeze_(1).squeeze_( 0).cpu().numpy() prediction = prediction.astype('uint8') im = PIL.Image.fromarray(prediction) im.save(image_id[0]) # save the segmentation of validation datasets in original size # need to change the direction here def validation_save(self): self.model.eval() self.evaluator.reset() filedir = 'C:\\Users\\Shuang\\Desktop\\val_res' tbar = tqdm(self.val_save_loader, desc='\r') for i, sample in enumerate(tbar): image, target, image_id = sample['image'], sample['label'], sample[ 'id'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) prediction = output.data.max(1)[1].squeeze_(1).squeeze_( 0).cpu().numpy() im = PIL.Image.fromarray(prediction.astype('uint8')) h = target.shape[1] w = target.shape[2] ratio = 513. / np.max([w, h]) if w < h: m = int(w * ratio) im = im.crop((0, 0, m, 513)) elif w >= h: m = int(h * ratio) im = im.crop((0, 0, 513, m)) im = im.resize((w, h), PIL.Image.BILINEAR) if not os.path.isdir(filedir): os.makedirs(filedir) im.save(os.path.join(filedir, image_id[0] + ".png")) def validation_resize(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_save_loader, desc='\r') for i, sample in enumerate(tbar): image, target, image_id = sample['image'], sample['label'], sample[ 'id'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) prediction = output.data.max(1)[1].squeeze_(1).squeeze_( 0).cpu().numpy() im = PIL.Image.fromarray(prediction.astype('uint8')) h = target.shape[1] w = target.shape[2] ratio = 513. / np.max([w, h]) if w < h: m = int(w * ratio) im = im.crop((0, 0, m, 513)) elif w >= h: m = int(h * ratio) im = im.crop((0, 0, 513, m)) im = im.resize((w, h), PIL.Image.BILINEAR) if not os.path.isdir(filedir): os.makedirs(filedir) im.save(os.path.join(filedir, image_id[0] + ".png")) # calculate the MIoU of the result and label # need to change the direction in pascal.py def compare(self): tbar = tqdm(self.val_loader_for_compare, desc='\r') for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] image = image.numpy().astype(np.int64) target = target.numpy().astype(np.float32) self.evaluator.add_batch(target, image) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Compare the result and label:') print('[numImages: %5d]' % (i * self.args.batch_size + image.data.shape[0])) # print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print("Acc:", Acc) print("Acc_class:", Acc_class) print("mIoU:", mIoU) print("fwIoU:", FWIoU) # hard mining and change the train list of next epoch def hard_mining(self): iou_id = [] tbar = tqdm(self.val_loader_for_compare, desc='\r') for i, sample in enumerate(tbar): image, target, image_id = sample['image'], sample['label'], sample[ 'id'] image = image.numpy().astype(np.int64) target = target.numpy().astype(np.float32) self.evaluator.one_add_batch(target, image) IoU = self.evaluator.One_Intersection_over_Union() IoU = float(IoU) iou_id.append([IoU, image_id]) iou_id.sort() print(iou_id) filename = 'F:/pingan/VOCdevkit/VOC2012/ImageSets/Segmentation/arg1.txt' if not os.path.exists(filename): os.system(r'touch %s' % filename) f = open(filename, 'w') for i in range(10): f.write(iou_id[i][1][0] + "\n") f.close()
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary # 使用tensorboardX可视化 self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader # kwargs = {'num_workers': args.workers, 'pin_memory': True} kwargs = {'num_workers': 0, 'pin_memory': True} #self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) self.train_loader, self.val_loader, self.nclass = make_data_loader( args, **kwargs) # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None # Define Criterion self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) if args.model_name == 'unet': #model = UNet_ac(args.n_channels, args.n_filters, args.n_class).cuda() model = UNet_SNws(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() model = UNet_bn(args.n_channels, args.n_filters, args.n_class).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) #optimizer = torch.optim.AdamW(model.parameters(), lr=args.arch_lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # elif args.model_name == 'hunet': # model = HUNet(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() # optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'unet3+': model = UNet3p_SNws(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'unet3+_aspp': #model = UNet3p_aspp(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() #model = UNet3p_aspp_SNws(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() #model = UNet3p_res_aspp_SNws(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() model = UNet3p_res_edge_aspp_SNws(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'unet3+_ocr': model = UNet3p_res_ocr_SNws(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # elif args.model_name == 'unet3+_resnest_aspp': # model = UNet3p_resnest_aspp(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() # optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'gscnn': model = GSCNN(args.n_channels, args.n_filters, args.n_class, args.using_movavg, args.using_bn).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'pspnet': model = PSPNet(args.n_channels, args.n_filters, args.n_class).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'segnet': model = Segnet(args.n_channels, args.n_filters, args.n_class).cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'hrnet': MODEL = { 'ALIGN_CORNERS': True, 'EXTRA': { 'FINAL_CONV_KERNEL': 1, # EXTRA 具体定义了模型的结果,包括 4 个 STAGE,各自的参数 'STAGE1': { 'NUM_MODULES': 1, # HighResolutionModule 重复次数 'NUM_BRANCHES': 1, # 分支数 'BLOCK': 'BOTTLENECK', 'NUM_BLOCKS': 4, 'NUM_CHANNELS': 64, 'FUSE_METHOD': 'SUM' }, 'STAGE2': { 'NUM_MODULES': 1, 'NUM_BRANCHES': 2, 'BLOCK': 'BASIC', 'NUM_BLOCKS': [4, 4], 'NUM_CHANNELS': [48, 96], 'FUSE_METHOD': 'SUM' }, 'STAGE3': { 'NUM_MODULES': 4, 'NUM_BRANCHES': 3, 'BLOCK': 'BASIC', 'NUM_BLOCKS': [4, 4, 4], 'NUM_CHANNELS': [48, 96, 192], 'FUSE_METHOD': 'SUM' }, 'STAGE4': { 'NUM_MODULES': 3, 'NUM_BRANCHES': 4, 'BLOCK': 'BASIC', 'NUM_BLOCKS': [4, 4, 4, 4], 'NUM_CHANNELS': [48, 96, 192, 384], 'FUSE_METHOD': 'SUM' } } } model = HighResolutionNet(args.n_channels, args.n_filters, args.n_class, MODEL).cuda() # model.init_weights() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'hrnet+ocr': MODEL = { 'ALIGN_CORNERS': True, 'EXTRA': { 'FINAL_CONV_KERNEL': 1, # EXTRA 具体定义了模型的结果,包括 4 个 STAGE,各自的参数 'STAGE1': { 'NUM_MODULES': 1, # HighResolutionModule 重复次数 'NUM_BRANCHES': 1, # 分支数 'BLOCK': 'BOTTLENECK', 'NUM_BLOCKS': 4, 'NUM_CHANNELS': 64, 'FUSE_METHOD': 'SUM' }, 'STAGE2': { 'NUM_MODULES': 1, 'NUM_BRANCHES': 2, 'BLOCK': 'BASIC', 'NUM_BLOCKS': [4, 4], 'NUM_CHANNELS': [48, 96], 'FUSE_METHOD': 'SUM' }, 'STAGE3': { 'NUM_MODULES': 4, 'NUM_BRANCHES': 3, 'BLOCK': 'BASIC', 'NUM_BLOCKS': [4, 4, 4], 'NUM_CHANNELS': [48, 96, 192], 'FUSE_METHOD': 'SUM' }, 'STAGE4': { 'NUM_MODULES': 3, 'NUM_BRANCHES': 4, 'BLOCK': 'BASIC', 'NUM_BLOCKS': [4, 4, 4, 4], 'NUM_CHANNELS': [48, 96, 192, 384], 'FUSE_METHOD': 'SUM' } } } # model = HighResolutionNet_OCR(args.n_channels, args.n_filters, args.n_class, MODEL).cuda() model = HighResolutionNet_OCR_SNws(args.n_channels, args.n_filters, args.n_class, MODEL).cuda() # model.init_weights() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.model_name == 'deeplabv3+': # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) backbone = model.backbone # backbone.conv1 = nn.Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) print('change the input channels', backbone.conv1) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] #optimizer = torch.optim.SGD(train_params, momentum=args.momentum, # weight_decay=args.weight_decay, nesterov=args.nesterov) optimizer = torch.optim.AdamW(train_params, weight_decay=args.weight_decay) #optimizer = torch.optim.AdamW(train_params, lr=args.arch_lr, betas=(0.9, 0.999), weight_decay=args.weight_decay) #elif args.model_name == 'autodeeplab': #model = AutoDeeplab(args.n_class, 12, self.criterion, crop_size=args.crop_size) #optimizer = torch.optim.AdamW(model.weight_parameters(), lr=args.lr, weight_decay=args.weight_decay) #optimizer = torch.optim.SGD(model.weight_parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler #self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, # args.epochs, len(self.train_loader)) self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: print(self.args.gpu_ids) self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() # print(image.shape) self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch # self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) # 保存标量值 print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % (train_loss / len(tbar))) # print('Loss: %.3f' % (train_loss / i)) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() # 创建全为0的混淆矩阵 tbar = tqdm(self.val_loader, desc='\r') # 回车符 val_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] # image, target = sample[0], sample[1] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) val_loss += loss.item() tbar.set_description('Val loss: %.3f' % (val_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # 按行 # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union( ) # 频权交并比 self.writer.add_scalar('val/total_loss_epoch', val_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % (val_loss / len(tbar))) new_pred = FWIoU # mIoU # log logfile = os.path.join('/home/wzj/mine_cloud_14/', 'log.txt') log_file = open(logfile, 'a') if epoch == 0: log_file.seek(0) log_file.truncate() log_file.write(self.args.model_name + '\n') log_file.write('Epoch: %d, ' % (epoch + 1)) if new_pred < self.best_pred: log_file.write( 'Acc: {}, Acc_class: {}, mIoU: {}, fwIoU: {}, best_fwIoU: {}, ' .format(Acc, Acc_class, mIoU, FWIoU, self.best_pred)) else: log_file.write( 'Acc: {}, Acc_class: {}, mIoU: {}, fwIoU: {}, best_fwIoU: {}, ' .format(Acc, Acc_class, mIoU, FWIoU, new_pred)) log_file.write('Loss: %.3f\n' % (val_loss / len(tbar))) if epoch == 199: # 499 log_file.close() if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader # kwargs = {'num_workers': args.workers, 'pin_memory': True} kwargs = {'num_workers': 0, 'pin_memory': True} if args.nir: input_channels = 4 else: input_channels = 3 self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=3, backbone=args.backbone, in_channels=input_channels, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) weight[1] = 4 weight[2] = 2 weight[0] = 1 else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch # place_holder_target = target # place_holder_output = output self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def calculate_scores(self): train_loss = 0.0 self.model.eval() loader = self.val_loader shape = self.model.module.backbone.layer4[2].conv2.weight.shape gradient_embeddings = np.zeros((shape[0] * shape[1], len(loader))) # gradient_embeddings = np.zeros((512*33*33, len(loader))) # tbar = tqdm(self.train_loader) tbar = tqdm(loader) num_img_tr = len(loader) for i, sample in enumerate(tbar): # activations = collections.defaultdict(list) # def save_activation(name, mod, input, output): # activations[name].append(output.cpu()) # # for name, m in self.model.named_modules(): # if name == 'module.backbone.layer4.2.relu': # m.register_forward_hook(partial(save_activation, name)) # image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() # self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() # score = activations['module.backbone.layer4.2.relu'][0].data.numpy().flatten() score = self.model.module.backbone.layer4[2].conv2.weight.grad score = score.cpu().data.numpy()[:, :, 0, 0].flatten() gradient_embeddings[:, i] = score train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr) with open('feature_maps/backbone_layer_4_conv2_gradients_trained.npy', 'wb') as f: print('[INFO] saving the gradients into hard disk') np.save(f, gradient_embeddings) # with open('gradients.npy', 'rb') as f: # print('[INFO] Loading the gradients from hard disk') # gradient_embeddings = np.load(f) # print('[INFO] Calculating TSNE embeddings') # # embeddings = tsne.fit(gradient_embeddings) # # embeddings = TSNE(n_components=2).fit_transform(gradient_embeddings) # print('[INFO] Plotting the embeddings') # plt.scatter(embeddings[0], embeddings[1]) # plt.show() self.writer.add_scalar('total_score', train_loss) # print('[Epoch: %d, numImages: %5d]' % (i, i * self.args.batch_size + image.data.shape[0])) # print('Loss: %.3f' % train_loss) def pred_single_image(self, path, counter): self.model.eval() img_path = path lbl_path = os.path.join( os.path.split(os.path.split(path)[0])[0], 'lbl', os.path.split(path)[1]) activations = collections.defaultdict(list) def save_activation(name, mod, input, output): activations[name].append(output.cpu()) for name, m in self.model.named_modules(): if type(m) == nn.ReLU: m.register_forward_hook(partial(save_activation, name)) input = cv2.imread(path) # bkg = cv2.createBackgroundSubtractorMOG2() # back = bkg.apply(input) # cv2.imshow('back', back) # cv2.waitKey() input = cv2.resize(input, (513, 513), interpolation=cv2.INTER_CUBIC) image = Image.open(img_path).convert('RGB') # width x height x 3 # _tmp = np.array(Image.open(lbl_path), dtype=np.uint8) _tmp = np.array(Image.open(img_path), dtype=np.uint8) _tmp[_tmp == 255] = 1 _tmp[_tmp == 0] = 0 _tmp[_tmp == 128] = 2 _tmp = Image.fromarray(_tmp) mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) composed_transforms = transforms.Compose([ tr.FixedResize(size=513), tr.Normalize(mean=mean, std=std), tr.ToTensor() ]) sample = {'image': image, 'label': _tmp} sample = composed_transforms(sample) image, target = sample['image'], sample['label'] image = torch.unsqueeze(image, dim=0) if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) see = Analysis('module.decoder.last_conv.6', activations) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) pred = np.reshape(pred, (513, 513)) # prediction = np.append(target, pred, axis=1) prediction = pred rgb = np.zeros((prediction.shape[0], prediction.shape[1], 3)) r = prediction.copy() g = prediction.copy() b = prediction.copy() g[g != 1] = 0 g[g == 1] = 255 r[r != 2] = 0 r[r == 2] = 255 b = np.zeros(b.shape) rgb[:, :, 0] = b rgb[:, :, 1] = g rgb[:, :, 2] = r prediction = np.append(input, rgb.astype(np.uint8), axis=1) result = np.append(input, prediction.astype(np.uint8), axis=1) cv2.line(rgb, (513, 0), (513, 1020), (255, 255, 255), thickness=1) cv2.line(rgb, (513, 0), (513, 1020), (255, 255, 255), thickness=1) cv2.imwrite( '/home/robot/git/pytorch-deeplab-xception/run/cropweed/deeplab-resnet/experiment_41/samples/synthetic_{}.png' .format(counter), prediction)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() self.use_amp = True if (APEX_AVAILABLE and args.use_amp) else False self.opt_level = args.opt_level kwargs = { 'num_workers': args.workers, 'pin_memory': True, 'drop_last': True } self.train_loaderA, self.train_loaderB, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: #if so, which trainloader to use? weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) # Define network model = AutoDeeplab(num_classes=self.nclass, num_layers=12, criterion=self.criterion, filter_multiplier=self.args.filter_multiplier, block_multiplier=self.args.block_multiplier, step=self.args.step) optimizer = torch.optim.SGD(model.weight_parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.model, self.optimizer = model, optimizer self.architect_optimizer = torch.optim.Adam( self.model.arch_parameters(), lr=args.arch_lr, betas=(0.9, 0.999), weight_decay=args.arch_weight_decay) # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loaderA), min_lr=args.min_lr) # TODO: Figure out if len(self.train_loader) should be devided by two ? in other module as well # Using cuda if args.cuda: self.model = self.model.cuda() # mixed precision if self.use_amp and args.cuda: keep_batchnorm_fp32 = True if (self.opt_level == 'O2' or self.opt_level == 'O3') else None # fix for current pytorch version with opt_level 'O1' if self.opt_level == 'O1' and torch.__version__ < '1.3': for module in self.model.modules(): if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): # Hack to fix BN fprop without affine transformation if module.weight is None: module.weight = torch.nn.Parameter( torch.ones(module.running_var.shape, dtype=module.running_var.dtype, device=module.running_var.device), requires_grad=False) if module.bias is None: module.bias = torch.nn.Parameter( torch.zeros(module.running_var.shape, dtype=module.running_var.dtype, device=module.running_var.device), requires_grad=False) # print(keep_batchnorm_fp32) self.model, [self.optimizer, self.architect_optimizer] = amp.initialize( self.model, [self.optimizer, self.architect_optimizer], opt_level=self.opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32, loss_scale="dynamic") print('cuda finished') # Using data parallel if args.cuda and len(self.args.gpu_ids) > 1: if self.opt_level == 'O2' or self.opt_level == 'O3': print( 'currently cannot run with nn.DataParallel and optimization level', self.opt_level) self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) print('training on multiple-GPUs') #checkpoint = torch.load(args.resume) #print('about to load state_dict') #self.model.load_state_dict(checkpoint['state_dict']) #print('model loaded') #sys.exit() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] # if the weights are wrapped in module object we have to clean it if args.clean_module: self.model.load_state_dict(checkpoint['state_dict']) state_dict = checkpoint['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v self.model.load_state_dict(new_state_dict) else: if (torch.cuda.device_count() > 1 or args.load_parallel): self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loaderA) num_img_tr = len(self.train_loaderA) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optimizer.step() if epoch >= self.args.alpha_epoch: search = next(iter(self.train_loaderB)) image_search, target_search = search['image'], search['label'] if self.args.cuda: image_search, target_search = image_search.cuda( ), target_search.cuda() self.architect_optimizer.zero_grad() output_search = self.model(image_search) arch_loss = self.criterion(output_search, target_search) if self.use_amp: with amp.scale_loss( arch_loss, self.architect_optimizer) as arch_scaled_loss: arch_scaled_loss.backward() else: arch_loss.backward() self.architect_optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) #self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) #torch.cuda.empty_cache() self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False if torch.cuda.device_count() > 1: state_dict = self.model.module.state_dict() else: state_dict = self.model.state_dict() self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': state_dict, 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred if torch.cuda.device_count() > 1: state_dict = self.model.module.state_dict() else: state_dict = self.model.state_dict() self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': state_dict, 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) # Define network model = MyDeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, freeze_bn=args.freeze_bn) self.model = model train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] # Define Optimizer #optimizer = torch.optim.SGD(train_params, momentum=args.momentum, # weight_decay=args.weight_decay, nesterov=args.nesterov) # adam optimizer = torch.optim.Adam(params=self.model.parameters(),betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False) weight = [1, 10, 10, 10, 10, 10, 10, 10] weight = torch.tensor(weight, dtype=torch.float) self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda, num_classes=self.nclass).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) #patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 ''' # 获取当前模型各层的名称 layer_name = list(self.model.state_dict().keys()) #print(self.model.state_dict()[layer_name[3]]) # 加载通用的预训练模型 pretrained = './pretrained_model/deeplab-mobilenet.pth.tar' pre_ckpt = torch.load(pretrained) key_name = list(checkpoint['state_dict'].keys()) # 获取预训练模型各层的名称 pre_ckpt['state_dict'][key_name[-2]] = checkpoint['state_dict'][key_name[-2]] # 类别不同,最后两层单独赋值 pre_ckpt['state_dict'][key_name[-1]] = checkpoint['state_dict'][key_name[-1]] self.model.module.load_state_dict(pre_ckpt['state_dict']) # , strict=False) #print(self.model.state_dict()[layer_name[3]]) print("加载预训练模型ok") ''' def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) #import pdb #pdb.set_trace() loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() #if (i+1) % 50 == 0: # print('Train loss: %.3f' % (loss.item() / (i + 1))) tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch #self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) filename='checkpoint_{}_{:.4f}.pth.tar'.format(epoch, train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best, filename=filename) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) #if (i+1) %20 == 0: # print('Test loss: %.3f' % (loss / (i + 1))) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class Predictor(): def __init__(self, config, checkpoint_path='./snapshots/checkpoint_best.pth.tar'): self.config = config self.checkpoint_path = checkpoint_path # with open(self.config_file_path) as f: self.categories_dict = {"background": 0, "short_sleeve_top": 1, "long_sleeve_top": 2, "short_sleeve_outwear": 3, "long_sleeve_outwear": 4, "vest": 5, "sling": 6, "shorts": 7, "trousers": 8, "skirt": 9, "short_sleeve_dress": 10, "long_sleeve_dress": 11, "vest_dress": 12, "sling_dress": 13} # self.categories_dict = {"background": 0, "meningioma": 1, "glioma": 2, "pituitary": 3} self.categories_dict_rev = {v: k for k, v in self.categories_dict.items()} self.model = self.load_model() self.train_loader, self.val_loader, self.test_loader, self.nclass = initialize_data_loader(config) self.num_classes = self.config['network']['num_classes'] self.evaluator = Evaluator(self.num_classes) self.criterion = SegmentationLosses(weight=None, cuda=self.config['network']['use_cuda']).build_loss(mode=self.config['training']['loss_type']) def load_model(self): model = DeepLab(num_classes=self.config['network']['num_classes'], backbone=self.config['network']['backbone'], output_stride=self.config['image']['out_stride'], sync_bn=False, freeze_bn=True) if self.config['network']['use_cuda']: checkpoint = torch.load(self.checkpoint_path) else: checkpoint = torch.load(self.checkpoint_path, map_location={'cuda:0': 'cpu'}) # print(checkpoint) model = torch.nn.DataParallel(model) model.load_state_dict(checkpoint['state_dict']) return model def inference_on_test_set(self): print("inference on test set") self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.config['network']['use_cuda']: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print("Accuracy:{}, Accuracy per class:{}, mean IoU:{}, frequency weighted IoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) def segment_image(self, filename): # file_path = os.path.join(dir_path, filename) img = Image.open(filename).convert('RGB') sample = {'image': img, 'label': img} sample = DeepFashionSegmentation.preprocess(sample, crop_size=513) image, _ = sample['image'], sample['label'] image = image.unsqueeze(0) with torch.no_grad(): prediction = self.model(image) image = image.squeeze(0).numpy() image = denormalize_image(np.transpose(image, (1, 2, 0))) image *= 255. prediction = prediction.squeeze(0).cpu().numpy() # print(prediction[]) prediction = np.argmax(prediction, axis=0) return image, prediction
class Trainer: def __init__(self, network, train_dataloader, eval_dataloader, criterion, optimizer, visualizer, experiment_name, config): self.config = config self.network = network self.train_dataloader = train_dataloader self.eval_dataloader = eval_dataloader self.criterion = criterion self.optimizer = optimizer self.visualizer = visualizer self.experiment_name = experiment_name self.evaluator = Evaluator(config['n_classes']) def train_epoch(self, epoch): running_loss = [] for idx, (inputs, labels) in enumerate(self.train_dataloader, 0): self.network.train() if self.config['use_cuda']: inputs = inputs.cuda().float() labels = labels.cuda().float() else: inputs = inputs.float() labels = labels.float() self.optimizer.zero_grad() predictions = self.network(inputs) loss = self.criterion(predictions, labels) loss.backward() self.optimizer.step() running_loss.append(loss.item()) if idx % self.config['print_loss'] == 0: running_loss = np.mean(np.array(running_loss)) self.visualizer.update_statistics( idx + len(self.train_dataloader) * epoch, loss1=running_loss, loss2=None) self.visualizer.update_images(predictions, labels) print(f'Training loss on iteration {idx} = {running_loss}') running_loss = [] def eval_net(self, epoch): running_eval_loss = 0.0 self.network.eval() for i, (inputs_, labels_) in enumerate(self.eval_dataloader, 0): if self.config['use_cuda']: inputs_ = inputs_.cuda().float() labels_ = labels_.cuda().float() else: inputs_ = inputs_.float() labels_ = labels_.float() predictions_ = self.network(inputs_) eval_loss = self.criterion(predictions_, labels_) running_eval_loss += eval_loss.item() running_eval_loss = running_eval_loss / len(self.eval_dataloader) self.visualizer.update_statistics( iteration=len(self.train_dataloader) * (epoch + 1), loss1=None, loss2=running_eval_loss) self.visualizer.update_images(predictions_, labels_, evaluation=True) print(f'### Evaluation loss on epoch {epoch} = {running_eval_loss}') def train(self): try: os.mkdir( os.path.join(self.config['exp_path'], self.experiment_name)) except FileExistsError: print("Director already exists! It will be overwritten!") for i in range(1, self.config['train_epochs'] + 1): print('Training on epoch ' + str(i)) self.train_epoch(i) if i % self.config['eval_net_epoch'] == 0: self.validation(i) if i % self.config['save_net_epochs'] == 0: self.save_net_state(i) def save_net_state(self, epoch): path_to_save = os.path.join(self.config['exp_path'], self.experiment_name, 'model_epoch_' + str(epoch) + '.pkl') torch.save(self.network, path_to_save) def validation(self, epoch): self.network.eval() self.evaluator.reset() tbar = tqdm(self.eval_dataloader, desc='\r') test_loss = [] for i, (image, target) in enumerate(tbar): if self.config['use_cuda']: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.network(image) loss = self.criterion(output, target) test_loss.append(loss.item()) tbar.set_description('Test loss: %.3f' % loss.item()) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) test_loss = np.mean(np.array(test_loss)) self.visualizer.update_statistics( iteration=len(self.train_dataloader) * (epoch + 1), loss1=None, loss2=test_loss) # self.visualizer.update_images(output, target, evaluation=True) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Validation:') print('[Epoch: %d]' % epoch) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss)
class Valuator(object): def __init__(self, args): self.args = args self.args.batchnorm_function = torch.nn.BatchNorm2d # Define Dataloader self.nclass = self.args.num_classes # Define network model = generate_net(self.args) self.model = model self.evaluator = Evaluator(self.nclass) self.criterion = SegmentationLosses(cuda=True).build_loss(mode='ce') # Using cuda if self.args.cuda: self.model = self.model.cuda() # Resuming checkpoint _, _, _ = load_pretrained_mode(self.model, checkpoint_path=self.args.resume) def visual(self): self.model.eval() print('\nvisualizing') self.evaluator.reset() data_dir = self.args.data_dir data_list = os.path.join(data_dir, self.args.val_list) vis_set = GenDataset(self.args, data_list, split='vis') vis_loader = DataLoader(vis_set, batch_size=1, shuffle=False) num_img_tr = len(vis_loader) print('=====>[numImages: %5d]' % (num_img_tr)) for i, sample in enumerate(vis_loader): image, target, name, ori = sample['image'], sample[ 'label'], sample['name'], sample['ori'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) if isinstance(output, (tuple, list)): output = output[0] output = torch.nn.functional.interpolate(output, size=ori.size()[1:3], mode='bilinear', align_corners=True) pred = output.data.cpu().numpy() target = target.cpu().numpy() ori = ori.cpu().numpy() pred = np.argmax(pred, axis=1) if num_img_tr > 100: if i % (num_img_tr // 100) == 0: self.save_img(ori, target, pred, name) else: self.save_img(ori, target, pred, name) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU, IoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print("IoU per class: ", IoU) def save_img(self, images, labels, predictions, names): save_dir = self.args.save_dir if not os.path.exists(save_dir): os.makedirs(save_dir) num_image = len(labels) labels = decode_seg_map_sequence(labels).cpu().numpy().transpose( 0, 2, 3, 1) predictions = decode_seg_map_sequence( predictions).cpu().numpy().transpose(0, 2, 3, 1) for i in range(num_image): name = names[i] if not isinstance(name, str): name = str(name) save_name = os.path.join(save_dir, name + '.png') image = images[i, :, :, :] label_mask = labels[i, :, :, :] prediction = predictions[i, :, :, :] if image.shape != label_mask.shape: print('error in %s' % name) continue label_map = self.addImage(image.astype(dtype=np.uint8), label_mask.astype(dtype=np.uint8)) pred_map = self.addImage(image.astype(dtype=np.uint8), prediction.astype(dtype=np.uint8)) label = img.fromarray(label_map.astype(dtype=np.uint8), mode='RGB') pred = img.fromarray(pred_map.astype(dtype=np.uint8), mode='RGB') label_mask = img.fromarray(label_mask.astype(dtype=np.uint8), mode='RGB') pred_mask = img.fromarray(prediction.astype(dtype=np.uint8), mode='RGB') shape1 = label.size shape2 = pred.size assert (shape1 == shape2) width = 2 * shape1[0] + 60 height = 2 * shape1[1] + 60 toImage = img.new('RGB', (width, height)) toImage.paste(pred, (0, 0)) toImage.paste(label, (shape1[0] + 60, 0)) toImage.paste(pred_mask, (0, shape1[1] + 60)) toImage.paste(label_mask, (shape1[0] + 60, shape1[1] + 60)) toImage.save(save_name) def addImage(self, img1_path, img2_path): alpha = 1 beta = 0.7 gamma = 0 img_add = cv2.addWeighted(img1_path, alpha, img2_path, beta, gamma) return img_add
class Trainer: def __init__(self, args, model, train_set, val_set, test_set, class_weights, saver, writer): self.args = args self.saver = saver self.saver.save_experiment_config() # save cfgs self.writer = writer self.num_classes = train_set.num_classes # dataloaders kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_dataloader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, **kwargs) self.val_dataloader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.test_dataloader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.dataset_size = {'train': len(train_set), 'val': len(val_set), 'test': len(test_set)} print('dataset size:', self.dataset_size) # 加快训练,减少每轮迭代次数;不需要从引入样本时就截断数据,这样更好 self.iters_per_epoch = args.iters_per_epoch if args.iters_per_epoch else len(self.train_dataloader) if args.optimizer == 'SGD': print('Using SGD') self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) self.lr_scheduler = LR_Scheduler(mode=args.lr_scheduler, base_lr=args.lr, lr_step=args.lr_step, num_epochs=args.epochs, warmup_epochs=args.warmup_epochs, iters_per_epoch=self.iters_per_epoch) elif args.optimizer == 'Adam': print('Using Adam') self.optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, # amsgrad=True, weight_decay=args.weight_decay) else: raise NotImplementedError self.device = torch.device(f'cuda:{args.gpu_ids}') if len(args.gpu_ids) > 1: args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')] model = torch.nn.DataParallel(model, device_ids=args.gpu_ids) patch_replication_callback(model) print(args.gpu_ids) self.model = model.to(self.device) # loss if args.use_balanced_weights: weight = torch.from_numpy(class_weights.astype(np.float32)).to(self.device) else: weight = None self.criterion = SegmentationLosses(mode=args.loss_type, weight=weight, ignore_index=constants.BG_INDEX) # evaluator self.evaluator = Evaluator(self.num_classes) self.best_epoch = 0 self.best_mIoU = 0.0 self.best_pixelAcc = 0.0 def training(self, epoch, prefix='Train', evaluation=False): self.model.train() if evaluation: self.evaluator.reset() train_losses = AverageMeter() tbar = tqdm(self.train_dataloader, desc='\r', total=self.iters_per_epoch) # 设置最多迭代次数, 从0开始.. if self.writer: self.writer.add_scalar(f'{prefix}/learning_rate', get_learning_rate(self.optimizer), epoch) for i, sample in enumerate(tbar): image, target = sample['img'], sample['target'] image, target = image.to(self.device), target.to(self.device) if self.args.optimizer == 'SGD': self.lr_scheduler(self.optimizer, i, epoch) # each iteration output = self.model(image) loss = self.criterion(output, target) # multiple output loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() train_losses.update(loss.item()) tbar.set_description('Epoch {}, Train loss: {:.3f}'.format(epoch, train_losses.avg)) if evaluation: output = F.interpolate(output[-1], size=(target.size(1), target.size(2)), mode='bilinear', align_corners=True) pred = torch.argmax(output, dim=1) self.evaluator.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # B,H,W # 即便 tqdm 有 total,仍然要这样跳出 if i == self.iters_per_epoch - 1: break if self.writer: self.writer.add_scalar(f'{prefix}/loss', train_losses.val, epoch) if evaluation: Acc = self.evaluator.Pixel_Accuracy() mIoU = self.evaluator.Mean_Intersection_over_Union() print('Epoch: {}, Acc_pixel:{:.3f}, mIoU:{:.3f}'.format(epoch, Acc, mIoU)) self.writer.add_scalars(f'{prefix}/IoU', { 'mIoU': mIoU, # 'mDice': mDice, }, epoch) self.writer.add_scalars(f'{prefix}/Acc', { 'acc_pixel': Acc, # 'acc_class': Acc_class }, epoch) @torch.no_grad() def validation(self, epoch, test=False): self.model.eval() self.evaluator.reset() # reset confusion matrix if test: tbar = tqdm(self.test_dataloader, desc='\r') prefix = 'Test' else: tbar = tqdm(self.val_dataloader, desc='\r') prefix = 'Valid' # loss segment_losses = AverageMeter() for i, sample in enumerate(tbar): image, target = sample['img'], sample['target'] image, target = image.to(self.device), target.to(self.device) output = self.model(image)[0] # 拿到首个输出 segment_loss = self.criterion(output, target) segment_losses.update(segment_loss.item()) tbar.set_description(f'{prefix} loss: %.4f' % segment_losses.avg) output = F.interpolate(output, size=(target.size()[1:]), mode='bilinear', align_corners=True) pred = torch.argmax(output, dim=1) # pred # eval: add batch result self.evaluator.add_batch(target.cpu().numpy(), pred.cpu().numpy()) # B,H,W Acc = self.evaluator.Pixel_Accuracy() # Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() # mDice = self.evaluator.Mean_Dice() print('Epoch: {}, Acc_pixel:{:.4f}, mIoU:{:.4f}'.format(epoch, Acc, mIoU)) if self.writer: self.writer.add_scalar(f'{prefix}/loss', segment_losses.avg, epoch) self.writer.add_scalars(f'{prefix}/IoU', { 'mIoU': mIoU, # 'mDice': mDice, }, epoch) self.writer.add_scalars(f'{prefix}/Acc', { 'acc_pixel': Acc, # 'acc_class': Acc_class }, epoch) if not test: if mIoU > self.best_mIoU: print('saving model...') self.best_mIoU = mIoU self.best_pixelAcc = Acc self.best_epoch = epoch state = { 'epoch': self.best_epoch, 'state_dict': self.model.state_dict(), # 方便 test 保持同样结构? 'optimizer': self.optimizer.state_dict(), 'best_mIoU': self.best_mIoU, 'best_pixelAcc': self.best_pixelAcc } self.saver.save_checkpoint(state) print('save model at epoch', epoch) return mIoU, Acc def load_best_checkpoint(self): checkpoint = self.saver.load_checkpoint() self.model.load_state_dict(checkpoint['state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer']) print(f'=> loaded checkpoint - epoch {checkpoint["epoch"]}') return checkpoint["epoch"]
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network #### if initializer if args.init is not None: model = DeepLab(num_classes=21, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) else: model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr}, # {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}] train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr }] # Define Optimizer # optimizer = torch.optim.SGD(train_params, momentum=args.momentum, # weight_decay=args.weight_decay, nesterov=args.nesterov) optimizer = torch.optim.Adam(train_params, lr=args.lr, weight_decay=args.weight_decay, amsgrad=True) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler #self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, # args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() #initializing network if args.init is not None: if not os.path.isfile(args.init): raise RuntimeError( "=> no initializer checkpoint found at '{}'".format( args.init)) checkpoint = torch.load(args.init) #args.start_epoch = checkpoint['epoch'] state_dict = checkpoint['state_dict'] # del state_dict["decoder.last_conv.8.weight"] # del state_dict["decoder.last_conv.8.bias"] if args.cuda: self.model.module.load_state_dict(state_dict, strict=False) else: self.model.load_state_dict(state_dict, strict=False) # if not args.ft: # self.optimizer.load_state_dict(checkpoint['optimizer']) # self.best_pred = checkpoint['best_pred'] self.model.module.decoder.last_layer = nn.Conv2d(256, self.nclass, kernel_size=1, stride=1).cuda() print("=> loaded initializer '{}' (epoch {})".format( args.init, checkpoint['epoch'])) # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) #args.start_epoch = checkpoint['epoch'] ##state_dict = checkpoint['state_dict'] ## del state_dict["decoder.last_conv.8.weight"] ## del state_dict["decoder.last_conv.8.bias"] if args.cuda: #self.model.module.load_state_dict(state_dict, strict=False) self.model.module.load_state_dict(checkpoint['state_dict']) else: #self.model.load_state_dict(state_dict, strict=False) self.model.load_state_dict(checkpoint['state_dict']) # if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) #self.model.module.decoder.last_layer = nn.Conv2d(256, self.nclass, kernel_size=1, stride=1) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 train_loss1 = 0.0 train_loss2 = 0.0 train_loss3 = 0.0 self.model.train() # trying to save a checkpoint and check if it exists... # import os # cur_path = os.path.dirname(os.path.abspath('.')) # print('saving mycheckpoint in:' + cur_path ) # checkpoint_name = 'mycheckpoint.pth.tar' # save_path = cur_path + '/' + checkpoint_name # torch.save(self.model.module.state_dict(), save_path) # assert(os.path.isfile(save_path)) # # torch.save(self.model.module.state_dict(), checkpoint_name) # # assert(os.path.isfile(cur_path + '/' + checkpoint_name)) # print('checkpoint saved ok') # # checkpoint saved tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) # import pdb; pdb.set_trace() # label_w = label_stats(self.train_loader, nimg=70) #** -args.norm_loss if args.norm_loss != 0 else None # import pdb; pdb.set_trace() for i, sample in enumerate(tbar): #print("i is:{}, index is:{}".format(i,sample['index'])) #print("path is:{}".format(sample['path'])) #image, target = sample['image'], sample['label'] image, target, index, path, b_mask, enlarged_b_mask = sample[ 'image'], sample['label'], sample['index'], sample[ 'path'], sample['b_mask'], sample['enlarged_b_mask'] #print('sample for training index is :{} and path is:{}'.format(index,path)) if self.args.cuda: image, target = image.cuda(), target.cuda() #not using learning rate scheduler and apply a fixed learning rate #self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) #import pdb; pdb.set_trace() #loss = self.criterion(output, target, b_mask, enlarged_b_mask) loss1, loss2, loss3, loss = self.criterion(output, target, b_mask, enlarged_b_mask) # criterion = nn.BCELoss() # loss = criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() train_loss1 += loss1.item() train_loss2 += loss2.item() train_loss3 += loss3.item() #import pdb; pdb.set_trace() tbar.set_description('Train loss_total: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # tbar.set_description('Train loss1: %.3f' % (train_loss1 / (i + 1))) self.writer.add_scalar('train/total_loss1_iter', loss1.item(), i + num_img_tr * epoch) # tbar.set_description('Train loss2: %.3f' % (train_loss2 / (i + 1))) self.writer.add_scalar('train/total_loss2_iter', loss2.item(), i + num_img_tr * epoch) # tbar.set_description('Train loss3: %.3f' % (train_loss3 / (i + 1))) self.writer.add_scalar('train/total_loss3_iter', loss3.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10000) == 0: #for the whole dataset #if i % (num_img_tr // 10) == 0: # for debugging global_step = i + num_img_tr * epoch #self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, b_mask, enlarged_b_mask, global_step) # Save the model after each 500 iterations if i % 500 == 0: #for the whole dataset #if i % 5 == 0: # for debugging is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) # perform the validation after each 1000 iterations if i % 1000 == 0: #for the whole dataset #if i % 15 == 0 : # for debugging self.validation(i) #self.validation(i + num_img_tr * epoch) ## garbage collection pass #del image, target, index, path, b_mask, enlarged_b_mask, output #gc.collect() self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) self.writer.add_scalar('train/total_loss1_epoch', train_loss1, epoch) self.writer.add_scalar('train/total_loss2_epoch', train_loss2, epoch) self.writer.add_scalar('train/total_loss3_epoch', train_loss3, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch #print('saving checkpoint') is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 test_loss1 = 0.0 test_loss2 = 0.0 test_loss3 = 0.0 for i, sample in enumerate(tbar): #image, target = sample['image'], sample['label'] #image, target, index, path = sample['image'], sample['label'], sample['index'], sample['path'] image, target, index, path, b_mask, enlarged_b_mask = sample[ 'image'], sample['label'], sample['index'], sample[ 'path'], sample['b_mask'], sample['enlarged_b_mask'] #print('sample for testing index is :{} and path is:{}'.format(index,path)) if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) #loss = self.criterion(output, target) #loss = self.criterion(output, target, b_mask, enlarged_b_mask) loss1, loss2, loss3, loss = self.criterion(output, target, b_mask, enlarged_b_mask) test_loss += loss.item() test_loss1 += loss1.item() test_loss2 += loss2.item() test_loss3 += loss3.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) target = np.argmax(target, axis=1) #pred = np.argmax(pred, axis=1) #import pdb; pdb.set_trace() # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/total_loss1_epoch', test_loss1, epoch) self.writer.add_scalar('val/total_loss2_epoch', test_loss2, epoch) self.writer.add_scalar('val/total_loss3_epoch', test_loss3, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
class MyTrainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} if (args.dataset == "fashion_clothes"): train_set = fashion.FashionDataset( args, Path.db_root_dir("fashion_clothes"), mode='train') val_set = fashion.FashionDataset( args, Path.db_root_dir("fashion_clothes"), mode='test') self.nclass = train_set.nclass print("Train size {}, val size {}".format(len(train_set), len(val_set))) self.train_loader = DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True, **kwargs) self.val_loader = DataLoader(dataset=val_set, batch_size=args.batch_size, shuffle=False, **kwargs) self.test_loader = None assert self.nclass == 7 self.best_pred = 0.0 if args.model == 'deeplabv3+': model = DeepLab(backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) if args.cuda: checkpoint = torch.load(args.resume) else: checkpoint = torch.load(args.resume, map_location='cpu') args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) #Freeze the backbone if args.freeze_backbone: set_parameter_requires_grad(model.backbone, False) ######NEW DECODER###### #Different type of FT if args.ft_type == 'decoder': set_parameter_requires_grad(model, False) model.decoder = build_decoder(self.nclass, 'resnet', nn.BatchNorm2d) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] elif args.ft_type == 'last_layer': set_parameter_requires_grad(model, False) model.decoder.last_conv[8] = nn.Conv2d( in_channels=256, out_channels=self.nclass, kernel_size=1) model.decoder.last_conv[8].reset_parameters() train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] if args.ft_type == 'all': #Reset last layer, to generate output we want model.decoder.last_conv[8] = nn.Conv2d( in_channels=256, out_channels=self.nclass, kernel_size=1) model.decoder.last_conv[8].reset_parameters() train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) print("weight is {}".format(weight)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def visulize_validation(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): #current_index_val_set image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) #we have image, target, output on GPU #j, index of image in batch self.summary.visualize_pregt(self.writer, self.args.dataset, image, target, output, i) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Visualizing:') pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Final Validation:') print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) def output_validation(self): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): #current_index_val_set image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) #we have image, target, output on GPU #j, index of image in batch #image save self.summary.save_pred(self.args.dataset, output, i) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Visualizing:') pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() print('Final Validation:') print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) def train_loop(self): try: for epoch in range(self.args.start_epoch, self.args.epochs): self.training(epoch) if not self.args.no_val and epoch % self.args.eval_interval == ( self.args.eval_interval - 1): self.validation(epoch) except KeyboardInterrupt: print('Early Stopping') finally: self.visulize_validation() self.writer.close()
class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader( args, **kwargs) # Define network model = DeepLab(num_classes=self.nclass, backbone=args.backbone, output_stride=args.out_stride, sync_bn=args.sync_bn, freeze_bn=args.freeze_bn) train_params = [{ 'params': model.get_1x_lr_params(), 'lr': args.lr }, { 'params': model.get_10x_lr_params(), 'lr': args.lr * 10 }] # Define Optimizer optimizer = torch.optim.SGD(train_params, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join( Path.db_root_dir(args.dataset), args.dataset + '_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses( weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader)) # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'".format( args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.module.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader) num_img_tr = len(self.train_loader) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() # sts() output = self.model(image) loss = self.criterion(output, target) loss.backward() self.optimizer.step() train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format( Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': self.model.module.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
def warm_up(self, warmup_epochs): if warmup_epochs <= 0: self.logger.log('=> warmup close', mode='warm') #print('\twarmup close') return # set optimizer and scheduler in warm_up phase lr_max = self.arch_search_config.warmup_lr data_loader = self.run_manager.run_config.train_loader scheduler_params = self.run_manager.run_config.optimizer_config[ 'scheduler_params'] optimizer_params = self.run_manager.run_config.optimizer_config[ 'optimizer_params'] momentum, nesterov, weight_decay = optimizer_params[ 'momentum'], optimizer_params['nesterov'], optimizer_params[ 'weight_decay'] eta_min = scheduler_params['eta_min'] optimizer_warmup = torch.optim.SGD(self.net.weight_parameters(), lr_max, momentum, weight_decay=weight_decay, nesterov=nesterov) # set initial_learning_rate in weight_optimizer #for param_groups in self.run_manager.optimizer.param_groups: # param_groups['lr'] = lr_max lr_scheduler_warmup = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer_warmup, warmup_epochs, eta_min) iter_per_epoch = len(data_loader) total_iteration = warmup_epochs * iter_per_epoch self.logger.log('=> warmup begin', mode='warm') epoch_time = AverageMeter() end_epoch = time.time() for epoch in range(self.warmup_epoch, warmup_epochs): self.logger.log('\n' + '-' * 30 + 'Warmup Epoch: {}'.format(epoch + 1) + '-' * 30 + '\n', mode='warm') lr_scheduler_warmup.step(epoch) warmup_lr = lr_scheduler_warmup.get_lr() self.net.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accs = AverageMeter() mious = AverageMeter() fscores = AverageMeter() epoch_str = 'epoch[{:03d}/{:03d}]'.format(epoch + 1, warmup_epochs) time_left = epoch_time.average * (warmup_epochs - epoch) common_log = '[Warmup the {:}] Left={:} LR={:}'.format( epoch_str, str(timedelta(seconds=time_left)) if epoch != 0 else None, warmup_lr) self.logger.log(common_log, mode='warm') end = time.time() # single_path init TODO: does not perform sampling in origin update #_, network_index = self.net.get_network_arch_hardwts_with_constraint() #_, aspp_index = self.net.get_aspp_hardwts_index() #single_path = self.net.sample_single_path(self.run_manager.run_config.nb_layers, aspp_index, network_index) for i, (datas, targets) in enumerate(data_loader): #print(i) #print(self.net.single_path) #if i == 59: # used for debug # break if torch.cuda.is_available(): datas = datas.to(self.run_manager.device, non_blocking=True) targets = targets.to(self.run_manager.device, non_blocking=True) else: raise ValueError('do not support cpu version') data_time.update(time.time() - end) # TODO: update one architecture sufficiently # 1. get hardwts and index # 2. sample single_path, and set single_path # 3. get arch_sample_frequency # 4. update single_path per '{:}'.format(sample_arch_frequency) frequency #if (i+1) % self.arch_search_config.sample_arch_frequency == 0: # TODO: update per iteration #_, network_index = self.net.get_network_arch_hardwts() #_, aspp_index = self.net.get_aspp_hardwts_index() #single_path = self.net.sample_single_path(self.run_manager.run_config.nb_layers, aspp_index, network_index) logits = self.net.single_path_forward(datas) # TODO: don't add entropy reg in warmup_phase ce_loss = self.run_manager.criterion(logits, targets) #entropy_reg = self.net.calculate_entropy(single_path) loss = self.run_manager.add_regularization_loss( epoch, ce_loss, None) # measure metrics and update evaluator = Evaluator(self.run_manager.run_config.nb_classes) evaluator.add_batch(targets, logits) acc = evaluator.Pixel_Accuracy() miou = evaluator.Mean_Intersection_over_Union() fscore = evaluator.Fx_Score() losses.update(loss.data.item(), datas.size(0)) accs.update(acc, datas.size(0)) mious.update(miou, datas.size(0)) fscores.update(fscore, datas.size(0)) self.net.zero_grad() loss.backward() self.run_manager.optimizer.step() batch_time.update(time.time() - end) end = time.time() if ( i + 1 ) % self.run_manager.run_config.train_print_freq == 0 or i + 1 == iter_per_epoch: Wstr = '|*WARM-UP*|' + time_string( ) + '[{:}][iter{:03d}/{:03d}]'.format( epoch_str, i + 1, iter_per_epoch) Tstr = '|Time | [{batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})]'.format( batch_time=batch_time, data_time=data_time) Bstr = '|Base | [Loss {loss.val:.3f} ({loss.avg:.3f}) Accuracy {acc.val:.2f} ({acc.avg:.2f}) MIoU {miou.val:.2f} ({miou.avg:.2f}) F {fscore.val:.2f} ({fscore.avg:.2f})]'\ .format(loss=losses, acc=accs, miou=mious, fscore=fscores) self.logger.log(Wstr + '\n' + Tstr + '\n' + Bstr, 'warm') #torch.cuda.empty_cache() epoch_time.update(time.time() - end_epoch) end_epoch = time.time() log = '[{:}] warm :: loss={:.2f} accuracy={:.2f} miou={:.2f} f1score={:.2f}\n'.format( epoch_str, losses.average, accs.average, mious.average, fscores.average) self.vis.visdom_update(epoch, 'warmup_loss', [losses.average]) self.vis.visdom_update(epoch, 'warmup_miou', [mious.average]) self.logger.log(log, mode='warm') ''' # TODO: wheter perform validation after each epoch in warmup phase ? valid_loss, valid_acc, valid_miou, valid_fscore = self.validate() valid_log = 'Warmup Valid\t[{0}/{1}]\tLoss\t{2:.6f}\tAcc\t{3:6.4f}\tMIoU\t{4:6.4f}\tF\t{5:6.4f}'\ .format(epoch+1, warmup_epochs, valid_loss, valid_acc, valid_miou, valid_fscore) #'\tflops\t{6:}M\tparams{7:}M'\ valid_log += 'Train\t[{0}/{1}]\tLoss\t{2:.6f}\tAcc\t{3:6.4f}\tMIoU\t{4:6.4f}\tFscore\t{5:6.4f}' self.run_manager.write_log(valid_log, 'valid') ''' # continue warmup phrase self.warmup = epoch + 1 < warmup_epochs self.warmup_epoch = self.warmup_epoch + 1 #self.start_epoch = self.warmup_epoch # To save checkpoint in warmup phase at specific frequency. if (epoch + 1) % self.run_manager.run_config.save_ckpt_freq == 0 or ( epoch + 1) == warmup_epochs: state_dict = self.net.state_dict() # rm architecture parameters because, in warm_up phase, arch_parameters are not updated. #for key in list(state_dict.keys()): # if 'cell_arch_parameters' in key or 'network_arch_parameters' in key or 'aspp_arch_parameters' in key: # state_dict.pop(key) checkpoint = { 'state_dict': state_dict, 'weight_optimizer': self.run_manager.optimizer.state_dict(), 'weight_scheduler': self.run_manager.optimizer.state_dict(), 'warmup': self.warmup, 'warmup_epoch': epoch + 1, } filename = self.logger.path(mode='warm', is_best=False) save_path = save_checkpoint(checkpoint, filename, self.logger, mode='warm')