def validate(net, path, image_size, data_loader, batch_size=100, device='cuda:0'): if 'cuda' in device: net = torch.nn.DataParallel(net).to(device) else: net = net.to(device) data_loader.dataset.transform = transforms.Compose([ transforms.Resize(int(math.ceil(image_size / 0.875))), transforms.CenterCrop(image_size), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ]) cudnn.benchmark = True criterion = nn.CrossEntropyLoss().to(device) net.eval() net = net.to(device) losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() with torch.no_grad(): with tqdm(total=len(data_loader), desc='Validate') as t: for i, (images, labels) in enumerate(data_loader): images, labels = images.to(device), labels.to(device) # compute output output = net(images) # # ? pc: handle abnormal labels # labels = labels-1 # labels[labels<0]=0 # print('-'*20) # print('MIN: %d | MAX: %d'%(min(labels), max(labels))) # loss = criterion(output, labels) # measure accuracy and record loss acc1, acc5 = accuracy(output, labels, topk=(1, 5)) # losses.update(loss.item(), images.size(0)) top1.update(acc1[0].item(), images.size(0)) top5.update(acc5[0].item(), images.size(0)) t.set_postfix({ 'loss': losses.avg, 'top1': top1.avg, 'top5': top5.avg, 'img_size': images.size(2), }) t.update(1) print('Results: loss=%.5f,\t top1=%.1f,\t top5=%.1f' % (losses.avg, top1.avg, top5.avg)) return top1.avg
def validate(self, epoch=0, is_test=False, run_str='', net=None, data_loader=None, no_logs=False, train_mode=False): if net is None: net = self.net if not isinstance(net, nn.DataParallel): net = nn.DataParallel(net) if data_loader is None: data_loader = self.run_config.test_loader if is_test else self.run_config.valid_loader if train_mode: net.train() else: net.eval() losses = AverageMeter() metric_dict = self.get_metric_dict() with torch.no_grad(): with tqdm(total=len(data_loader), desc='Validate Epoch #{} {}'.format(epoch + 1, run_str), disable=no_logs) as t: for i, (images, labels) in enumerate(data_loader): images, labels = images.to(self.device), labels.to( self.device) # compute output output = net(images) # print(f'actual size: {images.shape[-1]}') loss = self.test_criterion(output, labels) # measure accuracy and record loss self.update_metric(metric_dict, output, labels) losses.update(loss.item(), images.size(0)) t.set_postfix({ 'loss': losses.avg, **self.get_metric_vals(metric_dict, return_dict=True), 'img_size': images.size(2), }) t.update(1) return losses.avg, self.get_metric_vals(metric_dict)
def eval_one_epoch(self, args, epoch, warmup_epochs=0, warmup_lr=0): # switch to train mode self.net.train() MyRandomResizedCrop.EPOCH = epoch # required by elastic resolution # nBatch = len(self.run_config.train_loader) losses = AverageMeter() metric_dict = self.get_metric_dict() data_time = AverageMeter() # with tqdm(total=nBatch, desc='{} Train Epoch #{}'.format(self.run_config.dataset, epoch + 1)) as t: end = time.time() for i, (images, labels) in enumerate(self.run_config.valid_loader): # for i, (images, labels) in enumerate(self.run_config.train_loader): # if i >= 2: # break MyRandomResizedCrop.BATCH = i data_time.update(time.time() - end) images, labels = images.to(self.device), labels.to(self.device) target = labels # compute output output = self.net(images) loss = self.train_criterion(output, labels) loss_type = 'ce' # compute gradient and do SGD step # self.model.zero_grad() # or self.optimizer.zero_grad() loss.backward() # self.get_grads() # self.optimizer.step() # measure accuracy and record loss losses.update(loss.item(), images.size(0)) self.update_metric(metric_dict, output, target) # t.set_postfix({ # 'loss': losses.avg, # **self.get_metric_vals(metric_dict, return_dict=True), # 'img_size': images.size(2), # 'loss_type': loss_type, # 'data_time': data_time.avg, # }) # t.update(1) end = time.time() # self.get_grads() return losses.avg, self.get_metric_vals(metric_dict)
def validate(self, epoch=0, is_test=True, run_str='', net=None, data_loader=None, no_logs=False): if net is None: net = self.net if not isinstance(net, nn.DataParallel): net = nn.DataParallel(net) if data_loader is None: if is_test: data_loader = self.run_config.test_loader else: data_loader = self.run_config.valid_loader net.eval() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() with torch.no_grad(): with tqdm(total=len(data_loader), desc='Validate Epoch #{} {}'.format(epoch + 1, run_str), disable=no_logs) as t: for i, (images, labels) in enumerate(data_loader): images, labels = images.to(self.device), labels.to(self.device) # compute output output = net(images) loss = self.test_criterion(output, labels) # measure accuracy and record loss acc1, acc5 = accuracy(output, labels, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0].item(), images.size(0)) top5.update(acc5[0].item(), images.size(0)) t.set_postfix({ 'loss': losses.avg, 'top1': top1.avg, 'top5': top5.avg, 'img_size': images.size(2), }) t.update(1) return losses.avg, top1.avg, top5.avg
criterion = nn.CrossEntropyLoss().cuda() net.eval() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() with torch.no_grad(): with tqdm(total=len(data_loader), desc='Validate') as t: for i, (images, labels) in enumerate(data_loader): images, labels = images.cuda(), labels.cuda() # compute output output = net(images) loss = criterion(output, labels) # measure accuracy and record loss acc1, acc5 = accuracy(output, labels, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0].item(), images.size(0)) top5.update(acc5[0].item(), images.size(0)) t.set_postfix({ 'loss': losses.avg, 'top1': top1.avg, 'top5': top5.avg, 'img_size': images.size(2), }) t.update(1) print('Test OFA specialized net <%s> with image size %d:' % (args.net, image_size)) print('Results: loss=%.5f,\t top1=%.1f,\t top5=%.1f' % (losses.avg, top1.avg, top5.avg))
def train_one_epoch(run_manager, args, epoch, warmup_epochs=0, warmup_lr=0): dynamic_net = run_manager.net # switch to train mode dynamic_net.train() run_manager.run_config.train_loader.sampler.set_epoch(epoch) MyRandomResizedCrop.EPOCH = epoch nBatch = len(run_manager.run_config.train_loader) data_time = AverageMeter() losses = DistributedMetric('train_loss') top1 = DistributedMetric('train_top1') top5 = DistributedMetric('train_top5') with tqdm(total=nBatch, desc='Train Epoch #{}'.format(epoch + 1), disable=not run_manager.is_root) as t: end = time.time() for i, (images, labels) in enumerate(run_manager.run_config.train_loader): data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = run_manager.run_config.warmup_adjust_learning_rate( run_manager.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = run_manager.run_config.adjust_learning_rate( run_manager.optimizer, epoch - warmup_epochs, i, nBatch) images, labels = images.cuda(), labels.cuda() target = labels # soft target if args.kd_ratio > 0: args.teacher_model.train() with torch.no_grad(): soft_logits = args.teacher_model(images).detach() soft_label = F.softmax(soft_logits, dim=1) # clear gradients run_manager.optimizer.zero_grad() loss_of_subnets, acc1_of_subnets, acc5_of_subnets = [], [], [] # compute output subnet_str = '' for _ in range(args.dynamic_batch_size): # set random seed before sampling if args.independent_distributed_sampling: subnet_seed = os.getpid() + time.time() else: subnet_seed = int('%d%.3d%.3d' % (epoch * nBatch + i, _, 0)) random.seed(subnet_seed) subnet_settings = dynamic_net.sample_active_subnet() subnet_str += '%d: ' % _ + ','.join([ '%s_%s' % (key, '%.1f' % subset_mean(val, 0) if isinstance(val, list) else val) for key, val in subnet_settings.items() ]) + ' || ' output = run_manager.net(images) if args.kd_ratio == 0: loss = run_manager.train_criterion(output, labels) loss_type = 'ce' else: if args.kd_type == 'ce': kd_loss = cross_entropy_loss_with_soft_target( output, soft_label) else: kd_loss = F.mse_loss(output, soft_logits) loss = args.kd_ratio * kd_loss + run_manager.train_criterion( output, labels) loss = loss * (2 / (args.kd_ratio + 1)) loss_type = '%.1fkd-%s & ce' % (args.kd_ratio, args.kd_type) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) loss_of_subnets.append(loss) acc1_of_subnets.append(acc1[0]) acc5_of_subnets.append(acc5[0]) loss.backward() run_manager.optimizer.step() losses.update(list_mean(loss_of_subnets), images.size(0)) top1.update(list_mean(acc1_of_subnets), images.size(0)) top5.update(list_mean(acc5_of_subnets), images.size(0)) t.set_postfix({ 'loss': losses.avg.item(), 'top1': top1.avg.item(), 'top5': top5.avg.item(), 'R': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'seed': str(subnet_seed), 'str': subnet_str, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg.item(), top1.avg.item(), top5.avg.item()
def train_one_epoch(run_manager, args, epoch, warmup_epochs=0, warmup_lr=0): dynamic_net = run_manager.net if isinstance(dynamic_net, nn.DataParallel): dynamic_net = dynamic_net.module # switch to train mode dynamic_net.train() # run_manager.run_config.train_loader.sampler.set_epoch(epoch) # MyRandomResizedCrop.EPOCH = epoch #################### Code for freezing BN. Overfitting 할 때는 주석 해제하면됨. # for m in dynamic_net.modules(): # if isinstance(m, nn.BatchNorm2d): # ########## Use running mean/var # m.eval() # ########## BN weight/bias freeze # # m.weight.requires_grad = False # # m.bias.requires_grad = False nBatch = len(run_manager.run_config.train_loader) data_time = AverageMeter() # losses = DistributedMetric('train_loss') # top1 = DistributedMetric('train_top1') # top5 = DistributedMetric('train_top5') losses = AverageMeter() psnr_averagemeter = AverageMeter() with tqdm(total=nBatch, desc='Train Epoch #{}'.format(epoch + 1)) as t: end = time.time() for i, mini_batch in enumerate(run_manager.run_config.train_loader): images = mini_batch['image'] #################### 2x or 4x 고르는 부분. x2_down_images = mini_batch['2x_down_image'] x4_down_images = mini_batch['4x_down_image'] data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = run_manager.run_config.warmup_adjust_learning_rate( run_manager.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = run_manager.run_config.adjust_learning_rate( run_manager.optimizer, epoch - warmup_epochs, i, nBatch) images = images.cuda() #################### 2x or 4x 고르는 부분. x2_down_images = x2_down_images.cuda() x4_down_images = x4_down_images.cuda() target = images # soft target if args.kd_ratio > 0: args.teacher_model.train() with torch.no_grad(): soft_logits = args.teacher_model(images).detach() soft_label = F.softmax(soft_logits, dim=1) # clear gradients run_manager.optimizer.zero_grad() loss_of_subnets, psnr_of_subnets = [], [] # compute output subnet_str = '' for _ in range(args.dynamic_batch_size): # set random seed before sampling if args.independent_distributed_sampling: subnet_seed = os.getpid() + time.time() else: subnet_seed = int('%d%.3d%.3d' % (epoch * nBatch + i, _, 0)) random.seed(subnet_seed) #################### Random Sampling과 Structured Sampling중에 주석 바꿔가면서 고르면 됨. Single Architecture Overfitting을 위해서 여기 수정해주면 가능. subnet_settings = dynamic_net.sample_active_subnet() # dynamic_net.set_active_subnet(ks=7, e=3, d=2, pixel_d=1) subnet_str += '%d: ' % _ + ','.join([ '%s_%s' % (key, '%.1f' % subset_mean(val, 0) if isinstance(val, list) else val) for key, val in subnet_settings.items() ]) + ' || ' #################### 2x or 4x 고르는 부분. # output = run_manager.net(images) if subnet_settings['pixel_d'][0] == 1: output = run_manager.net(x2_down_images) elif subnet_settings['pixel_d'][0] == 2: output = run_manager.net(x4_down_images) if args.kd_ratio == 0: loss = run_manager.train_criterion(output, images) loss_type = 'mse' else: if args.kd_type == 'ce': kd_loss = cross_entropy_loss_with_soft_target( output, soft_label) else: kd_loss = F.mse_loss(output, soft_logits) loss = args.kd_ratio * kd_loss + run_manager.train_criterion( output, labels) loss = loss * (2 / (args.kd_ratio + 1)) loss_type = '%.1fkd-%s & ce' % (args.kd_ratio, args.kd_type) # measure accuracy and record loss # acc1, acc5 = accuracy(output, target, topk=(1, 5)) psnr_current = psnr(rgb2y(tensor2img_np(output)), rgb2y(tensor2img_np(images))) loss_of_subnets.append(loss) # acc1_of_subnets.append(acc1[0]) # acc5_of_subnets.append(acc5[0]) psnr_of_subnets.append(psnr_current) loss.backward() run_manager.optimizer.step() losses.update(list_mean(loss_of_subnets), images.size(0)) # top1.update(list_mean(acc1_of_subnets), images.size(0)) # top5.update(list_mean(acc5_of_subnets), images.size(0)) psnr_averagemeter.update(list_mean(psnr_of_subnets), images.size(0)) t.set_postfix({ 'loss': losses.avg.item(), # 'top1': top1.avg.item(), # 'top5': top5.avg.item(), 'psnr': psnr_averagemeter.avg, 'R': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'seed': str(subnet_seed), 'str': subnet_str, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg.item(), psnr_averagemeter.avg
verbose=False) gpu_ava_delay.reset() cpu_ava_delay.reset() with get_engine( onnxpath) as engine, engine.create_execution_context() as context: inputs, outputs, bindings, stream = common.allocate_buffers(engine) # Set host input to the image. The common.do_inference function will copy the input to the GPU before executing. for i, (images, labels) in enumerate(data_loader): images, labels = images.numpy(), labels.numpy() inputs[0].host = images.astype(np.float32) t1 = time.time() trt_outputs = common.do_inference_v2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) t2 = time.time() print((t2 - t1) * 1000) if i > 5: gpu_delay = (t2 - t1) * 1000 gpu_ava_delay.update(gpu_delay) csv_array = [ net_config, round(gpu_ava_delay.avg, 4), round(cpu_ava_delay.avg, 4) ] csv_writer.writerow(csv_array) csv_f.flush()
def train_one_epoch(self, args, epoch, warmup_epochs=0, warmup_lr=0): # switch to train mode self.net.train() MyRandomResizedCrop.EPOCH = epoch # required by elastic resolution nBatch = len(self.run_config.train_loader) losses = AverageMeter() metric_dict = self.get_metric_dict() data_time = AverageMeter() with tqdm(total=nBatch, desc='{} Train Epoch #{}'.format(self.run_config.dataset, epoch + 1)) as t: end = time.time() for i, (images, labels) in enumerate(self.run_config.train_loader): MyRandomResizedCrop.BATCH = i data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = self.run_config.warmup_adjust_learning_rate( self.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = self.run_config.adjust_learning_rate( self.optimizer, epoch - warmup_epochs, i, nBatch) images, labels = images.to(self.device), labels.to(self.device) target = labels if isinstance(self.run_config.mixup_alpha, float): # transform data lam = random.betavariate(self.run_config.mixup_alpha, self.run_config.mixup_alpha) images = mix_images(images, lam) labels = mix_labels( labels, lam, self.run_config.data_provider.n_classes, self.run_config.label_smoothing) # soft target if args.teacher_model is not None: args.teacher_model.train() with torch.no_grad(): soft_logits = args.teacher_model(images).detach() soft_label = F.softmax(soft_logits, dim=1) # compute output output = self.net(images) loss = self.train_criterion(output, labels) if args.teacher_model is None: loss_type = 'ce' else: if args.kd_type == 'ce': kd_loss = cross_entropy_loss_with_soft_target( output, soft_label) else: kd_loss = F.mse_loss(output, soft_logits) loss = args.kd_ratio * kd_loss + loss loss_type = '%.1fkd+ce' % args.kd_ratio # compute gradient and do SGD step self.net.zero_grad() # or self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure accuracy and record loss losses.update(loss.item(), images.size(0)) self.update_metric(metric_dict, output, target) t.set_postfix({ 'loss': losses.avg, **self.get_metric_vals(metric_dict, return_dict=True), 'img_size': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg, self.get_metric_vals(metric_dict)
def train_one_epoch(self, args, epoch, warmup_epochs=0, warmup_lr=0): # switch to train mode self.net.train() MyRandomResizedCrop.EPOCH = epoch # required by elastic resolution nBatch = len(self.run_config.train_loader) losses = AverageMeter() metric_dict = self.get_metric_dict() data_time = AverageMeter() with tqdm(total=nBatch, desc='{} Train Epoch #{}'.format(self.run_config.dataset, epoch + 1)) as t: end = time.time() para_loader = pl.ParallelLoader(self.run_config.train_loader, [self.device]) para_loader = para_loader.per_device_loader(self.device) for i, (images, labels) in enumerate(para_loader): MyRandomResizedCrop.BATCH = i data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = self.run_config.warmup_adjust_learning_rate( self.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = self.run_config.adjust_learning_rate(self.optimizer, epoch - warmup_epochs, i, nBatch) new_lr *= xm.xrt_world_size() target = labels if isinstance(self.run_config.mixup_alpha, float): # transform data lam = random.betavariate(self.run_config.mixup_alpha, self.run_config.mixup_alpha) images = mix_images(images, lam) labels = mix_labels( labels, lam, self.run_config.data_provider.n_classes, self.run_config.label_smoothing ) images = images.to(self.device) labels = labels.to(self.device) # compute output output = self.net(images) loss = self.train_criterion(output, labels) # if args.teacher_model is None: loss_type = 'ce' # compute gradient and do SGD step self.net.zero_grad() # or self.optimizer.zero_grad() loss.backward() # self.optimizer.step() xm.optimizer_step(self.optimizer) # measure accuracy and record loss losses.update(loss.item(), images.size(0)) self.update_metric(metric_dict, output, target) t.set_postfix({ 'loss': losses.avg, **self.get_metric_vals(metric_dict, return_dict=True), 'img_size': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg, self.get_metric_vals(metric_dict)
def train_one_epoch(self, args, epoch, warmup_epochs=5, warmup_lr=0): self.net.train() self.run_config.train_loader.sampler.set_epoch(epoch) MyRandomResizedCrop.EPOCH = epoch nBatch = len(self.run_config.train_loader) losses = DistributedMetric('train_loss') top1 = DistributedMetric('train_top1') top5 = DistributedMetric('train_top5') data_time = AverageMeter() with tqdm(total=nBatch, desc='Train Epoch #{}'.format(epoch + 1), disable=not self.is_root) as t: end = time.time() for i, (images, labels) in enumerate(self.run_config.train_loader): data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = self.run_config.warmup_adjust_learning_rate( self.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = self.run_config.adjust_learning_rate(self.optimizer, epoch - warmup_epochs, i, nBatch) images, labels = images.cuda(), labels.cuda() target = labels # soft target if args.teacher_model is not None: args.teacher_model.train() with torch.no_grad(): soft_logits = args.teacher_model(images).detach() soft_label = F.softmax(soft_logits, dim=1) # compute output output = self.net(images) if args.teacher_model is None: loss = self.train_criterion(output, labels) loss_type = 'ce' else: if args.kd_type == 'ce': kd_loss = cross_entropy_loss_with_soft_target(output, soft_label) else: kd_loss = F.mse_loss(output, soft_logits) loss = args.kd_ratio * kd_loss + self.train_criterion(output, labels) loss_type = '%.1fkd-%s & ce' % (args.kd_ratio, args.kd_type) # update self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss, images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) t.set_postfix({ 'loss': losses.avg.item(), 'top1': top1.avg.item(), 'top5': top5.avg.item(), 'img_size': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg.item(), top1.avg.item(), top5.avg.item()
def train_one_epoch(self, args, epoch, warmup_epochs=0, warmup_lr=0): # switch to train mode self.net.train() nBatch = len(self.run_config.train_loader) losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() data_time = AverageMeter() with tqdm(total=nBatch, desc='Train Epoch #{}'.format(epoch + 1)) as t: end = time.time() for i, (images, labels) in enumerate(self.run_config.train_loader): data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = self.run_config.warmup_adjust_learning_rate( self.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = self.run_config.adjust_learning_rate(self.optimizer, epoch - warmup_epochs, i, nBatch) images, labels = images.to(self.device), labels.to(self.device) target = labels # soft target if args.teacher_model is not None: args.teacher_model.train() with torch.no_grad(): soft_logits = args.teacher_model(images).detach() soft_label = F.softmax(soft_logits, dim=1) # compute output if isinstance(self.network, torchvision.models.Inception3): output, aux_outputs = self.net(images) loss1 = self.train_criterion(output, labels) loss2 = self.train_criterion(aux_outputs, labels) loss = loss1 + 0.4 * loss2 else: output = self.net(images) loss = self.train_criterion(output, labels) if args.teacher_model is None: loss_type = 'ce' else: if args.kd_type == 'ce': kd_loss = cross_entropy_loss_with_soft_target(output, soft_label) else: kd_loss = F.mse_loss(output, soft_logits) loss = args.kd_ratio * kd_loss + loss loss_type = '%.1fkd-%s & ce' % (args.kd_ratio, args.kd_type) # compute gradient and do SGD step self.net.zero_grad() # or self.optimizer.zero_grad() if self.mix_prec is not None: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optimizer.step() # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0].item(), images.size(0)) top5.update(acc5[0].item(), images.size(0)) t.set_postfix({ 'loss': losses.avg, 'top1': top1.avg, 'top5': top5.avg, 'img_size': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg, top1.avg, top5.avg
def train_one_epoch(self, args, epoch, warmup_epochs=5, warmup_lr=0): self.net.train() # self.run_config.train_loader.sampler.set_epoch(epoch) # required by distributed sampler MyRandomResizedCrop.EPOCH = epoch # required by elastic resolution nBatch = len(self.run_config.train_loader) # losses = DistributedMetric('train_loss') losses = AverageMeter() metric_dict = self.get_metric_dict() data_time = AverageMeter() with tqdm(total=nBatch, desc='Train Epoch #{}'.format(epoch + 1), disable=not self.is_root) as t: end = time.time() for i, (images, labels) in enumerate(self.run_config.train_loader): MyRandomResizedCrop.BATCH = i data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = self.run_config.warmup_adjust_learning_rate( self.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = self.run_config.adjust_learning_rate( self.optimizer, epoch - warmup_epochs, i, nBatch) # images, labels = images.cuda(), labels.cuda() target = labels if isinstance(self.run_config.mixup_alpha, float): # transform data random.seed(int('%d%.3d' % (i, epoch))) lam = random.betavariate(self.run_config.mixup_alpha, self.run_config.mixup_alpha) images = mix_images(images, lam) labels = mix_labels( labels, lam, self.run_config.data_provider.n_classes, self.run_config.label_smoothing) # soft target # if args.teacher_model is not None: # args.teacher_model.train() # with torch.no_grad(): # soft_logits = args.teacher_model(images).detach() # soft_label = F.softmax(soft_logits, dim=1) # compute output output = self.net(images) # if args.teacher_model is None: loss = self.train_criterion(output, labels) loss_type = 'ce' # else: # if args.kd_type == 'ce': # kd_loss = cross_entropy_loss_with_soft_target(output, soft_label) # else: # kd_loss = F.mse_loss(output, soft_logits) # loss = args.kd_ratio * kd_loss + self.train_criterion(output, labels) # loss_type = '%.1fkd+ce' % args.kd_ratio # update self.optimizer.zero_grad() loss.backward() self.optimizer.step() # measure accuracy and record loss # losses.update(loss, images.size(0)) # self.update_metric(metric_dict, output, target) #todo : update_metric 에 horovod 가 있음. 바꾸든지 이 메소드 안쓰든지 해야할 # t.set_postfix({ # 'loss': losses.avg.item(), # **self.get_metric_vals(metric_dict, return_dict=True), # 'img_size': images.size(2), # 'lr': new_lr, # 'loss_type': loss_type, # 'data_time': data_time.avg, # }) # t.update(1) # end = time.time() break # return losses.avg.item()#, self.get_metric_vals(metric_dict) return losses.avg # , self.get_metric_vals(metric_dict)
def train_one_epoch(self, args, epoch, warmup_epochs=0, warmup_lr=0): # switch to train mode self.net.train() #################### Code for freezing BN. Overfitting 할 때는 주석 해제하면됨. for m in self.net.modules(): if isinstance(m, nn.BatchNorm2d): ########## Use running mean/var m.eval() ########## BN weight/bias freeze # m.weight.requires_grad = False # m.bias.requires_grad = False nBatch = len(self.run_config.train_loader) losses = AverageMeter() # top1 = AverageMeter() # top5 = AverageMeter() psnr_averagemeter = AverageMeter() data_time = AverageMeter() with tqdm(total=nBatch, desc='Train Epoch #{}'.format(epoch + 1)) as t: end = time.time() for i, mini_batch in enumerate(self.run_config.train_loader): images = mini_batch['image'] #################### 2x or 4x 고르는 부분. x2_down_images = mini_batch['2x_down_image'] # x4_down_images = mini_batch['4x_down_image'] data_time.update(time.time() - end) if epoch < warmup_epochs: new_lr = self.run_config.warmup_adjust_learning_rate( self.optimizer, warmup_epochs * nBatch, nBatch, epoch, i, warmup_lr, ) else: new_lr = self.run_config.adjust_learning_rate( self.optimizer, epoch - warmup_epochs, i, nBatch) images = images.to(self.device) #################### 2x or 4x 고르는 부분. x2_down_images = x2_down_images.to(self.device) # x4_down_images = x4_down_images.to(self.device) target = images # soft target if args.teacher_model is not None: args.teacher_model.train() with torch.no_grad(): soft_logits = args.teacher_model(images).detach() # soft_label = F.softmax(soft_logits, dim=1) # compute output if isinstance(self.network, torchvision.models.Inception3): output, aux_outputs = self.net(images) loss1 = self.train_criterion(output, labels) loss2 = self.train_criterion(aux_outputs, labels) loss = loss1 + 0.4 * loss2 else: #################### 2x or 4x 고르는 부분. output = self.net(x2_down_images) # output = self.net(x4_down_images) loss = self.train_criterion(output, images) if args.teacher_model is None: loss_type = 'mse' else: if args.kd_type == 'ce': kd_loss = cross_entropy_loss_with_soft_target( output, soft_label) else: kd_loss = F.mse_loss(output, soft_logits) loss = args.kd_ratio * kd_loss + loss loss_type = '%.1fkd-%s & mse' % (args.kd_ratio, args.kd_type) # compute gradient and do SGD step self.net.zero_grad() # or self.optimizer.zero_grad() if self.mix_prec is not None: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.optimizer.step() # measure accuracy and record loss # acc1, acc5 = accuracy(output, target, topk=(1, 5)) psnr_current = psnr(rgb2y(tensor2img_np(output)), rgb2y(tensor2img_np(images))) losses.update(loss.item(), images.size(0)) # top1.update(acc1[0].item(), images.size(0)) # top5.update(acc5[0].item(), images.size(0)) psnr_averagemeter.update(psnr_current, images.size(0)) t.set_postfix({ 'loss': losses.avg, # 'top1': top1.avg, # 'top5': top5.avg, 'psnr': psnr_averagemeter.avg, 'img_size': images.size(2), 'lr': new_lr, 'loss_type': loss_type, 'data_time': data_time.avg, }) t.update(1) end = time.time() return losses.avg, psnr_averagemeter.avg
def validate(self, epoch=0, is_test=True, run_str='', net=None, data_loader=None, no_logs=False, tensorboard_logging=False): if tensorboard_logging: from tensorboardX import SummaryWriter ################## for tensorboardX. Seuqential Video에 대해서 로그찍을 필요 없을때는 그냥 삭제하면됨. writer = SummaryWriter( './runs/sr_teacher_bn_mse_bolt' ) ################## 필요할 때마다 log위치 수정가능. for tensorboardX. Seuqential Video에 대해서 로그찍을 필요 없을때는 그냥 삭제하면됨. if net is None: net = self.net if not isinstance(net, nn.DataParallel): net = nn.DataParallel(net) if data_loader is None: if is_test: data_loader = self.run_config.test_loader else: data_loader = self.run_config.valid_loader net.eval() losses = AverageMeter() # top1 = AverageMeter() # top5 = AverageMeter() psnr_averagemeter = AverageMeter() with torch.no_grad(): with tqdm(total=len(data_loader), desc='Validate Epoch #{} {}'.format(epoch + 1, run_str), disable=no_logs) as t: for i, mini_batch in enumerate(data_loader): images = mini_batch['image'] #################### 2x or 4x 고르는 부분. x2_down_images = mini_batch['2x_down_image'] # x4_down_images = mini_batch['4x_down_image'] images = images.to(self.device) #################### 2x or 4x 고르는 부분. x2_down_images = x2_down_images.to(self.device) # x4_down_images = x4_down_images.to(self.device) # compute output #################### 2x or 4x 고르는 부분. output = net(x2_down_images) # output = net(x4_down_images) loss = self.test_criterion(output, images) # measure accuracy and record loss psnr_current = psnr( rgb2y(tensor2img_np(output)), rgb2y(tensor2img_np(images))) # HR Comparison # import PIL # LR Comparison # import torchvision.transforms as transforms # LR Comparison # output = output.cpu().data[0, :, :, :] # LR Comparison # output = transforms.ToPILImage()(output) # LR Comparison # output = output.resize((int(output.size[0]/2), int(output.size[1]/2)), resample=PIL.Image.BICUBIC) # LR Comparison # output.save('zssr.png') # LR Comparison FOR VALIDATE BICUBIC_DOWN # output = transforms.ToTensor()(output) # LR Comparison # psnr_current = psnr(rgb2y(tensor2img_np(output)), rgb2y(tensor2img_np(x2_down_images))) # LR Comparison if tensorboard_logging: writer.add_scalars( 'metric', {'psnr': psnr_current}, i ) ################## for tensorboardX. Seuqential Video에 대해서 로그찍을 필요 없을때는 그냥 삭제하면됨. losses.update(loss.item(), images.size(0)) # top1.update(acc1[0].item(), images.size(0)) # top5.update(acc5[0].item(), images.size(0)) psnr_averagemeter.update(psnr_current, images.size(0)) t.set_postfix({ 'loss': losses.avg, # 'top1': top1.avg, # 'top5': top5.avg, 'psnr': psnr_averagemeter.avg, 'img_size': images.size(2), }) t.update(1) if tensorboard_logging: writer.close( ) #################### for tensorboardX. Seuqential Video에 대해서 로그찍을 필요 없을때는 그냥 삭제하면됨. return losses.avg, psnr_averagemeter.avg