def run_train(opt, training_data_loader, validation_data_loader): if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) log_file = os.path.join(opt.checkpoint_dir, 'vgg_log.csv') print('[Initialize networks for training]') net = VGG(opt) L2_criterion = nn.MSELoss() print(net) if opt.resume: opt.start_epoch, net = load_model(opt, opt.checkpoint_dir) else: with open(log_file, mode='w') as f: f.write('epoch, train_loss, train_acc, valid_loss, valid_acc\n') print('===> Setting GPU') print('CUDA Available', torch.cuda.is_available()) if opt.use_cuda and torch.cuda.is_available(): opt.use_cuda = True opt.device = 'cuda' else: opt.use_cuda = False opt.device = 'cpu' if torch.cuda.device_count() > 1 and opt.multi_gpu: print("Use" + str(torch.cuda.device_count()) + 'GPUs') net = nn.DataParallel(net) if opt.use_cuda: net = net.to(opt.device) L2_criterion = L2_criterion.to(opt.device) print("===> Setting Optimizer") optimizer = torch.optim.Adam(net.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2)) for epoch in range(opt.start_epoch, opt.n_epochs): opt.epoch_num = epoch train_loss, train_acc = train(opt, net, optimizer, training_data_loader, loss_criterion=L2_criterion) valid_loss, valid_acc = evaluate(opt, net, validation_data_loader, loss_criterion=L2_criterion) with open(log_file, mode='a') as f: f.write("%d, %08f,%08f,%08f,%08f\n" % (epoch, train_loss, train_acc, valid_loss, valid_acc)) save_checkpoint(opt, net, epoch, valid_loss)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # local rank, local machine cuda id args.local_rank = args.gpu args.batch_size = args.batch_size_per_gpu args.batch_size_total = args.batch_size * args.world_size #rescale base lr args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max( 1, args.batch_size_total // 256)) # set random seed, make sure all random subgraph generated would be the same random.seed(args.seed) torch.manual_seed(args.seed) if args.gpu: torch.cuda.manual_seed(args.seed) global_rank = args.gpu + args.machine_rank * ngpus_per_node dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=global_rank) # Setup logging format. logging.setup_logging(args.logging_save_path, 'w') logger.info( f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \ gpu per node {ngpus_per_node}, world size {args.world_size}" ) # synchronize is needed here to prevent a possible timeout after calling # init_process_group # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 comm.synchronize() args.rank = comm.get_rank() # global rank args.local_rank = args.gpu torch.cuda.set_device(args.gpu) # build model logger.info("=> creating model '{}'".format(args.arch)) model = models.model_factory.create_model(args) model.cuda(args.gpu) # use sync batchnorm if getattr(args, 'sync_bn', False): model.apply(lambda m: setattr(m, 'need_sync', True)) model = comm.get_parallel_model(model, args.gpu) #local rank logger.info(model) criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda( args.gpu) soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max, args.iw_clip).cuda(args.gpu) if not getattr(args, 'inplace_distill', True): soft_criterion = None ## load dataset, train_sampler: distributed train_loader, val_loader, train_sampler = build_data_loader(args) args.n_iters_per_epoch = len(train_loader) logger.info(f'building optimizer and lr scheduler, \ local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}' ) optimizer = build_optimizer(args, model) lr_scheduler = build_lr_scheduler(args, optimizer) # optionally resume from a checkpoint if args.resume: saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger) logger.info(args) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) args.curr_epoch = epoch logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0])) # train for one epoch acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \ soft_criterion=soft_criterion, lr_scheduler=lr_scheduler) if comm.is_master_process() or args.distributed: # validate supernet model validate(train_loader, val_loader, model, criterion, args) if comm.is_master_process(): # save checkpoints saver.save_checkpoint( args.checkpoint_save_path, model, optimizer, lr_scheduler, args, epoch, )
avg_relative_error / nnq.num_params, epoch) # Make an overall histogram of the weights all_weights = [ p.detach().cpu().numpy().flatten() for p in nnq.param_list ] for w, name in zip(all_weights, nnq.param_names): logger.histo_summary(name, w, epoch, bins=20) all_weights = np.concatenate(all_weights) logger.histo_summary("params", all_weights, epoch, bins=50) # Saving and testing if epoch % configs.get('save_freq', int(1e6)) == 0: saver.save_checkpoint(model_list, log_dir, epoch, optimizer=optimizer, lr_scheduler=lr_scheduler) # PUT ANY TESTING HERE (the kind that happens every epoch) for model in model_list: model.eval() # Save a final checkpoint saver.save_checkpoint(model_list, log_dir, configs["num_epochs"], optimizer=optimizer, lr_scheduler=lr_scheduler) if args.eval:
def run_train(opt, training_data_loader): # check gpu setting with opt arguments opt = set_gpu(opt) print('Initialize networks for training') net = set_model(opt) print(net) if opt.use_cuda: net = net.to(opt.device) print("Setting Optimizer") if opt.optimizer == 'adam': optimizer = optim.Adam(net.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2), eps=1e-8, weight_decay=0) print("===> Use Adam optimizer") if opt.resume: opt.start_epoch, net, optimizer = load_model(opt, net, optimizer=optimizer) else: set_checkpoint_dir(opt) if opt.multi_gpu: net = nn.DataParallel(net) if not os.path.exists(opt.checkpoint_dir): os.makedirs(opt.checkpoint_dir) log_file = os.path.join(opt.checkpoint_dir, opt.model + "_log.csv") opt_file = os.path.join(opt.checkpoint_dir, opt.model + "_opt.txt") scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5, mode='min') # scheduler = StepLR(optimizer, step_size=50, gamma=0.5) # Create log file when training start if opt.start_epoch == 1: with open(log_file, mode='w') as f: f.write("epoch,train_loss,valid_loss\n") save_config(opt) data_loader = { 'train': training_data_loader, } modes = ['train', 'valid'] l2_criterion = nn.MSELoss() l1_criterion = nn.L1Loss() if opt.use_cuda: l2_criterion = l2_criterion.to(opt.device) l1_criterion = l1_criterion.to(opt.device) if opt.content_loss == 'l2': content_loss_criterion = l2_criterion elif opt.content_loss == 'l1': content_loss_criterion = l1_criterion else: raise ValueError("Specify content loss correctly (l1, l2)") if opt.style_loss == 'l2': style_loss_criterion = l2_criterion elif opt.style_loss == 'l1': style_loss_criterion = l1_criterion else: raise ValueError("Specify style loss correctly (l1, l2)") if opt.ll_loss == 'l2': ll_loss_criterion = l2_criterion elif opt.ll_loss == 'l1': ll_loss_criterion = l1_criterion else: raise ValueError("Specify style loss correctly (l1, l2)") nc = opt.n_channels np.random.seed(1024) sq = np.arange(1024) np.random.shuffle(sq) for epoch in range(opt.start_epoch, opt.n_epochs): opt.epoch_num = epoch for phase in modes: if phase == 'train': total_loss = 0.0 total_psnr = 0.0 total_iteration = 0 net.train() mode = "Training" print("*** %s ***" % mode) start_time = time.time() for iteration, batch in enumerate(data_loader[phase], 1): # (_, x), (_, target) = batch[0], batch[1] x, target = batch[0], batch[1] x_img, target_img = batch[3], batch[4] lr_approx = batch[5] if opt.use_cuda: x = x.to(opt.device) target = target.to(opt.device) optimizer.zero_grad() # epoch_loss = 0. with torch.set_grad_enabled(phase == 'train'): out = net(x) # norm_target = normalize_coeffs(target, ch_min=opt.ch_min, ch_max=opt.ch_max) std_target = standarize_coeffs(target, ch_mean=opt.ch_mean, ch_std=opt.ch_std) # norm_out = normalize_coeffs(out, ch_min=opt.ch_min, ch_max=opt.ch_max) std_out = standarize_coeffs(out, ch_mean=opt.ch_mean, ch_std=opt.ch_std) ll_target = std_target[:, 0:nc, :, :] ll_out = std_out[:, 0:nc, :, :] high_target = std_target[:, nc:, :, :] high_out = std_out[:, nc:, :, :] # log_channel_loss(std_out, std_target, content_loss_criterion) ll_content_loss = content_loss_criterion( ll_target, ll_out) ll_style_loss = 0 # content_loss = content_loss_criterion(norm_target, norm_out) high_content_loss = content_loss_criterion( high_target, high_out) high_style_loss = 0 ll_loss = ll_content_loss + ll_style_loss high_loss = high_content_loss + high_style_loss epoch_loss = opt.ll_weight * ll_loss + ( 1 - opt.ll_weight) * high_loss # L1 loss for wavelet coeffiecients l1_loss = 0 total_loss += epoch_loss.item() epoch_loss.backward() optimizer.step() mse_loss = l2_criterion(out, target) psnr = 10 * math.log10(1 / mse_loss.item()) total_psnr += psnr print( "High Content Loss: {:5f}, High Style Loss: {:5f}, LL Content Loss: {:5f}, LL Style Loss:{:5f}" .format(high_content_loss, high_style_loss, ll_content_loss, ll_style_loss)) print( "{} {:4f}s => Epoch[{}/{}]({}/{}): Epoch Loss: {:5f} High Loss: {:5f} LL Loss: {:5f} L1 Loss: {:5f} PSNR: {:5f}" .format(mode, time.time() - start_time, opt.epoch_num, opt.n_epochs, iteration, len(data_loader[phase]), epoch_loss.item(), high_loss.item(), ll_loss.item(), l1_loss, psnr)) total_iteration = iteration total_loss = total_loss / total_iteration total_psnr = total_psnr / total_iteration train_loss = total_loss train_psnr = total_psnr else: net.eval() mode = "Validation" print("*** %s ***" % mode) valid_loss, valid_psnr = run_valid(opt, net, content_loss_criterion, sq) scheduler.step(valid_loss) with open(log_file, mode='a') as f: f.write("%d,%08f,%08f,%08f,%08f\n" % (epoch, train_loss, train_psnr, valid_loss, valid_psnr)) save_checkpoint(opt, net, optimizer, epoch, valid_loss)
best_loss = 1000.0 writer = SummaryWriter(log_dir=opt.log_dir) for epoch in range(opt.n_epochs): opt.epoch_num = epoch train_loss = trainer(opt, net, optimizer, train_data_loader, loss_criterion=loss_criterion) valid_loss = evaluator(opt, net, valid_data_loader, loss_criterion=loss_criterion) writer.add_scalar('WCELoss/train', train_loss, epoch) writer.add_scalar('WCELoss/valid', valid_loss, epoch) if not opt.save_best: save_checkpoint(opt, net, epoch, valid_loss, schedular) if opt.save_best: if valid_loss < best_loss: best_loss = valid_loss best_model_wts = copy.deepcopy(net.state_dict()) # 채송: main 함수 다 돌면 valid loss가 가장 좋은 model 저장하도록 하는 if opt.save_best: save_checkpoint(opt, best_model_wts, epoch, valid_loss, schedular) writer.close()
writer.add_scalar('Loss/F1-score', f1_score, epoch + 1) # if val_loss >= lower_loss: # no_optimize += 1 # else: # no_optimize = 0 scheduler.step(val_loss) lr = optimizer.state_dict()['param_groups'][0]['lr'] writer.add_scalar('Loss/learning_rate', lr, epoch + 1) if f1_score > best_f1: best_f1 = f1_score filename = [] filename.append( os.path.join( args.checkpoints, 'net-epoch-%s-%s.pth' % (epoch + 1, round(best_f1, 4)))) filename.append(os.path.join(args.checkpoints, 'model_best.pth')) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': net.state_dict(), }, True, filename) # if no_optimize > args.early_stopping: # print("Early Stopping...") # break print("Training Done...")