Exemple #1
0
def run_train(opt, training_data_loader, validation_data_loader):
    if not os.path.exists(opt.checkpoint_dir):
        os.makedirs(opt.checkpoint_dir)

    log_file = os.path.join(opt.checkpoint_dir, 'vgg_log.csv')

    print('[Initialize networks for training]')

    net = VGG(opt)
    L2_criterion = nn.MSELoss()
    print(net)

    if opt.resume:
        opt.start_epoch, net = load_model(opt, opt.checkpoint_dir)
    else:
        with open(log_file, mode='w') as f:
            f.write('epoch, train_loss, train_acc, valid_loss, valid_acc\n')

    print('===> Setting GPU')
    print('CUDA Available', torch.cuda.is_available())

    if opt.use_cuda and torch.cuda.is_available():
        opt.use_cuda = True
        opt.device = 'cuda'
    else:
        opt.use_cuda = False
        opt.device = 'cpu'

    if torch.cuda.device_count() > 1 and opt.multi_gpu:
        print("Use" + str(torch.cuda.device_count()) + 'GPUs')
        net = nn.DataParallel(net)

    if opt.use_cuda:
        net = net.to(opt.device)
        L2_criterion = L2_criterion.to(opt.device)

    print("===> Setting Optimizer")
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=opt.lr,
                                 betas=(opt.b1, opt.b2))

    for epoch in range(opt.start_epoch, opt.n_epochs):
        opt.epoch_num = epoch
        train_loss, train_acc = train(opt,
                                      net,
                                      optimizer,
                                      training_data_loader,
                                      loss_criterion=L2_criterion)
        valid_loss, valid_acc = evaluate(opt,
                                         net,
                                         validation_data_loader,
                                         loss_criterion=L2_criterion)

        with open(log_file, mode='a') as f:
            f.write("%d, %08f,%08f,%08f,%08f\n" %
                    (epoch, train_loss, train_acc, valid_loss, valid_acc))
        save_checkpoint(opt, net, epoch, valid_loss)
Exemple #2
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu  # local rank, local machine cuda id
    args.local_rank = args.gpu
    args.batch_size = args.batch_size_per_gpu
    args.batch_size_total = args.batch_size * args.world_size
    #rescale base lr
    args.lr_scheduler.base_lr = args.lr_scheduler.base_lr * (max(
        1, args.batch_size_total // 256))

    # set random seed, make sure all random subgraph generated would be the same
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.gpu:
        torch.cuda.manual_seed(args.seed)

    global_rank = args.gpu + args.machine_rank * ngpus_per_node
    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=global_rank)

    # Setup logging format.
    logging.setup_logging(args.logging_save_path, 'w')

    logger.info(
        f"Use GPU: {args.gpu}, machine rank {args.machine_rank}, num_nodes {args.num_nodes}, \
                    gpu per node {ngpus_per_node}, world size {args.world_size}"
    )

    # synchronize is needed here to prevent a possible timeout after calling
    # init_process_group
    # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
    comm.synchronize()

    args.rank = comm.get_rank()  # global rank
    args.local_rank = args.gpu
    torch.cuda.set_device(args.gpu)

    # build model
    logger.info("=> creating model '{}'".format(args.arch))
    model = models.model_factory.create_model(args)
    model.cuda(args.gpu)

    # use sync batchnorm
    if getattr(args, 'sync_bn', False):
        model.apply(lambda m: setattr(m, 'need_sync', True))

    model = comm.get_parallel_model(model, args.gpu)  #local rank

    logger.info(model)

    criterion = loss_ops.CrossEntropyLossSmooth(args.label_smoothing).cuda(
        args.gpu)
    soft_criterion = loss_ops.AdaptiveLossSoft(args.alpha_min, args.alpha_max,
                                               args.iw_clip).cuda(args.gpu)

    if not getattr(args, 'inplace_distill', True):
        soft_criterion = None

    ## load dataset, train_sampler: distributed
    train_loader, val_loader, train_sampler = build_data_loader(args)
    args.n_iters_per_epoch = len(train_loader)

    logger.info(f'building optimizer and lr scheduler, \
            local rank {args.gpu}, global rank {args.rank}, world_size {args.world_size}'
                )
    optimizer = build_optimizer(args, model)
    lr_scheduler = build_lr_scheduler(args, optimizer)

    # optionally resume from a checkpoint
    if args.resume:
        saver.load_checkpoints(args, model, optimizer, lr_scheduler, logger)

    logger.info(args)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        args.curr_epoch = epoch
        logger.info('Training lr {}'.format(lr_scheduler.get_lr()[0]))

        # train for one epoch
        acc1, acc5 = train_epoch(epoch, model, train_loader, optimizer, criterion, args, \
                soft_criterion=soft_criterion, lr_scheduler=lr_scheduler)

        if comm.is_master_process() or args.distributed:
            # validate supernet model
            validate(train_loader, val_loader, model, criterion, args)

        if comm.is_master_process():
            # save checkpoints
            saver.save_checkpoint(
                args.checkpoint_save_path,
                model,
                optimizer,
                lr_scheduler,
                args,
                epoch,
            )
Exemple #3
0
                                  avg_relative_error / nnq.num_params, epoch)

            # Make an overall histogram of the weights
            all_weights = [
                p.detach().cpu().numpy().flatten() for p in nnq.param_list
            ]
            for w, name in zip(all_weights, nnq.param_names):
                logger.histo_summary(name, w, epoch, bins=20)
            all_weights = np.concatenate(all_weights)
            logger.histo_summary("params", all_weights, epoch, bins=50)

            # Saving and testing
            if epoch % configs.get('save_freq', int(1e6)) == 0:
                saver.save_checkpoint(model_list,
                                      log_dir,
                                      epoch,
                                      optimizer=optimizer,
                                      lr_scheduler=lr_scheduler)

            # PUT ANY TESTING HERE (the kind that happens every epoch)
            for model in model_list:
                model.eval()

        # Save a final checkpoint
        saver.save_checkpoint(model_list,
                              log_dir,
                              configs["num_epochs"],
                              optimizer=optimizer,
                              lr_scheduler=lr_scheduler)

    if args.eval:
Exemple #4
0
def run_train(opt, training_data_loader):
    # check gpu setting with opt arguments
    opt = set_gpu(opt)

    print('Initialize networks for training')
    net = set_model(opt)
    print(net)

    if opt.use_cuda:
        net = net.to(opt.device)

    print("Setting Optimizer")
    if opt.optimizer == 'adam':
        optimizer = optim.Adam(net.parameters(),
                               lr=opt.lr,
                               betas=(opt.b1, opt.b2),
                               eps=1e-8,
                               weight_decay=0)
        print("===> Use Adam optimizer")

    if opt.resume:
        opt.start_epoch, net, optimizer = load_model(opt,
                                                     net,
                                                     optimizer=optimizer)
    else:
        set_checkpoint_dir(opt)

    if opt.multi_gpu:
        net = nn.DataParallel(net)

    if not os.path.exists(opt.checkpoint_dir):
        os.makedirs(opt.checkpoint_dir)
    log_file = os.path.join(opt.checkpoint_dir, opt.model + "_log.csv")
    opt_file = os.path.join(opt.checkpoint_dir, opt.model + "_opt.txt")

    scheduler = ReduceLROnPlateau(optimizer,
                                  factor=0.5,
                                  patience=5,
                                  mode='min')
    # scheduler = StepLR(optimizer, step_size=50, gamma=0.5)

    # Create log file when training start
    if opt.start_epoch == 1:
        with open(log_file, mode='w') as f:
            f.write("epoch,train_loss,valid_loss\n")
        save_config(opt)

    data_loader = {
        'train': training_data_loader,
    }
    modes = ['train', 'valid']

    l2_criterion = nn.MSELoss()
    l1_criterion = nn.L1Loss()
    if opt.use_cuda:
        l2_criterion = l2_criterion.to(opt.device)
        l1_criterion = l1_criterion.to(opt.device)

    if opt.content_loss == 'l2':
        content_loss_criterion = l2_criterion
    elif opt.content_loss == 'l1':
        content_loss_criterion = l1_criterion
    else:
        raise ValueError("Specify content loss correctly (l1, l2)")

    if opt.style_loss == 'l2':
        style_loss_criterion = l2_criterion
    elif opt.style_loss == 'l1':
        style_loss_criterion = l1_criterion
    else:
        raise ValueError("Specify style loss correctly (l1, l2)")

    if opt.ll_loss == 'l2':
        ll_loss_criterion = l2_criterion
    elif opt.ll_loss == 'l1':
        ll_loss_criterion = l1_criterion
    else:
        raise ValueError("Specify style loss correctly (l1, l2)")

    nc = opt.n_channels
    np.random.seed(1024)
    sq = np.arange(1024)
    np.random.shuffle(sq)

    for epoch in range(opt.start_epoch, opt.n_epochs):
        opt.epoch_num = epoch
        for phase in modes:
            if phase == 'train':
                total_loss = 0.0
                total_psnr = 0.0
                total_iteration = 0

                net.train()

                mode = "Training"
                print("*** %s ***" % mode)
                start_time = time.time()

                for iteration, batch in enumerate(data_loader[phase], 1):
                    # (_, x), (_, target) = batch[0], batch[1]
                    x, target = batch[0], batch[1]
                    x_img, target_img = batch[3], batch[4]
                    lr_approx = batch[5]

                    if opt.use_cuda:
                        x = x.to(opt.device)
                        target = target.to(opt.device)

                    optimizer.zero_grad()

                    # epoch_loss = 0.
                    with torch.set_grad_enabled(phase == 'train'):
                        out = net(x)

                        # norm_target = normalize_coeffs(target, ch_min=opt.ch_min, ch_max=opt.ch_max)
                        std_target = standarize_coeffs(target,
                                                       ch_mean=opt.ch_mean,
                                                       ch_std=opt.ch_std)
                        # norm_out = normalize_coeffs(out, ch_min=opt.ch_min, ch_max=opt.ch_max)
                        std_out = standarize_coeffs(out,
                                                    ch_mean=opt.ch_mean,
                                                    ch_std=opt.ch_std)

                        ll_target = std_target[:, 0:nc, :, :]
                        ll_out = std_out[:, 0:nc, :, :]
                        high_target = std_target[:, nc:, :, :]
                        high_out = std_out[:, nc:, :, :]

                        # log_channel_loss(std_out, std_target, content_loss_criterion)
                        ll_content_loss = content_loss_criterion(
                            ll_target, ll_out)
                        ll_style_loss = 0
                        # content_loss = content_loss_criterion(norm_target, norm_out)
                        high_content_loss = content_loss_criterion(
                            high_target, high_out)
                        high_style_loss = 0

                        ll_loss = ll_content_loss + ll_style_loss
                        high_loss = high_content_loss + high_style_loss
                        epoch_loss = opt.ll_weight * ll_loss + (
                            1 - opt.ll_weight) * high_loss

                        # L1 loss for wavelet coeffiecients
                        l1_loss = 0

                        total_loss += epoch_loss.item()

                        epoch_loss.backward()
                        optimizer.step()

                    mse_loss = l2_criterion(out, target)
                    psnr = 10 * math.log10(1 / mse_loss.item())
                    total_psnr += psnr

                    print(
                        "High Content Loss: {:5f}, High Style Loss: {:5f}, LL Content Loss: {:5f}, LL Style Loss:{:5f}"
                        .format(high_content_loss, high_style_loss,
                                ll_content_loss, ll_style_loss))
                    print(
                        "{} {:4f}s => Epoch[{}/{}]({}/{}): Epoch Loss: {:5f} High Loss: {:5f} LL Loss: {:5f} L1 Loss: {:5f} PSNR: {:5f}"
                        .format(mode,
                                time.time() - start_time,
                                opt.epoch_num, opt.n_epochs, iteration,
                                len(data_loader[phase]), epoch_loss.item(),
                                high_loss.item(), ll_loss.item(), l1_loss,
                                psnr))

                    total_iteration = iteration

                total_loss = total_loss / total_iteration
                total_psnr = total_psnr / total_iteration

                train_loss = total_loss
                train_psnr = total_psnr

            else:
                net.eval()
                mode = "Validation"
                print("*** %s ***" % mode)
                valid_loss, valid_psnr = run_valid(opt, net,
                                                   content_loss_criterion, sq)
                scheduler.step(valid_loss)

        with open(log_file, mode='a') as f:
            f.write("%d,%08f,%08f,%08f,%08f\n" %
                    (epoch, train_loss, train_psnr, valid_loss, valid_psnr))

        save_checkpoint(opt, net, optimizer, epoch, valid_loss)
Exemple #5
0
    best_loss = 1000.0

    writer = SummaryWriter(log_dir=opt.log_dir)
    for epoch in range(opt.n_epochs):
        opt.epoch_num = epoch
        train_loss = trainer(opt,
                             net,
                             optimizer,
                             train_data_loader,
                             loss_criterion=loss_criterion)
        valid_loss = evaluator(opt,
                               net,
                               valid_data_loader,
                               loss_criterion=loss_criterion)

        writer.add_scalar('WCELoss/train', train_loss, epoch)
        writer.add_scalar('WCELoss/valid', valid_loss, epoch)

        if not opt.save_best:
            save_checkpoint(opt, net, epoch, valid_loss, schedular)

            if opt.save_best:
                if valid_loss < best_loss:
                    best_loss = valid_loss
                    best_model_wts = copy.deepcopy(net.state_dict())
                    # 채송: main 함수 다 돌면 valid loss가 가장 좋은 model 저장하도록 하는

    if opt.save_best:
        save_checkpoint(opt, best_model_wts, epoch, valid_loss, schedular)

    writer.close()
Exemple #6
0
        writer.add_scalar('Loss/F1-score', f1_score, epoch + 1)
        # if val_loss >= lower_loss:
        #     no_optimize += 1
        # else:
        #     no_optimize = 0
        scheduler.step(val_loss)

        lr = optimizer.state_dict()['param_groups'][0]['lr']
        writer.add_scalar('Loss/learning_rate', lr, epoch + 1)

        if f1_score > best_f1:
            best_f1 = f1_score

            filename = []
            filename.append(
                os.path.join(
                    args.checkpoints,
                    'net-epoch-%s-%s.pth' % (epoch + 1, round(best_f1, 4))))
            filename.append(os.path.join(args.checkpoints, 'model_best.pth'))
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': net.state_dict(),
                }, True, filename)

        # if no_optimize > args.early_stopping:
        #     print("Early Stopping...")
        #     break

    print("Training Done...")