Beispiel #1
0
 def save(self, current_epoch, best=False):
     if best:
         save_checkpoint(self.config,
                         self.kwargs['config_name'],
                         self.model,
                         current_epoch,
                         self.loss_basic.val,
                         self.optimizer,
                         self.logger,
                         self.kwargs['time_stamp'],
                         self.accuarcy,
                         flag='best',
                         verbose=(self.kwargs['cae_type'] + '#' +
                                  self.verbose))
         self.result_path = save_model(self.config,
                                       self.kwargs['config_name'],
                                       self.model,
                                       self.logger,
                                       self.kwargs['time_stamp'],
                                       self.accuarcy,
                                       verbose=(self.kwargs['cae_type'] +
                                                '#' + self.verbose))
     else:
         save_checkpoint(self.config,
                         self.kwargs['config_name'],
                         self.model,
                         current_epoch,
                         self.loss_basic.val,
                         self.optimizer,
                         self.logger,
                         self.kwargs['time_stamp'],
                         self.accuarcy,
                         verbose=(self.kwargs['cae_type'] + '#' +
                                  self.verbose))
def main():

    args = parse_args()

    logger, final_output_dir, tb_log_dir = \
        utils.create_logger(config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    cudnn.benchmark = config.CUDNN.BENCHMARK
    cudnn.determinstic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED
    gpus = list(config.GPUS)

    dataset_type = get_dataset(config)
    train_data = dataset_type(config, is_train=True)
    train_loader = DataLoader(dataset=train_data,
                              batch_size=config.TRAIN.BATCH_SIZE_PER_GPU *
                              len(gpus),
                              shuffle=config.TRAIN.SHUFFLE,
                              num_workers=config.WORKERS,
                              pin_memory=config.PIN_MEMORY)

    val_loader = DataLoader(dataset=dataset_type(config, is_train=False),
                            batch_size=config.TEST.BATCH_SIZE_PER_GPU *
                            len(gpus),
                            shuffle=False,
                            num_workers=config.WORKERS,
                            pin_memory=config.PIN_MEMORY)

    # config.MODEL.NUM_JOINTS = train_data.get_num_points()
    model = models.get_face_alignment_net(config)

    # copy model files
    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    model = nn.DataParallel(model, device_ids=gpus).cuda()

    # loss
    criterion = torch.nn.MSELoss(size_average=True).cuda()

    optimizer = utils.get_optimizer(config, model)

    best_nme = 100
    last_epoch = config.TRAIN.BEGIN_EPOCH

    if isinstance(config.TRAIN.LR_STEP, list):
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
            last_epoch - 1)
    else:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                       config.TRAIN.LR_STEP,
                                                       config.TRAIN.LR_FACTOR,
                                                       last_epoch - 1)

    if config.TRAIN.RESUME:
        model_state_file = os.path.join(final_output_dir, 'latest.pth')
        if os.path.islink(model_state_file):
            checkpoint = torch.load(model_state_file)
            last_epoch = checkpoint['epoch']
            best_nme = checkpoint['best_nme']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
        else:
            print("=> no checkpoint found")

    for epoch in range(last_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        function.train(config, train_loader, model, criterion, optimizer,
                       epoch, writer_dict)

        # evaluate
        nme, predictions = function.validate(config, val_loader, model,
                                             criterion, epoch, writer_dict)

        is_best = nme < best_nme
        best_nme = min(nme, best_nme)

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        print("best:", is_best)
        utils.save_checkpoint(
            {
                "state_dict": model,
                "epoch": epoch + 1,
                "best_nme": best_nme,
                "optimizer": optimizer.state_dict(),
            }, predictions, is_best, final_output_dir,
            'checkpoint_{}.pth'.format(epoch))

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #3
0
def main(cfg):
    util.init_random_seed(cfg.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = cfg.gpu_devices
    use_gpu = torch.cuda.is_available()
    if cfg.use_cpu:
        use_gpu = False

    cfg.ckpt_dir = "test" if cfg.evaluate else cfg.ckpt_dir
    cfg.save_dir = osp.join(cfg.save_dir, cfg.ckpt_dir)
    util.mkdir_if_missing(cfg.save_dir)

    if not cfg.evaluate:
        sys.stdout = util.Logger(osp.join(cfg.save_dir, 'log_train.txt'))
    else:
        sys.stdout = util.Logger(osp.join(cfg.save_dir, 'log_test.txt'))
    print("==========\nArgs:{}\n==========".format(cfg))

    if use_gpu:
        print("Currently using GPU {}".format(cfg.gpu_devices))
        cudnn.benchmark = True
        torch.cuda.manual_seed_all(cfg.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    # --------------------------------------------------------------------------------------------
    print("* Initializing dataset {}".format(cfg.dataset))
    dataset = getdata.init_dataset(name=cfg.dataset)
    cfg.num_train_pids = dataset.num_train_pids

    transform_train = T.Compose([
        T.Random2DTranslation(cfg.height, cfg.width),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    transform_test = T.Compose([
        T.Resize((cfg.height, cfg.width)),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    pin_memory = True if use_gpu else False

    trainloader = DataLoader(
        # train_set,
        VideoDataset(dataset.train, seq_len=cfg.seq_len, sample=cfg.train_sample_method, transform=transform_train),
        sampler=RandomIdentitySampler(dataset.train, batch_size=cfg.train_batch, num_instances=cfg.num_instances),
        batch_size=cfg.train_batch, num_workers=cfg.workers,
        pin_memory=pin_memory, drop_last=True,
    )

    queryloader = DataLoader(
        # query_set,
        VideoDataset(dataset.query, seq_len=cfg.seq_len, sample=cfg.test_sample_method, transform=transform_test),
        batch_size=cfg.test_batch, shuffle=False, num_workers=0,
        pin_memory=pin_memory, drop_last=False,
    )

    galleryloader = DataLoader(
        # gallery_set,
        VideoDataset(dataset.gallery, seq_len=cfg.seq_len, sample=cfg.test_sample_method, transform=transform_test),
        batch_size=cfg.test_batch, shuffle=False, num_workers=0,
        pin_memory=pin_memory, drop_last=False,
    )

    # --------------------------------------------------------------------------------------------
    # Initialize model, optimizer, and scheduler
    print("* Initializing model: {}".format(cfg.arch))
    model = get_model(cfg)
    print("Model size: {:.2f}M".format(sum(p.numel() for p in model.parameters()) / 1000000.0))

    criterion_xent = CrossEntropyLabelSmooth(num_classes=dataset.num_train_pids, use_gpu=use_gpu)
    criterion_htri = TripletLoss(margin=cfg.margin)

    optimizer = torch.optim.Adam(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
    assert cfg.stepsize > 0
    scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg.stepsize, gamma=cfg.gamma, last_epoch=-1)
    if cfg.warmup_epoch > 0:
        scheduler = WarmUpLR(optimizer, scheduler, cfg.warmup_epoch, len(trainloader))

    # --------------------------------------------------------------------------------------------
    # optionally resume from a checkpoint
    best_rank1 = -np.inf
    start_epoch = cfg.start_epoch
    if cfg.resume:
        checkpoint = torch.load(cfg.resume)
        start_epoch = checkpoint['epoch'] + 1
        best_rank1 = checkpoint['rank1']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler = lr_scheduler.StepLR(optimizer, step_size=cfg.stepsize, gamma=cfg.gamma,
                                        last_epoch=checkpoint['epoch'])
        print("loaded checkpoint '{}' (epoch {})".format(cfg.resume, checkpoint['epoch']))
        del checkpoint

    if use_gpu:
        model = nn.DataParallel(model).cuda()

    if cfg.evaluate:
        print("* Evaluating")
        with torch.no_grad():
            evaluate(model, queryloader, galleryloader, cfg.pool, use_gpu)
        return

    if cfg.arch == '3d':
        torch.backends.cudnn.benchmark = False

    # --------------------------------------------------------------------------------------------
    print("\n* Start training")
    start_time = time.time()
    for epoch in range(start_epoch, cfg.max_epoch):
        epoch_start_time = time.time()
        update_lr(scheduler, epoch, n_iter=None)
        train_one_epoch(
            model,
            epoch,
            optimizer,
            scheduler,
            trainloader,
            cfg.warmup_epoch,
            criterion_xent,
            criterion_htri,
            use_gpu
        )

        lr_msg = 'used lr: '
        for item in [pg['lr'] for pg in optimizer.param_groups]:
            lr_msg += '%.0E ' % (item)
        print('* end of epoch {}/{}, time taken: {:.0f} sec, {}'.format(
            epoch, cfg.max_epoch, time.time() - epoch_start_time, lr_msg))

        # scheduler.step(epoch + 1)  # setting lr for next epoch, self.last_epoch==epoch+1

        if epoch in cfg.eval_steps or (epoch + 1) == cfg.max_epoch:
            print("* evaluate")
            with torch.no_grad():
                rank1 = eval(model, queryloader, galleryloader, use_gpu)
            is_best = rank1 > best_rank1
            if is_best: best_rank1 = rank1

            if use_gpu:
                state_dict = model.module.state_dict()
            else:
                state_dict = model.state_dict()
            util.save_checkpoint({
                'rank1': rank1,
                'epoch': epoch,
                'state_dict': state_dict,
                'optimizer': optimizer.state_dict(),
            }, is_best, osp.join(cfg.save_dir, 'latest.pth'))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    print("Checkpoints are saved to {}".format(cfg.save_dir))
    print("Finished. Total elapsed time (h:m:s): {}".format(elapsed))
Beispiel #4
0
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.'+config.MODEL.NAME+'.get_seg_net')(
        config, is_train=True
    )

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
        'vis_global_steps': 0,
    }

    # dump_input = torch.rand((config.TRAIN.BATCH_SIZE,
    #                          3,
    #                          config.MODEL.IMAGE_SIZE[1],
    #                          config.MODEL.IMAGE_SIZE[0]))
    # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    optimizer = get_optimizer(config, model)

    # Data loading code
    if 'xception' in config.MODEL.NAME:
        # Xception uses different mean std for input image
        normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                         std=[0.5, 0.5, 0.5])
    else:
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

    train_augs = aug.Compose([aug.RandomScale(0.5, 2.0),
                              aug.RandomHorizontallyFlip(0.5),
                              aug.RandomSizedCrop(config.MODEL.IMAGE_SIZE)])

    test_augs = None

    train_dataset = eval('dataset.'+config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]),
        augmentations=train_augs
    )
    valid_dataset = eval('dataset.'+config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TEST_SET,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]),
        augmentations=test_augs
    )

    # define loss function (criterion) and optimizer
    criterion = CrossEntropy2D(ignore_index=255, weight=train_dataset.class_weights).cuda()

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE*len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True,
        drop_last=True if len(gpus) > 2 else False  # PyTorch's DataParallel model cannot handle 0 image on either of the GPUs
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE,
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True
    )

    if config.TRAIN.LR_SCHEDULER == 'multistep':
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR
        )
    elif config.TRAIN.LR_SCHEDULER == 'poly':
        max_iter = config.TRAIN.END_EPOCH * len(train_loader)
        lr_scheduler = PolynomialLR(optimizer, max_iter=max_iter, decay_iter=1)
    elif config.TRAIN.LR_SCHEDULER == 'none':
        lr_scheduler = None
    else:
        raise ValueError('Scheduler {} not supported'.format(config.TRAIN.LR_SCHEDULER))

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        if config.TRAIN.LR_SCHEDULER == 'multistep':
            lr_scheduler.step()

        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, lr_scheduler, epoch,
              final_output_dir, tb_log_dir, writer_dict)


        if (epoch + 1) % config.TRAIN.EVAL_INTERVAL == 0:
            if not config.MODEL.LEARN_GAMMA:
                if float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) <= 1:
                    gamma = (config.TRAIN.NE_GAMMA_U - config.TRAIN.NE_GAMMA_L) * \
                            (1 - float(lr_scheduler.last_epoch) / (lr_scheduler.max_iter * config.TRAIN.NE_ITER_RATIO) ) ** \
                            config.TRAIN.NE_GAMMA_EXP + config.TRAIN.NE_GAMMA_L
                else:
                    gamma = config.TRAIN.NE_GAMMA_L
            else:
                gamma = None

            # evaluate on validation set
            perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                      criterion, final_output_dir, tb_log_dir,
                                      writer_dict, gamma=gamma)

            if perf_indicator > best_perf:
                best_perf = perf_indicator
                best_model = True
            else:
                best_model = False

            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint({
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)
        else:
            perf_indicator = 0.0

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #5
0
def main():
    args = parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.gpu)
    reset_config(config, args)
    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')
    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('{}.get_pose_net'.format(args.model))(config, is_train=True)
    model.eval()
    params = count_parameters_in_MB(model)
    logger.info("Params = %.2fMB" % params)
    mult_adds = comp_multadds(model,
                              input_size=(3, config.MODEL.IMAGE_SIZE[1],
                                          config.MODEL.IMAGE_SIZE[0]))
    logger.info("Mult-Adds = %.2fMB" % mult_adds)
    model.train()
    model = model.cuda()

    # copy model file
    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()
    optimizer = get_optimizer(config, model)
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE,
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE,
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir)
        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.state_dict(), final_model_state_file)

    lr_scheduler.step()
def main():
    args = parse_args()
    update_config(cfg, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        cfg, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(cfg)

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg,
                                                               is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
        final_output_dir)
    # logger.info(pprint.pformat(model))

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))
    # writer_dict['writer'].add_graph(model, (dump_input))

    logger.info(get_model_summary(model, dump_input))

    model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda()

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
        transforms.Compose([transforms.ToTensor(), normalize]))
    valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
        transforms.Compose([transforms.ToTensor(), normalize]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)

    best_perf = 0.0
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                        cfg.TRAIN.LR_STEP,
                                                        cfg.TRAIN.LR_FACTOR,
                                                        last_epoch=last_epoch)

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        # train for one epoch
        train(cfg, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)
        lr_scheduler.step()

        # evaluate on validation set
        perf_indicator = validate(cfg, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        '=> saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #7
0
def do_train(train_loader, val_loader, model, indicator_dict, cfg, writer_dict,
             final_output_dir, log_dir, visualize):
    batch_time = AverageMeter()
    data_time = AverageMeter()

    end = time.time()
    for i, current_data in enumerate(
            train_loader, start=indicator_dict['current_iteration']):
        data_time.update(time.time() - end)

        if i > indicator_dict['total_iteration']:
            return

        # validation
        if indicator_dict[
                'current_iteration'] % cfg.VAL.EVALUATION_FREQUENCY == 0:
            indicator_dict['current_performance'] = do_validate(
                val_loader, model, cfg, visualize, writer_dict,
                final_output_dir)
            indicator_dict['is_best'] = False
            if indicator_dict['current_performance'] < indicator_dict[
                    'best_performance']:
                indicator_dict['best_performance'] = indicator_dict[
                    'current_performance']
                indicator_dict['is_best'] = True

            # save checkpoint
            output_dictionary = {
                'indicator_dict': indicator_dict,
                'writer_dict_train_global_steps':
                writer_dict['train_global_steps'],
                'writer_dict_val_global_steps':
                writer_dict['val_global_steps'],
                'tb_log_dir': log_dir
            }

            if hasattr(model, 'generator'):
                output_dictionary['generator'] = model.generator.state_dict()
                output_dictionary[
                    'optimizer_generator'] = model.optimizer_generator.state_dict(
                    )

            if hasattr(model, 'discriminator'):
                output_dictionary[
                    'discriminator'] = model.discriminator.state_dict()
                output_dictionary[
                    'optimizer_discriminator'] = model.optimizer_discriminator.state_dict(
                    )

            save_checkpoint(output_dictionary, indicator_dict,
                            final_output_dir)
            model.train()

        # train
        model.set_dataset(current_data)
        model.optimize_parameters()

        # visualize
        if indicator_dict[
                'current_iteration'] % cfg.TRAIN.DISPLAY_FREQUENCY == 0 and cfg.IS_VISUALIZE:
            visualize(model, indicator_dict['current_iteration'],
                      os.path.join(final_output_dir, "train"),
                      cfg.TRAIN.DISPLAY_FREQUENCY)

        # update learning rate
        for current_scheduler in model.schedulers:
            current_scheduler.step()

        batch_time.update(time.time() - end)
        end = time.time()
        model.record_information(i,
                                 len(train_loader),
                                 batch_time,
                                 data_time,
                                 indicator_dict,
                                 writer_dict,
                                 phase='train')
Beispiel #8
0
def main():
    # set all the configurations
    args = parse_args()
    update_config(cfg, args)

    # set the logger, tb_log_dir means tensorboard logdir
    logger, final_output_dir, tb_log_dir = create_logger(
        cfg, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(cfg)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    # bulid up model
    model = get_net(cfg)
    model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()

    # Data loading
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, True, transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, False, transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)

    # define loss function (criterion) and optimizer
    criterion = get_loss(cfg).cuda()
    optimizer = get_optimizer(cfg, model)

    # load checkpoint model
    best_perf = 0.0
    best_model = False
    last_epoch = -1
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                        cfg.TRAIN.LR_STEP,
                                                        cfg.TRAIN.LR_FACTOR,
                                                        last_epoch=last_epoch)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    # training
    for epoch in range(begin_epoch + 1, cfg.TRAIN.END_EPOCH + 1):
        # train for one epoch
        train(cfg, train_loader, model, criterion, optimizer, epoch,
              writer_dict)

        lr_scheduler.step()

        # evaluate on validation set
        if epoch % cfg.TRAIN.VAL_FREQ == 0 or epoch == cfg.TRAIN.END_EPOCH + 1:
            perf_indicator = validate(cfg, valid_loader, valid_dataset, model,
                                      criterion, final_output_dir, tb_log_dir,
                                      writer_dict)

            if perf_indicator >= best_perf:
                best_perf = perf_indicator
                best_model = True
            else:
                best_model = False

            # save checkpoint model and best model
            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint(
                {
                    'epoch': epoch,
                    'model': cfg.MODEL.NAME,
                    'state_dict': model.state_dict(),
                    'best_state_dict': model.module.state_dict(),
                    'perf': perf_indicator,
                    'optimizer': optimizer.state_dict(),
                }, best_model, final_output_dir)

    # save final model
    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        '=> saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #9
0
def main():
    final_output_dir = 'output'
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(config, 'train')

    logger.info(pprint.pformat(config))

    # CuDNN
    cudnn.benchmark = config.CUDNN.BENCHMARK
    cudnn.deterministic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED

    # HRNet Model
    mv_hrnet = get_pose_net(config, is_train=True)
    #pose_hrnet = get_pose_net(config, is_train=True)  # Pose estimation model
    #pose_hrnet.load_state_dict(torch.load(config.NETWORK.PRETRAINED), strict=False)  # Pretrained weight loading
    #mv_hrnet = get_multiview_pose_net(pose_hrnet, config)  # Multiview adopting
    #depth_hrnet = get_pose_net(config, is_train=True)  # 2.5D depth prediction model

    # Multi GPUs Setting
    gpus = [int(i) for i in config.GPUS.split(',')]
    mv_hrnet = torch.nn.DataParallel(mv_hrnet, device_ids=gpus).cuda()
    logger.info('=> init data parallel model')

    # Loss
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()
    logger.info('=> init criterion')

    # Optimizer
    optimizer = get_optimizer(config, mv_hrnet)
    logger.info('=> init {} optimizer'.format(config.TRAIN.OPTIMIZER))

    # Loading checkpoint
    start_epoch = config.TRAIN.BEGIN_EPOCH
    if config.TRAIN.RESUME:
        start_epoch, mv_hrnet, optimizer = load_checkpoint(
            mv_hrnet, optimizer, final_output_dir)

    # Scheduler
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)
    logger.info('=> init scheduler')

    # Summary
    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # Data loader
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    logger.info('=> loading train dataset')
    train_dataset = H36MDataset(
        config, config.DATASET.TRAIN_SUBSET, True,
        transforms.Compose([transforms.ToTensor(), normalize]))
    #train_dataset = MultiViewH36M(config, config.DATASET.TRAIN_SUBSET, True, transforms.Compose([transforms.ToTensor(), normalize]))
    logger.info('=> loading validation dataset')
    valid_dataset = H36MDataset(
        config, config.DATASET.TEST_SUBSET, False,
        transforms.Compose([transforms.ToTensor(), normalize]))

    logger.info('=> loading train dataloader')
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)

    logger.info('=> loading valid dataloader')
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    # Training loop
    best_perf = 0.0
    best_model = False
    for epoch in range(start_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # Trainer
        train(config, train_loader, mv_hrnet, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        # Performance indicator
        perf_indicator = validate(config, valid_loader, valid_dataset,
                                  mv_hrnet, criterion, final_output_dir,
                                  tb_log_dir, writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': mv_hrnet.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    # End
    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(mv_hrnet.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #10
0
    def training(self, epoch):
        if self.sw is None:
            self.sw = SummaryWriter(logdir=str(self.args.logs_path),
                                    flush_secs=5)

        tbar = tqdm(self.train_data, file=self.tqdm_out, ncols=100)
        self.metric.reset()
        train_loss = 0.0

        iter_per_epoch = len(self.train_data)

        for i, (data, target) in enumerate(tbar):
            global_step = iter_per_epoch * epoch + i
            data = split_and_load(data,
                                  ctx_list=self.args.ctx,
                                  batch_axis=0,
                                  even_split=False)
            target = split_and_load(target,
                                    ctx_list=self.args.ctx,
                                    batch_axis=0,
                                    even_split=False)
            with autograd.record(True):
                if self.with_depth:
                    outputs = [self.net(*X) for X in data]
                else:
                    outputs = [self.net(X) for X in data]

                losses = [
                    self.criterion(*X, Y) for X, Y in zip(outputs, target)
                ]

                autograd.backward(losses)

            self.optimizer.step(self.args.batch_size)

            batch_loss = sum(loss.asnumpy()[0]
                             for loss in losses) / len(losses)
            train_loss += batch_loss

            if self.image_dump_interval > 0 and global_step % self.image_dump_interval == 0:
                image_blob = data[0][0][0] if self.with_depth else data[0][0]
                image = self.denormalizator(image_blob.as_in_context(
                    mx.cpu(0))).asnumpy() * 255

                gt_mask = target[0][0].asnumpy() + self.trainset.pred_offset
                predicted_mask = mx.nd.squeeze(
                    mx.nd.argmax(outputs[0][0][0],
                                 0)).asnumpy() + self.trainset.pred_offset

                gt_mask = visualize_mask(gt_mask.astype(np.int32),
                                         self.trainset.NUM_CLASS + 1)
                predicted_mask = visualize_mask(
                    predicted_mask.astype(np.int32),
                    self.trainset.NUM_CLASS + 1)

                image = image.transpose((1, 2, 0))
                if gt_mask.shape[:2] == image.shape[:2]:
                    result = np.hstack(
                        (image, gt_mask, predicted_mask)).transpose(
                            (2, 0, 1)).astype(np.uint8)
                    self.sw.add_image('Images/input_image',
                                      result,
                                      global_step=global_step)
                else:
                    self.sw.add_image('Images/input_image',
                                      image.transpose(
                                          (2, 0, 1)).astype(np.uint8),
                                      global_step=global_step)
                    result = np.hstack((gt_mask, predicted_mask)).transpose(
                        (2, 0, 1)).astype(np.uint8)
                    self.sw.add_image('Images/predicted',
                                      result,
                                      global_step=global_step)

            self.sw.add_scalar(tag='Loss/ce',
                               value={
                                   'batch': batch_loss,
                                   'epoch_avg': train_loss / (i + 1)
                               },
                               global_step=global_step)
            self.sw.add_scalar(tag='learning_rate',
                               value=self.lr_scheduler.learning_rate,
                               global_step=global_step)

            if hasattr(self.criterion, 'k_sum'):
                self.sw.add_scalar(tag='nfl_mult',
                                   value=self.criterion.k_sum,
                                   global_step=global_step)

            tbar.set_description(
                f'Epoch {epoch}, training loss {train_loss/(i+1):.3f}')
            mx.nd.waitall()
            self.net.hybridize()

        save_checkpoint(self.net, self.args, epoch=None)
Beispiel #11
0
def main():
    args = parse_args()
    reset_config(config, args)
    set_cudnn(config)
    seed = config.RANDOM_SEED
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU.strip()
    gpus = list(range(len(config.GPU.strip().split(','))))

    logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg)
    summary_writer = SummaryWriter(log_dir=tb_log_dir)

    this_dir = osp.dirname(__file__)
    # backup the source code and the yaml config
    if args.cfg:
        shutil.copy(args.cfg, osp.join(final_output_dir,
                                       osp.basename(args.cfg)))
    if not osp.exists(osp.join(final_output_dir, "lib")):
        shutil.copytree(osp.join(this_dir, "../lib/"),
                        osp.join(final_output_dir, "lib"))
    for k, v in config.items():
        logger.info(f"{k}: {v}")

    # conditional import
    if config.TRAIN.FINETUNE_ROTATER:
        from lib.core.function3 import train, validate, evaluate
    elif config.TRAIN.USE_CYCLE:
        from lib.core.function2 import train, validate, evaluate
    else:
        from lib.core.function1 import train, validate, evaluate

    # build model
    logger.info('start building model.')
    if len(gpus) > 1:
        pose_model = torch.nn.DataParallel(get_pose_model(config)).cuda(
            gpus[0])
        discriminator = torch.nn.DataParallel(get_discriminator(config)).cuda(
            gpus[0])
        temp_discriminator = torch.nn.DataParallel(
            get_discriminator(config)).cuda(gpus[0])
    else:
        pose_model = get_pose_model(config).cuda()
        discriminator = get_discriminator(config, is_temp=False).cuda()
        temp_discriminator = get_discriminator(config, is_temp=True).cuda()
    optimizer_g = get_optimizer(config, pose_model, is_dis=False)
    optimizer_d = get_optimizer(config, discriminator, is_dis=True)
    optimizer_d_temp = get_optimizer(config,
                                     temp_discriminator,
                                     is_dis=True,
                                     is_temp=True)
    step_size, gamma = config.TRAIN.SCHEDULER_STEP_SIZE, config.TRAIN.SCHEDULER_GAMMA
    scheduler_g = lr_scheduler.StepLR(optimizer_g,
                                      step_size=step_size,
                                      gamma=gamma)
    scheduler_d = lr_scheduler.StepLR(optimizer_d,
                                      step_size=step_size,
                                      gamma=gamma)
    scheduler_temp = lr_scheduler.StepLR(optimizer_d_temp,
                                         step_size=step_size,
                                         gamma=gamma)
    logger.info('finished building model.')
    # print out the model arch
    if config.TRAIN.PRETRAIN_LIFTER:
        print("Load pretrained lifter...")
        state_dict = torch.load(
            config.TRAIN.LIFTER_PRETRAIN_PATH)['pose_model_state_dict']
        # state_dict = {k[7:]:v for k, v in state_dict.items()}
        pose_model.load_state_dict(state_dict, strict=False)

    if config.DATA.DATASET_NAME == 'surreal':
        loader_func = surreal
    else:
        loader_func = h36m if config.DATA.DATASET_NAME == "h36m" else mpiinf
    dataset_train = loader_func(config, is_train=True)
    dataset_test = loader_func(config, is_train=False)

    train_loader = DataLoader(dataset=dataset_train,
                              batch_size=config.BATCH_SIZE,
                              shuffle=True,
                              drop_last=False,
                              pin_memory=True,
                              num_workers=config.NUM_WORKERS)
    test_loader = DataLoader(dataset=dataset_test,
                             batch_size=config.BATCH_SIZE,
                             shuffle=False,
                             drop_last=False,
                             pin_memory=True,
                             num_workers=config.NUM_WORKERS)

    if args.eval:
        prefix = config.DATA.DATASET_NAME
        # for mode in ['train', 'valid']:
        for mode in ['valid']:
            is_train = True if mode == 'train' else False
            v3d_to_ours = [
                3, 2, 1, 4, 5, 6, 16, 15, 14, 11, 12, 13, 8, 0, 7, 9, 10
            ] if prefix == "h36m" else np.arange(config.DATA.NUM_JOINTS)
            mpi2h36m = [
                10, 9, 8, 11, 12, 13, 4, 3, 2, 5, 6, 7, 1, 14, 15, 16, 0
            ]
            if prefix == 'surreal':
                indices = np.arange(config.DATA.NUM_JOINTS)
            else:
                indices = v3d_to_ours if prefix == "h36m" else mpi2h36m
            mode = "train" if is_train else "valid"
            read_name = f"../data/{prefix}_{mode}_pred3.h5"
            # read_name = f"../../unsupervised_mesh/data/h36m_{mode}_pred_3d_mesh.h5"
            save_name = f"../data/{prefix}_{mode}_pred_3d.h5"
            if args.eval_suffix is not None:
                save_name = save_name[:-3] + "_" + args.eval_suffix + ".h5"

            # eval mode, load the pretrained model and generate the 3d prediction of all 3ds
            if not config.TRAIN.PRETRAIN_LIFTER:
                raise Warning(
                    "You are not using a pretrain model... may be you can specify --pretrain flag"
                )
            dataloader = DataLoader(dataset_train if mode == "train" else dataset_test, batch_size=config.BATCH_SIZE, \
                shuffle=False, drop_last=False, pin_memory=True, num_workers=config.NUM_WORKERS)
            all_out_data = evaluate(dataloader,
                                    pose_model,
                                    config,
                                    is_train=(mode == "train"))
            p1_mpjpe, p2_mpjpe = all_out_data['p1_mpjpe'], all_out_data[
                'p2_mpjpe']
            # read out imagenames
            print("Reading imagenames and joints 2d...")
            fin = h5py.File(read_name, "r")
            fout = h5py.File(save_name, "w")
            imagenames = fin['imagename'][:].copy()
            joints_2d_gt = np.array(fin['joint_2d_gt'])
            fout['imagename'] = imagenames
            fout['joint_2d_gt'] = joints_2d_gt[:, indices]
            fout['joint_3d_gt'] = all_out_data['joint_3d_gt']
            fout['joint_3d_pre'] = all_out_data['joint_3d_pre']
            possible_same_keys = [
                'shape', 'pose', 'original_joint_2d_gt', 'joint_2d_pre',
                'seqlen'
            ]

            for key in possible_same_keys:
                if key in fin.keys():
                    if 'joint' in key:
                        fout[key] = np.array(fin[key])[:, indices]
                    else:
                        fout[key] = np.array(fin[key])
            if 'seqname' in fin.keys():
                fout['seqname'] = fin['seqname'][:].copy()

            if 'auc' in all_out_data.keys():
                fout['auc'] = all_out_data['auc']
                fout['pckh5'] = all_out_data['pckh5']
                fout['auc_p2'] = all_out_data['auc_p2']
                fout['pckh5_p2'] = all_out_data['pckh5_p2']
            if 'scales' in all_out_data.keys():
                fout['scale_pre'] = all_out_data['scales']
            if 'scale_mids' in all_out_data.keys():
                fout['scale_mid_pre'] = all_out_data['scale_mids']

            fin.close()
            fout.close()
            print(
                "Evaluation on the {} set finished. P1 Mpjpe: {:.3f}, P2 Mpjpe: {:.3f}, saved to {}"
                .format("training" if is_train else "test", p1_mpjpe, p2_mpjpe,
                        save_name))
            if prefix == "mpi":
                print("[email protected]: {:.3f}, AUC: {:.3f}".format(
                    all_out_data['pckh5'], all_out_data['auc']))
                print("P2: [email protected]: {:.3f}, AUC: {:.3f}".format(
                    all_out_data['pckh5_p2'], all_out_data['auc_p2']))
        # uncomment this if you need to plot images
        # print("Rendering sequences...")
        # subprocess.call(f'python render.py --seq_num 10 --in_filename ../data/{prefix}_valid_pred_3d.h5 --save_dir ../vis', shell=True)
        return

    # preparation for visualization & perseq optimization(optional)
    if config.USE_GT:
        # note that the gt here is not the gt above(config.USE_GT)
        train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales.pkl"
        valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales.pkl"
    else:
        train_path = f"../data/{config.DATA.DATASET_NAME}_train_scales_pre.pkl"
        valid_path = f"../data/{config.DATA.DATASET_NAME}_valid_scales_pre.pkl"

    train_scale_mids_gt = load_pickle(train_path)['scale_mid'] if osp.exists(
        train_path) else None
    valid_scale_mids_gt = load_pickle(valid_path)['scale_mid'] if osp.exists(
        valid_path) else None
    train_seqnames, valid_seqnames = dataset_train.get_seqnames(
    ), dataset_test.get_seqnames()
    best_p1_mpjpe = best_p2_mpjpe = cur_p1_mpjpe = 10000.0
    best_auc_val = best_pckh5 = 0.0
    best_auc_val_p2 = best_pckh5_p2 = 0.0

    for epoch in range(config.TRAIN.NUM_EPOCHS):
        scheduler_d.step()
        scheduler_g.step()
        scheduler_temp.step()
        # scheduler_s.step()
        avg_d_loss, avg_g_loss, avg_t_loss, train_scale_mids_pre = train(
            train_loader,
            pose_model,
            discriminator,
            temp_discriminator,
            optimizer_g,
            optimizer_d,
            optimizer_d_temp,
            epoch,
            config,
            summary_writer=summary_writer,
            print_interval=config.PRINT_INTERVAL)
        logger.info(
            "***** Epoch: {}, Avg G Loss: {:.3f}, Avg D Loss: {:.3f} Avg T Loss: {:.3f} *****"
            .format(epoch, avg_g_loss, avg_d_loss, avg_t_loss))
        p1_mpjpe, p2_mpjpe, vis_image, valid_scale_mids_pre, extra_dict = validate(
            test_loader, pose_model, epoch, config)
        logger.info(
            "Epoch: {}, P1 Mpjpe/Best P1: {:.3f}/{:.3f}, P2 Mpjpe/Best P2/Cur P1: {:.3f}/{:.3f}/{:.3f}"
            .format(epoch, p1_mpjpe, best_p1_mpjpe, p2_mpjpe, best_p2_mpjpe,
                    cur_p1_mpjpe))
        if p2_mpjpe < best_p2_mpjpe:
            best_p2_mpjpe = p2_mpjpe
            cur_p1_mpjpe = p1_mpjpe
            is_best = True
        else:
            is_best = False

        if p1_mpjpe < best_p1_mpjpe:
            best_p1_mpjpe = p1_mpjpe

        if extra_dict is not None:
            auc_val, pckh5 = extra_dict['auc'], extra_dict['pckh5']
            auc_val_p2, pckh5_p2 = extra_dict['auc_p2'], extra_dict['pckh5_p2']
            if auc_val_p2 > best_auc_val_p2:
                best_auc_val_p2 = auc_val_p2
                best_pckh5_p2 = pckh5_p2
                is_best = True
            else:
                is_best = False

            if auc_val > best_auc_val:
                best_auc_val = auc_val
                best_pckh5 = pckh5
            logger.info(
                "[email protected](Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})"
                .format(pckh5, best_pckh5, auc_val, best_auc_val))
            logger.info(
                "P2: [email protected](Best): {:.3f}({:.3f}), AUC value(Best): {:.3f}({:.3f})"
                .format(pckh5_p2, best_pckh5_p2, auc_val_p2, best_auc_val_p2))

        save_checkpoint(
            {
                "epoch": epoch,
                "auc": best_auc_val,
                "pckh5": best_pckh5,
                "auc_p2": best_auc_val_p2,
                "pckh5_p2": best_pckh5_p2,
                "p1_mpjpe": p1_mpjpe,
                "p2_mpjpe": p2_mpjpe,
                "pose_model_state_dict": pose_model.state_dict(),
                "discriminator_state_dict": discriminator.state_dict(),
                "temp_discriminator_state_dict":
                temp_discriminator.state_dict(),
                "optimizer_d": optimizer_d.state_dict(),
                "optimizer_g": optimizer_g.state_dict(),
                "optimizer_d_temp": optimizer_d_temp.state_dict()
            }, is_best, final_output_dir)
        summary_writer.add_scalar("p1_mpjpe_3d_test/epoch", p1_mpjpe, epoch)
        summary_writer.add_scalar("p2_mpjpe_3d_test/epoch", p2_mpjpe, epoch)
        summary_writer.add_image("test_joints/epoch", vis_image, epoch)
        if extra_dict is not None:
            summary_writer.add_scalar("PCKh0.5/epoch", pckh5, epoch)
            summary_writer.add_scalar("AUC/epoch", auc_val, epoch)

        if train_scale_mids_gt is not None and train_scale_mids_pre is not None and len(
                train_scale_mids_pre) > 0:
            num_seq = config.VIS.SCALE_MID_NUM_SEQ
            vis_image_scale_mid1 = plot_scalemid_dist(
                train_scale_mids_pre, train_scale_mids_gt.tolist())
            vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type(
                torch.float32).permute(2, 0, 1) / 255
            vis_image_scale_mid2 = plot_scalemid_seq_dist(
                train_scale_mids_pre,
                train_scale_mids_gt.tolist(),
                train_seqnames,
                num_seq=num_seq)
            vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type(
                torch.float32).permute(2, 0, 1) / 255
            summary_writer.add_image("train_scalemid_distribution/epoch",
                                     vis_image_scale_mid1, epoch)
            summary_writer.add_image("train_scalemid_seq_distribution/epoch",
                                     vis_image_scale_mid2, epoch)
        if valid_scale_mids_gt is not None and valid_scale_mids_pre is not None and len(
                valid_scale_mids_pre) > 0:
            num_seq = config.VIS.SCALE_MID_NUM_SEQ
            vis_image_scale_mid1 = plot_scalemid_dist(
                valid_scale_mids_pre, valid_scale_mids_gt.tolist())
            vis_image_scale_mid1 = torch.from_numpy(vis_image_scale_mid1).type(
                torch.float32).permute(2, 0, 1) / 255
            vis_image_scale_mid2 = plot_scalemid_seq_dist(
                valid_scale_mids_pre,
                valid_scale_mids_gt.tolist(),
                valid_seqnames,
                num_seq=num_seq)
            vis_image_scale_mid2 = torch.from_numpy(vis_image_scale_mid2).type(
                torch.float32).permute(2, 0, 1) / 255
            summary_writer.add_image("valid_scalemid_distribution/epoch",
                                     vis_image_scale_mid1, epoch)
            summary_writer.add_image("valid_scalemid_seq_distribution/epoch",
                                     vis_image_scale_mid2, epoch)

    summary_writer.close()
Beispiel #12
0
def main():

    args = parse_args()

    # set logger and dir
    logger, final_output_dir, tb_log_dir = \
        utils.create_logger(config, args.experiment_name, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # set cudnn
    cudnn.benchmark = config.CUDNN.BENCHMARK
    cudnn.determinstic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED

    # 目前仅支持单gpu,todo:增加多gpu支持
    # set model and loss and criterion
    model = models.get_face_alignment_net(config)
    model = model.cuda(config.GPUS[0])
    criterion = torch.nn.MSELoss(size_average=True).cuda(config.GPUS[0])
    # criterion = AdaptiveWingLoss()
    optimizer = utils.get_optimizer(config, model)

    # get dataset
    dataset_type = get_dataset(config)

    # get dataloader
    train_loader = DataLoader(dataset=dataset_type(config, is_train=True),
                              batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
                              shuffle=config.TRAIN.SHUFFLE,
                              num_workers=config.WORKERS,
                              pin_memory=config.PIN_MEMORY)

    val_loader = DataLoader(dataset=dataset_type(config, is_train=False),
                            batch_size=config.TEST.BATCH_SIZE_PER_GPU,
                            shuffle=False,
                            num_workers=config.WORKERS,
                            pin_memory=config.PIN_MEMORY)

    # set lr_scheduler
    last_epoch = config.TRAIN.BEGIN_EPOCH
    if isinstance(config.TRAIN.LR_STEP, list):
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
            last_epoch - 1)
    else:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                       config.TRAIN.LR_STEP,
                                                       config.TRAIN.LR_FACTOR,
                                                       last_epoch - 1)

    # set training writer
    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # set training resume function
    if config.TRAIN.RESUME:
        model_state_file = os.path.join(final_output_dir, 'latest.pth')
        if os.path.islink(model_state_file):
            checkpoint = torch.load(model_state_file)
            last_epoch = checkpoint['epoch']
            best_nme = checkpoint['best_nme']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
        else:
            print("=> no checkpoint found")

    # starting training
    best_nme = 10000
    for epoch in range(last_epoch, config.TRAIN.END_EPOCH):

        lr_scheduler.step()

        # traing
        function.train(config, train_loader, model, criterion, optimizer,
                       epoch, writer_dict)

        # evaluating
        nme, predictions = function.validate(config, val_loader, model,
                                             criterion, epoch, writer_dict)

        # saving
        is_best = nme < best_nme
        best_nme = min(nme, best_nme)

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        print("best:", is_best)
        utils.save_checkpoint(
            {
                "state_dict": model,
                "epoch": epoch + 1,
                "best_nme": best_nme,
                "optimizer": optimizer.state_dict(),
            }, predictions, is_best, final_output_dir,
            'checkpoint_{}.pth'.format(epoch))

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #13
0
def main():
    best_perf = 0.0

    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir = create_logger(config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = models.pose3d_resnet.get_pose_net(config, is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)

    shutil.copy2(args.cfg, final_output_dir)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    loss_fn = eval('loss.' + config.LOSS.FN)
    criterion = loss_fn(num_joints=config.MODEL.NUM_JOINTS,
                        norm=config.LOSS.NORM).cuda()

    # define training, validation and evaluation routines
    train = train_integral
    validate = validate_integral
    evaluate = eval_integral

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Resume from a trained model
    if not (config.MODEL.RESUME is ''):
        checkpoint = torch.load(config.MODEL.RESUME)
        if 'epoch' in checkpoint.keys():
            config.TRAIN.BEGIN_EPOCH = checkpoint['epoch']
            best_perf = checkpoint['perf']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info('=> resume from pretrained model {}'.format(
                config.MODEL.RESUME))
        else:
            model.load_state_dict(checkpoint)
            logger.info('=> resume from pretrained model {}'.format(
                config.MODEL.RESUME))

    # Choose the dataset, either Human3.6M or mpii
    ds = eval('dataset.' + config.DATASET.DATASET)

    # Data loading code
    train_dataset = ds(cfg=config,
                       root=config.DATASET.ROOT,
                       image_set=config.DATASET.TRAIN_SET,
                       is_train=True)
    valid_dataset = ds(cfg=config,
                       root=config.DATASET.ROOT,
                       image_set=config.DATASET.TEST_SET,
                       is_train=False)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        preds_in_patch_with_score = validate(valid_loader, model)
        acc = evaluate(epoch,
                       preds_in_patch_with_score,
                       valid_loader,
                       final_output_dir,
                       debug=config.DEBUG.DEBUG)

        perf_indicator = 500. - acc if config.DATASET.DATASET == 'h36m' or 'mpii_3dhp' or 'jta' else acc

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
Beispiel #14
0
def main():
    logger.info("Logger is set - training start")

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

    if config.distributed:
        config.gpu = config.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(config.gpu)
        # distributed init
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=config.dist_url,
                                             world_size=config.world_size,
                                             rank=config.local_rank)

        config.world_size = torch.distributed.get_world_size()

        config.total_batch_size = config.world_size * config.batch_size
    else:
        config.total_batch_size = config.batch_size

    loaders, samplers = get_search_datasets(config)
    train_loader, valid_loader = loaders
    train_sampler, valid_sampler = samplers

    net_crit = nn.CrossEntropyLoss().cuda()
    controller = CDARTSController(config,
                                  net_crit,
                                  n_nodes=4,
                                  stem_multiplier=config.stem_multiplier)
    if config.param_pool_path is not None:
        param_pool = torch.load(config.param_pool_path, map_location='cpu')
        controller.load_state_dict(param_pool, strict=False)

    resume_state = None
    if config.resume:
        resume_state = torch.load(config.resume_path, map_location='cpu')

    sta_layer_idx = 0
    if config.resume:
        controller.load_state_dict(resume_state['controller'])
        sta_layer_idx = resume_state['sta_layer_idx']

    controller = controller.cuda()
    if config.sync_bn:
        if config.use_apex:
            controller = apex.parallel.convert_syncbn_model(controller)
        else:
            controller = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                controller)

    if config.use_apex:
        controller = DDP(controller, delay_allreduce=True)
    else:
        controller = DDP(controller, device_ids=[config.gpu])

    # warm up model_search
    layer_idx = 0
    if config.ensemble_param:
        w_optim = torch.optim.SGD(
            [{
                "params": controller.module.feature_extractor.parameters()
            }, {
                "params":
                controller.module.super_layers[layer_idx].parameters(),
                'lr': config.w_lr
            }, {
                "params":
                controller.module.super_layers[layer_idx + 1:].parameters()
            }, {
                "params": controller.module.fc_super.parameters()
            }, {
                "params": controller.module.distill_aux_head1.parameters()
            }, {
                "params": controller.module.distill_aux_head2.parameters()
            }, {
                "params": controller.module.ensemble_param
            }, {
                "params":
                controller.module.nas_layers[:layer_idx].parameters()
            }],
            lr=config.w_lr,
            momentum=config.w_momentum,
            weight_decay=config.w_weight_decay)
    else:
        w_optim = torch.optim.SGD(
            [{
                "params": controller.module.feature_extractor.parameters()
            }, {
                "params":
                controller.module.super_layers[layer_idx].parameters(),
                'lr': config.w_lr
            }, {
                "params":
                controller.module.super_layers[layer_idx + 1:].parameters()
            }, {
                "params": controller.module.fc_super.parameters()
            }, {
                "params": controller.module.distill_aux_head1.parameters()
            }, {
                "params": controller.module.distill_aux_head2.parameters()
            }, {
                "params":
                controller.module.nas_layers[:layer_idx].parameters()
            }],
            lr=config.w_lr,
            momentum=config.w_momentum,
            weight_decay=config.w_weight_decay)

    for layer_idx in range(sta_layer_idx, config.layer_num):
        if config.one_stage:
            if layer_idx > 0:
                break

        # clean arch params in model_search
        if config.clean_arch:
            controller.module.init_arch_params(layer_idx)

        # search training loop
        best_top1 = 0.
        best_genotypes = []
        best_connects = []
        sta_search_iter, sta_search_epoch = 0, 0
        is_best = True
        if (layer_idx == sta_layer_idx) and (resume_state is not None):
            sta_search_iter = resume_state['sta_search_iter']
            sta_search_epoch = resume_state['sta_search_epoch']
            best_top1 = resume_state['best_top1']
            best_genotypes = resume_state['best_genotypes']
            best_connects = resume_state['best_connects']
        else:
            # init model main
            if config.gumbel_sample:
                genotype, connect = controller.module.generate_genotype_gumbel(
                    0)
            else:
                genotype, connect = controller.module.generate_genotype(0)
            for i in range(config.layer_num):
                best_genotypes.append(genotype)
                best_connects.append(connect)

        for i in range(config.layer_num):
            controller.module.genotypes[i] = best_genotypes[i]
            controller.module.connects[i] = best_connects[i]

        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            w_optim,
            config.search_iter * config.search_iter_epochs,
            eta_min=config.w_lr_min)
        lr_scheduler_retrain = nn.ModuleList()
        alpha_optim = nn.ModuleList()
        optimizer = nn.ModuleList()
        sub_epoch = 0

        for search_iter in range(sta_search_iter, config.search_iter):
            if search_iter < config.pretrain_epochs:
                if config.local_rank == 0:
                    logger.info("####### Super model warmup #######")
                train_sampler.set_epoch(search_iter)
                retrain_warmup(train_loader, controller, w_optim, layer_idx,
                               search_iter, writer, logger, True,
                               config.pretrain_epochs, config)
                #lr_scheduler.step()
            else:
                # build new controller
                for i, genotype in enumerate(best_genotypes):
                    controller.module.build_nas_layers(i, genotype,
                                                       config.same_structure)

                controller_b = copy.deepcopy(controller.module)
                del controller
                controller = controller_b.cuda()
                controller.fix_pre_layers(layer_idx)

                #if search_iter > config.regular_ratio * config.search_iter:
                #    config.regular = False

                # sync params from super layer pool
                for i in range(layer_idx, config.layer_num):
                    controller.copy_params_from_super_layer(i)

                if config.sync_bn:
                    if config.use_apex:
                        controller = apex.parallel.convert_syncbn_model(
                            controller)
                    else:
                        controller = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                            controller)

                if config.use_apex:
                    controller = DDP(controller, delay_allreduce=True)
                else:
                    controller = DDP(controller, device_ids=[config.gpu])

                # weights optimizer
                if config.ensemble_param:
                    w_optim = torch.optim.SGD([{
                        "params":
                        controller.module.feature_extractor.parameters()
                    }, {
                        "params":
                        controller.module.super_layers[layer_idx].parameters(),
                        'lr':
                        config.w_lr
                    }, {
                        "params":
                        controller.module.super_layers[layer_idx +
                                                       1:].parameters()
                    }, {
                        "params":
                        controller.module.fc_super.parameters()
                    }, {
                        "params":
                        controller.module.distill_aux_head1.parameters()
                    }, {
                        "params":
                        controller.module.distill_aux_head2.parameters()
                    }, {
                        "params":
                        controller.module.ensemble_param
                    }, {
                        "params":
                        controller.module.nas_layers[:layer_idx].parameters()
                    }],
                                              lr=config.w_lr,
                                              momentum=config.w_momentum,
                                              weight_decay=config.
                                              w_weight_decay)
                else:
                    w_optim = torch.optim.SGD([{
                        "params":
                        controller.module.feature_extractor.parameters()
                    }, {
                        "params":
                        controller.module.super_layers[layer_idx].parameters(),
                        'lr':
                        config.w_lr
                    }, {
                        "params":
                        controller.module.super_layers[layer_idx +
                                                       1:].parameters()
                    }, {
                        "params":
                        controller.module.fc_super.parameters()
                    }, {
                        "params":
                        controller.module.distill_aux_head1.parameters()
                    }, {
                        "params":
                        controller.module.distill_aux_head2.parameters()
                    }, {
                        "params":
                        controller.module.nas_layers[:layer_idx].parameters()
                    }],
                                              lr=config.w_lr,
                                              momentum=config.w_momentum,
                                              weight_decay=config.
                                              w_weight_decay)
                # arch_params optimizer
                if config.repeat_cell:
                    alpha_optim = torch.optim.Adam(
                        controller.module.super_layers_arch[0].parameters(),
                        config.alpha_lr,
                        betas=(0.5, 0.999),
                        weight_decay=config.alpha_weight_decay)
                else:
                    alpha_optim = torch.optim.Adam(
                        controller.module.super_layers_arch[layer_idx:].
                        parameters(),
                        config.alpha_lr,
                        betas=(0.5, 0.999),
                        weight_decay=config.alpha_weight_decay)

                if config.ensemble_param:
                    optimizer = torch.optim.SGD(
                        [{
                            "params":
                            controller.module.feature_extractor.parameters()
                        }, {
                            "params":
                            controller.module.nas_layers.parameters(),
                            'lr':
                            config.nasnet_lr *
                            0.1 if config.param_pool_path else config.nasnet_lr
                        }, {
                            "params": controller.module.ensemble_param
                        }, {
                            "params":
                            controller.module.distill_aux_head1.parameters()
                        }, {
                            "params":
                            controller.module.distill_aux_head2.parameters()
                        }, {
                            "params": controller.module.fc_nas.parameters()
                        }],
                        lr=config.nasnet_lr,
                        momentum=config.w_momentum,
                        weight_decay=config.w_weight_decay)
                else:
                    optimizer = torch.optim.SGD(
                        [{
                            "params":
                            controller.module.feature_extractor.parameters()
                        }, {
                            "params":
                            controller.module.nas_layers.parameters(),
                            'lr':
                            config.nasnet_lr *
                            0.1 if config.param_pool_path else config.nasnet_lr
                        }, {
                            "params":
                            controller.module.distill_aux_head1.parameters()
                        }, {
                            "params":
                            controller.module.distill_aux_head2.parameters()
                        }, {
                            "params": controller.module.fc_nas.parameters()
                        }],
                        lr=config.nasnet_lr,
                        momentum=config.w_momentum,
                        weight_decay=config.w_weight_decay)

                lr_scheduler_retrain = torch.optim.lr_scheduler.CosineAnnealingLR(
                    optimizer,
                    config.search_iter_epochs,
                    eta_min=config.w_lr_min)
                lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
                    w_optim,
                    config.search_iter * config.search_iter_epochs,
                    eta_min=config.w_lr_min)

                if (layer_idx
                        == sta_layer_idx) and (resume_state is not None) and (
                            resume_state['sta_search_epoch'] >
                            config.pretrain_epochs):
                    w_optim.load_state_dict(resume_state['w_optim'])
                    alpha_optim.load_state_dict(resume_state['alpha_optim'])
                    lr_scheduler.load_state_dict(resume_state['lr_scheduler'])
                    lr_scheduler_retrain.load_state_dict(
                        resume_state['lr_scheduler_retrain'])
                else:
                    # lr_scheduler
                    pass
                    #for i in range(search_iter * config.search_iter_epochs):
                    #    lr_scheduler.step()

                # warmup model main
                if config.local_rank == 0:
                    logger.info("####### Sub model warmup #######")
                for warmup_epoch in range(config.nasnet_warmup):
                    valid_sampler.set_epoch(warmup_epoch)
                    retrain_warmup(valid_loader, controller, optimizer,
                                   layer_idx, warmup_epoch, writer, logger,
                                   False, config.nasnet_warmup, config)

                best_top1 = 0.
                sub_epoch = 0

                for sub_epoch in range(sta_search_epoch,
                                       config.search_iter_epochs):

                    lr_search = lr_scheduler.get_lr()[0]
                    lr_main = lr_scheduler_retrain.get_lr()[0]

                    search_epoch = search_iter * config.search_iter_epochs + sub_epoch

                    # reset iterators
                    train_sampler.set_epoch(search_epoch)
                    valid_sampler.set_epoch(search_epoch)

                    # training
                    search(train_loader, valid_loader, controller, optimizer,
                           w_optim, alpha_optim, layer_idx, search_epoch,
                           writer, logger, config)

                    # validation
                    step_num = len(valid_loader)
                    cur_step = (search_epoch + 1) * step_num
                    top1 = 1.

                    genotypes = []
                    connects = []

                    if config.gumbel_sample:
                        genotype, connect = controller.module.generate_genotype_gumbel(
                            0)
                    else:
                        genotype, connect = controller.module.generate_genotype(
                            0)

                    for i in range(config.layer_num):
                        genotypes.append(genotype)
                        connects.append(connect)

                    if config.local_rank == 0:
                        # for i in range(config.layer_num - layer_idx):
                        # logger.info ("Stage: {} Layer: {}".format(layer_idx, i+layer_idx+1))
                        logger.info("Genotypes: ")
                        # controller.module.print_arch_params(logger, i+layer_idx)
                        controller.module.print_arch_params(logger, 0)

                    for i in range(config.layer_num - layer_idx):
                        if config.local_rank == 0:
                            # genotype
                            genotype = genotypes[i]
                            logger.info(
                                "Stage: {} Layer: {} genotype = {}".format(
                                    layer_idx, i + layer_idx + 1, genotype))
                            # genotype as a image
                            plot_path = os.path.join(
                                config.plot_path,
                                "Stage_{}_Layer_{}_EP_{:02d}".format(
                                    layer_idx, layer_idx + i + 1,
                                    search_epoch + 1))
                            caption = "Stage_{}_Layer_{}_Epoch_{}".format(
                                layer_idx, layer_idx + i + 1, search_epoch + 1)
                            plot(genotype.normal, plot_path + "-normal",
                                 caption)
                            plot(genotype.reduce, plot_path + "-reduce",
                                 caption)

                    # sync params to super layer pool
                    for i in range(layer_idx, config.layer_num):
                        controller.module.copy_params_from_nas_layer(i)

                    # save
                    best_top1 = top1
                    best_genotypes = genotypes
                    best_connects = connects

                    for i in range(config.layer_num):
                        controller.module.genotypes[i] = best_genotypes[i]
                        controller.module.connects[i] = best_connects[i]

                    #lr_scheduler.step()
                    #lr_scheduler_retrain.step()

            if config.local_rank == 0:
                utils.save_checkpoint(controller.module, config.path, is_best)
                torch.save(
                    {
                        'controller': controller.module.state_dict(),
                        'sta_layer_idx': layer_idx,
                        'w_optim': w_optim.state_dict(),
                        'alpha_optim': alpha_optim.state_dict(),
                        'lr_scheduler': lr_scheduler.state_dict(),
                        'sta_search_iter': search_iter,
                        'sta_search_epoch': sub_epoch + 1,
                        'best_top1': best_top1,
                        'best_genotypes': best_genotypes,
                        'best_connects': best_connects,
                        'lr_scheduler_retrain':
                        lr_scheduler_retrain.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }, os.path.join(config.path, 'search_resume.pth.tar'))

            torch.cuda.empty_cache()
            sta_search_epoch = 0

        # clean
        del w_optim
        del alpha_optim
        del optimizer
        torch.cuda.empty_cache()
        config.pretrain_epochs = max(
            config.pretrain_epochs - config.pretrain_decay, 0)

    # genotype as a image
    for i in range(config.layer_num):
        genotype, connect = controller.module.generate_genotype(i)
        controller.module.genotypes[i] = genotype
        controller.module.connects[i] = connect

    if config.local_rank == 0:
        for layer_idx, genotype in controller.module.genotypes.items():
            logger.info("layer_idx : {}".format(layer_idx + 1))
            logger.info("genotype = {}".format(genotype))

            plot_path = os.path.join(
                config.plot_path,
                "Final_Layer_{}_genotype".format(layer_idx + 1))
            caption = "Layer_{}".format(layer_idx + 1)
            plot(genotype.normal, plot_path + "-normal", caption)
            plot(genotype.reduce, plot_path + "-reduce", caption)

    # save dict as json
    if config.local_rank == 0:
        for layer_idx, genotype in controller.module.genotypes.items():
            controller.module.genotypes[layer_idx] = str(genotype)

        js = json.dumps(controller.module.genotypes)
        file = open('genotypes.json', 'w')
        file.write(js)
        file.close()
Beispiel #15
0
def main():

    args = parse_args()

    logger, final_output_dir, tb_log_dir = \
        utils.create_logger(config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    cudnn.benchmark = config.CUDNN.BENCHMARK
    cudnn.determinstic = config.CUDNN.DETERMINISTIC
    cudnn.enabled = config.CUDNN.ENABLED
    gpus = list(config.GPUS)

    dataset_type = get_dataset(config)
    train_data = dataset_type(config, split="train")
    train_loader = DataLoader(dataset=train_data,
                              batch_size=config.TRAIN.BATCH_SIZE_PER_GPU *
                              len(gpus),
                              shuffle=config.TRAIN.SHUFFLE,
                              num_workers=config.WORKERS,
                              pin_memory=config.PIN_MEMORY)

    val_data = dataset_type(config, split="valid")
    val_loader = DataLoader(dataset=val_data,
                            batch_size=config.TEST.BATCH_SIZE_PER_GPU *
                            len(gpus),
                            shuffle=False,
                            num_workers=config.WORKERS,
                            pin_memory=config.PIN_MEMORY)

    # config.MODEL.NUM_JOINTS = train_data.get_num_points()
    model = models.get_face_alignment_net(config)

    # copy model files
    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    model = nn.DataParallel(model, device_ids=gpus).cuda()

    # loss
    criterion = torch.nn.MSELoss(size_average=True).cuda()
    optimizer = utils.get_optimizer(config, model)

    best_nme = 100
    last_epoch = config.TRAIN.BEGIN_EPOCH

    if isinstance(config.TRAIN.LR_STEP, list):
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
            last_epoch - 1)
    else:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                       config.TRAIN.LR_STEP,
                                                       config.TRAIN.LR_FACTOR,
                                                       last_epoch - 1)

    if config.TRAIN.RESUME:
        model_state_file = os.path.join(final_output_dir, 'final.pth')
        if os.path.islink(model_state_file):
            checkpoint = torch.load(model_state_file)
            last_epoch = checkpoint['epoch']
            best_nme = checkpoint['best_nme']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
        else:
            print("=> no checkpoint found")
    loss = []

    for epoch in range(last_epoch, config.TRAIN.END_EPOCH):
        losses, diff = function.train(config, train_loader, model, criterion,
                                      optimizer, epoch, writer_dict)
        loss.append(losses)
        lr_scheduler.step()

        np.save(
            os.path.join(final_output_dir, "train_diff@epoch{}".format(epoch)),
            diff)

        # evaluate
        nme, predictions, diff = function.validate(config, val_loader, model,
                                                   criterion, epoch,
                                                   writer_dict)

        np.save(
            os.path.join(final_output_dir, "valid_diff@epoch{}".format(epoch)),
            diff)

        is_best = nme < best_nme
        best_nme = min(nme, best_nme)

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        print("best:", is_best)
        utils.save_checkpoint(
            {
                "state_dict": model,
                "epoch": epoch + 1,
                "best_nme": best_nme,
                "optimizer": optimizer.state_dict(),
            }, predictions, is_best, final_output_dir,
            'checkpoint_{}.pth'.format(epoch))
        if is_best:
            for i in range(len(predictions)):
                afile = val_data.annotation_files[i]
                new_afile = '{}.{}.txt'.format(
                    afile,
                    os.path.basename(args.cfg).split('.')[0])
                with open(new_afile, 'wt') as f:
                    pts = predictions[i].cpu().numpy()
                    for j in range(len(pts)):
                        f.write("{},{}\n".format(
                            pts[j][1] / val_data.factor[1],
                            pts[j][0] / val_data.factor[0]))

    pd.DataFrame(data=loss).to_csv('loss2.csv')
    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #16
0
def main():
    logger.info("Logger is set - training start")

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

    if config.distributed:
        config.gpu = config.local_rank % torch.cuda.device_count()
        torch.cuda.set_device(config.gpu)
        # distributed init
        torch.distributed.init_process_group(backend='nccl',
                                             init_method=config.dist_url,
                                             world_size=config.world_size,
                                             rank=config.local_rank)

        config.world_size = torch.distributed.get_world_size()

        config.total_batch_size = config.world_size * config.batch_size
    else:
        config.total_batch_size = config.batch_size

    loaders, samplers = get_augment_datasets(config)
    train_loader, valid_loader = loaders
    train_sampler, valid_sampler = samplers

    net_crit = nn.CrossEntropyLoss().cuda()
    controller = CDARTSController(config,
                                  net_crit,
                                  n_nodes=4,
                                  stem_multiplier=config.stem_multiplier)

    file = open(config.cell_file, 'r')
    js = file.read()
    r_dict = json.loads(js)
    if config.local_rank == 0:
        logger.info(r_dict)
    file.close()
    genotypes_dict = {}
    for layer_idx, genotype in r_dict.items():
        genotypes_dict[int(layer_idx)] = gt.from_str(genotype)

    controller.build_augment_model(controller.init_channel, genotypes_dict)
    resume_state = None
    if config.resume:
        resume_state = torch.load(config.resume_path, map_location='cpu')
        controller.model_main.load_state_dict(resume_state['model_main'])

    controller.model_main = controller.model_main.cuda()
    param_size = utils.param_size(controller.model_main)
    logger.info("param size = %fMB", param_size)

    # change training hyper parameters according to cell type
    if 'cifar' in config.dataset:
        if param_size < 3.0:
            config.weight_decay = 3e-4
            config.drop_path_prob = 0.2
        elif param_size > 3.0 and param_size < 3.5:
            config.weight_decay = 3e-4
            config.drop_path_prob = 0.3
        else:
            config.weight_decay = 5e-4
            config.drop_path_prob = 0.3

    if config.local_rank == 0:
        logger.info("Current weight decay: {}".format(config.weight_decay))
        logger.info("Current drop path prob: {}".format(config.drop_path_prob))

    controller.model_main = apex.parallel.convert_syncbn_model(
        controller.model_main)
    # weights optimizer
    optimizer = torch.optim.SGD(controller.model_main.parameters(),
                                lr=config.lr,
                                momentum=config.momentum,
                                weight_decay=config.weight_decay)
    # optimizer = torch.optim.SGD(controller.model_main.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay, nesterov=True)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, config.epochs)

    if config.use_amp:
        controller.model_main, optimizer = amp.initialize(
            controller.model_main, optimizer, opt_level=config.opt_level)

    if config.distributed:
        controller.model_main = DDP(controller.model_main,
                                    delay_allreduce=True)

    best_top1 = 0.
    best_top5 = 0.
    sta_epoch = 0
    # training loop
    if config.resume:
        optimizer.load_state_dict(resume_state['optimizer'])
        lr_scheduler.load_state_dict(resume_state['lr_scheduler'])
        best_top1 = resume_state['best_top1']
        best_top5 = resume_state['best_top5']
        sta_epoch = resume_state['sta_epoch']

    epoch_pool = [220, 230, 235, 240, 245]
    for epoch in range(sta_epoch, config.epochs):
        # reset iterators
        train_sampler.set_epoch(epoch)
        valid_sampler.set_epoch(epoch)
        current_lr = lr_scheduler.get_lr()[0]
        # current_lr = utils.adjust_lr(optimizer, epoch, config)

        if config.local_rank == 0:
            logger.info('Epoch: %d lr %e', epoch, current_lr)
        if epoch < config.warmup_epochs and config.total_batch_size > 256:
            for param_group in optimizer.param_groups:
                param_group['lr'] = current_lr * (epoch + 1) / 5.0
            if config.local_rank == 0:
                logger.info('Warming-up Epoch: %d, LR: %e', epoch,
                            current_lr * (epoch + 1) / 5.0)

        drop_prob = config.drop_path_prob * epoch / config.epochs
        controller.model_main.module.drop_path_prob(drop_prob)

        # training
        train(train_loader, controller.model_main, optimizer, epoch, writer,
              logger, config)

        # validation
        cur_step = (epoch + 1) * len(train_loader)
        top1, top5 = validate(valid_loader, controller.model_main, epoch,
                              cur_step, writer, logger, config)

        if 'cifar' in config.dataset:
            lr_scheduler.step()
        elif 'imagenet' in config.dataset:
            lr_scheduler.step()
            # current_lr = utils.adjust_lr(optimizer, epoch, config)
        else:
            raise Exception('Lr error!')

        # save
        if best_top1 < top1:
            best_top1 = top1
            best_top5 = top5
            is_best = True
        else:
            is_best = False

        # save
        if config.local_rank == 0:
            if ('imagenet'
                    in config.dataset) and ((epoch + 1) in epoch_pool) and (
                        not config.resume) and (config.local_rank == 0):
                torch.save(
                    {
                        "model_main":
                        controller.model_main.module.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        "lr_scheduler": lr_scheduler.state_dict(),
                        "best_top1": best_top1,
                        "best_top5": best_top5,
                        "sta_epoch": epoch + 1
                    },
                    os.path.join(config.path,
                                 "epoch_{}.pth.tar".format(epoch + 1)))
                utils.save_checkpoint(
                    controller.model_main.module.state_dict(), config.path,
                    is_best)

            torch.save(
                {
                    "model_main": controller.model_main.module.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "lr_scheduler": lr_scheduler.state_dict(),
                    "best_top1": best_top1,
                    "best_top5": best_top5,
                    "sta_epoch": epoch + 1
                }, os.path.join(config.path, "retrain_resume.pth.tar"))
            utils.save_checkpoint(controller.model_main.module.state_dict(),
                                  config.path, is_best)

    if config.local_rank == 0:
        logger.info("Final best Prec@1 = {:.4%}, Prec@5 = {:.4%}".format(
            best_top1, best_top5))