Exemple #1
0
def main(args):
    os.environ['CUDA_VISIBLE_DEVICES'] = '3'
    # create checkpoint dir
    if not isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # create model
    model = network.__dict__[cfg.model](cfg.output_shape, cfg.num_class, pretrained = False)
    model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion1 = torch.nn.MSELoss().cuda() # for Global loss
    criterion2 = torch.nn.MSELoss(reduce=False).cuda() # for refine loss
    optimizer = torch.optim.Adam(model.parameters(),
                                lr = cfg.lr,
                                weight_decay=cfg.weight_decay)
    
    if args.resume:
        if isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            pretrained_dict = checkpoint['state_dict']
            model.load_state_dict(pretrained_dict)
            args.start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
            logger = Logger(join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:        
        logger = Logger(join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'LR', 'Train Loss'])

    cudnn.benchmark = True
    print('    Total params: %.2fMB' % (sum(p.numel() for p in model.parameters())/(1024*1024)*4))

    train_loader = torch.utils.data.DataLoader(
        MscocoMulti(cfg),
        batch_size=cfg.batch_size*args.num_gpus, shuffle=True,
        num_workers=args.workers, pin_memory=True) 

    for epoch in range(args.start_epoch, args.epochs):
        lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch, cfg.lr_gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) 

        # train for one epoch
        train_loss = train(train_loader, model, [criterion1, criterion2], optimizer)
        print('train_loss: ',train_loss)

        # append logger file
        logger.append([epoch + 1, lr, train_loss])

        save_model({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer' : optimizer.state_dict(),
        }, checkpoint=args.checkpoint)

    logger.close()
Exemple #2
0
def train_model(model, dataloaders, optimizer, scheduler, num_epochs=1):
    since = time.time()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logger = Logger(join(opt.output, f'log.txt'))
    best_loss = 1e5 # set to some large enough value

    criterion = Criterion()
    data_dict = dict()
    for epoch in range(num_epochs):
        scheduler.step()
        lr = scheduler.get_lr()[-1]
        print(f'Epoch: {epoch+1}/{num_epochs} LR: {lr:.3E}')
        data_dict['Epoch'] = epoch
        data_dict['LR'] = lr

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            # Iterate over data.
            batch_time  = AverageMeter()
            data_time   = AverageMeter()
            loss_meter  = AverageMeter()

            end = time.time()
            bar_name = 'Training' if phase=='train' else 'Testing '
            num_batch = len(dataloaders[phase])
            bar = Bar(bar_name, max=num_batch)
            # Iterate over data.
            for i,(inputs, targets) in enumerate(dataloaders[phase]):
                # measure data loading time
                data_time.update(time.time() - end)

                # move data to GPU
                inputs  = inputs.to(device).float()
                targets = targets.to(device).float()

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    loss, _ = criterion.eval(outputs, targets)

                # measure accuracy and record loss
                loss_meter.update(loss.item(), inputs.shape[0])
                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                # plot progress
                bar.suffix  = f'({i+1:04d}/{num_batch:04d}) Data: {data_time.val:.6f}s | Batch: {batch_time.val:.3f}s | Total: {bar.elapsed_td:} | ETA: {bar.eta_td:} | Loss: {loss_meter.avg:.4f}'
                bar.next()
            bar.finish()
            data_dict[f'{phase} Loss'] = loss_meter.avg

            # save last model 
            if phase == 'train':
                save_model(model, join(opt.output, f'last.pth'))
            else:
                is_best = data_dict[f'val Loss'] < best_loss
                if is_best:
                    best_loss = data_dict[f'val Loss']
                    save_model(model, join(opt.output, f'best.pth'))
        # update the log
        logger.update(data_dict)
def main():
    args = parse_args()
    update_config(cfg_hrnet, args)

    # create checkpoint dir
    if not isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # create model
    #print('networks.'+ cfg_hrnet.MODEL.NAME+'.get_pose_net')
    model = eval('models.' + cfg_hrnet.MODEL.NAME + '.get_pose_net')(
        cfg_hrnet, is_train=True)
    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()

    # show net
    args.channels = 3
    args.height = cfg.data_shape[0]
    args.width = cfg.data_shape[1]
    #net_vision(model, args)

    # define loss function (criterion) and optimizer
    criterion = torch.nn.MSELoss(reduction='mean').cuda()

    #torch.optim.Adam
    optimizer = AdaBound(model.parameters(),
                         lr=cfg.lr,
                         weight_decay=cfg.weight_decay)

    if args.resume:
        if isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            pretrained_dict = checkpoint['state_dict']
            model.load_state_dict(pretrained_dict)
            args.start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            logger = Logger(join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        logger = Logger(join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'LR', 'Train Loss'])

    cudnn.benchmark = True
    torch.backends.cudnn.enabled = True
    print('    Total params: %.2fMB' %
          (sum(p.numel() for p in model.parameters()) / (1024 * 1024) * 4))

    train_loader = torch.utils.data.DataLoader(
        #MscocoMulti(cfg),
        KPloader(cfg),
        batch_size=cfg.batch_size * len(args.gpus))
    #, shuffle=True,
    #num_workers=args.workers, pin_memory=True)

    #for i, (img, targets, valid) in enumerate(train_loader):
    #    print(i, img, targets, valid)

    for epoch in range(args.start_epoch, args.epochs):
        lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch,
                                  cfg.lr_gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))

        # train for one epoch
        train_loss = train(train_loader, model, criterion, optimizer)
        print('train_loss: ', train_loss)

        # append logger file
        logger.append([epoch + 1, lr, train_loss])

        save_model(
            {
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            checkpoint=args.checkpoint)

    logger.close()
Exemple #4
0
def main(args):
    # import pdb; pdb.set_trace()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = torch.device("cpu")
    print(device)

    writer = SummaryWriter(cfg.tensorboard_path)
    # create checkpoint dir
    counter = 0
    if not isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # create model
    model = network.__dict__[cfg.model](cfg.output_shape,
                                        cfg.num_class,
                                        pretrained=True)

    model = torch.nn.DataParallel(model).to(device)
    # model = model.to(device)

    # define loss function (criterion) and optimizer
    criterion_bce = torch.nn.BCELoss().to(device)
    criterion_abs = torch.nn.L1Loss().to(device)
    # criterion_abs = offset_loss().to(device)
    # criterion1 = torch.nn.MSELoss().to(device) # for Global loss
    # criterion2 = torch.nn.MSELoss(reduce=False).to(device) # for refine loss
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=cfg.lr,
                                 weight_decay=cfg.weight_decay)

    if args.resume:
        print(args.resume)
        checkpoint_file_resume = os.path.join(args.checkpoint,
                                              args.resume + '.pth.tar')
        if isfile(checkpoint_file_resume):
            print("=> loading checkpoint '{}'".format(checkpoint_file_resume))
            checkpoint = torch.load(checkpoint_file_resume)
            pretrained_dict = checkpoint['state_dict']
            model.load_state_dict(pretrained_dict)
            args.start_epoch = checkpoint['epoch']
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                checkpoint_file_resume, checkpoint['epoch']))
            logger = Logger(join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(
                checkpoint_file_resume))
    else:
        logger = Logger(join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'LR', 'Train Loss'])

    cudnn.benchmark = True
    print('    Total params: %.2fMB' %
          (sum(p.numel() for p in model.parameters()) / (1024 * 1024) * 4))

    train_loader = torch.utils.data.DataLoader(MscocoMulti_double_only(cfg),
                                               batch_size=cfg.batch_size *
                                               args.num_gpus,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)

    for epoch in range(args.start_epoch, args.epochs):
        lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch,
                                  cfg.lr_gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))

        # train for one epoch
        train_loss, counter = train(train_loader, model,
                                    [criterion_abs, criterion_bce], writer,
                                    counter, optimizer, device)
        print('train_loss: ', train_loss)

        # append logger file
        logger.append([epoch + 1, lr, train_loss])

        save_model(
            {
                'epoch': epoch + 1,
                'info': cfg.info,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            },
            checkpoint=args.checkpoint)

    writer.export_scalars_to_json("./test.json")
    writer.close()

    logger.close()
Exemple #5
0
def main(args):
    """
    Main training loop for training a stacked hourglass model on MPII dataset.
    :param args: Command line arguments.
    """
    global best_acc

    # create checkpoint dir
    if not isdir(args.checkpoint_dir):
        mkdir_p(args.checkpoint_dir)

    # create model
    print("==> creating model '{}', stacks={}, blocks={}".format(
        args.arch, args.stacks, args.blocks))
    model = HourglassNet(num_stacks=args.stacks,
                         num_blocks=args.blocks,
                         num_classes=args.num_classes,
                         batch_norm_momentum=args.batch_norm_momentum,
                         use_layer_norm=args.use_layer_norm,
                         width=256,
                         height=256)
    joint_visibility_model = JointVisibilityNet(hourglass_stacks=args.stacks)

    # scale weights
    if args.scale_weight_factor != 1.0:
        model.scale_weights_(args.scale_weight_factor)

    # setup horovod and model for parallel execution
    if args.use_horovod:
        hvd.init()
        torch.cuda.set_device(hvd.local_rank())
        args.lr *= hvd.size()
        model.cuda()
    else:
        model = model.cuda()
        if args.predict_joint_visibility:
            joint_visibility_model = joint_visibility_model.cuda()

    # define loss function (criterion) and optimizer
    criterion = torch.nn.MSELoss(size_average=True).cuda()
    joint_visibility_criterion = None if not args.predict_joint_visibility else torch.nn.BCEWithLogitsLoss(
    )
    params = [{'params': model.parameters(), 'lr': args.lr}]
    if args.predict_joint_visibility:
        params.append({
            'params': joint_visibility_model.parameters(),
            'lr': args.lr
        })
    params = model.parameters()
    if not args.use_amsprop:
        optimizer = torch.optim.RMSprop(params,
                                        lr=args.lr,
                                        momentum=args.momentum,
                                        weight_decay=args.weight_decay)
    else:
        optimizer = torch.optim.Adam(params,
                                     lr=args.lr,
                                     weight_decay=args.weight_decay,
                                     amsgrad=True)
    if args.use_horovod:
        optimizer = hvd.DistributedOptimizer(
            optimizer, named_parameters=model.named_parameters())

    # Create a tensorboard writer
    writer = SummaryWriter(log_dir="%s/hourglass_mpii_%s_tb_log" %
                           (args.tb_dir, args.exp))

    # optionally resume from a checkpoint
    title = 'mpii-' + args.arch
    if args.load:
        if isfile(args.load):
            print("=> loading checkpoint '{}'".format(args.load))
            checkpoint = torch.load(args.load)

            # remove old usage of data parallel (used to be wrapped around model) # TODO: remove this when no old models used this
            state_dict = {}
            for key in checkpoint['state_dict']:
                new_key = key[len("module."):] if key.startswith(
                    "module.") else key
                state_dict[new_key] = checkpoint['state_dict'][key]

            # restore state
            args.start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_acc']
            model.load_state_dict(state_dict)
            if args.predict_joint_visibility:
                joint_visibility_model.load_state_dict(
                    checkpoint['joint_visibility_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])

            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.load, checkpoint['epoch']))
            logger = Logger(join(args.checkpoint_dir, 'log.txt'),
                            title=title,
                            resume=True)
        else:
            raise Exception("=> no checkpoint found at '{}'".format(args.load))
    else:
        logger = Logger(join(args.checkpoint_dir, 'log.txt'), title=title)
        logger.set_names(
            ['Epoch', 'LR', 'Train Loss', 'Val Loss', 'Train Acc', 'Val Acc'])

    cudnn.benchmark = True
    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))

    # Data loading code
    train_dataset, train_loader, val_loader = _make_torch_data_loaders(args)

    if args.evaluate:
        print('\nEvaluation only')
        loss, acc, predictions = validate(val_loader, model, criterion,
                                          args.num_classes, args.debug,
                                          args.flip)
        save_pred(predictions, checkpoint=args.checkpoint_dir)
        return

    lr = args.lr
    for epoch in range(args.start_epoch, args.epochs):
        lr = adjust_learning_rate(optimizer, epoch, lr, args.schedule,
                                  args.gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))

        # decay sigma
        if args.sigma_decay > 0:
            train_loader.dataset.sigma *= args.sigma_decay
            val_loader.dataset.sigma *= args.sigma_decay

        # train for one epoch
        train_loss, train_acc, joint_visibility_loss, joint_visibility_acc = train(
            train_loader,
            model=model,
            joint_visibility_model=joint_visibility_model,
            criterion=criterion,
            num_joints=args.num_classes,
            joint_visibility_criterion=joint_visibility_criterion,
            optimizer=optimizer,
            epoch=epoch,
            writer=writer,
            lr=lr,
            debug=args.debug,
            flip=args.flip,
            remove_intermediate_supervision=args.
            remove_intermediate_supervision,
            tb_freq=args.tb_log_freq,
            no_grad_clipping=args.no_grad_clipping,
            grad_clip=args.grad_clip,
            use_horovod=args.use_horovod,
            predict_joint_visibility=args.predict_joint_visibility,
            predict_joint_loss_coeff=args.joint_visibility_loss_coeff)

        # evaluate on validation set
        valid_loss, valid_acc_PCK, valid_acc_PCKh, valid_acc_PCKh_per_joint, valid_joint_visibility_loss, valid_joint_visibility_acc, predictions = validate(
            val_loader, model, joint_visibility_model, criterion,
            joint_visibility_criterion, args.num_classes, args.debug,
            args.flip, args.use_horovod, args.use_train_mode_to_eval,
            args.predict_joint_visibility)

        # append logger file, and write to tensorboard summaries
        writer.add_scalars('data/epoch/losses_wrt_epochs', {
            'train_loss': train_loss,
            'test_lost': valid_loss
        }, epoch)
        writer.add_scalar('data/epoch/train_accuracy_PCK', train_acc, epoch)
        writer.add_scalar('data/epoch/test_accuracy_PCK', valid_acc_PCK, epoch)
        writer.add_scalar('data/epoch/test_accuracy_PCKh', valid_acc_PCKh,
                          epoch)
        for key in valid_acc_PCKh_per_joint:
            writer.add_scalar(
                'per_joint_data/epoch/test_accuracy_PCKh_%s' % key,
                valid_acc_PCKh_per_joint[key], epoch)
        logger.append(
            [epoch + 1, lr, train_loss, valid_loss, train_acc, valid_acc_PCK])
        if args.predict_joint_visibility:
            writer.add_scalars(
                'joint_visibility/epoch/loss', {
                    'train': joint_visibility_loss,
                    'test_lost': valid_joint_visibility_loss
                }, epoch)
            writer.add_scalars(
                'joint_visibility/epoch/acc', {
                    'train': joint_visibility_acc,
                    'test_lost': valid_joint_visibility_acc
                }, epoch)

        # remember best acc and save checkpoint
        model_specific_checkpoint_dir = "%s/hourglass_mpii_%s" % (
            args.checkpoint_dir, args.exp)
        if not isdir(model_specific_checkpoint_dir):
            mkdir_p(model_specific_checkpoint_dir)

        is_best = valid_acc_PCK > best_acc
        best_acc = max(valid_acc_PCK, best_acc)
        mean, stddev = train_dataset.get_mean_stddev()
        checkpoint = {
            'epoch': epoch + 1,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_acc': best_acc,
            'optimizer': optimizer.state_dict(),
            'mean': mean,
            'stddev': stddev,
        }
        if args.predict_joint_visibility:
            checkpoint[
                'joint_visibility_state_dict'] = joint_visibility_model.state_dict(
                )
        save_checkpoint(checkpoint,
                        predictions,
                        is_best,
                        checkpoint=model_specific_checkpoint_dir)

    logger.close()
def main():
    args = parse_args()

    # create checkpoint dir
    if not isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # create model
    model = network.__dict__[cfg.model](cfg.channel_settings,
                                        cfg.output_shape,
                                        cfg.num_class,
                                        pretrained=True)

    # show net
    args.channels = 3
    args.height = cfg.data_shape[0]
    args.width = cfg.data_shape[1]
    #net_vision(model, args)

    if 1:
        if isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            model.load_state_dict(checkpoint['state_dict'])
            args.start_epoch = checkpoint['epoch']
            lr = checkpoint['lr']
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
            logger = Logger(join(args.checkpoint, 'log.txt'), resume=True)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    else:
        lr = cfg.lr
        logger = Logger(join(args.checkpoint, 'log.txt'))
        logger.set_names(['Epoch', 'LR', 'Train Loss'])

    # define loss function (criterion) and optimizer
    criterion1 = torch.nn.MSELoss().cuda()  # for Global loss
    criterion2 = torch.nn.MSELoss(reduce=False).cuda()  # for refine loss

    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()

    cudnn.benchmark = True
    torch.backends.cudnn.enabled = True
    print('    Total params: %.2fMB' %
          (sum(p.numel() for p in model.parameters()) / (1024 * 1024) * 4))

    train_loader = torch.utils.data.DataLoader(
        #MscocoMulti(cfg),
        KPloader(cfg),
        batch_size=cfg.batch_size * len(args.gpus))
    #, shuffle=True,
    #num_workers=args.workers, pin_memory=True)

    #torch.optim.Adam
    optimizer = AdaBound(model.parameters(),
                         lr=lr,
                         weight_decay=cfg.weight_decay)

    for epoch in range(args.start_epoch, args.epochs):
        lr = adjust_learning_rate(optimizer, epoch, cfg.lr_dec_epoch,
                                  cfg.lr_gamma)
        print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr))

        # train for one epoch
        train_loss = train(train_loader, model, [criterion1, criterion2],
                           optimizer)
        print('train_loss: ', train_loss)

        # append logger file
        logger.append([epoch + 1, lr, train_loss])

        #save_model({
        #    'epoch': epoch + 1,
        #    'state_dict': model.state_dict(),
        #    'optimizer' : optimizer.state_dict(),
        #}, checkpoint=args.checkpoint)

        state_dict = model.module.state_dict()
        for key in state_dict.keys():
            state_dict[key] = state_dict[key].cpu()
        torch.save({
            'epoch': epoch + 1,
            'state_dict': state_dict,
            'lr': lr,
        },
                   os.path.join(args.checkpoint,
                                "epoch" + str(epoch + 1) + "checkpoint.ckpt"))
        print("=> Save model done! the path: ", \
              os.path.join(args.checkpoint, "epoch" + str(epoch + 1) + "checkpoint.ckpt"))

    logger.close()