Exemple #1
0
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        config, is_train=True)
    if args.model:
        logger.info('=> loading model from {}'.format(args.model))
        checkpoint = torch.load(args.model)
        #from collections import OrderedDict
        #new_state_dict = OrderedDict()
        #for k,v in checkpoint.items():
        #    k='haha'+k
        #    new_state_dict[k]=v
        #model.load_state_dict(new_state_dict)
        model.load_state_dict(checkpoint)
    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    print(len(train_loader))
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)
    print(len(valid_loader))

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(config,
              train_loader,
              model,
              criterion,
              optimizer,
              epoch,
              final_output_dir,
              tb_log_dir,
              writer_dict,
              data_flag='posetrack')

        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False
        logger.info('Now epoch is {}, now best model is {}'.format(
            epoch + 1, best_model))

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    # print code version info
    repo = Repo('')
    repo_git = repo.git
    working_tree_diff_head = repo_git.diff('HEAD')
    this_commit_hash = repo.commit()
    cur_branches = repo_git.branch('--list')
    logger.info('Current Code Version is {}'.format(this_commit_hash))
    logger.info('Current Branch Info :\n{}'.format(cur_branches))
    logger.info(
        'Working Tree diff with HEAD: \n{}'.format(working_tree_diff_head))

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    backbone_model = eval('models.' + config.BACKBONE_MODEL + '.get_pose_net')(
        config, is_train=True)
    model = models.multiview_pose_net.get_multiview_pose_net(
        backbone_model, config)
    # logger.info(pprint.pformat(model))

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # dump_input = torch.rand(
    #     (config.TRAIN.BATCH_SIZE, 3,  # config.NETWORK.NUM_JOINTS,
    #      config.NETWORK.IMAGE_SIZE[1], config.NETWORK.IMAGE_SIZE[0]))
    # writer_dict['writer'].add_graph(model, dump_input)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()
    # criterion_fuse = JointsMSELoss(use_target_weight=True).cuda()

    optimizer = get_optimizer(config, model)
    start_epoch = config.TRAIN.BEGIN_EPOCH
    if config.TRAIN.RESUME:
        start_epoch, model, optimizer, ckpt_perf = load_checkpoint(
            model, optimizer, final_output_dir)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.TRAIN_DATASET)(
        config, config.DATASET.TRAIN_SUBSET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)(
        config, config.DATASET.TEST_SUBSET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        collate_fn=totalcapture_collate,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        collate_fn=totalcapture_collate,
        pin_memory=True)

    best_perf = ckpt_perf
    best_epoch = -1
    best_model = False
    for epoch in range(start_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()
        extra_param = dict()
        # extra_param['loss2'] = criterion_fuse
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, writer_dict, **extra_param)

        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, writer_dict,
                                  **extra_param)

        logger.info(
            '=> perf indicator at epoch {} is {}. old best is {} '.format(
                epoch, perf_indicator, best_perf))

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
            best_epoch = epoch
            logger.info(
                '====> find new best model at end of epoch {}. (start from 0)'.
                format(epoch))
        else:
            best_model = False
        logger.info(
            'epoch of best validation results is {}'.format(best_epoch))

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

        # save final state at every epoch
        final_model_state_file = os.path.join(
            final_output_dir, 'final_state_ep{}.pth.tar'.format(epoch))
        logger.info(
            'saving final model state to {}'.format(final_model_state_file))
        torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        config, is_train=True)
    logger.info(">>> total params: {:.2f}M".format(
        sum(p.numel() for p in model.parameters()) / 1000000.0))
    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)
    model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE,
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE,
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        """metabatch: 
        args: 1.dataset_num 2.batchsize 3.total_epoch"""
        #####################   METABATCH  #####################################
        dataset_num = len(train_dataset)
        batch_size = config.TRAIN.BATCH_SIZE
        total_epoch = config.TRAIN.END_EPOCH
        logger.info('dataset_size={}, batchsize = {} ,total_epoch = {}'.format(
            dataset_num, batch_size, total_epoch))
        SEU_YS = MetaData_Container(dataset_num, batch_size, total_epoch)
        #########################################################################
        train(config, SEU_YS, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        SEU_YS.Output_CSV_Table()  #每个周期输出表到csv文件并打印

        #########################################################################
        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Exemple #4
0
def validate(config,
             val_loader,
             val_dataset,
             model,
             criterion,
             output_dir,
             tb_log_dir,
             writer_dict=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    # switch to evaluate mode
    model.eval()

    num_samples = len(val_dataset)
    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3),
                         dtype=np.float32)
    all_boxes = np.zeros((num_samples, 6))
    image_path = []
    filenames = []
    imgnums = []
    idx = 0
    with torch.no_grad():
        end = time.time()
        for i, (input, target, target_weight, meta) in enumerate(val_loader):
            # compute output
            input = input.cuda()
            output = model(input)
            if config.TEST.FLIP_TEST:
                # this part is ugly, because pytorch has not supported negative index
                # input_flipped = model(input[:, :, :, ::-1])
                input_flipped = np.flip(input.cpu().numpy(), 3).copy()
                input_flipped = torch.from_numpy(input_flipped).cuda()
                output_flipped = model(input_flipped)
                output_flipped = flip_back(output_flipped.cpu().numpy(),
                                           val_dataset.flip_pairs)
                output_flipped = torch.from_numpy(output_flipped.copy()).cuda()

                # feature is not aligned, shift flipped heatmap for higher accuracy
                if config.TEST.SHIFT_HEATMAP:
                    output_flipped[:, :, :, 1:] = \
                        output_flipped.clone()[:, :, :, 0:-1]
                    # output_flipped[:, :, :, 0] = 0

                output = (output + output_flipped) * 0.5

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)

            loss = criterion(output, target, target_weight)

            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(),
                                             target.cpu().numpy())

            acc.update(avg_acc, cnt)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            c = meta['center'].numpy()
            s = meta['scale'].numpy()
            score = meta['score'].numpy()

            preds, maxvals = get_final_preds(config,
                                             output.clone().cpu().numpy(), c,
                                             s)

            all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
            all_preds[idx:idx + num_images, :, 2:3] = maxvals
            # double check this all_boxes parts
            all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2]
            all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2]
            all_boxes[idx:idx + num_images, 4] = np.prod(s * 200, 1)
            all_boxes[idx:idx + num_images, 5] = score
            image_path.extend(meta['image'])
            if config.DATASET.DATASET == 'posetrack':
                filenames.extend(meta['filename'])
                imgnums.extend(meta['imgnum'].numpy())

            idx += num_images

            if i % config.PRINT_FREQ == 0:
                msg = 'Test: [{0}/{1}]\t' \
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                    'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
                        i, len(val_loader), batch_time=batch_time,
                        loss=losses, acc=acc)
                logger.info(msg)

                prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i)
                save_debug_images(config, input, meta, target, pred * 4,
                                  output, prefix, target_weight)

        name_values, perf_indicator = val_dataset.evaluate(
            config, all_preds, output_dir, all_boxes, image_path, filenames,
            imgnums)

        _, full_arch_name = get_model_name(config)
        if isinstance(name_values, list):
            for name_value in name_values:
                _print_name_value(name_value, full_arch_name)
        else:
            _print_name_value(name_values, full_arch_name)

        if writer_dict:
            writer = writer_dict['writer']
            global_steps = writer_dict['valid_global_steps']
            writer.add_scalar('valid_loss', losses.avg, global_steps)
            writer.add_scalar('valid_acc', acc.avg, global_steps)
            if isinstance(name_values, list):
                for name_value in name_values:
                    writer.add_scalars('valid', dict(name_value), global_steps)
            else:
                writer.add_scalars('valid', dict(name_values), global_steps)
            writer_dict['valid_global_steps'] = global_steps + 1

        return perf_indicator
Exemple #5
0
def validate(config,
             val_loader,
             val_dataset,
             model,
             criterion,
             output_dir,
             tb_log_dir,
             writer_dict=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    # switch to evaluate mode
    model.eval()

    num_samples = len(val_dataset)
    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3),
                         dtype=np.float32)
    all_boxes = np.zeros((num_samples, 6))
    image_path = []
    filenames = []
    imgnums = []
    idx = 0
    with torch.no_grad():
        end = time.time()
        for i, (input_, target, target_weight, meta) in enumerate(val_loader):
            # compute output
            if meta['image_id'] != 10003420000:
                continue
            root = config.DATASET.ROOT
            file_name = index_to_path(root, meta['image_id'][0].item())
            data_numpy = cv2.imread(
                file_name, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
            c_dt = meta['center'][0].numpy()
            s_dt = meta['scale'][0].numpy()
            r = 0
            trans = get_affine_transform(c_dt, s_dt, r,
                                         config.MODEL.IMAGE_SIZE)
            input = cv2.warpAffine(data_numpy,
                                   trans, (int(config.MODEL.IMAGE_SIZE[0]),
                                           int(config.MODEL.IMAGE_SIZE[1])),
                                   flags=cv2.INTER_LINEAR)

            normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225])
            transform = transforms.Compose([
                transforms.ToTensor(),
                normalize,
            ])

            input = transform(input)
            # print(type(input))
            # print(input.shape)

            new_input = np.zeros(
                [1, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0]])
            new_input[0, :, :, :] = input[:, :, :]
            input = torch.from_numpy(new_input).float()

            output = model(input)
            if config.TEST.FLIP_TEST:
                # this part is ugly, because pytorch has not supported negative index
                # input_flipped = model(input[:, :, :, ::-1])
                input_flipped = np.flip(input.cpu().numpy(), 3).copy()
                input_flipped = torch.from_numpy(input_flipped).cuda()
                output_flipped = model(input_flipped)
                output_flipped = flip_back(output_flipped.cpu().numpy(),
                                           val_dataset.flip_pairs)
                output_flipped = torch.from_numpy(output_flipped.copy()).cuda()

                # feature is not aligned, shift flipped heatmap for higher accuracy
                if config.TEST.SHIFT_HEATMAP:
                    output_flipped[:, :, :, 1:] = \
                        output_flipped.clone()[:, :, :, 0:-1]
                    # output_flipped[:, :, :, 0] = 0

                output = (output + output_flipped) * 0.5

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)

            loss = criterion(output, target, target_weight)

            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(),
                                             target.cpu().numpy())

            acc.update(avg_acc, cnt)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            c = meta['center'].numpy()
            s = meta['scale'].numpy()
            score = meta['score'].numpy()

            c_d = meta['center'].numpy()
            s_d = meta['scale'].numpy()

            preds, maxvals = get_final_preds(config,
                                             output.clone().cpu().numpy(), c_d,
                                             s_d)

            print('id--{},\nkpts:\n{}'.format(meta['image_id'], preds[0]))
            # time.sleep(10)

            all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
            all_preds[idx:idx + num_images, :, 2:3] = maxvals
            # double check this all_boxes parts
            all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2]
            all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2]
            all_boxes[idx:idx + num_images, 4] = np.prod(s * 200, 1)
            all_boxes[idx:idx + num_images, 5] = score
            image_path.extend(meta['image'])
            # if config.DATASET.DATASET == 'posetrack':
            #     filenames.extend(meta['filename'])
            #     imgnums.extend(meta['imgnum'].numpy())

            idx += num_images

            if i % config.PRINT_FREQ == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
                          i, len(val_loader), batch_time=batch_time,
                          loss=losses, acc=acc)
                logger.info(msg)

                prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i)
                save_debug_images(config, input, meta, target, pred * 4,
                                  output, prefix)

        name_values, perf_indicator = val_dataset.evaluate(
            config, all_preds, output_dir, all_boxes, image_path, filenames,
            imgnums)

        _, full_arch_name = get_model_name(config)
        if isinstance(name_values, list):
            for name_value in name_values:
                _print_name_value(name_value, full_arch_name)
        else:
            _print_name_value(name_values, full_arch_name)

        if writer_dict:
            writer = writer_dict['writer']
            global_steps = writer_dict['valid_global_steps']
            writer.add_scalar('valid_loss', losses.avg, global_steps)
            writer.add_scalar('valid_acc', acc.avg, global_steps)
            if isinstance(name_values, list):
                for name_value in name_values:
                    writer.add_scalars('valid', dict(name_value), global_steps)
            else:
                writer.add_scalars('valid', dict(name_values), global_steps)
            writer_dict['valid_global_steps'] = global_steps + 1

    return perf_indicator
def main():
    torch.set_printoptions(precision=2, sci_mode=False, linewidth=300)

    args = parse_args()
    reset_config(config, args)
    run_phase = args.runMode  # train or test

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, run_phase)

    model_file = 'final_state_ep{}.pth.tar'.format(args.modelFile)
    # print code version info
    try:
        repo = Repo('')
        repo_git = repo.git
        working_tree_diff_head = repo_git.diff('HEAD')
        this_commit_hash = repo.commit()
        cur_branches = repo_git.branch('--list')
        logger.info('Current Code Version is {}'.format(this_commit_hash))
        logger.info('Current Branch Info :\n{}'.format(cur_branches))
        logger.info(
            'Working Tree diff with HEAD: \n{}'.format(working_tree_diff_head))
    except:
        logger.info('Git repo not initialized')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    backbone_model = eval('models.' + config.BACKBONE_MODEL + '.get_pose_net')(
        config, is_train=True)
    model = models.adafuse_network.get_multiview_pose_net(
        backbone_model, config)

    writer_dict = {
        'writer': SummaryWriter(tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # load pretrained backbone
    # Note this backbone is already trained on current dataset
    pretrained_backbone_file = Path(
        config.DATA_DIR) / config.NETWORK.PRETRAINED
    if os.path.exists(pretrained_backbone_file):
        model.load_state_dict(torch.load(pretrained_backbone_file),
                              strict=False)

    if args.evaluate:
        run_phase = 'test'
        model_file_path = config.NETWORK.ADAFUSE
        model.load_state_dict(torch.load(model_file_path), strict=True)
        logger.info(
            '=> loading model from {} for evaluating'.format(model_file_path))
    elif run_phase == 'test':
        model_state_file = os.path.join(final_output_dir, model_file)
        logger.info('=> loading model from {}'.format(model_state_file))
        model.load_state_dict(torch.load(model_state_file), strict=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()
    criterion_mpjpe = JointMPJPELoss().cuda()

    view_weight_params = []
    for name, param in model.named_parameters():
        if 'view_weight_net' in name:
            param.requires_grad = True
            view_weight_params.append(param)
        else:
            param.requires_grad = False
    optimizer = torch.optim.Adam(params=view_weight_params, lr=config.TRAIN.LR)

    start_epoch = config.TRAIN.BEGIN_EPOCH
    if run_phase == 'train' and config.TRAIN.RESUME:
        start_epoch, model, optimizer, ckpt_perf = load_checkpoint(
            model, optimizer, final_output_dir)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    if run_phase == 'train':
        train_dataset = eval('dataset.' + config.DATASET.TRAIN_DATASET)(
            config, config.DATASET.TRAIN_SUBSET, True,
            transforms.Compose([
                transforms.ToTensor(),
                normalize,
            ]))
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
            shuffle=config.TRAIN.SHUFFLE,
            num_workers=config.WORKERS,
            collate_fn=adafuse_collate,
            pin_memory=True)

    valid_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)(
        config, config.DATASET.TEST_SUBSET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        collate_fn=adafuse_collate,
        pin_memory=True)

    if run_phase == 'train':
        best_perf = ckpt_perf
        best_epoch = -1
        best_model = False
    perf_indicator = 0
    for epoch in range(start_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()
        extra_param = dict()
        extra_param['loss_mpjpe'] = criterion_mpjpe

        if run_phase == 'train':
            params = {
                'config': config,
                'dataset': train_dataset,
                'loader': train_loader,
                'model': model,
                'criterion_mse': criterion,
                'criterion_mpjpe': criterion_mpjpe,
                'final_output_dir': final_output_dir,
                'tb_writer': writer_dict,
                'optimizer': optimizer,
                'epoch': epoch,
                'is_train': True,
                'save_heatmaps': False,
            }
            # train
            run_model(**params)

            # save checkpoint and model before validation
            if divmod(epoch + 1, 1)[1] == 0:  # save checkpoint every x epoch
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'model': get_model_name(config),
                        'state_dict': model.module.state_dict(),
                        'perf': perf_indicator,
                        'optimizer': optimizer.state_dict(),
                    },
                    False,
                    final_output_dir,
                    filename='checkpoint_ep{}.pth.tar'.format(epoch))

            # save final state at every epoch
            final_model_state_file = os.path.join(
                final_output_dir, 'final_state_ep{}.pth.tar'.format(epoch))
            logger.info('saving final model state to {}'.format(
                final_model_state_file))
            torch.save(model.module.state_dict(), final_model_state_file)

        valid_params = {
            'config': config,
            'dataset': valid_dataset,
            'loader': valid_loader,
            'model': model,
            'criterion_mse': criterion,
            'criterion_mpjpe': criterion_mpjpe,
            'final_output_dir': final_output_dir,
            'tb_writer': writer_dict,
            'optimizer': optimizer,
            'epoch': epoch,
            'is_train': False,
            'save_heatmaps': False,
        }
        perf_indicator = run_model(**valid_params)

        if run_phase == 'test':
            break  # if run mode is test, only run test one time is enough

        logger.info(
            '=> perf indicator at epoch {} is {}. old best is {} '.format(
                epoch, perf_indicator, best_perf))

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
            best_epoch = epoch
            logger.info(
                '====> find new best model at end of epoch {}. (start from 0)'.
                format(epoch))
        else:
            best_model = False
        logger.info(
            'epoch of best validation results is {}'.format(best_epoch))

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
    # --- End all epoch
    writer_dict['writer'].close()
def main():
    args = parse_args()
    reset_config(config, args)  # args里面存储,cfg文件里面配置的内容(学习率, batchsize, GPU设置)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    # logger.info(pprint.pformat(args))
    # logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        # config, is_train=True
        config,
        is_train=False)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(logdir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))

    # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()  # 调用多GPU

    # if torch.cuda.device_count() > 1:
    #     model = torch.nn.DataParallel(model, device_ids=[0,1])

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(  # dataset.mpii
        config,
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET,
        True,
        train_set=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))  # 数据增强?

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),  # 用1个GPU, batchsize为32
        # batch_size=config.TRAIN.BATCH_SIZE * 2,  # 用1个GPU, batchsize为32
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH,
                       config.TRAIN.END_EPOCH):  # 训练20个epoch
        lr_scheduler.step()
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir)
        # , writer_dict)

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        if epoch % 1 == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': get_model_name(config),
                    'state_dict': model.state_dict(),
                    #'perf': perf_indicator,
                    'optimizer': optimizer.state_dict(),
                },
                best_model,
                final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)  #
Exemple #8
0
def train_normale(DANN, args, config):
    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        config, is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    ################# AGGIUNTO #######################
    criterion_dann = nn.CrossEntropyLoss()
    ########################################

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    ################# AGGIUNTO #######################
    if DANN == True:
        painting_dataset = eval('dataset.painting')(
            config,
            config.DATASET.ROOT,
            config.DATASET.PAINTING_SET,
            False,
            transforms.Compose([
                #transforms.Resize(256),
                #transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ]))

        train_no_aug_dataset = eval('dataset.painting')(
            config,
            config.DATASET.ROOT,
            config.DATASET.TRAIN_NO_AUG_SET,
            False,
            transforms.Compose([
                #transforms.Resize(256),
                #transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ]))

##################################

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)
    ################# AGGIUNTO #######################
    if DANN == True:
        painting_loader = torch.utils.data.DataLoader(
            painting_dataset,
            #valid_dataset,
            batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
            shuffle=config.TRAIN.SHUFFLE,
            num_workers=config.WORKERS,
            pin_memory=True)
        #painting_loader=copy.deepcopy(valid_loader)

        train_no_aug_loader = torch.utils.data.DataLoader(
            train_no_aug_dataset,
            #valid_dataset,
            batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
            shuffle=config.TRAIN.SHUFFLE,
            num_workers=config.WORKERS,
            pin_memory=True)

##################################

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):

        # train for one epoch
        ################# AGGIUNTO #######################
        if DANN == True:
            alpha = 0.03
            print(f"Alpha = {alpha}")
            model = train_dann(config, train_loader, train_no_aug_loader,
                               painting_loader, model, criterion, optimizer,
                               epoch, final_output_dir, tb_log_dir,
                               writer_dict, alpha, criterion_dann)
        else:
            train(config, train_loader, model, criterion, optimizer, epoch,
                  final_output_dir, tb_log_dir, writer_dict)
        ##################################

        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)
        lr_scheduler.step()

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')
    
    time_stamp = tb_log_dir.split('_')[-1]
    new_folder = os.path.join(final_output_dir, time_stamp)
    os.makedirs(new_folder)

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.'+config.MODEL.NAME+'.get_pose_net_second_deconv')(
        config, is_train=True
    ).cuda()
    second_deconv = eval('models.'+config.MODEL.NAME+'.get_second_deconv')(
        config#, pretrained='output/coco/pose_resnet_50/256x192_d256x3_adam_lr1e-3/2021-02-15-03-49/model_best.pth.tar'
    ).cuda()

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    # dump_input = torch.rand((config.TRAIN.BATCH_SIZE,
    #                          3,
    #                          config.MODEL.IMAGE_SIZE[1],
    #                          config.MODEL.IMAGE_SIZE[0]))
    # writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    # model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT, use_gain_loss=False
    ).cuda()

    # optimizer = get_optimizer(config, model)
    second_deconv_optimizer = get_optimizer(config, second_deconv)

    # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
    #     optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR
    # )
    second_deconv_lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        second_deconv_optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR
    )

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.'+config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET,
        True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])
    )
    valid_dataset = eval('dataset.'+config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TEST_SET,
        False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE*len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE*len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True
    )

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        # lr_scheduler.step()
        second_deconv_lr_scheduler.step()
        
        # train for one epoch
        train(config, train_loader, model, second_deconv, criterion, second_deconv_optimizer, epoch,
                new_folder, tb_log_dir, writer_dict)
        
        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model, second_deconv,  
                                    criterion, new_folder, tb_log_dir,
                                    writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(new_folder))
        save_checkpoint({
            'epoch': epoch + 1,
            'model': get_model_name(config),
            'state_dict': second_deconv.state_dict(),
            'perf': perf_indicator,
            'optimizer': second_deconv_optimizer.state_dict(),
        }, best_model, new_folder)

    final_model_state_file = os.path.join(new_folder,
                                          'final_state.pth.tar')
    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Exemple #10
0
def main():
    args = parse_args()
    reset_config(
        config, args)  # config has edict type and is imported from core.config

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    print('final_output_dir: ', final_output_dir)

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        config, is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    best_perf = 0.0
    best_model = False
    best_epoch = 0

    # for continue training from checkpoint
    if os.path.isfile(config.MODEL.CHECKPOINT):
        logger.info(
            '=> load model from the checkpoint: config.MODEL.CHECKPOINT')
        ckp = torch.load(config.MODEL.CHECKPOINT)
        model.load_state_dict(ckp['state_dict'])
        writer_dict['train_global_steps'] = ckp['epoch']
        writer_dict['valid_global_steps'] = writer_dict['train_global_steps']
        config.TRAIN.BEGIN_EPOCH = writer_dict['train_global_steps']
        best_perf = ckp['best_perf']
        best_epoch = ckp['best_epoch']
        config.TRAIN.LR_STEP = [
            lr - config.TRAIN.BEGIN_EPOCH for lr in config.TRAIN.LR_STEP
        ]
        logger.info('begin_epoch: {}'.format(config.TRAIN.BEGIN_EPOCH))
        logger.info('train_global_steps: {}'.format(
            writer_dict['train_global_steps']))
        logger.info('valid_global_steps: {}'.format(
            writer_dict['valid_global_steps']))
        logger.info('best_perf: {}'.format(best_perf))
        logger.info('best_epoch: {}'.format(best_epoch))
        logger.info('lr_step: {}'.format(config.TRAIN.LR_STEP))

    for lr in config.TRAIN.LR_STEP:
        if lr < 0:
            logger.info(
                'ERROR: learning rate must be larger than 0, but lr_step is {}'
                .format(config.TRAIN.LR_STEP))
            exit()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        lr_scheduler.step()

        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
            best_epoch = epoch
            logger.info('=> best model, epoch: {}, perf: {}'.format(
                epoch, perf_indicator))
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'best_perf': best_perf,
                'best_epoch': best_epoch,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
def main():
    args = parse_args()

    cf.args = args

    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    tb_log_dir = pathjoin(
        dirname(tb_log_dir), 'w%s,m%s,rs%s,t%s_' %
        (args.pointMaxW, args.probMargin, ''.join(map(str, args.rs)), args.t) +
        basename(tb_log_dir))

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        config, is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))
    #writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    #    criterion = JointsMSELoss(
    #        use_target_weight=config.LOSS.USE_TARGET_WEIGHT
    #    ).cuda()

    if config.TRAIN.CRITERION == 'msssm_mean':
        criterion = MultiScaleSpatialSoftmax(log_freq=60 * 10,
                                             cyc_rs=args.rs,
                                             poolings=['avg', 'max'][:],
                                             pointMaxW=args.pointMaxW,
                                             probMargin=args.probMargin,
                                             temper=args.t)
        # p[1, 4, 10]* m[0, .5, .8]
#        criterion = MultiScaleSpatialSoftMax( poolings=['avg', 'max'], pointMaxW=1)
#        criterion = MultiScaleSpatialSoftMax(cyc_rs=[8, 4, 2, ], pointMaxW=1)
    elif config.TRAIN.CRITERION == 'ssm_mean':
        criterion = SpatialSoftmax()


#    criterion = torch.nn.DataParallel(criterion, device_ids=gpus).cuda()

#    cf.debugPoinMax = 30
    cf.debugPoinMax = False

    if cf.debugPoinMax:
        criterion = MultiScaleSpatialSoftmax(
            log_freq=30,
            cyc_rs=[],
            poolings=['avg', 'max'][:],
            pointMaxW=args.pointMaxW,
        )

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
    print(args)
Exemple #12
0
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    if args.useOneDrive == True:
        oneDriveLogger = OneDriveLogger()
    else:
        oneDriveLogger = None

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED
    '''
    model = MobileNet_()
    model.load_state_dict(torch.load(args.resume))
    model.heatmap = nn.Conv2d(1024, 16, 1, bias=False)
    model.offset = nn.Conv2d(1024, 16*2, 1, bias=False)
    model.offset.weight.data = torch.from_numpy(np.zeros_like(model.offset.weight.data)) 
    model.heatmap.weight.data = torch.from_numpy(np.zeros_like(model.heatmap.weight.data)) 
    lastestname = os.path.join(final_output_dir, 'renew')
    torch.save(model.state_dict(), lastestname + '.model')
    '''

    if config.MODEL.NAME == "MobileNet16_":
        model = MobileNet16_()
    elif config.MODEL.NAME == "MnasNet16_":
        model = MnasNet_()
    elif config.MODEL.NAME == "MobileNet162_":
        model = MobileNet162_()
    else:
        model = eval(config.MODEL.NAME)()

    optimizer_state_dict = None
    if args.resume:
        '''
        checkpoint = torch.load(args.resume)
        state_dict = checkpoint['state_dict']
        # create new OrderedDict that does not contain `module.`
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:] # remove `module.`
            new_state_dict[name] = v
        # load params
        model.load_state_dict(new_state_dict)
        optimizer_state_dict = checkpoint['optimizer']
        '''

        checkpoint = torch.load(args.resume)
        state_dict = checkpoint['state_dict']
        model.load_state_dict(state_dict)
        optimizer_state_dict = checkpoint['optimizer']

        #model.load_state_dict(torch.load(args.resume))
        '''
        optimizer = get_optimizer(config, model)
        
        for p in model.model.parameters():
            p.requires_grad = False
        
        heatmap_data = model.heatmap.weight.data 
        model.heatmap = nn.Conv2d(1024, 16, 1, bias=False)
        model.offset = nn.Conv2d(1024, 16*2, 1, bias=False)
        model.heatmap.weight.data = heatmap_data
        model.offset.weight.data = torch.from_numpy(np.zeros_like(model.offset.weight.data)) 
        '''

        #model.model2 = None
        '''
        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.ReLU(inplace=True),
    
                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU(inplace=True),
            )

        model.model1_1_2 = conv_dw(256, 256, 1)
        model.model1_7_2 = conv_dw(512, 512, 1)
        '''
    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, config.MODEL.NUM_JOINTS,
         config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0]))
    #writer_dict['writer'].add_graph(model, (dump_input, ))

    gpus = [int(i) for i in config.GPUS.split(',')]
    #model = torch.nn.DataParallel(model, device_ids=gpus).cuda()
    model.cuda()

    # define loss function (criterion) and optimizer

    criterion = JointsMSELossCoco(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT,
        heatmap_size=config.MODEL.EXTRA.HEATMAP_SIZE[0]).cuda()

    optimizer = get_optimizer(config, model)

    if optimizer_state_dict != None:
        optimizer.load_state_dict(optimizer_state_dict)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET,
        True,
        transforms.Compose([
            transforms.ToTensor(),
            #normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TEST_SET,
        False,
        transforms.Compose([
            transforms.ToTensor(),
            #normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict, oneDriveLogger,
              args.useOffset)

        filename = os.path.join(final_output_dir,
                                'epoch-{0}'.format(epoch + 1))
        torch.save(model.state_dict(), filename + '.model')
        lastestname = os.path.join(final_output_dir, 'lastest')
        torch.save(model.state_dict(), lastestname + '.model')
        if args.useOneDrive == True:
            torch.save(model.state_dict(),
                       'C:/Users/aoyag/OneDrive/pytorch/lastest.model')

        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict, oneDriveLogger, args.useOffset)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
Exemple #13
0
def validate(config,
             loader,
             dataset,
             model_dict,
             criterion_dict,
             output_dir,
             writer_dict,  # None
             rank):
    # only rank 0 process will enter this function
    device = torch.device('cuda', rank)
    for model in model_dict.values():
        model.eval()
    batch_time = AverageMeter()
    losses = AverageMeter()
    avg_acc = AverageMeter()

    nsamples = len(dataset) * 4
    is_aggre = config.NETWORK.AGGRE
    njoints = config.NETWORK.NUM_JOINTS
    height = int(config.NETWORK.HEATMAP_SIZE[0])
    width = int(config.NETWORK.HEATMAP_SIZE[1])
    all_preds = np.zeros((nsamples, njoints, 3), dtype=np.float32)
    all_heatmaps = np.zeros(
        (nsamples, njoints, height, width), dtype=np.float32)

    idx = 0
    with torch.no_grad():
        end = time.time()
        for i, (input, target, weight, meta) in enumerate(loader):

            input = [view.to(device, non_blocking=False) for view in input]
            raw_features, aggre_features, _, _ = model_dict['base_model'](input)

            if is_aggre and config.TEST.FUSE_OUTPUT:
                output = fuse_routing(raw_features, aggre_features, is_aggre, meta)
            else:
                output = raw_features

            if config.TEST.FLIP_TEST:
                # only support MPII flip
                # input : a list of [N, 3, H, W]
                input_flipped = [torch.flip(view, dims=[3]).to(device, non_blocking=False) for view in input]
                raw_features_flipped, aggre_features_flipped, _, _ = model_dict['base_model'](input_flipped)

                if is_aggre and config.TEST.FUSE_OUTPUT:
                    output_flipped = fuse_routing(raw_features_flipped, aggre_features_flipped, is_aggre, meta)
                else:
                    output_flipped = raw_features_flipped
                output_flipped = flip_back_th(output_flipped, dataset.flip_pairs)

                if config.TEST.SHIFT_HEATMAP:
                    # in-place shift
                    for view in output_flipped:
                        view[:, :, :, 1:] = view.clone()[:, :, :, 0:-1]
                output = [(view + view_flipped)*0.5 for view, view_flipped in zip(output, output_flipped)]

            loss = 0
            target_cuda = []
            weight_cuda = []
            # loss on single view, with ground truth heat maps
            for t, w, r in zip(target, weight, raw_features):
                t = t.to(device, non_blocking=False)
                w = w.to(device, non_blocking=False)
                target_cuda.append(t)
                weight_cuda.append(w)
                loss += criterion_dict['mse_weights'](r, t, w)

            # loss on multivew h36m, consistent loss
            if is_aggre:
                if config.LOSS.USE_CONSISTENT_LOSS:
                    raw_h36m, agg_h36m = select_out_h36m(raw_features, aggre_features, meta)
                    assert len(raw_h36m[0]) == len(agg_h36m[0])
                    if len(raw_h36m[0]) != 0:
                        raw_h36m, agg_h36m = torch.cat(raw_h36m, dim=0), torch.cat(agg_h36m, dim=0)
                        loss += criterion_dict['mse'](raw_h36m, agg_h36m)

                if config.DATASET.PSEUDO_LABEL_PATH:
                    # mse loss on output, with pseudo heat maps
                    for t, w, o in zip(target_cuda, weight_cuda, output):
                        cal_loss = criterion_dict['mse_weights'](o, t, w)
                        loss += cal_loss * config.LOSS.MSE_LOSS_WEIGHT

            nimgs = len(input) * input[0].size(0)
            losses.update(loss.item(), nimgs)

            nviews = len(output)
            acc = [None] * nviews
            cnt = [None] * nviews
            pre = [None] * nviews
            for j in range(nviews):
                _, acc[j], cnt[j], pre[j] = accuracy(
                    output[j].detach().cpu().numpy(),
                    target_cuda[j].detach().cpu().numpy())
            acc = np.mean(acc)
            cnt = np.mean(cnt)
            avg_acc.update(acc, cnt)

            batch_time.update(time.time() - end)
            end = time.time()

            preds = np.zeros((nimgs, njoints, 3), dtype=np.float32)
            heatmaps = np.zeros(
                (nimgs, njoints, height, width), dtype=np.float32)
            for k, o, m in zip(range(nviews), output, meta):
                pred, maxval = get_final_preds(config,
                                               o.clone().cpu().numpy(),
                                               m['center'].numpy(),
                                               m['scale'].numpy())
                pred = pred[:, :, 0:2]
                pred = np.concatenate((pred, maxval), axis=2)
                preds[k::nviews] = pred
                heatmaps[k::nviews] = o.clone().cpu().numpy()

            all_preds[idx:idx + nimgs] = preds
            all_heatmaps[idx:idx + nimgs] = heatmaps
            idx += nimgs

            if i % config.PRINT_FREQ == 0 and rank == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
                          i, len(loader), batch_time=batch_time,
                          loss=losses, acc=avg_acc)
                logger.info(msg)

                for k in range(len(input)):
                    view_name = 'view_{}'.format(k + 1)
                    prefix = '{}_{}_{:08}'.format(
                        os.path.join(output_dir, 'validation'), view_name, i)
                    save_debug_images(config, input[k], meta[k], target_cuda[k],
                                      pre[k] * 4, output[k], prefix)

        perf_indicator = 1000
        if rank == 0:
            # save heatmaps and joint locations
            u2a = dataset.u2a_mapping
            u2a = {k:v  for k, v in u2a.items() if v != '*'}
            sorted_u2a = sorted(u2a.items(), key=lambda x: x[0])
            u = np.array([mapping[0] for mapping in sorted_u2a])


            file_name = os.path.join(output_dir, 'heatmaps_locations_%s_%s.h5' % (dataset.subset, dataset.dataset_type))
            file = h5py.File(file_name, 'w')
            file['heatmaps'] = all_heatmaps[:, u, :, :]
            file['locations'] = all_preds[:, u, :]
            file['joint_names_order'] = u  # names order in union(mpii) dataset
            file.close()

            name_value, perf_indicator = dataset.evaluate(all_preds[:, u, :], output_dir if config.DEBUG.SAVE_ALL_PREDS else None)
            names = name_value.keys()
            values = name_value.values()
            num_values = len(name_value)
            _, full_arch_name = get_model_name(config)
            logger.info('| Arch ' +
                        ' '.join(['| {}'.format(name) for name in names]) + ' |')
            logger.info('|---' * (num_values + 1) + '|')
            logger.info('| ' + full_arch_name + ' ' +
                        ' '.join(['| {:.3f}'.format(value) for value in values]) +
                        ' |')

    return perf_indicator
Exemple #14
0
def validate(config,
             val_loader,
             val_dataset,
             model,
             criterion,
             pointcri,
             anglecri,
             output_dir,
             tb_log_dir,
             writer_dict=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()
    #lossAngle = AverageMeter()
    lossPoint = AverageMeter()
    lossScore = AverageMeter()
    accPearson = AverageMeter()
    accMAE = AverageMeter()

    # switch to evaluate mode
    model.eval()

    num_samples = len(val_dataset)
    #all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 2), dtype=np.float32)        #ori landmark model
    all_preds = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3),
                         dtype=np.float32)
    all_preds_point = np.zeros((num_samples, config.MODEL.NUM_JOINTS, 3),
                               dtype=np.float32)
    #all_boxes = np.zeros((num_samples, 6))
    all_boxes = np.zeros((num_samples, 22))
    all_boxes_point = np.zeros((num_samples, 6))
    image_path = []
    filenames = []
    imgnums = []
    idx = 0
    with torch.no_grad():
        end = time.time()
        for i, (input, target, target_weight, meta,
                points) in enumerate(val_loader):
            # compute output
            outputs = model(input)

            if isinstance(outputs, list):
                output = outputs
                # output = outputs[-1]
            else:
                output = outputs
            # output = output[0]
            #import pdb
            #pdb.set_trace()
            if config.TEST.FLIP_TEST:
                # this part is ugly, because pytorch has not supported negative index
                # input_flipped = model(input[:, :, :, ::-1])
                input_flipped = np.flip(input.cpu().numpy(), 3).copy()
                input_flipped = torch.from_numpy(input_flipped).cuda()
                outputs_flipped = model(input_flipped)

                if isinstance(outputs_flipped, list):
                    output_flipped = outputs_flipped[0]
                else:
                    output_flipped = outputs_flipped

                output_flipped = flip_back(output_flipped.cpu().numpy(),
                                           val_dataset.flip_pairs)
                output_flipped = torch.from_numpy(output_flipped.copy()).cuda()

                # feature is not aligned, shift flipped heatmap for higher accuracy
                if config.TEST.SHIFT_HEATMAP:
                    output_flipped[:, :, :, 1:] = \
                        output_flipped.clone()[:, :, :, 0:-1]

                output = (output + output_flipped) * 0.5

            target = target.cuda(non_blocking=True)
            target_weight = target_weight.cuda(non_blocking=True)
            points = points.cuda(non_blocking=True)
            input_w = config.MODEL.IMAGE_SIZE[0]
            input_h = config.MODEL.IMAGE_SIZE[1]
            scoreloss = criterion(output, target, target_weight)
            #pointloss = pointcri(output, points, input_w, input_h)
            aa = 1
            #loss = 1*angleloss + 0.1*pointloss + 0*scoreloss
            #loss = (1-aa)*pointloss + aa*scoreloss
            loss = scoreloss
            num_images = input.size(0)
            # measure accuracy and record loss
            losses.update(loss.item(), num_images)
            #lossPoint.update(pointloss.item(), input.size(0))
            lossScore.update(scoreloss.item(), input.size(0))
            _, avg_acc, cnt, pred = accuracy(output.cpu().numpy(),
                                             target.cpu().numpy())

            acc.update(avg_acc, cnt)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            c = meta['center'].numpy()
            s = meta['scale'].numpy()
            r = meta['rotation'].numpy()
            score = meta['score'].numpy()
            w_rate = meta['w_rate']
            h_rate = meta['h_rate']
            box_list = meta['box_list'].numpy()
            id = meta['id'].numpy()
            joints_vis = meta['joints_vis'][:, :,
                                            0].numpy()  #shape = [num_joints]

            scoremap_height = output.shape[2]
            scoremap_width = output.shape[3]
            preds, maxvals = get_final_preds(config,
                                             output.clone().cpu().numpy(), c,
                                             s)
            all_preds[idx:idx + num_images, :, 0:2] = preds[:, :, 0:2]
            all_preds[idx:idx + num_images, :, 2:3] = maxvals
            # double check this all_boxes parts

            all_boxes[idx:idx + num_images, 0:2] = c[:, 0:2]
            all_boxes[idx:idx + num_images, 2:4] = s[:, 0:2]
            all_boxes[idx:idx + num_images, 4] = np.prod(s * 200, 1)
            all_boxes[idx:idx + num_images, 5] = score

            all_boxes[idx:idx + num_images, 6:9] = box_list[:, 0:3]
            all_boxes[idx:idx + num_images, 9] = id
            all_boxes[idx:idx + num_images, 10:22] = joints_vis[:, 0:12]

            image_path.extend(meta['image'])
            #import pdb
            #pdb.set_trace()
            idx += num_images
            if i % config.PRINT_FREQ == 0:
                msg = 'Test: [{0}/{1}]\t' \
                      'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' \
                      'lossScore {scoreloss.val:.5f} ({scoreloss.avg:.5f})\t' \
                      'Loss {loss.val:.5f} ({loss.avg:.5f})\t' \
                      'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
                          i, len(val_loader), batch_time=batch_time,
                    scoreloss=lossScore, loss=losses, acc=acc)
                logger.info(msg)

                prefix = '{}_{}'.format(os.path.join(output_dir, 'val'), i)
                save_debug_images(config, input, meta, target, pred * 4,
                                  output, prefix)
        #import pdb
        #pdb.set_trace()
        name_values, perf_indicator = val_dataset.evaluate(
            config, all_preds, output_dir, all_boxes, image_path, filenames,
            imgnums)
        _, full_arch_name = get_model_name(config)
        if isinstance(name_values, list):
            for name_value in name_values:
                _print_name_value(name_value, full_arch_name)
        else:
            _print_name_value(name_values, full_arch_name)

        if writer_dict:
            writer = writer_dict['writer']
            global_steps = writer_dict['valid_global_steps']
            writer.add_scalar('valid_loss', losses.avg, global_steps)
            writer.add_scalar('valid_acc', acc.avg, global_steps)

            if isinstance(name_values, list):
                for name_value in name_values:
                    writer.add_scalars('valid', dict(name_value), global_steps)
            else:
                writer.add_scalars('valid', dict(name_values), global_steps)

            writer_dict['valid_global_steps'] = global_steps + 1

    return perf_indicator
Exemple #15
0
def main():
    args = parse_args()
    reset_config(config, args)

    if args.prevModelDir and args.modelDir:
        # copy pre models for philly
        copy_prev_models(args.prevModelDir, args.modelDir)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.'+config.MODEL.NAME+'.get_pose_net')(
        config, is_train=True
    )

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)
    # logger.info(pprint.pformat(model))

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand((config.TRAIN.BATCH_SIZE,
                             3,
                             config.MODEL.IMAGE_SIZE[1],
                             config.MODEL.IMAGE_SIZE[0]))
    #writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)
        
    #logger.info(get_model_summary(model, dump_input))
    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()
    train_layer=['module.final_sigmoid','module.dropout','module.final_linear.weight','module.final_linear.bias','module.output_conv_final.weight','module.final_conv.weight','module.final_conv.bias']
    #train_layer=['module.final_layer_one.weight','module.final_layer_one.bias','module.final_linear.weight','module.final_linear.bias','module.out_linear.weight','module.out_linear.bias','module.output_conv_final.weight']

    #for k,v in model.named_parameters():
        #if k not in train_layer:
            #v.requires_grad=False
    
    #import pdb
    #pdb.set_trace()
    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT
    ).cuda()
    anglecri=AnglesLoss(use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()
    pointcri = PointsLoss(use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()
    
    normalize = transforms.Normalize(mean=[0.43, 0.43, 0.43], std=[0.223, 0.223, 0.223]) #for SpineWeb CLAHE
    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR
    )
    train_dataset = eval('dataset.'+config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET,
        True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])
    )
    valid_dataset = eval('dataset.'+config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TEST_SET,
        False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE*len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE*len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True
    )

    best_perf = 0.0
    best_model = True
    last_epoch = -1
    optimizer = get_optimizer(config, model)
    begin_epoch = config.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(
        final_output_dir, 'checkpoint.pth'
    )

    if config.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
        last_epoch=last_epoch
    )

    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(config, train_loader, model, criterion,pointcri,anglecri, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)


        # evaluate on validation set
        perf_indicator = validate(
            config, valid_loader, valid_dataset, model, criterion,pointcri,anglecri,
            final_output_dir, tb_log_dir, writer_dict
        )

        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False
        

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint({
            'epoch': epoch + 1,
            'model': get_model_name(config),
            'state_dict': model.state_dict(),
            'best_state_dict': model.module.state_dict(),
            'perf': perf_indicator,
            'optimizer': optimizer.state_dict(),
        }, best_model, final_output_dir)
        if best_model == True:
            best_model_state_file = os.path.join(final_output_dir, 'best_state.pth.tar')
            logger.info('saving best model state to {}'.format(
                best_model_state_file))
            torch.save(model.module.state_dict(), best_model_state_file)
        

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()