def test(self,
             task,
             seed,
             iteration_num,
             render,
             load_iter=None,
             debug=False):
        set_seed(seed)
        iteration_num = int(iteration_num)

        gt.reset()
        gt.set_def_unique(False)
        start_iter = 0

        params = self.logger.load_params(load_iter)
        start_iter = self._set_params(params)
        self.theta.train()
        extra_data = self.logger.load_extra_data()
        self._set_extra_data(extra_data)

        rollout = []
        state = task.reset()
        self.controller.set_task(task)

        for i in gt.timed_for(range(start_iter, iteration_num),
                              save_itrs=True):

            t = 0
            done = False
            reward_sum = 0
            state = task.reset()
            while not done:
                past_traj = [r[-self.M:] for r in rollout]

                if past_traj != []:
                    for _ in range(self.adaptation_update_num):
                        loss = self._compute_adaptation_loss(
                            self.theta, past_traj)
                        zero_grad(self.theta.parameters())
                        self._meta_update(loss)

                action = self.controller.plan(self.theta, state, None, debug)
                next_state, reward, done, _ = task.step(action)
                reward_sum += reward

                if render:
                    task.render()

                if action.shape == ():
                    action = [action]

                rollout = _aggregate_rollout(rollout, state, action,
                                             next_state)
                state = next_state
                t += 1

                if done:
                    rollout = []
                    state = task.reset()

            print('Iteration:', i, 'Reward:', reward_sum, 'Traj len:', t)
from models.base_block_depth import FeatClassifier, BaseClassifier
from models.resnet18_inception_depth_4 import resnet18_inception_depth_4
from models.resnet18_self_attention_depth_34 import resnet18_self_attention_depth_34
from models.resnet18_self_attention_depth_34_version2 import resnet18_self_attention_depth_34_version2
from models.resnet18_inception_depth_4_wrap import resnet18_inception_depth_4_wrap
from models.ours import ours
from models.resnet_depth import resnet_depth
from models.resnet_attention import resnet_attention 
from models.resnet18_self_mutual_attention import resnet18_self_mutual_attention
'''
from batch_engine import valid_trainer, batch_trainer
from models.base_block import FeatClassifier, BaseClassifier
from models.resnet18_depth import resnet18_depth
from dataset.AttrDataset_depth import AttrDataset, get_transform
'''
set_seed(605)


def main(args):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.device
    print('load the model from:   ' + args.save_path )
    exp_dir = os.path.join(args.save_path, args.dataset, args.dataset, 'img_model/ckpt_max.pth')
    train_tsfm, valid_tsfm = get_transform(args)
    #pdb.set_trace()
    valid_set = AttrDataset(args=args, split=args.valid_split, transform=valid_tsfm)

    valid_loader = DataLoader(
        dataset=valid_set,
        batch_size=args.batchsize,
        shuffle=False,
        num_workers=4,
Exemple #3
0
def main():
    assert torch.cuda.is_available(), 'need gpu to train network!'
    torch.cuda.empty_cache()

    args = parse_args()
    sys.path.append(args.work_dir)
    from train_config import config
    log_dir = os.path.join(args.work_dir, 'log')
    checkpoint_dir = os.path.join(args.work_dir, 'checkpoints')
    resume_model = os.path.join(checkpoint_dir, 'latest.pth')

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    global logger
    logger = get_logger('train', log_dir)

    set_seed(config.seed)

    local_rank = args.local_rank
    # start init process
    if config.distributed:
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        torch.cuda.set_device(local_rank)

    init_fn = functools.partial(worker_seed_init_fn,
                                num_workers=config.num_workers,
                                local_rank=local_rank,
                                seed=config.seed)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        config.train_dataset, shuffle=True) if config.distributed else None
    collater = SegmentationCollater()
    train_loader = DataLoader(config.train_dataset,
                              batch_size=config.batch_size,
                              shuffle=(train_sampler is None),
                              pin_memory=True,
                              num_workers=config.num_workers,
                              collate_fn=collater.next,
                              sampler=train_sampler,
                              worker_init_fn=init_fn)
    val_sampler = torch.utils.data.distributed.DistributedSampler(
        config.val_dataset, shuffle=False) if config.distributed else None
    val_loader = DataLoader(config.val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=config.num_workers,
                            collate_fn=collater.next,
                            sampler=val_sampler)

    for key, value in config.__dict__.items():
        if not key.startswith('__'):
            if key not in [
                    'model', 'criterion', 'decoder', 'train_dataset',
                    'val_dataset'
            ]:
                log_info = f'{key}: {value}'
                logger.info(log_info) if (
                    config.distributed
                    and local_rank == 0) or not config.distributed else None

    gpus_type, gpus_num = torch.cuda.get_device_name(
    ), torch.cuda.device_count()
    log_info = f'gpus_type: {gpus_type}, gpus_num: {gpus_num}'
    logger.info(log_info) if (config.distributed and local_rank
                              == 0) or not config.distributed else None

    model = config.model.cuda()
    criterion = config.criterion.cuda()
    decoder = config.decoder.cuda()

    # parameters needs to be updated by the optimizer
    # buffers doesn't needs to be updated by the optimizer
    for name, param in model.named_parameters():
        log_info = f'name: {name}, grad: {param.requires_grad}'
        logger.info(log_info) if (config.distributed and local_rank
                                  == 0) or not config.distributed else None

    for name, buffer in model.named_buffers():
        log_info = f'name: {name}, grad: {buffer.requires_grad}'
        logger.info(log_info) if (config.distributed and local_rank
                                  == 0) or not config.distributed else None

    optimizer = build_optimizer(config, model)
    scheduler = build_scheduler(config, optimizer)
    model = build_training_mode(config, model, optimizer)

    start_epoch = 1
    # automatically resume model for training if checkpoint model exist
    if os.path.exists(resume_model):
        checkpoint = torch.load(resume_model, map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        saved_epoch = checkpoint['epoch']
        start_epoch += saved_epoch
        test_loss, lr = checkpoint['test_loss'], checkpoint['lr']

        log_info = f'resuming model from {resume_model}. resume_epoch: {saved_epoch}, test_loss: {test_loss:.4f}, lr: {lr:.6f}'
        logger.info(log_info) if (config.distributed and local_rank
                                  == 0) or not config.distributed else None

    # calculate training time
    start_time = time.time()
    best_test_loss = 100000000.

    for epoch in range(start_epoch, config.epochs + 1):
        torch.cuda.empty_cache()
        train_sampler.set_epoch(epoch) if config.distributed else None
        loss = train_segmentation(train_loader, model, criterion, optimizer,
                                  scheduler, epoch, logger, config)
        log_info = f'train: epoch {epoch:0>3d}, total_loss: {loss:.4f}'
        logger.info(log_info) if (config.distributed and local_rank
                                  == 0) or not config.distributed else None

        test_loss = None
        if epoch in config.eval_epoch or epoch == config.epochs:
            test_loss = compute_segmentation_test_loss(val_loader, model,
                                                       criterion)
            log_info = f'eval: epoch: {epoch:0>3d}, test_loss: {test_loss:.4f}'
            logger.info(log_info) if (config.distributed and local_rank
                                      == 0) or not config.distributed else None

        if (config.distributed and local_rank == 0) or not config.distributed:
            # save best test loss model and each epoch checkpoint
            if test_loss and test_loss < best_test_loss:
                torch.save(model.module.state_dict(),
                           os.path.join(checkpoint_dir, 'best.pth'))
                best_test_loss = test_loss

            torch.save(
                {
                    'epoch': epoch,
                    'test_loss': best_test_loss,
                    'lr': scheduler.get_lr()[0],
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scheduler_state_dict': scheduler.state_dict(),
                }, os.path.join(checkpoint_dir, 'latest.pth'))

    if (config.distributed and local_rank == 0) or not config.distributed:
        if os.path.exists(os.path.join(checkpoint_dir, 'best.pth')):
            os.rename(
                os.path.join(checkpoint_dir, 'best.pth'),
                os.path.join(
                    checkpoint_dir,
                    f'{config.network}-epoch{epoch}-best_test_loss{best_test_loss:.3f}.pth'
                ))

    training_time = (time.time() - start_time) / 3600
    flops, params = compute_flops_and_params(config, model)
    log_info = f'train done. model: {config.network}, flops: {flops}, params: {params}, training time: {training_time:.3f} hours, best_test_loss: {best_test_loss:.3f}'
    logger.info(log_info) if (config.distributed and local_rank
                              == 0) or not config.distributed else None
Exemple #4
0
def main(config):
    os.environ['CUDA_VISIBLE_DEVICES'] = config.GPU

    if not config.EVAL_MODE:
        sys.stdout = Logger(osp.join(config.OUTPUT, 'log_train.txt'))
    else:
        sys.stdout = Logger(osp.join(config.OUTPUT, 'log_test.txt'))
    print("==========\nConfig:{}\n==========".format(config))
    print("Currently using GPU {}".format(config.GPU))
    # Set random seed
    set_seed(config.SEED)

    # Build dataloader
    trainloader, queryloader, galleryloader, num_classes = build_dataloader(
        config)
    # Build model
    model, classifier = build_model(config, num_classes)
    # Build classification and pairwise loss
    criterion_cla, criterion_pair = build_losses(config)
    # Build optimizer
    parameters = list(model.parameters()) + list(classifier.parameters())
    if config.TRAIN.OPTIMIZER.NAME == 'adam':
        optimizer = optim.Adam(
            parameters,
            lr=config.TRAIN.OPTIMIZER.LR,
            weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY)
    elif config.TRAIN.OPTIMIZER.NAME == 'adamw':
        optimizer = optim.AdamW(
            parameters,
            lr=config.TRAIN.OPTIMIZER.LR,
            weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY)
    elif config.TRAIN.OPTIMIZER.NAME == 'sgd':
        optimizer = optim.SGD(parameters,
                              lr=config.TRAIN.OPTIMIZER.LR,
                              momentum=0.9,
                              weight_decay=config.TRAIN.OPTIMIZER.WEIGHT_DECAY,
                              nesterov=True)
    else:
        raise KeyError("Unknown optimizer: {}".format(
            config.TRAIN.OPTIMIZER.NAME))
    # Build lr_scheduler
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        milestones=config.TRAIN.LR_SCHEDULER.STEPSIZE,
        gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE)

    start_epoch = config.TRAIN.START_EPOCH
    if config.MODEL.RESUME:
        print("Loading checkpoint from '{}'".format(config.MODEL.RESUME))
        checkpoint = torch.load(config.MODEL.RESUME)
        model.load_state_dict(checkpoint['state_dict'])
        start_epoch = checkpoint['epoch']

    model = nn.DataParallel(model).cuda()
    classifier = nn.DataParallel(classifier).cuda()

    if config.EVAL_MODE:
        print("Evaluate only")
        test(model, queryloader, galleryloader)
        return

    start_time = time.time()
    train_time = 0
    best_rank1 = -np.inf
    best_epoch = 0
    print("==> Start training")
    for epoch in range(start_epoch, config.TRAIN.MAX_EPOCH):
        start_train_time = time.time()
        train(epoch, model, classifier, criterion_cla, criterion_pair,
              optimizer, trainloader)
        train_time += round(time.time() - start_train_time)

        if (epoch+1) > config.TEST.START_EVAL and config.TEST.EVAL_STEP > 0 and \
            (epoch+1) % config.TEST.EVAL_STEP == 0 or (epoch+1) == config.TRAIN.MAX_EPOCH:
            print("==> Test")
            rank1 = test(model, queryloader, galleryloader)
            is_best = rank1 > best_rank1
            if is_best:
                best_rank1 = rank1
                best_epoch = epoch + 1

            state_dict = model.module.state_dict()
            save_checkpoint(
                {
                    'state_dict': state_dict,
                    'rank1': rank1,
                    'epoch': epoch,
                }, is_best,
                osp.join(config.OUTPUT,
                         'checkpoint_ep' + str(epoch + 1) + '.pth.tar'))
        scheduler.step()

    print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(
        best_rank1, best_epoch))

    elapsed = round(time.time() - start_time)
    elapsed = str(datetime.timedelta(seconds=elapsed))
    train_time = str(datetime.timedelta(seconds=train_time))
    print(
        "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".
        format(elapsed, train_time))
Exemple #5
0
def main():
    assert torch.cuda.is_available(), 'need gpu to train network!'
    torch.cuda.empty_cache()

    args = parse_args()
    sys.path.append(args.work_dir)
    from test_config import config
    log_dir = os.path.join(args.work_dir, 'log')

    set_seed(config.seed)

    collater = SegmentationCollater()
    val_loader = DataLoader(config.val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=config.num_workers,
                            collate_fn=collater.next)

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    logger = get_logger('test', log_dir)

    for key, value in config.__dict__.items():
        if not key.startswith('__'):
            if key not in [
                    'model', 'criterion', 'decoder', 'train_dataset',
                    'val_dataset'
            ]:
                log_info = f'{key}: {value}'
                logger.info(log_info)

    gpus_type, gpus_num = torch.cuda.get_device_name(
    ), torch.cuda.device_count()
    log_info = f'gpus_type: {gpus_type}, gpus_num: {gpus_num}'
    logger.info(log_info)

    model = config.model
    decoder = config.decoder

    if config.trained_model_path:
        saved_model = torch.load(os.path.join(BASE_DIR,
                                              config.trained_model_path),
                                 map_location=torch.device('cpu'))
        model.load_state_dict(saved_model)

    flops, params = compute_flops_and_params(config, model)
    log_info = f'model: {config.network}, flops: {flops}, params: {params}'
    logger.info(log_info)

    model = model.cuda()
    decoder = decoder.cuda()
    model = nn.DataParallel(model)

    result_dict = validate_segmentation(config.val_dataset, val_loader, model,
                                        decoder, config)
    log_info = f'eval_result: '
    if result_dict:
        for key, value in result_dict.items():
            log_info += f'{key}: {value} ,'
    else:
        log_info += f', no target detected in testset images!'
    logger.info(log_info)

    return
def main():
    assert torch.cuda.is_available(), 'need gpu to train network!'
    torch.cuda.empty_cache()

    args = parse_args()
    sys.path.append(args.work_dir)
    from test_config import config
    log_dir = os.path.join(args.work_dir, 'log')

    set_seed(config.seed)

    local_rank = args.local_rank
    # start init process
    if config.distributed:
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        torch.cuda.set_device(local_rank)

    val_sampler = torch.utils.data.distributed.DistributedSampler(
        config.val_dataset, shuffle=False) if config.distributed else None
    val_loader = DataLoader(config.val_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=False,
                            num_workers=config.num_workers,
                            sampler=val_sampler)

    if (config.distributed and local_rank == 0) or not config.distributed:
        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

    global logger
    logger = get_logger('test', log_dir)

    for key, value in config.__dict__.items():
        if not key.startswith('__'):
            if key not in ['model', 'criterion']:
                log_info = f'{key}: {value}'
                logger.info(log_info) if (
                    config.distributed
                    and local_rank == 0) or not config.distributed else None

    gpus_type, gpus_num = torch.cuda.get_device_name(
    ), torch.cuda.device_count()
    log_info = f'gpus_type: {gpus_type}, gpus_num: {gpus_num}'
    logger.info(log_info) if (config.distributed and local_rank
                              == 0) or not config.distributed else None

    model = config.model
    criterion = config.criterion

    if config.trained_model_path:
        saved_model = torch.load(os.path.join(BASE_DIR,
                                              config.trained_model_path),
                                 map_location=torch.device('cpu'))
        model.load_state_dict(saved_model)

    flops, params = compute_flops_and_params(config, model)
    log_info = f'model: {config.network}, flops: {flops}, params: {params}'
    logger.info(log_info) if (config.distributed and local_rank
                              == 0) or not config.distributed else None

    model = model.cuda()
    criterion = criterion.cuda()
    if config.distributed:
        model = nn.parallel.DistributedDataParallel(model,
                                                    device_ids=[local_rank],
                                                    output_device=local_rank)
    else:
        model = nn.DataParallel(model)

    top1, top5, loss, per_image_load_time, per_image_inference_time = validate_classification(
        val_loader, model, criterion, config)
    log_info = f'top1: {top1:.3f}%, top5: {top5:.3f}%, loss: {loss:.4f}, per_image_load_time: {per_image_load_time:.3f}ms, per_image_inference_time: {per_image_inference_time:.3f}ms'
    logger.info(log_info) if (config.distributed and local_rank
                              == 0) or not config.distributed else None

    return
Exemple #7
0
    if args.tb_path == '':
        args.tb_path = args.save
    writer = SummaryWriter(args.tb_path)

    utils.set_logging(args.save)

    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)

    cudnn.benchmark = True
    cudnn.enabled = True

    if config.train_params.use_seed:
        utils.set_seed(config.train_params.seed)

    logging.info("args = %s", args)
    logging.info('Training with config:')
    logging.info(pprint.pformat(config))

    config.net_config, net_type = utils.load_net_config(
        os.path.join(args.load_path, 'net_config'))
    derivedNetwork = getattr(model_derived, '%s_Net' % net_type.upper())
    model = derivedNetwork(config.net_config, config=config)

    model.eval()
    if hasattr(model, 'net_config'):
        logging.info("Network Structure: \n" +
                     '|\n'.join(map(str, model.net_config)))
    params = utils.count_parameters_in_MB(model)