Esempio n. 1
0
def run_experiment(alg_name, map_id, specs_id, num_times, steps, r_good,
                   multi_ag, show_print, render):
    # configuration of testing params
    testing_params = TestingParameters()

    # configuration of learning params
    training_params = TrainingParameters()

    # Setting the experiment
    tester = Tester(training_params, testing_params, map_id, specs_id,
                    multi_ag)

    # Setting the curriculum learner
    curriculum = CurriculumLearner(tester.specs,
                                   total_steps=steps,
                                   r_good=r_good)

    # Setting up the saver
    saver = Saver(alg_name, tester, curriculum)

    # Baseline 1 (Decentrlized LTL DQN)
    if alg_name == "i-dqn-l":
        i_dqn_l.run_experiments(tester, curriculum, saver, num_times,
                                show_print, render)
    else:

        i_lpopl.run_experiments(tester, curriculum, saver, num_times,
                                show_print, render)
Esempio n. 2
0
    def __init__(self):
        super().__init__()
        now_time = time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time()))
        logger_path = os.path.join(
            self.args.training.save_dir,
            self.args.dataset.dataset_train,
            self.args.models.model_warpper,
            self.args.training.experiment_id,
            '%s.log' % now_time
        )
        set_logger_path(logger_path)
        logger.info(self.args)

        # Define Saver
        self.saver = Saver(self.args)

        # Define Tensorboard Summary
        self.summary = TensorboardSummary()
        self.writer = self.summary.create_summary(self.saver.experiment_dir, self.args.models)


        self.init_training_container()
        self.batchsize = self.args.training.batchsize
        self.reset_batchsize()
        self.evaluator = Evaluator()
        self.best = 0.0

        # show parameters to be trained
        logger.debug('\nTraining params:')
        for p in self.model.named_parameters():
            if p[1].requires_grad:
                logger.debug(p[0])
        logger.debug('\n')

        # Clear start epoch if fine-tuning
        logger.info('Starting iteration: %d' % self.start_it)
        logger.info('Total iterationes: %d' % self.args.training.max_iter)
Esempio n. 3
0
def main():

    args = parse_args()
    args.pretrain = False

    root_path = 'exps/exp_{}'.format(args.exp)

    if not os.path.exists(root_path):
        os.mkdir(root_path)
        os.mkdir(os.path.join(root_path, "log"))
        os.mkdir(os.path.join(root_path, "model"))

    base_lr = args.lr  # base learning rate

    train_dataset, val_dataset = build_dataset(args.dataset, args.data_root,
                                               args.train_list)

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=args.num_workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    model = VNet(args.n_channels, args.n_classes).cuda()

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=0.0005)
    #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7)

    model = torch.nn.DataParallel(model)

    model.train()

    if args.resume is None:
        assert os.path.exists(args.load_path)
        state_dict = model.state_dict()
        print("Loading weights...")
        pretrain_state_dict = torch.load(args.load_path,
                                         map_location="cpu")['state_dict']

        for k in list(pretrain_state_dict.keys()):
            if k not in state_dict:
                del pretrain_state_dict[k]
        model.load_state_dict(pretrain_state_dict)
        print("Loaded weights")
    else:
        print("Resuming from {}".format(args.resume))
        checkpoint = torch.load(args.resume, map_location="cpu")

        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        model.load_state_dict(checkpoint['state_dict'])

    logger = Logger(root_path)
    saver = Saver(root_path)

    for epoch in range(args.start_epoch, args.epochs):
        train(model, train_loader, optimizer, logger, args, epoch)
        validate(model, val_loader, optimizer, logger, saver, args, epoch)
        adjust_learning_rate(args, optimizer, epoch)
Esempio n. 4
0
def main():

    args = parse_args()
    if args.turnon < 0:
        args.pretrain = True
    else:
        args.pretrain = False
    print("Using GPU: {}".format(args.local_rank))
    root_path = 'exps/exp_{}'.format(args.exp)
    if args.local_rank == 0 and not os.path.exists(root_path):
        os.mkdir(root_path)
        os.mkdir(os.path.join(root_path, "log"))
        os.mkdir(os.path.join(root_path, "model"))

    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    train_dataset, val_dataset = build_dataset(args.dataset,
                                               args.data_root,
                                               args.train_list,
                                               sampling=args.sampling)
    args.world_size = len(args.gpu.split(","))
    if args.world_size > 1:
        os.environ['MASTER_PORT'] = args.port
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group('nccl')
        device = torch.device('cuda:{}'.format(args.local_rank))
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=len(args.gpu.split(",")),
            rank=args.local_rank)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               sampler=train_sampler,
                                               num_workers=args.num_workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    model = VNet(args.n_channels, args.n_classes, input_size=64,
                 pretrain=True).cuda(args.local_rank)
    model_ema = VNet(args.n_channels,
                     args.n_classes,
                     input_size=64,
                     pretrain=True).cuda(args.local_rank)
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=0.0005)
    if args.world_size > 1:
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)
        model_ema = DDP(model_ema,
                        device_ids=[args.local_rank],
                        output_device=args.local_rank,
                        find_unused_parameters=True)

    model.train()
    model_ema.load_state_dict(model.state_dict())
    print("Loaded weights")

    logger = Logger(root_path)
    saver = Saver(root_path, save_freq=args.save_freq)
    if args.sampling == 'default':
        contrast = RGBMoCo(128, K=4096,
                           T=args.temperature).cuda(args.local_rank)
    elif args.sampling == 'layerwise':
        contrast = RGBMoCoNew(128, K=4096,
                              T=args.temperature).cuda(args.local_rank)
    else:
        raise ValueError("unsupported sampling method")
    criterion = torch.nn.CrossEntropyLoss()

    flag = False
    for epoch in range(args.start_epoch, args.epochs):
        train_sampler.set_epoch(epoch)
        train(model, model_ema, train_loader, optimizer, logger, saver, args,
              epoch, contrast, criterion)
        validate(model_ema, val_loader, optimizer, logger, saver, args, epoch)
        adjust_learning_rate(args, optimizer, epoch)
Esempio n. 5
0
                                           drop_last=True)

val_loader = torch.utils.data.DataLoader(val_dataset,
                                         batch_size=1,
                                         num_workers=args.num_workers,
                                         pin_memory=True)
model = VNet(args.n_channels, args.n_classes, input_size=64,
             pretrain=True).cuda()
model_ema = VNet(args.n_channels, args.n_classes, input_size=64,
                 pretrain=True).cuda()

optimizer = torch.optim.SGD(model.parameters(),
                            lr=args.lr,
                            momentum=0.9,
                            weight_decay=0.0005)
model = torch.nn.DataParallel(model)
model_ema = torch.nn.DataParallel(model_ema)
model_ema.load_state_dict(model.state_dict())
print("Model Initialized")
logger = Logger(root_path)
saver = Saver(root_path, save_freq=args.save_freq)
if args.sampling == 'default':
    contrast = RGBMoCo(128, K=4096, T=args.temperature).cuda()
else:
    raise ValueError("unsupported sampling method")
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(args.start_epoch, args.epochs):
    pretrain_jigsaw(model, model_ema, train_loader, optimizer, logger, saver,
                    args, epoch, contrast, criterion)
    adjust_learning_rate(args, optimizer, epoch)
Esempio n. 6
0
def main():

    args = parse_args()
    args.pretrain = True
    print("Using GPU: {}".format(args.local_rank))

    root_path = 'exps/exp_{}'.format(args.exp)

    if not os.path.exists(root_path) and args.local_rank == 0:
        os.mkdir(root_path)
        os.mkdir(os.path.join(root_path, "log"))
        os.mkdir(os.path.join(root_path, "model"))

    base_lr = args.lr  # base learning rate
    batch_size = 1
    max_iterations = 40000

    cell_size = 96  # size of volume we crop patch from
    patch_size = 64
    puzzle_config = 3  # 2 or 3 for 2X2X2 or 3X3X3 puzzle
    puzzle_num = puzzle_config**3
    feature_len = 256  #
    iter_num = 0
    sr_feature_size = 32
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    train_dataset, val_dataset = build_dataset(args)
    args.world_size = len(args.gpu.split(","))
    if args.world_size > 1:
        os.environ['MASTER_PORT'] = args.port
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group('nccl')
        device = torch.device('cuda:{}'.format(args.local_rank))
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=len(args.gpu.split(",")),
            rank=args.local_rank)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               sampler=train_sampler,
                                               num_workers=args.num_workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    model = VNet(args.n_channels, args.n_classes, input_size=64,
                 pretrain=True).cuda(args.local_rank)
    model_ema = VNet(args.n_channels,
                     args.n_classes,
                     input_size=64,
                     pretrain=True).cuda(args.local_rank)
    assert os.path.exists(args.load_path)

    state_dict = model.state_dict()
    print("Loading weights...")
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=0.0005)

    #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7)
    if args.world_size > 1:
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)

    model.train()
    model_ema = DDP(model_ema,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)

    pretrain_state_dict = torch.load(args.load_path,
                                     map_location="cpu")['state_dict']

    model.load_state_dict(pretrain_state_dict)
    model_ema.load_state_dict(pretrain_state_dict)
    print("Loaded weights")

    logger = Logger(root_path)
    saver = Saver(root_path, save_freq=args.save_freq)
    contrast = RGBMoCo(128, K=512).cuda(args.local_rank)
    criterion = torch.nn.CrossEntropyLoss()
    for epoch in range(args.start_epoch, args.epochs):
        train_sampler.set_epoch(epoch)
        adapt(model, model_ema, train_loader, optimizer, logger, saver, args,
              epoch, contrast, criterion)
        adjust_learning_rate(args, optimizer, epoch)
Esempio n. 7
0
class Trainer(BaseContainer):
    def __init__(self):
        super().__init__()
        now_time = time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time()))
        logger_path = os.path.join(
            self.args.training.save_dir,
            self.args.dataset.dataset_train,
            self.args.models.model_warpper,
            self.args.training.experiment_id,
            '%s.log' % now_time
        )
        set_logger_path(logger_path)
        logger.info(self.args)

        # Define Saver
        self.saver = Saver(self.args)

        # Define Tensorboard Summary
        self.summary = TensorboardSummary()
        self.writer = self.summary.create_summary(self.saver.experiment_dir, self.args.models)


        self.init_training_container()
        self.batchsize = self.args.training.batchsize
        self.reset_batchsize()
        self.evaluator = Evaluator()
        self.best = 0.0

        # show parameters to be trained
        logger.debug('\nTraining params:')
        for p in self.model.named_parameters():
            if p[1].requires_grad:
                logger.debug(p[0])
        logger.debug('\n')

        # Clear start epoch if fine-tuning
        logger.info('Starting iteration: %d' % self.start_it)
        logger.info('Total iterationes: %d' % self.args.training.max_iter)

    # main function for training
    def training(self):
        self.model.train()

        num_img_tr = len(self.train_loader)
        logger.info('\nTraining')

        max_iter = self.args.training.max_iter
        it = self.start_it

        # support multiple optimizers, but only one 
        # optimizer is used here, i.e., names = ['match']
        names = self.args.training.optimizer.keys()

        while it < max_iter:
            for samples in self.train_loader:
                samples = to_cuda(samples)

                # validation
                val_iter = self.args.training.get('val_iter', -1)
                if val_iter > 0 and it % val_iter == 0 and it >= self.args.training.get('start_eval_it', 15000):
                    self.validation(it, 'val')
                    self.model.train()

                if it % 100 == 0:
                    logger.info('\n===> Iteration  %d/%d' % (it, max_iter))
    
                # update class weights
                if it >= 500 and self.args.training.get('weight_update_iter', -1) > 0 and it % self.args.training.get('weight_update_iter', -1) == 0:
                    self.model.update_hard()
                    logger.info('\nUpdate hard ID: %.3f'%self.model.center.ratio)
                    self.writer.add_scalar('train/data_ratio', self.model.center.ratio, it)

                for name in names:
                    losses = dict()

                    self.optimizer[name].zero_grad()
                    outputs = self.model(samples, type=name)
                    losses = self.criterion(outputs, name)
                    loss = losses['loss']
                    loss.backward()
                    self.optimizer[name].step()

                    losses.update(losses)

                    # log training loss
                    if it % 100 == 0:
                        loss_log_str = '=>%s   loss: %.4f'%(name, loss.item())
                        for loss_name in losses.keys():
                            if loss_name != 'loss':
                                loss_log_str += '    %s: %.4f'%(loss_name, losses[loss_name])
                                self.writer.add_scalar('train/%s_iter'%loss_name, losses[loss_name], it)
                        logger.info(loss_log_str)
                        self.writer.add_scalar('train/total_loss_iter_%s'%name, loss.item(), it)

                    # adjust learning rate
                    lr_decay_iter = self.args.training.optimizer[name].get('lr_decay_iter', None)
                    if lr_decay_iter is not None:
                        for i in range(len(lr_decay_iter)):
                            if it == lr_decay_iter[i]:
                                lr = self.args.training.optimizer[name].lr * (self.args.training.optimizer[name].lr_decay ** (i+1))
                                logger.info('\nReduce lr to %.6f\n'%(lr))
                                for param_group in self.optimizer[name].param_groups:
                                    param_group["lr"] = lr 
                                break

                it += 1

                # save model and optimizer
                if it % self.args.training.save_iter == 0 or it == max_iter or it == 1:
                    logger.info('\nSaving checkpoint ......')
                    optimizer_to_save = dict()
                    for i in self.optimizer.keys():
                        optimizer_to_save[i] = self.optimizer[i].state_dict()
                    self.saver.save_checkpoint({
                        'start_it': it,
                        'stage': self.stage,
                        'state_dict': self.model.state_dict(),
                        'optimizer': optimizer_to_save,
                    }, filename='ckp_%06d.pth.tar'%it)
                    logger.info('Done.')

    # main function for validation
    def validation(self, it, split):
        logger.info('\nEvaluating %s...'%split)
        self.evaluator.reset()
        self.model.eval()

        data_loader = self.val_loader if split == 'val' else self.test_loader
        num_img_tr = len(data_loader)
        dist_pos = []
        dist_neg = []
        total_loss = []
        name = list(self.args.training.optimizer.keys())[0]
        for i, samples in enumerate(data_loader):
            samples = to_cuda(samples)

            with torch.no_grad():
                outputs = self.model(samples, type=name, is_triple=True)
                dist_pos.append(outputs[-1]['dist_pos'].mean().item())
                dist_neg.append(outputs[-1]['dist_neg'].mean().item())

            self.evaluator.add_batch(outputs[-1]['pred'], outputs[0]['target'])

        self.writer.add_scalar('%s/dist_pos'%split, np.array(dist_pos).mean(), it)
        self.writer.add_scalar('%s/dist_neg'%split, np.array(dist_neg).mean(), it)

        acc = self.evaluator.Accuracy()
        self.writer.add_scalar('%s/acc'%split, acc, it)
        if split == 'val':
            logger.info('=====>[Iteration: %d    %s/acc=%.4f    previous best=%.4f'%(it, split, acc, self.best))
        else:
            logger.info('=====>[Iteration: %d    %s/acc=%.4f'%(it, split, acc))

        # if split == 'val':
        #     self.validation(it, 'test')

        if split == 'val' and acc > self.best:
            self.best = acc
            logger.info('\nSaving checkpoint ......')
            optimizer_to_save = dict()
            for i in self.optimizer.keys():
                optimizer_to_save[i] = self.optimizer[i].state_dict()
            self.saver.save_checkpoint({
                'start_it': it,
                'stage': self.stage,
                'state_dict': self.model.state_dict(),
                'optimizer': optimizer_to_save,
            }, filename='best.pth.tar')
Esempio n. 8
0
def main():

    args = parse_args()
    args.pretrain = False
    print("Using GPU: {}".format(args.local_rank))
    root_path = 'exps/exp_{}'.format(args.exp)
    if args.local_rank == 0 and not os.path.exists(root_path):
        os.mkdir(root_path)
        os.mkdir(os.path.join(root_path, "log"))
        os.mkdir(os.path.join(root_path, "model"))

    base_lr = args.lr  # base learning rate
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    train_dataset, val_dataset = build_dataset(args.dataset, args.data_root,
                                               args.train_list)
    args.world_size = len(args.gpu.split(","))
    if args.world_size > 1:
        os.environ['MASTER_PORT'] = args.port
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group('nccl')
        device = torch.device('cuda:{}'.format(args.local_rank))
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=len(args.gpu.split(",")),
            rank=args.local_rank)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               sampler=train_sampler,
                                               num_workers=args.num_workers,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             num_workers=args.num_workers,
                                             pin_memory=True)

    model = VNet(args.n_channels, args.n_classes).cuda(args.local_rank)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.lr,
                                momentum=0.9,
                                weight_decay=0.0005)
    #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7)
    if args.world_size > 1:
        model = DDP(model,
                    device_ids=[args.local_rank],
                    output_device=args.local_rank,
                    find_unused_parameters=True)

    model.train()
    print("Loaded weights")

    logger = Logger(root_path)
    saver = Saver(root_path)

    for epoch in range(args.start_epoch, args.epochs):
        train(model, train_loader, optimizer, logger, args, epoch)
        validate(model, val_loader, optimizer, logger, saver, args, epoch)
        adjust_learning_rate(args, optimizer, epoch)