Beispiel #1
0
    def commit_insert(self):
        if self.keys is None:
            self.keys = Parameter(self.keys_to_be_inserted)
            self.values = Parameter(self.values_to_be_inserted)
        elif self.keys_to_be_inserted is not None:
            keys = torch.cat([self.keys.detach(),self.keys_to_be_inserted],0)
            self.keys = Parameter(keys)
            values = [self.values.detach(),self.values_to_be_inserted]
            values = torch.cat(values,0)
            self.values = Parameter(values)

        # Move most recently used key-value pairs to the back
        if len(self.move_to_back) != 0:
            unmoved_ids = list(set(range(len(self.keys))) - self.move_to_back)
            moved_ids = list(self.move_to_back)
            unmoved_keys = self.keys.detach()[unmoved_ids]
            moved_keys = self.keys.detach()[moved_ids]
            self.keys = Parameter(torch.cat([unmoved_keys, moved_keys], 0))
            unmoved_values = self.values.detach()[unmoved_ids]
            moved_values = self.values.detach()[moved_ids]
            self.values = Parameter(torch.cat([unmoved_values,moved_values], 0))
            self.move_to_back = set()

        if len(self.keys) > self.max_memory:
            # Expel oldest key to maintain total memory
            for key in self.keys[:-self.max_memory]:
                del self.key_cache[tuple(key.detach().cpu().numpy())]
            self.keys = Parameter(self.keys[-self.max_memory:].detach())
            self.values = Parameter(self.values[-self.max_memory:].detach())
        self.keys_to_be_inserted = None
        self.values_to_be_inserted = None
        params = [self.keys, self.values]
        self.optimizer = get_optimizer(self.opt_name,params,self.lr)
        self.kdtree.build_index(self.keys.detach().cpu().numpy())
        self.stale_index = False
Beispiel #2
0
 def update(self, value, index):
     """
     Set self.values[index] = value
     """
     values = self.values.detach()
     values[index] = value[0].detach()
     self.values = Parameter(values)
     params = [self.keys, self.values]
     self.optimizer = get_optimizer(self.opt_name,params,self.lr)
Beispiel #3
0
def run(cfg):
    '''Load save path'''
    cfg.log_string('Data save path: %s' % (cfg.save_path))
    checkpoint = CheckpointIO(cfg)
    '''Load device'''
    cfg.log_string('Loading device settings.')
    device = load_device(cfg)
    '''Load data'''
    cfg.log_string('Loading dataset.')
    train_loader = get_dataloader(cfg.config, mode='train')
    test_loader = get_dataloader(cfg.config, mode='test')
    '''Load net'''
    cfg.log_string('Loading model.')
    net = get_model(cfg.config, device=device)
    if isinstance(net, list):
        checkpoint.register_modules(voxnet=net[0])
        checkpoint.register_modules(refnet=net[1])
    else:
        checkpoint.register_modules(voxnet=net)

    cfg.log_string('loading loss function')
    loss_func = get_loss(cfg.config, device)
    '''Load optimizer'''
    cfg.log_string('Loading optimizer.')
    optimizer = get_optimizer(config=cfg.config, net=net)
    if isinstance(net, list):
        checkpoint.register_modules(voxopt=optimizer[0])
        checkpoint.register_modules(refopt=optimizer[1])
    else:
        checkpoint.register_modules(voxopt=optimizer)
    '''Load scheduler'''
    cfg.log_string('Loading optimizer scheduler.')
    scheduler = load_scheduler(config=cfg.config, optimizer=optimizer)
    if isinstance(net, list):
        checkpoint.register_modules(voxsch=scheduler[0])
        checkpoint.register_modules(refsch=scheduler[1])
    else:
        checkpoint.register_modules(voxsch=scheduler)
    '''Load trainer'''
    cfg.log_string('Loading trainer.')
    trainer = get_trainer(cfg.config)
    '''Start to train'''
    cfg.log_string('Start to train.')
    #cfg.log_string('Total number of parameters in {0:s}: {1:d}.'.format(cfg.config['method'], sum(p.numel() for p in net.parameters())))

    trainer(cfg,
            net,
            loss_func,
            optimizer,
            scheduler,
            train_loader=train_loader,
            test_loader=test_loader,
            device=device,
            checkpoint=checkpoint)

    cfg.log_string('Training finished.')
Beispiel #4
0
 def __init__(self, env, args, device='cpu'):
     """
     Instantiate an NEC Agent
     ----------
     env: gym.Env
         gym environment to train on
     args: args class from argparser
         args are from from train.py: see train.py for help with each arg
     device: string
         'cpu' or 'cuda:0' depending on use_cuda flag from train.py
     """
     self.environment_type = args.environment_type
     self.env = env
     self.device = device
     # Hyperparameters
     self.epsilon = args.initial_epsilon
     self.final_epsilon = args.final_epsilon
     self.epsilon_decay = args.epsilon_decay
     self.gamma = args.gamma
     self.N = args.N
     # Transition queue and replay memory
     self.transition_queue = []
     self.replay_every = args.replay_every
     self.replay_buffer_size = args.replay_buffer_size
     self.replay_memory = ReplayMemory(self.replay_buffer_size)
     # CNN for state embedding network
     self.frames_to_stack = args.frames_to_stack
     self.embedding_size = args.embedding_size
     self.in_height = args.in_height
     self.in_width = args.in_width
     self.cnn = CNN(self.frames_to_stack, self.embedding_size,
                    self.in_height, self.in_width).to(self.device)
     # Differentiable Neural Dictionary (DND): one for each action
     self.kernel = inverse_distance
     self.num_neighbors = args.num_neighbors
     self.max_memory = args.max_memory
     self.lr = args.lr
     self.dnd_list = []
     for i in range(env.action_space.n):
         self.dnd_list.append(
             DND(self.kernel, self.num_neighbors, self.max_memory,
                 args.optimizer, self.lr))
     # Optimizer for state embedding CNN
     self.q_lr = args.q_lr
     self.batch_size = args.batch_size
     self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(),
                                    self.lr)
Beispiel #5
0
def create_experiment(config):
    """Creates an experiment based on config."""

    device = torch.device(config.device)
    logging.info("using {}".format(config.device))

    experiment = Experiment(config.name, config.save_dir)
    experiment.register_config(config)

    logger = None
    if config.use_tflogger:
        logger = Logger(config.tflog_dir)
        experiment.register_logger(logger)

    torch.manual_seed(config.rseed)

    model = NRU(device,
                config.input_size,
                config.output_size,
                num_layers=config.num_layers,
                layer_size=config.layer_size,
                output_activation="linear",
                layer_norm=config.layer_norm,
                use_relu=config.use_relu,
                memory_size=config.memory_size,
                k=config.k).to(device)
    experiment.register_model(model)

    data_iterator = get_data_iterator(config)
    experiment.register_data_iterator(data_iterator)

    optimizer = get_optimizer(model.parameters(), config)
    model.register_optimizer(optimizer)

    tr = MyContainer()
    tr.updates_done = 0
    tr.epochs_done = 0
    tr.ce = {}
    tr.ce["train"] = []
    tr.accuracy = {}
    tr.accuracy["valid"] = []
    tr.accuracy["test"] = []
    tr.grad_norm = []

    experiment.register_train_statistics(tr)

    return experiment, model, data_iterator, tr, logger, device
Beispiel #6
0
    def _train(self):
        print(f"\n({self.experim_name}) training...\n")
        model = get_model(self.args).to(self.device)
        optimizer = get_optimizer(self.args, model)
        lr_scheduler = get_lr_scheduler(self.args,
                                        optimizer=optimizer,
                                        iters_per_epoch=len(self.dataloader))

        for e in range(1, 1 + self.n_epochs):
            model, optimizer, lr_scheduler = self._train_epoch(
                e, model, optimizer, lr_scheduler)
            self._val(e, model)

            if self.debug:
                break

        self.best_miou = -1.0
        return model
Beispiel #7
0
def train(model, dataloader, device, optimizer_name, loss_name, lr, verbose):
    optimizer_object = get_optimizer(optimizer_name)
    optimizer = optimizer_object(model.parameters(), lr=lr)

    loss_fn = get_loss(loss_name)

    model.train()

    running_loss = 0.0
    running_corrects = 0

    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        bs = len(targets)
        classes = torch.zeros((bs, 10))
        for i in range(bs):
            classes[i][targets[i]] = 1
        classes = classes.to(device)

        outputs = model(inputs)
        loss = loss_fn()(outputs, classes)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, preds = torch.max(outputs.data, 1)

        running_loss += loss.item()
        running_corrects += torch.sum(preds == targets.data)

    loss = running_loss / 60000
    acc = running_corrects.data.item() / 60000
    if verbose:
        print(f'Training results: Loss: {loss:.4f} Acc: {acc:.4f}')

    return acc
Beispiel #8
0
def train(model, dataloader, device, optimizer_name, loss_name, lr):
    optimizer_object = get_optimizer(optimizer_name)
    optimizer = optimizer_object(model.parameters(), lr=lr)

    loss_fn = get_loss(loss_name)

    model.train()

    running_loss = 0.0
    running_corrects = 0

    for inputs, targets in dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        bs = len(targets)
        classes = torch.zeros((bs, 10))
        for i in range(bs):
            classes[i][targets[i]] = 1
        classes = classes.to(device)

        outputs = model(inputs)
        loss = loss_fn()(outputs,
                         classes)  # LeCun & al. used Maximum Log Likehood

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, preds = torch.max(outputs.data, 1)
        # statistics
        running_loss += loss.item()
        running_corrects += torch.sum(preds == targets.data)

    loss = running_loss / 60000
    acc = running_corrects.data.item() / 60000
    print('Training results: Loss: {:.4f} Acc: {:.4f}'.format(loss, acc))

    return acc
Beispiel #9
0
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    backbone_model = eval('models.' + config.BACKBONE_MODEL + '.get_pose_net')(
        config, is_train=True)

    model = eval('models.' + config.MODEL + '.get_multiview_pose_net')(
        backbone_model, config)
    print(model)

    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../../lib/models', config.MODEL + '.py'),
        final_output_dir)
    shutil.copy2(args.cfg, final_output_dir)
    logger.info(pprint.pformat(model))

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }


    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)
    start_epoch = config.TRAIN.BEGIN_EPOCH
    if config.TRAIN.RESUME:
        start_epoch, model, optimizer = load_checkpoint(model, optimizer,
                                                        final_output_dir)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + config.DATASET.TRAIN_DATASET)(
        config, config.DATASET.TRAIN_SUBSET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.TEST_DATASET)(
        config, config.DATASET.TEST_SUBSET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    best_model = False
    for epoch in range(start_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, writer_dict)

        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint({
            'epoch': epoch + 1,
            'model': get_model_name(config),
            'state_dict': model.module.state_dict(),
            'perf': perf_indicator,
            'optimizer': optimizer.state_dict(),
        }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info('saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #10
0
def main():
    args = parse_args()
    update_config(cfg, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        cfg, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(cfg)

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    model_p, model_d = eval('models.' + cfg.MODEL.NAME +
                            '.get_adaptive_pose_net')(cfg, is_train=True)

    if cfg.TRAIN.CHECKPOINT:
        logger.info('=> loading model from {}'.format(cfg.TRAIN.CHECKPOINT))
        model_p.load_state_dict(torch.load(cfg.TRAIN.CHECKPOINT))
    else:
        model_state_file = os.path.join(final_output_dir, 'checkpoint.pth')
        logger.info('=> loading model from {}'.format(model_state_file))
        model_p.load_state_dict(torch.load(model_state_file))

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'pre_train_global_steps': 0,
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model_p, (dump_input, ), verbose=False)

    logger.info(get_model_summary(model_p, dump_input))

    model_p = torch.nn.DataParallel(model_p, device_ids=cfg.GPUS).cuda()
    model_d = torch.nn.DataParallel(model_d, device_ids=cfg.GPUS).cuda()

    # define loss function (criterion) and optimizer for pose_net
    criterion_p = JointsMSELoss(
        use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer_p = get_optimizer(cfg, model_p)

    # define loss function (criterion) and optimizer for domain
    criterion_d = torch.nn.BCEWithLogitsLoss().cuda()
    optimizer_d = get_optimizer(cfg, model_d)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_pre_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_PRE_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_pre_loader = torch.utils.data.DataLoader(
        train_pre_dataset,
        batch_size=cfg.TRAIN.PRE_BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)

    syn_labels = train_dataset._load_syrip_syn_annotations()
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        sampler=BalancedBatchSampler(train_dataset, syn_labels),
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)
    '''
    train_loader = torch.utils.data.DataLoader(   
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY
    )
    '''

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)

    best_perf = 0.0
    best_model = False
    last_epoch = -1
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model_p.load_state_dict(checkpoint['state_dict'])

        optimizer_p.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    # freeze some layers
    idx = 0
    print('Parametersssssssssssssss')
    for param in model_p.parameters():

        if idx <= 108:  #fix 108 for stage 2 + bottleneck  or fix 483 for stage 3 + stage 2+ bottleneck
            param.requires_grad = False
            #print(param.data.shape)
        idx = idx + 1

    lr_scheduler_p = torch.optim.lr_scheduler.MultiStepLR(
        optimizer_p,
        cfg.TRAIN.LR_STEP,
        cfg.TRAIN.LR_FACTOR,
        last_epoch=last_epoch)

    lr_scheduler_d = torch.optim.lr_scheduler.MultiStepLR(
        optimizer_d, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR)

    epoch_D = cfg.TRAIN.PRE_EPOCH
    losses_D_list = []
    acces_D_list = []
    acc_num_total = 0
    num = 0
    losses_d = AverageMeter()

    # Pretrained Stage
    print('Pretrained Stage:')
    print('Start to train Domain Classifier-------')
    for epoch_d in range(epoch_D):  # epoch
        model_d.train()
        model_p.train()

        for i, (input, target, target_weight,
                meta) in enumerate(train_pre_loader):  # iteration
            # compute output for pose_net
            feature_outputs, outputs = model_p(input)
            #print(feature_outputs.size())
            # compute for domain classifier
            domain_logits = model_d(feature_outputs.detach())
            domain_label = (meta['synthetic'].unsqueeze(-1) *
                            1.0).cuda(non_blocking=True)
            # print(domain_label)

            loss_d = criterion_d(domain_logits, domain_label)
            loss_d.backward(retain_graph=True)
            optimizer_d.step()

            # compute accuracy of classifier
            acc_num = 0
            for j in range(len(domain_label)):
                if (domain_logits[j] > 0 and domain_label[j] == 1.0) or (
                        domain_logits[j] < 0 and domain_label[j] == 0.0):
                    acc_num += 1
                    acc_num_total += 1
                num += 1
            acc_d = acc_num * 1.0 / input.size(0)
            acces_D_list.append(acc_d)

            optimizer_d.zero_grad()
            losses_d.update(loss_d.item(), input.size(0))

            if i % cfg.PRINT_FREQ == 0:
                msg = 'Epoch: [{0}][{1}/{2}]\t' \
                      'Accuracy_d: {3} ({4})\t' \
                      'Loss_d: {loss_d.val:.5f} ({loss_d.avg:.5f})'.format(
                          epoch_d, i, len(train_pre_loader), acc_d, acc_num_total * 1.0 / num, loss_d = losses_d)
                logger.info(msg)

                writer = writer_dict['writer']
                pre_global_steps = writer_dict['pre_train_global_steps']
                writer.add_scalar('pre_train_loss_D', losses_d.val,
                                  pre_global_steps)
                writer.add_scalar('pre_train_acc_D', acc_d, pre_global_steps)
                writer_dict['pre_train_global_steps'] = pre_global_steps + 1

            losses_D_list.append(losses_d.val)

    print('Training Stage (Step I and II):')
    losses_P_list = []
    acces_P_list = []
    losses_p = AverageMeter()
    acces_p = AverageMeter()
    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        lr_scheduler_p.step()

        # train for one epoch
        losses_P_list, losses_D_list, acces_P_list, acces_D_list = train_adaptive(
            cfg, train_loader, model_p, model_d, criterion_p, criterion_d,
            optimizer_p, optimizer_d, epoch, final_output_dir, tb_log_dir,
            writer_dict, losses_P_list, losses_D_list, acces_P_list,
            acces_D_list, acc_num_total, num, losses_p, acces_p, losses_d)

        # evaluate on validation set
        perf_indicator = validate_adaptive(cfg, valid_loader, valid_dataset,
                                           model_p, criterion_p,
                                           final_output_dir, tb_log_dir,
                                           writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model_p.state_dict(),
                'best_state_dict': model_p.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer_p.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model_p.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()

    np.save('./losses_D.npy', np.array(losses_D_list))  # Adversarial-D
    np.save('./losses_P.npy', np.array(losses_P_list))  # P
    np.save('./acces_P.npy', np.array(acces_P_list))  # P
    np.save('./acces_D.npy', np.array(acces_D_list))  # D
def train(use_cuda: bool, n_epochs: int, validate_every: int,
          use_dropout: bool, partitions: Partitions, optimizer_name: str,
          lr: float, wd: float, momentum: bool):
    logger = logging.getLogger('logger')

    no_test = True
    model_path = "./model_output/pairwise/model_{0}"

    partitions.generate_partitions(PairPartition, no_test=no_test)
    training_data = Balanced(partitions.train)

    if validate_every > 0:
        balanced_validation = Balanced(partitions.val)
        training_pairs = AllPairs(partitions.train)
        search_length = training_pairs.n_references
        validation_pairs = AllPairs(partitions.val)
        testing_pairs = AllPairs(partitions.test) if not no_test else None
    else:
        balanced_validation = None
        training_pairs = None
        validation_pairs = None
        testing_pairs = None
        search_length = None

    # get a siamese network, see Siamese class for architecture
    siamese = Siamese(dropout=use_dropout)
    siamese = initialize_weights(siamese, use_cuda)

    if use_cuda:
        siamese = siamese.cuda()

    criterion = BCELoss()
    optimizer = get_optimizer(siamese, optimizer_name, lr, wd, momentum)

    try:
        logger.info("Training network with pairwise loss...")
        progress = TrainingProgress()
        models = training.train_siamese_network(siamese, training_data,
                                                criterion, optimizer, n_epochs,
                                                use_cuda)
        for epoch, (model, training_batch_losses) in enumerate(models):
            utils.network.save_model(model, model_path.format(epoch))

            training_loss = training_batch_losses.mean()
            if validate_every != 0 and epoch % validate_every == 0:
                validation_batch_losses = inference.siamese_loss(
                    model, balanced_validation, criterion, use_cuda)
                validation_loss = validation_batch_losses.mean()

                training_mrr, training_rank = inference.mean_reciprocal_ranks(
                    model, training_pairs, use_cuda)
                val_mrr, val_rank = inference.mean_reciprocal_ranks(
                    model, validation_pairs, use_cuda)

                progress.add_mrr(train=training_mrr, val=val_mrr)
                progress.add_rank(train=training_rank, val=val_rank)
                progress.add_loss(train=training_loss, val=validation_loss)
            else:
                progress.add_mrr(train=np.nan, val=np.nan)
                progress.add_rank(train=np.nan, val=np.nan)
                progress.add_loss(train=training_loss, val=np.nan)

            progress.graph("Siamese", search_length)

        # load weights from best model if we validated throughout
        if validate_every > 0:
            siamese = siamese.train()
            utils.network.load_model(
                siamese, model_path.format(np.argmax(progress.val_mrr)))

        # otherwise just save most recent model
        utils.network.save_model(siamese, model_path.format('best'))
        utils.network.save_model(
            siamese,
            './output/{0}/pairwise'.format(utilities.get_trial_number()))

        if not no_test:
            logger.info(
                "Results from best model generated during training, evaluated on test data:"
            )
            rrs = inference.reciprocal_ranks(siamese, testing_pairs, use_cuda)
            utilities.log_final_stats(rrs)

        progress.pearson(log=True)
        progress.save("./output/{0}/pairwise.pickle".format(
            utilities.get_trial_number()))
        return siamese
    except Exception as e:
        utils.network.save_model(siamese, model_path.format('crash_backup'))
        logger.critical("Exception occurred while training: {0}".format(
            str(e)))
        logger.critical(traceback.print_exc())
        sys.exit()
Beispiel #12
0
# Load Weights into the
decoder = copy_weights(hfvae, decoder)

SECOND_STAGE = False
if SECOND_STAGE:
    z_train = encoder.predict(x_train)[0][0]

    latent_dim = np.prod(z_train.shape[1:])
    z_train = np.reshape(z_train, (-1, latent_dim))

    second_vae, second_encoder, second_decoder = two_stage.get_second_stage(
        latent_dim)

    # Compile model
    optimizer = utils.get_optimizer(z_train.shape[0] // batch_size,
                                    initial_lr=1e-3)
    second_vae.compile(optimizer=optimizer, loss=None, metrics=[utils.cos_sim])

    second_vae.fit(z_train, None, batch_size=batch_size, epochs=epochs)
    second_vae.save_weights('saved_weights/secondstage_NVAE_' + data + '.h5')

GMM = True
if GMM:
    from sklearn.mixture import GaussianMixture

    #we may only work on z_mean of the innermost layer
    z_train = encoder.predict(x_train)[0][0]
    #print("ltatent dim = ",z_train.shape[1])

    z_density = GaussianMixture(n_components=10, max_iter=100)
    z_density.fit(z_train)
def main_per_worker(process_index, ngpus_per_node, args):
    update_config(cfg, args)
    
    # torch seed
    torch.cuda.manual_seed(random.random())

    # cudnn
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    #proc_rank
    proc_rank = args.rank * ngpus_per_node + process_index

    #create logger
    logger, output_dir = create_logger(cfg, proc_rank)
    # logger.info(pprint.pformat(args))
    # logger.info(cfg)
    
    model = get_model(cfg, cfg.MODEL.FILE, cfg.MODEL.NAME) 

    emb = InceptionResnetV1(pretrained='vggface2', classify=False)
    assert cfg.MODEL.APPEARANCE.WEIGHTS != ''
    load_eval_model(cfg.MODEL.APPEARANCE.WEIGHTS, emb)

    # TODO change based on the paper
    optimizer = get_optimizer(cfg, model)
    model, optimizer, last_iter = load_checkpoint(cfg, model, optimizer)
    lr_scheduler = get_lr_scheduler(cfg, optimizer, last_iter)

    transform = FacenetInferenceTransform(size=(cfg.TRAIN.INPUT_MIN, cfg.TRAIN.INPUT_MAX))
    train_dataset = TrackletpairDataset(cfg.DATASET.ROOT, transform=transform, is_train=True)
    eval_dataset = TrackletpairDataset(cfg.DATASET.ROOT, transform=transform, is_train=False)

    # distribution
    if args.distributed:
        logger.info(
            f'Init process group: dist_url: {args.dist_url},  '
            f'world_size: {args.world_size}, '
            f'machine: {args.rank}, '
            f'rank:{proc_rank}'
        )
        dist.init_process_group(
            backend=cfg.DIST_BACKEND,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=proc_rank
        )
        torch.cuda.set_device(process_index)
        model.cuda()
        emb.cuda()
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[process_index]
        )
        emb = torch.nn.parallel.DistributedDataParallel(
            emb, device_ids=[process_index]
        )
        train_sampler = BalancedBatchSampler(
            train_dataset
        )
        batch_size = cfg.DATASET.IMG_NUM_PER_GPU

    else:
        assert proc_rank == 0, ('proc_rank != 0, it will influence '
                                'the evaluation procedure')
        model = torch.nn.DataParallel(model).cuda()
        emb = torch.nn.DataParallel(emb).cuda()
        train_sampler = BalancedBatchSampler(
            train_dataset
        )
        batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=(train_sampler is None),
        drop_last=False,
        collate_fn=tracklet_pair_collect,
        num_workers=cfg.WORKERS,
        pin_memory=True,
        sampler=train_sampler
    )

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        collate_fn=tracklet_pair_collect,
        num_workers=cfg.WORKERS
    )
    
        
    criterion = nn.CrossEntropyLoss()

    Trainer = trackletpairConnectTrainer(
        cfg,
        model,
        optimizer,
        lr_scheduler,
        criterion,
        output_dir,
        'acc',
        last_iter,
        proc_rank,
        pre_ap_model=emb,
    )

    while True:
        Trainer.train(train_loader, eval_loader)

    # eval
    Trainer.evaluate(eval_loader)
Beispiel #14
0
args = parser.parse_args()
with open(args.config) as f:
    config = yaml.load(f)
config['config_file'] = args.config.replace('/','.').split('.')[-2]

seed = config['seed']
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

n_epochs = config['optimization']['n_epochs']

if not args.disable_cuda and torch.cuda.is_available():
    device = torch.device('cuda:{}'.format(args.gpu))
else:
    device = torch.device('cpu')

logger = Logger(config)
model = get_model(config['model'])
optim = get_optimizer(model.parameters(),config['optimization'])
train_loader, valid_loader, test_loader = get_data(config['data'])

## Train
for i in range(n_epochs):
    for data, label in train_loader:
        break


def main():
    args = parse_args()
    update_config(cfg, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        cfg, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(cfg)

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    model_builder = importlib.import_module("models." +
                                            cfg.MODEL.NAME).get_fovea_net
    model = model_builder(cfg, is_train=True)

    # xiaofeng add for load parameter
    if cfg.TEST.MODEL_FILE:
        logger.info('=> loading model from {}'.format(cfg.TEST.MODEL_FILE))
        model.load_state_dict(torch.load(cfg.TEST.MODEL_FILE), strict=False)

    # copy model file -- xiaofeng comment it
    # this_dir = os.path.dirname(__file__)
    # shutil.copy2(os.path.join(this_dir, '../models', cfg.MODEL.NAME + '.py'), final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))

    model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()

    # define loss function (criterion) and optimizer
    criterion = HybridLoss(roi_weight=cfg.LOSS.ROI_WEIGHT,
                           regress_weight=cfg.LOSS.REGRESS_WEIGHT,
                           use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT,
                           hrnet_only=cfg.TRAIN.HRNET_ONLY).cuda()

    # Data loading code
    # normalize = transforms.Normalize(
    #     mean=[0.134, 0.207, 0.330], std=[0.127, 0.160, 0.239]
    # )
    # train_dataset = importlib.import_module('dataset.'+cfg.DATASET.DATASET).Dataset(
    #     cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
    #     transforms.Compose([
    #         transforms.ToTensor(),
    #         normalize,
    #     ])
    # )
    # valid_dataset = importlib.import_module('dataset.'+cfg.DATASET.DATASET).Dataset(
    #     cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
    #     transforms.Compose([
    #         transforms.ToTensor(),
    #         normalize,
    #     ])
    # )
    #
    # train_loader = torch.utils.data.DataLoader(
    #     train_dataset,
    #     batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
    #     shuffle=cfg.TRAIN.SHUFFLE,
    #     num_workers=cfg.WORKERS,
    #     pin_memory=cfg.PIN_MEMORY
    # )
    # valid_loader = torch.utils.data.DataLoader(
    #     valid_dataset,
    #     batch_size=cfg.TEST.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
    #     shuffle=False,
    #     num_workers=cfg.WORKERS,
    #     pin_memory=cfg.PIN_MEMORY
    # )

    db_trains = []
    db_vals = []
    final_full_test = cfg.TRAIN.FULL_DATA
    normalize_1 = transforms.Normalize(mean=[0.282, 0.168, 0.084],
                                       std=[0.189, 0.110, 0.062])
    train_dataset_1 = importlib.import_module('dataset.' +
                                              cfg.DATASET.DATASET).Dataset(
                                                  cfg, cfg.DATASET.ROOT,
                                                  cfg.DATASET.TRAIN_SET_1,
                                                  True,
                                                  transforms.Compose([
                                                      transforms.ToTensor(),
                                                      normalize_1,
                                                  ]))
    db_trains.append(train_dataset_1)

    normalize_2 = transforms.Normalize(mean=[0.409, 0.270, 0.215],
                                       std=[0.288, 0.203, 0.160])
    train_dataset_2 = importlib.import_module('dataset.' +
                                              cfg.DATASET.DATASET).Dataset(
                                                  cfg, cfg.DATASET.ROOT,
                                                  cfg.DATASET.TRAIN_SET_2,
                                                  True,
                                                  transforms.Compose([
                                                      transforms.ToTensor(),
                                                      normalize_2,
                                                  ]))
    db_trains.append(train_dataset_2)

    if final_full_test is True:
        normalize_3 = transforms.Normalize(mean=[0.404, 0.271, 0.222],
                                           std=[0.284, 0.202, 0.163])
        train_dataset_3 = importlib.import_module(
            'dataset.' + cfg.DATASET.DATASET).Dataset(
                cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, True,
                transforms.Compose([
                    transforms.ToTensor(),
                    normalize_3,
                ]))
        db_trains.append(train_dataset_3)

    train_dataset = ConcatDataset(db_trains)
    logger.info("Combined Dataset: Total {} images".format(len(train_dataset)))

    train_batch_size = cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS)
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=train_batch_size,
                                               shuffle=cfg.TRAIN.SHUFFLE,
                                               num_workers=cfg.WORKERS,
                                               pin_memory=cfg.PIN_MEMORY)

    normalize = transforms.Normalize(mean=[0.404, 0.271, 0.222],
                                     std=[0.284, 0.202, 0.163])
    val_dataset_1 = importlib.import_module('dataset.' +
                                            cfg.DATASET.DATASET).Dataset(
                                                cfg, cfg.DATASET.ROOT,
                                                cfg.DATASET.TEST_SET, False,
                                                transforms.Compose([
                                                    transforms.ToTensor(),
                                                    normalize,
                                                ]))
    db_vals.append(val_dataset_1)

    if final_full_test is True:
        normalize_1 = transforms.Normalize(mean=[0.282, 0.168, 0.084],
                                           std=[0.189, 0.110, 0.062])
        val_dataset_2 = importlib.import_module('dataset.' +
                                                cfg.DATASET.DATASET).Dataset(
                                                    cfg, cfg.DATASET.ROOT,
                                                    cfg.DATASET.TRAIN_SET_1,
                                                    False,
                                                    transforms.Compose([
                                                        transforms.ToTensor(),
                                                        normalize_1,
                                                    ]))
        db_vals.append(val_dataset_2)

        normalize_2 = transforms.Normalize(mean=[0.409, 0.270, 0.215],
                                           std=[0.288, 0.203, 0.160])
        val_dataset_3 = importlib.import_module('dataset.' +
                                                cfg.DATASET.DATASET).Dataset(
                                                    cfg, cfg.DATASET.ROOT,
                                                    cfg.DATASET.TRAIN_SET_2,
                                                    False,
                                                    transforms.Compose([
                                                        transforms.ToTensor(),
                                                        normalize_2,
                                                    ]))
        db_vals.append(val_dataset_3)

    valid_dataset = ConcatDataset(db_vals)

    logger.info("Val Dataset: Total {} images".format(len(valid_dataset)))

    test_batch_size = cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)

    logger.info("Train len: {}, batch_size: {}; Test len: {}, batch_size: {}" \
                .format(len(train_loader), train_batch_size, len(valid_loader), test_batch_size))

    best_metric = 1e6
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH

    if cfg.TEST.MODEL_FILE:
        checkpoint_file = cfg.TEST.MODEL_FILE
    else:
        checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        # begin_epoch = checkpoint['epoch']
        begin_epoch = 0  # xiaofeng change it
        best_metric = checkpoint['metric']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    if cfg.TRAIN.LR_EXP:
        # llr=lr∗gamma∗∗epoch
        lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                              cfg.TRAIN.GAMMA1,
                                                              last_epoch=-1)
    else:
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            cfg.TRAIN.LR_STEP,
            cfg.TRAIN.LR_FACTOR,
            last_epoch=last_epoch)

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        start_time = timer()

        lr_scheduler.step()

        # evaluate on validation set
        # lr_metric, hr_metric, final_metric = validate(
        #     cfg, valid_loader, valid_dataset, model, criterion,
        #     final_output_dir, tb_log_dir, writer_dict, db_vals
        # )
        # print("validation before training spent time:")
        # timer(start_time)  # timing ends here for "start_time" variable

        # train for one epoch
        train(cfg, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        print("epoch %d train spent time:" % (epoch))
        train_time = timer(
            start_time)  # timing ends here for "start_time" variable

        # if epoch >= int(cfg.TRAIN.END_EPOCH/10):
        # evaluate on validation set
        lr_metric, hr_metric, final_metric = validate(
            cfg, valid_loader, valid_dataset, model, criterion,
            final_output_dir, tb_log_dir, writer_dict, db_vals)

        print("validation spent time:")
        val_time = timer(
            train_time)  # timing ends here for "start_time" variable

        min_metric = min(lr_metric, hr_metric, final_metric)
        if min_metric <= best_metric:
            best_metric = min_metric
            best_model = True
            logger.info('=> epoch [{}] best model result: {}'.format(
                epoch, best_metric))
        else:
            best_model = False

        # xiaofeng changed it
        if best_model is True:
            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            # transfer the model to CPU before saving to fix unstable bug:
            # github.com/pytorch/pytorch/issues/10577

            model = model.cpu()
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': cfg.MODEL.NAME,
                    'state_dict': model.state_dict(),
                    'best_state_dict': model.module.state_dict(),
                    'metric': final_metric,
                    'optimizer': optimizer.state_dict(),
                }, best_model, final_output_dir)
            model = model.cuda()

            print("saving spent time:")
            end_time = timer(
                val_time)  # timing ends here for "start_time" variable
        elif (epoch % 60 == 0) and (epoch != 0):
            logger.info('=> saving epoch {} checkpoint to {}'.format(
                epoch, final_output_dir))
            # transfer the model to CPU before saving to fix unstable bug:
            # github.com/pytorch/pytorch/issues/10577

            time_str = time.strftime('%Y-%m-%d-%H-%M')
            if cfg.TRAIN.HRNET_ONLY:
                checkpoint_filename = 'checkpoint_HRNET_epoch%d_%s.pth' % (
                    epoch, time_str)
            else:
                checkpoint_filename = 'checkpoint_Hybrid_epoch%d_%s.pth' % (
                    epoch, time_str)
            model = model.cpu()
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'model': cfg.MODEL.NAME,
                    'state_dict': model.state_dict(),
                    'best_state_dict': model.module.state_dict(),
                    'metric': final_metric,
                    'optimizer': optimizer.state_dict(),
                }, best_model, final_output_dir, checkpoint_filename)
            model = model.cuda()

    # xiaofeng change
    time_str = time.strftime('%Y-%m-%d-%H-%M')
    if cfg.TRAIN.HRNET_ONLY:
        model_name = 'final_state_HRNET_%s.pth' % (time_str)
    else:
        model_name = 'final_state_Hybrid_%s.pth' % (time_str)

    final_model_state_file = os.path.join(final_output_dir, model_name)
    logger.info(
        '=> saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()

    # save a final checkpoint
    model = model.cpu()
    save_checkpoint(
        {
            'epoch': epoch + 1,
            'model': cfg.MODEL.NAME,
            'state_dict': model.state_dict(),
            'best_state_dict': model.module.state_dict(),
            'metric': final_metric,
            'optimizer': optimizer.state_dict(),
        }, best_model, final_output_dir, "checkpoint_final_state.pth")
    def build_model(self, eval=False):
        if self.train_mode:
            with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
                self.learning_rate = tf.constant(self.hparams.learning_rate)
                #self.learning_rate = self._get_learning_rate_warmup(self.hparams)
                #self.learning_rate = self._get_learning_rate_decay()

                opt = utils.get_optimizer(self.hparams, self.learning_rate)

                tower_grads = []
                losses = []
                controller = "/cpu:0"
                self._train_models = []
                for i, id in enumerate(gpu_utils.get_available_gpus()):
                    name = 'tower_%d' % i
                    with tf.device(gpu_utils.assign_to_device(id, controller)), tf.name_scope(name):
                        model = self.Model()
                        model(
                            self.hparams,
                            tf.estimator.ModeKeys.TRAIN,
                            self._batched_input_train)
                        loss = model.loss
                        with tf.name_scope("compute_gradients"):
                            grad_and_vars = opt.compute_gradients(
                                loss,
                                var_list=model.trainable_variables(),
                                colocate_gradients_with_ops=self.hparams.colocate_gradients_with_ops)
                            vars = [var for _, var in grad_and_vars]
                            grads, _, _ = model_utils.gradient_clip([grad for grad, var in grad_and_vars], max_gradient_norm=MAX_GRADIENT_NORM)
                            tower_grads.append(zip(grads, vars))
                        losses.append(loss)
                    outer_scope.reuse_variables()
                self._train_models.append(model)
                self._train_model = model
                self.params = model.trainable_variables()

            with tf.name_scope("apply_gradients"), tf.device(controller):
                average_grads = []
                
                for grad_and_vars in zip(*tower_grads):
                    grads = [g for g, _ in grad_and_vars]
                    for g, v in grad_and_vars:
                        print(g, v)
                    grad = tf.reduce_mean(grads, 0)
                    v = grad_and_vars[0][1]
                    grad_and_var = (grad, v)
                    average_grads.append(grad_and_var)
                self.update = opt.apply_gradients(average_grads, self._global_step)
                self.loss = tf.reduce_mean(losses)

            self._summary = tf.summary.merge([
                tf.summary.scalar('train_loss', self.loss),
                tf.summary.scalar("learning_rate", self.learning_rate),
            ])

        # init dev model
        if self.hparams.dev_data is not None:
            with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                self.dev_model = self.Model()
                self.hparams.batch_size = self.hparams.eval_batch_size
                self.dev_model(
                    self.hparams,
                    tf.estimator.ModeKeys.EVAL,
                    self._batched_input_dev)

        if eval or self.eval_mode:
            with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
                self.test_model = self.Model()
                self.hparams.batch_size = self.hparams.eval_batch_size
                self.test_model(
                    self.hparams,
                    tf.estimator.ModeKeys.EVAL,
                    self._batched_input_test)
                self._eval_summary = tf.no_op()

        self.print_logs()
Beispiel #17
0
def main():
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    args = parse_args()
    print('out')
    print(args)

    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_pose_net')(
        config, is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', config.MODEL.NAME + '.py'),
        final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (config.TRAIN.BATCH_SIZE, 3, config.MODEL.IMAGE_SIZE[1],
         config.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(',')]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + config.DATASET.DATASET)(
        config, config.DATASET.ROOT, config.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    best_perf = 0.0
    best_model = False

    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        #print("model check!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        #for i,p in enumerate(model.parameters()):
        #    print(p.requires_grad)

        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)

        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, valid_dataset, model,
                                  criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': get_model_name(config),
                'state_dict': model.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
def main():
    args = parse_args()
    update_config(cfg, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        cfg, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(cfg)

    t_checkpoints = cfg.KD.TEACHER  #注意是在student配置文件中修改
    train_type = cfg.KD.TRAIN_TYPE  #注意是在student配置文件中修改
    train_type = get_train_type(train_type, t_checkpoints)
    logger.info('=> train type is {} '.format(train_type))

    if train_type == 'FPD':
        cfg_name = 'student_' + os.path.basename(args.cfg).split('.')[0]
    else:
        cfg_name = os.path.basename(args.cfg).split('.')[0]
    save_yaml_file(cfg_name, cfg, final_output_dir)

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(cfg,
                                                               is_train=True)

    # fpd method, default NORMAL
    if train_type == 'FPD':
        tcfg = cfg.clone()
        tcfg.defrost()
        tcfg.merge_from_file(args.tcfg)
        tcfg.freeze()
        tcfg_name = 'teacher_' + os.path.basename(args.tcfg).split('.')[0]
        save_yaml_file(tcfg_name, tcfg, final_output_dir)
        # teacher model
        tmodel = eval('models.' + tcfg.MODEL.NAME + '.get_pose_net')(
            tcfg, is_train=False)

        load_checkpoint(t_checkpoints,
                        tmodel,
                        strict=True,
                        model_info='teacher_' + tcfg.MODEL.NAME)

        tmodel = torch.nn.DataParallel(tmodel, device_ids=cfg.GPUS).cuda()
        # define kd_pose loss function (criterion) and optimizer
        kd_pose_criterion = JointsMSELoss(
            use_target_weight=tcfg.LOSS.USE_TARGET_WEIGHT).cuda()

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
        final_output_dir)
    # logger.info(pprint.pformat(model))

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ))

    logger.info(get_model_summary(model, dump_input))

    if cfg.TRAIN.CHECKPOINT:
        load_checkpoint(cfg.TRAIN.CHECKPOINT,
                        model,
                        strict=True,
                        model_info='student_' + cfg.MODEL.NAME)
    model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()

    # you can choose or replace pose_loss and kd_pose_loss type, including mse,kl,ohkm loss ect
    # define pose loss function (criterion) and optimizer
    pose_criterion = JointsMSELoss(
        use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda()

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY)

    best_perf = 0.0
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                        cfg.TRAIN.LR_STEP,
                                                        cfg.TRAIN.LR_FACTOR,
                                                        last_epoch=last_epoch)

    # evaluate on validation set
    validate(cfg, valid_loader, valid_dataset, tmodel, pose_criterion,
             final_output_dir, tb_log_dir, writer_dict)
    validate(cfg, valid_loader, valid_dataset, model, pose_criterion,
             final_output_dir, tb_log_dir, writer_dict)

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # fpd method, default NORMAL
        if train_type == 'FPD':
            # train for one epoch
            fpd_train(cfg, train_loader, model, tmodel, pose_criterion,
                      kd_pose_criterion, optimizer, epoch, final_output_dir,
                      tb_log_dir, writer_dict)
        else:
            # train for one epoch
            train(cfg, train_loader, model, pose_criterion, optimizer, epoch,
                  final_output_dir, tb_log_dir, writer_dict)

        # evaluate on validation set
        perf_indicator = validate(cfg, valid_loader, valid_dataset, model,
                                  pose_criterion, final_output_dir, tb_log_dir,
                                  writer_dict)

        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        '=> saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
def main_per_worker(process_index, ngpus_per_node, args):
    update_config(cfg, args)
    
    # torch seed
    torch.cuda.manual_seed(random.random())

    # cudnn
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    #proc_rank
    proc_rank = args.rank * ngpus_per_node + process_index

    #create logger
    logger, output_dir = create_logger(cfg, proc_rank)
    # logger.info(pprint.pformat(args))
    # logger.info(cfg)
    
    model = InceptionResnetV1(pretrained='vggface2', classify=False, path=[cfg.MODEL.FEATURE_PATH, cfg.MODEL.LOGITS_PATH])
    optimizer = get_optimizer(cfg, model)
    model, optimizer, last_iter = load_checkpoint(cfg, model, optimizer)
    lr_scheduler = get_lr_scheduler(cfg, optimizer, last_iter)

    train_transform = FacenetTransform(size=(cfg.TRAIN.INPUT_MIN, cfg.TRAIN.INPUT_MAX))
    train_dataset = FacenetTripletDataset(cfg.DATASET.ROOT, transform=train_transform, is_train=True)

    eval_transform = FacenetTransform(size=cfg.TEST.TEST_SIZE)
    eval_dataset = FacenetTripletDataset(cfg.DATASET.ROOT, transform=eval_transform, is_train=False)

    # distribution
    if args.distributed:
        logger.info(
            f'Init process group: dist_url: {args.dist_url},  '
            f'world_size: {args.world_size}, '
            f'machine: {args.rank}, '
            f'rank:{proc_rank}'
        )
        dist.init_process_group(
            backend=cfg.DIST_BACKEND,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=proc_rank
        )
        torch.cuda.set_device(process_index)
        model.cuda()
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[process_index]
        )
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset
        )
        batch_size = cfg.DATASET.IMG_NUM_PER_GPU

    else:
        assert proc_rank == 0, ('proc_rank != 0, it will influence '
                                'the evaluation procedure')
        model = torch.nn.DataParallel(model).cuda()
        train_sampler = None
        batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node
    
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=(train_sampler is None),
        drop_last=True,
        collate_fn=facenet_triplet_collect,
        num_workers=cfg.WORKERS,
        pin_memory=True,
        sampler=train_sampler
    )

    eval_loader = torch.utils.data.DataLoader(
        eval_dataset,
        batch_size=batch_size,
        shuffle=False,
        drop_last=False,
        collate_fn=facenet_triplet_collect,
        num_workers=cfg.WORKERS
    )
    
    criterion = triplet_loss

    Trainer = get_trainer(
        cfg,
        model,
        optimizer,
        lr_scheduler,
        criterion,
        output_dir,
        last_iter,
        proc_rank,
    )

    while True:
        Trainer.train(train_loader, eval_loader)

    # eval
    Trainer.evaluate(eval_loader)
Beispiel #20
0
def trainIters(args):
    epoch_resume = 0
    model_dir = os.path.join('../models/',
                             args.model_name + '_prev_inference_mask')

    if args.resume:
        # will resume training the model with name args.model_name
        encoder_dict, decoder_dict, enc_opt_dict, dec_opt_dict, load_args = load_checkpoint(
            args.model_name, args.use_gpu)

        epoch_resume = load_args.epoch_resume
        encoder = FeatureExtractor(load_args)
        decoder = RSISMask(load_args)
        encoder_dict, decoder_dict = check_parallel(encoder_dict, decoder_dict)
        encoder.load_state_dict(encoder_dict)
        decoder.load_state_dict(decoder_dict)

        args = load_args

    elif args.transfer:
        # load model from args and replace last fc layer
        encoder_dict, decoder_dict, _, _, load_args = load_checkpoint(
            args.transfer_from, args.use_gpu)
        encoder = FeatureExtractor(load_args)
        decoder = RSISMask(args)
        encoder_dict, decoder_dict = check_parallel(encoder_dict, decoder_dict)
        encoder.load_state_dict(encoder_dict)
        decoder.load_state_dict(decoder_dict)

    else:
        encoder = FeatureExtractor(args)
        decoder = RSISMask(args)

    # model checkpoints will be saved here
    make_dir(model_dir)

    # save parameters for future use
    pickle.dump(args, open(os.path.join(model_dir, 'args.pkl'), 'wb'))

    encoder_params = get_base_params(args, encoder)
    skip_params = get_skip_params(encoder)
    decoder_params = list(decoder.parameters()) + list(skip_params)
    dec_opt = get_optimizer(args.optim, args.lr, decoder_params,
                            args.weight_decay)
    enc_opt = get_optimizer(args.optim_cnn, args.lr_cnn, encoder_params,
                            args.weight_decay_cnn)

    if args.resume:
        enc_opt.load_state_dict(enc_opt_dict)
        dec_opt.load_state_dict(dec_opt_dict)
        from collections import defaultdict
        dec_opt.state = defaultdict(dict, dec_opt.state)

    if not args.log_term:
        print("Training logs will be saved to:",
              os.path.join(model_dir, 'train.log'))
        sys.stdout = open(os.path.join(model_dir, 'train.log'), 'w')
        sys.stderr = open(os.path.join(model_dir, 'train.err'), 'w')

    print(args)

    # objective function for mask
    mask_siou = softIoULoss()

    if args.use_gpu:
        encoder.cuda()
        decoder.cuda()
        mask_siou.cuda()

    crits = mask_siou
    optims = [enc_opt, dec_opt]
    if args.use_gpu:
        torch.cuda.synchronize()
    start = time.time()

    # vars for early stopping
    best_val_loss = args.best_val_loss
    acc_patience = 0
    mt_val = -1

    # keep track of the number of batches in each epoch for continuity when plotting curves
    loaders = init_dataloaders(args)
    num_batches = {'train': 0, 'val': 0}
    #area_range = [[0 ** 2, 1e5 ** 2], [0 ** 2, 20 ** 2], [20 ** 2, 59 ** 2], [59 ** 2, 1e5 ** 2]]
    area_range = [[0**2, 1e5**2], [0**2, 30**2], [30**2, 90**2],
                  [90**2, 1e5**2]]  #for (287,950))
    resolution = 0

    for e in range(args.max_epoch):
        print("Epoch", e + epoch_resume)
        # store losses in lists to display average since beginning
        epoch_losses = {
            'train': {
                'total': [],
                'iou': []
            },
            'val': {
                'total': [],
                'iou': []
            }
        }
        # total mean for epoch will be saved here to display at the end
        total_losses = {'total': [], 'iou': []}

        # check if it's time to do some changes here
        if e + epoch_resume >= args.finetune_after and not args.update_encoder and not args.finetune_after == -1:
            print("Starting to update encoder")
            args.update_encoder = True
            acc_patience = 0
            mt_val = -1

        if args.loss_penalization:
            if e < 10:
                resolution = area_range[2]
            else:
                resolution = area_range[0]

        # we validate after each epoch
        for split in ['train', 'val']:
            if args.dataset == 'davis2017' or args.dataset == 'youtube' or args.dataset == 'kittimots':
                loaders[split].dataset.set_epoch(e)
                for batch_idx, (inputs, targets, seq_name,
                                starting_frame) in enumerate(loaders[split]):
                    # send batch to GPU

                    prev_hidden_temporal_list = None
                    loss = None
                    last_frame = False
                    max_ii = min(len(inputs), args.length_clip)

                    for ii in range(max_ii):
                        # If are on the last frame from a clip, we will have to backpropagate the loss back to the beginning of the clip.
                        if ii == max_ii - 1:
                            last_frame = True

                        #                x: input images (N consecutive frames from M different sequences)
                        #                y_mask: ground truth annotations (some of them are zeros to have a fixed length in number of object instances)
                        #                sw_mask: this mask indicates which masks from y_mask are valid
                        x, y_mask, sw_mask = batch_to_var(
                            args, inputs[ii], targets[ii])

                        if ii == 0:
                            prev_mask = y_mask

                        # From one frame to the following frame the prev_hidden_temporal_list is updated.
                        loss, losses, outs, hidden_temporal_list = runIter(
                            args, encoder, decoder, x, y_mask, sw_mask,
                            resolution, crits, optims, split, loss,
                            prev_hidden_temporal_list, prev_mask, last_frame)

                        # Hidden temporal state from time instant ii is saved to be used when processing next time instant ii+1
                        if args.only_spatial == False:
                            prev_hidden_temporal_list = hidden_temporal_list

                        prev_mask = outs

                    # store loss values in dictionary separately
                    epoch_losses[split]['total'].append(losses[0])
                    epoch_losses[split]['iou'].append(losses[1])

                    # print after some iterations
                    if (batch_idx + 1) % args.print_every == 0:

                        mt = np.mean(epoch_losses[split]['total'])
                        mi = np.mean(epoch_losses[split]['iou'])

                        te = time.time() - start
                        print("iter %d:\ttotal:%.4f\tiou:%.4f\ttime:%.4f" %
                              (batch_idx, mt, mi, te))
                        if args.use_gpu:
                            torch.cuda.synchronize()
                        start = time.time()

            num_batches[split] = batch_idx + 1
            # compute mean val losses within epoch

            if split == 'val' and args.smooth_curves:
                if mt_val == -1:
                    mt = np.mean(epoch_losses[split]['total'])
                else:
                    mt = 0.9 * mt_val + 0.1 * np.mean(
                        epoch_losses[split]['total'])
                mt_val = mt

            else:
                mt = np.mean(epoch_losses[split]['total'])

            mi = np.mean(epoch_losses[split]['iou'])

            # save train and val losses for the epoch
            total_losses['iou'].append(mi)
            total_losses['total'].append(mt)

            args.epoch_resume = e + epoch_resume

            print("Epoch %d:\ttotal:%.4f\tiou:%.4f\t(%s)" % (e, mt, mi, split))

        if mt < (best_val_loss - args.min_delta):
            print("Saving checkpoint.")
            best_val_loss = mt
            args.best_val_loss = best_val_loss
            # saves model, params, and optimizers
            save_checkpoint_prev_inference_mask(args, encoder, decoder,
                                                enc_opt, dec_opt)
            acc_patience = 0
        else:
            acc_patience += 1

        if acc_patience > args.patience and not args.update_encoder and not args.finetune_after == -1:
            print("Starting to update encoder")
            acc_patience = 0
            args.update_encoder = True
            best_val_loss = 1000  # reset because adding a loss term will increase the total value
            mt_val = -1
            encoder_dict, decoder_dict, enc_opt_dict, dec_opt_dict, _ = load_checkpoint(
                args.model_name, args.use_gpu)
            encoder.load_state_dict(encoder_dict)
            decoder.load_state_dict(decoder_dict)
            enc_opt.load_state_dict(enc_opt_dict)
            dec_opt.load_state_dict(dec_opt_dict)

        # early stopping after N epochs without improvement
        if acc_patience > args.patience_stop:
            break
Beispiel #21
0
def main():
    args = parse_args()

    if args.dist_url == "env://" and args.world_size == -1:
        args.world_size = int(os.environ["WORLD_SIZE"])

    dist.init_process_group(backend=args.dist_backend,
                            init_method=args.dist_url)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_cls_net')(config)

    dump_input = torch.rand(
        (1, 3, config.MODEL.IMAGE_SIZE[1], config.MODEL.IMAGE_SIZE[0]))
    logger.info(get_model_summary(model, dump_input))

    # copy model file
    # this_dir = os.path.dirname(__file__)
    # models_dst_dir = os.path.join(final_output_dir, 'models')
    # if os.path.exists(models_dst_dir):
    #     shutil.rmtree(models_dst_dir)
    # shutil.copytree(os.path.join(this_dir, '../lib/models'), models_dst_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    gpus = list(config.GPUS)
    '''
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()
    '''
    # Change DP to DDP
    torch.cuda.set_device(args.local_rank)
    model = model.to(args.local_rank)
    model = torch.nn.parallel.DistributedDataParallel(
        model, device_ids=[args.local_rank], output_device=args.local_rank)

    # define loss function (criterion) and optimizer
    criterion = torch.nn.CrossEntropyLoss().cuda()

    optimizer = get_optimizer(config, model)

    best_perf = 0.0
    best_model = False
    last_epoch = config.TRAIN.BEGIN_EPOCH
    if config.TRAIN.RESUME:
        model_state_file = os.path.join(final_output_dir, 'checkpoint.pth.tar')
        if os.path.isfile(model_state_file):
            checkpoint = torch.load(model_state_file)
            last_epoch = checkpoint['epoch']
            best_perf = checkpoint['perf']
            model.module.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            logger.info("=> loaded checkpoint (epoch {})".format(
                checkpoint['epoch']))
            best_model = True

    if isinstance(config.TRAIN.LR_STEP, list):
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
            last_epoch - 1)
    else:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                       config.TRAIN.LR_STEP,
                                                       config.TRAIN.LR_FACTOR,
                                                       last_epoch - 1)

    # Data loading code
    traindir = os.path.join(config.DATASET.ROOT, config.DATASET.TRAIN_SET)
    valdir = os.path.join(config.DATASET.ROOT, config.DATASET.TEST_SET)

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    '''
    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(config.MODEL.IMAGE_SIZE[0]),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    )
    '''
    # Change to TSV dataset instance
    train_dataset = TSVInstance(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(config.MODEL.IMAGE_SIZE[0]),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    # DDP requires DistributedSampler
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        shuffle=(train_sampler is None),
        num_workers=config.WORKERS,
        pin_memory=True,
        sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        TSVInstance(
            valdir,
            transforms.Compose([
                transforms.Resize(int(config.MODEL.IMAGE_SIZE[0] / 0.875)),
                transforms.CenterCrop(config.MODEL.IMAGE_SIZE[0]),
                transforms.ToTensor(),
                normalize,
            ])),
        batch_size=config.TEST.BATCH_SIZE_PER_GPU,
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True)

    for epoch in range(last_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()
        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)
        # evaluate on validation set
        perf_indicator = validate(config, valid_loader, model, criterion,
                                  final_output_dir, tb_log_dir, writer_dict)

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': config.MODEL.NAME,
                'state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            },
            best_model,
            final_output_dir,
            filename='checkpoint.pth.tar')

    final_model_state_file = os.path.join(final_output_dir,
                                          'final_state.pth.tar')
    logger.info(
        'saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #22
0
from trainer.Trainer import Trainer
from torch.utils.tensorboard import SummaryWriter
from models.loss import PixWiseBCELoss
from datasets.PixWiseDataset import PixWiseDataset
from utils.utils import read_cfg, get_optimizer, build_network, get_device

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

cfg = read_cfg(cfg_file='config/densenet_161_adam_lr1e-3.yaml')

device = get_device(cfg)

network = build_network(cfg)

optimizer = get_optimizer(cfg, network)

loss = PixWiseBCELoss(beta=cfg['train']['loss']['beta'])

writer = SummaryWriter(cfg['log_dir'])

dump_input = torch.randn(1, 3, 224, 224)

writer.add_graph(network, (dump_input, ))

# Without Resize transform, images are of different sizes and it causes an error
train_transform = transforms.Compose([
    transforms.Resize(cfg['model']['image_size']),
    transforms.RandomRotation(cfg['dataset']['augmentation']['rotation']),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
Beispiel #23
0
def main():
    args = parse_args()
    reset_config(config, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        config, args.cfg, "train")

    logger.info(pprint.pformat(args))
    logger.info(pprint.pformat(config))

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval("models." + config.MODEL.NAME + ".get_pose_net")(
        config, is_train=True)

    # copy model file
    this_dir = os.path.dirname(__file__)
    shutil.copy2(
        os.path.join(this_dir, "../lib/models", config.MODEL.NAME + ".py"),
        final_output_dir,
    )

    writer_dict = {
        "writer": SummaryWriter(log_dir=tb_log_dir),
        "train_global_steps": 0,
        "valid_global_steps": 0,
    }

    dump_input = torch.rand((
        config.TRAIN.BATCH_SIZE,
        3,
        config.MODEL.IMAGE_SIZE[1],
        config.MODEL.IMAGE_SIZE[0],
    ))
    writer_dict["writer"].add_graph(model, (dump_input, ), verbose=False)

    gpus = [int(i) for i in config.GPUS.split(",")]
    model = torch.nn.DataParallel(model, device_ids=gpus).cuda()

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=config.LOSS.USE_TARGET_WEIGHT).cuda()

    optimizer = get_optimizer(config, model)

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR)

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_dataset = eval("dataset." + config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TRAIN_SET,
        True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]),
    )
    valid_dataset = eval("dataset." + config.DATASET.DATASET)(
        config,
        config.DATASET.ROOT,
        config.DATASET.TEST_SET,
        False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]),
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE * len(gpus),
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE * len(gpus),
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=True,
    )

    best_perf = 0.0
    best_model = False
    for epoch in range(config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(
            config,
            train_loader,
            model,
            criterion,
            optimizer,
            epoch,
            final_output_dir,
            tb_log_dir,
            writer_dict,
        )

        # evaluate on validation set
        perf_indicator = validate(
            config,
            valid_loader,
            valid_dataset,
            model,
            criterion,
            final_output_dir,
            tb_log_dir,
            writer_dict,
        )

        if perf_indicator > best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info("=> saving checkpoint to {}".format(final_output_dir))
        save_checkpoint(
            {
                "epoch": epoch + 1,
                "model": get_model_name(config),
                "state_dict": model.state_dict(),
                "perf": perf_indicator,
                "optimizer": optimizer.state_dict(),
            },
            best_model,
            final_output_dir,
        )

    final_model_state_file = os.path.join(final_output_dir,
                                          "final_state.pth.tar")
    logger.info(
        "saving final model state to {}".format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict["writer"].close()
Beispiel #24
0
def main_worker(gpu, ngpus_per_node, args, final_output_dir, tb_log_dir):

    args.gpu = gpu
    args.rank = args.rank * ngpus_per_node + gpu
    print('Init process group: dist_url: {}, world_size: {}, rank: {}'.format(cfg.DIST_URL, args.world_size, args.rank))
    dist.init_process_group(backend=cfg.DIST_BACKEND, init_method=cfg.DIST_URL, world_size=args.world_size, rank=args.rank)

    update_config(cfg, args)

    # setup logger
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(cfg, is_train=True)
    logger.info(get_model_summary(model, torch.zeros(1, 3, *cfg.MODEL.IMAGE_SIZE)))

    # copy model file
    if not cfg.MULTIPROCESSING_DISTRIBUTED or (cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0):
        this_dir = os.path.dirname(__file__)
        shutil.copy2(os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'), final_output_dir)

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    if not cfg.MULTIPROCESSING_DISTRIBUTED or (cfg.MULTIPROCESSING_DISTRIBUTED and args.rank % ngpus_per_node == 0):
        dump_input = torch.rand((1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))
        writer_dict['writer'].add_graph(model, (dump_input, ))
        # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    if cfg.MODEL.SYNC_BN:
        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
    
    torch.cuda.set_device(args.gpu)
    model.cuda(args.gpu)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])

    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda(args.gpu)

    # Data loading code
    train_dataset = eval('dataset.'+cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    )
    valid_dataset = eval('dataset.'+cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
    )
    
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
        shuffle=(train_sampler is None),
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY,
        sampler=train_sampler
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU*len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY
    )
    logger.info(train_loader.dataset)

    best_perf = -1
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')
    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(checkpoint_file, checkpoint['epoch']))

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
        last_epoch=last_epoch)

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        
        # train for one epoch
        train(cfg, train_loader, model, criterion, optimizer, epoch,
              final_output_dir, tb_log_dir, writer_dict)
        # In PyTorch 1.1.0 and later, you should call `lr_scheduler.step()` after `optimizer.step()`.
        lr_scheduler.step()

        # evaluate on validation set
        perf_indicator = validate(
            args, cfg, valid_loader, valid_dataset, model, criterion,
            final_output_dir, tb_log_dir, writer_dict
        )

        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        if not cfg.MULTIPROCESSING_DISTRIBUTED or (
                cfg.MULTIPROCESSING_DISTRIBUTED
                and args.rank == 0
        ):
            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint({
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(
        final_output_dir, 'final_state{}.pth.tar'.format(gpu)
    )

    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #25
0
def main():
    args = parse_args()

    # cudnn related setting
    cudnn.benchmark = config.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = config.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = config.CUDNN.ENABLED

    model = eval('models.' + config.MODEL.NAME + '.get_nnb')(config)

    writer_dict = {
        'writer': SummaryWriter(log_dir='./output/facexray'),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    gpus = list(config.GPUS)
    model = torch.nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    criterion = Loss()

    optimizer = get_optimizer(config, model)

    last_epoch = config.TRAIN.BEGIN_EPOCH

    if isinstance(config.TRAIN.LR_STEP, list):
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
            last_epoch - 1)
    else:
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                       config.TRAIN.LR_STEP,
                                                       config.TRAIN.LR_FACTOR,
                                                       last_epoch - 1)

    # Data loading code
    # list_name没有单独标注在.yaml文件
    # transform还没能适用于其他规格,应做成[256, 256, 3]
    train_dataset = eval('dataset.' + config.DATASET.DATASET + '.' +
                         config.DATASET.DATASET)(
                             config.DATASET.ROOT, config.DATASET.TRAIN_SET,
                             None, transforms.Compose([transforms.ToTensor()]))

    valid_dataset = eval('dataset.' + config.DATASET.DATASET + '.' +
                         config.DATASET.DATASET)(config.DATASET.ROOT,
                                                 config.DATASET.TEST_SET, None,
                                                 transforms.Compose(
                                                     [transforms.ToTensor()]))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN.BATCH_SIZE_PER_GPU,
        shuffle=config.TRAIN.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=config.PIN_MEMORY)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.TEST.BATCH_SIZE_PER_GPU,
        shuffle=False,
        num_workers=config.WORKERS,
        pin_memory=config.PIN_MEMORY)

    for epoch in range(last_epoch, config.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # 前50000次迭代锁定原hrnet层参数训练,后面的迭代训练所有参数
        if epoch == 150000:
            for k, v in model.named_parameters():
                v.requires_grad = True

        # train for one epoch
        train(config, train_loader, model, criterion, optimizer, epoch,
              writer_dict)
        # evaluate on validation set
        validate(config, valid_loader, model, criterion, writer_dict)

    torch.save(model.module.state_dict(), './output/BI_dataset/faceXray.pth')
    writer_dict['writer'].close()
Beispiel #26
0
    model = get_model(cfg, [l1_cls_num, l2_cls_num], device, logger)
    if cfg.TRAIN_STAGE == 2:
        last_stage_weight_path = os.path.join(model_dir, 'best_model_stage1.pth')
        load_weight(model, last_stage_weight_path)
        model.module.freeze_backbone()
        model.module.freeze_classifer(0)
    elif cfg.TRAIN_STAGE == 1:
        last_stage_weight_path = os.path.join(args.pretrained_path)
        load_weight(model, last_stage_weight_path)
        model.module.freeze_backbone()
        model.module.freeze_classifer(1)

    # load_pretrained_weight(model, args.pretrained_path)
    combiner = Combiner(cfg, device)
    optimizer = get_optimizer(cfg, model)
    scheduler = get_scheduler(cfg, optimizer)
    # ----- END MODEL BUILDER -----

    trainLoader = DataLoader(
        train_set,
        batch_size=cfg.TRAIN.BATCH_SIZE,
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.TRAIN.NUM_WORKERS,
        pin_memory=cfg.PIN_MEMORY,
        drop_last=True
    )

    validLoader = DataLoader(
        valid_set,
        batch_size=cfg.TEST.BATCH_SIZE,
Beispiel #27
0
def main_worker(
        gpu, ngpus_per_node, args, final_output_dir, tb_log_dir
):
    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    if cfg.FP16.ENABLED:
        assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    if cfg.FP16.STATIC_LOSS_SCALE != 1.0:
        if not cfg.FP16.ENABLED:
            print("Warning:  if --fp16 is not used, static_loss_scale will be ignored.")

    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if cfg.MULTIPROCESSING_DISTRIBUTED:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        print('Init process group: dist_url: {}, world_size: {}, rank: {}'.
              format(args.dist_url, args.world_size, args.rank))
        dist.init_process_group(
            backend=cfg.DIST_BACKEND,
            init_method=args.dist_url,
            world_size=args.world_size,
            rank=args.rank
        )

    update_config(cfg, args)

    # setup logger
    logger, _ = setup_logger(final_output_dir, args.rank, 'train')

    model = eval('models.'+cfg.MODEL.NAME+'.get_pose_net')(
        cfg, is_train=True
    )

    # copy model file
    if not cfg.MULTIPROCESSING_DISTRIBUTED or (
            cfg.MULTIPROCESSING_DISTRIBUTED
            and args.rank % ngpus_per_node == 0
    ):
        this_dir = os.path.dirname(__file__)
        shutil.copy2(
            os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
            final_output_dir
        )

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    if not cfg.MULTIPROCESSING_DISTRIBUTED or (
            cfg.MULTIPROCESSING_DISTRIBUTED
            and args.rank % ngpus_per_node == 0
    ):
        dump_input = torch.rand(
            (1, 3, cfg.DATASET.INPUT_SIZE, cfg.DATASET.INPUT_SIZE)
        )
        writer_dict['writer'].add_graph(model, (dump_input, ))
        # logger.info(get_model_summary(model, dump_input, verbose=cfg.VERBOSE))

    if cfg.FP16.ENABLED:
        model = network_to_half(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            # args.workers = int(args.workers / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu]
            )
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    loss_factory = MultiLossFactory(cfg).cuda()

    # Data loading code
    train_loader = make_dataloader(
        cfg, is_train=True, distributed=args.distributed
    )
    logger.info(train_loader.dataset)

    best_perf = -1
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)

    if cfg.FP16.ENABLED:
        optimizer = FP16_Optimizer(
            optimizer,
            static_loss_scale=cfg.FP16.STATIC_LOSS_SCALE,
            dynamic_loss_scale=cfg.FP16.DYNAMIC_LOSS_SCALE
        )

    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(
        final_output_dir, 'checkpoint.pth.tar')
    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    if cfg.FP16.ENABLED:
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer.optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
            last_epoch=last_epoch
        )
    else:
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, cfg.TRAIN.LR_STEP, cfg.TRAIN.LR_FACTOR,
            last_epoch=last_epoch
        )

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train one epoch
        do_train(cfg, model, train_loader, loss_factory, optimizer, epoch,
                 final_output_dir, tb_log_dir, writer_dict, fp16=cfg.FP16.ENABLED)

        perf_indicator = epoch
        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        if not cfg.MULTIPROCESSING_DISTRIBUTED or (
                cfg.MULTIPROCESSING_DISTRIBUTED
                and args.rank == 0
        ):
            logger.info('=> saving checkpoint to {}'.format(final_output_dir))
            save_checkpoint({
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(
        final_output_dir, 'final_state{}.pth.tar'.format(gpu)
    )

    logger.info('saving final model state to {}'.format(
        final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #28
0
 def build_optimizer(self):
     for name, module in self.named_modules():
         optim_name = name.replace('net', 'optimizer')
         setattr(self, optim_name, utils.get_optimizer(self.opt, module))
def main():
    args = parse_args()
    update_config(cfg, args)

    logger, final_output_dir, tb_log_dir = create_logger(
        cfg, args.cfg, 'train')

    logger.info(pprint.pformat(args))
    logger.info(cfg)

    # cudnn related setting
    cudnn.benchmark = cfg.CUDNN.BENCHMARK
    # 用于加快训练速度,同时避免benchmark的随机性
    torch.backends.cudnn.deterministic = cfg.CUDNN.DETERMINISTIC
    torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED

    model = eval('models.' + cfg.MODEL.NAME + '.get_pose_net')(
        cfg, is_train=True)  # eval()函数执行一个字符串表达式,并返回表达式的值

    # copy model file
    this_dir = os.path.dirname(__file__)  # 取当前路径
    shutil.copy2(
        os.path.join(this_dir, '../lib/models', cfg.MODEL.NAME + '.py'),
        final_output_dir)
    # logger.info(pprint.pformat(model))

    writer_dict = {
        'writer': SummaryWriter(log_dir=tb_log_dir),
        'train_global_steps': 0,
        'valid_global_steps': 0,
    }

    dump_input = torch.rand(
        (1, 3, cfg.MODEL.IMAGE_SIZE[1], cfg.MODEL.IMAGE_SIZE[0]))
    writer_dict['writer'].add_graph(model, (dump_input, ))

    logger.info(get_model_summary(model, dump_input))  # 记录模型日志

    model = torch.nn.DataParallel(model, device_ids=cfg.GPUS).cuda()
    #model = torch.nn.DataParallel(model, device_ids=[0]).cuda()
    # 多GPU训练
    # define loss function (criterion) and optimizer
    criterion = JointsMSELoss(
        use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda()
    regress_loss = RegLoss(use_target_weight=cfg.LOSS.USE_TARGET_WEIGHT).cuda()
    # Data loading code
    normalize = transforms.Normalize(
        # 使用Imagenet的均值和标准差进行归一化
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])
    train_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TRAIN_SET, True,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))
    valid_dataset = eval('dataset.' + cfg.DATASET.DATASET)(
        cfg, cfg.DATASET.ROOT, cfg.DATASET.TEST_SET, False,
        transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ]))  # 图像处理

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg.TRAIN.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=cfg.TRAIN.SHUFFLE,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=cfg.TEST.BATCH_SIZE_PER_GPU * len(cfg.GPUS),
        shuffle=False,
        num_workers=cfg.WORKERS,
        pin_memory=cfg.PIN_MEMORY,
    )

    best_perf = 0.0
    best_model = False
    last_epoch = -1
    optimizer = get_optimizer(cfg, model)
    begin_epoch = cfg.TRAIN.BEGIN_EPOCH
    checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth')

    if cfg.AUTO_RESUME and os.path.exists(checkpoint_file):
        logger.info("=> loading checkpoint '{}'".format(checkpoint_file))
        checkpoint = torch.load(checkpoint_file)
        begin_epoch = checkpoint['epoch']
        best_perf = checkpoint['perf']
        last_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])

        optimizer.load_state_dict(checkpoint['optimizer'])
        logger.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file, checkpoint['epoch']))

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                        cfg.TRAIN.LR_STEP,
                                                        cfg.TRAIN.LR_FACTOR,
                                                        last_epoch=last_epoch)

    for epoch in range(begin_epoch, cfg.TRAIN.END_EPOCH):
        lr_scheduler.step()

        # train for one epoch
        train(cfg, train_loader, model, criterion, regress_loss, optimizer,
              epoch, final_output_dir, tb_log_dir, writer_dict)

        # evaluate on validation set
        perf_indicator = validate(cfg, valid_loader, valid_dataset, model,
                                  criterion, regress_loss, final_output_dir,
                                  tb_log_dir, writer_dict)

        if perf_indicator >= best_perf:
            best_perf = perf_indicator
            best_model = True
        else:
            best_model = False

        logger.info('=> saving checkpoint to {}'.format(final_output_dir))
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'model': cfg.MODEL.NAME,
                'state_dict': model.state_dict(),
                'best_state_dict': model.module.state_dict(),
                'perf': perf_indicator,
                'optimizer': optimizer.state_dict(),
            }, best_model, final_output_dir)

    final_model_state_file = os.path.join(final_output_dir, 'final_state.pth')
    logger.info(
        '=> saving final model state to {}'.format(final_model_state_file))
    torch.save(model.module.state_dict(), final_model_state_file)
    writer_dict['writer'].close()
Beispiel #30
0
        shuffle=config.TEST.SHUFFLE,
        num_workers=config.WORKERS,
        pin_memory=config.PIN_MEMORY,
    )
    # get device
    if torch.cuda.is_available():
        device = torch.device("cuda:{}".format(config.GPUID))
    else:
        device = torch.device("cpu:0")

    model = crnn.get_crnn(config)
    model = model.to(device)
    model_info(model)
    print(model)

    optimizer = get_optimizer(config, model)
    last_epoch = config.TRAIN.BEGIN_EPOCH
    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, config.TRAIN.LR_STEP, config.TRAIN.LR_FACTOR,
        last_epoch - 1)
    if config.ATTENTION.ENABLE:
        criterion = torch.nn.NLLLoss()
    else:
        criterion = torch.nn.CTCLoss()
    # 训练
    best_acc = 0.0
    for epoch in range(last_epoch, config.TRAIN.END_EPOCH):
        model.train()
        for i, (inp, idx) in enumerate(train_loader):
            # 前馈,计算loss
            inp = inp.to(device)