Ejemplo n.º 1
0
def main(cfg, _log):
    init_seed(cfg.seed)

    _log.info("=> fetching img pairs.")
    train_set, valid_set = get_dataset(cfg)

    _log.info('{} samples found, {} train samples and {} test samples '.format(
        len(valid_set) + len(train_set), len(train_set), len(valid_set)))

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=cfg.train.batch_size,
                                               num_workers=cfg.train.workers,
                                               pin_memory=True,
                                               shuffle=True)

    max_test_batch = 4
    if type(valid_set) is torch.utils.data.ConcatDataset:
        valid_loader = [
            torch.utils.data.DataLoader(s,
                                        batch_size=min(max_test_batch,
                                                       cfg.train.batch_size),
                                        num_workers=min(4, cfg.train.workers),
                                        pin_memory=True,
                                        shuffle=False)
            for s in valid_set.datasets
        ]
        valid_size = sum([len(l) for l in valid_loader])
    else:
        valid_loader = torch.utils.data.DataLoader(
            valid_set,
            batch_size=min(max_test_batch, cfg.train.batch_size),
            num_workers=min(4, cfg.train.workers),
            pin_memory=True,
            shuffle=False)
        valid_size = len(valid_loader)

    if cfg.train.epoch_size == 0:
        cfg.train.epoch_size = len(train_loader)
    if cfg.train.valid_size == 0:
        cfg.train.valid_size = valid_size
    cfg.train.epoch_size = min(cfg.train.epoch_size, len(train_loader))
    cfg.train.valid_size = min(cfg.train.valid_size, valid_size)

    model = get_model(cfg.model)
    loss = get_loss(cfg.loss)
    trainer = get_trainer(cfg.trainer)(train_loader, valid_loader, model, loss,
                                       _log, cfg.save_root, cfg.train)

    for name, param in model.named_parameters():
        if ("pyramid" in name) == False:
            param.requires_grad = False

        else:
            print(name, param.requires_grad)
            #parameter.requires_grad = False
    epoch, weights = load_checkpoint('checkpoints/Sintel/pwclite_ar.tar')
    print("traiiiiiiiiiiiiiiiiiiiiiiiiiiiiin", weights)

    trainer.model = model
    trainer.train()
Ejemplo n.º 2
0
def main(cfg, _log):
    init_seed(cfg.seed)

    _log.info("=> fetching img pairs.")
    train_set, valid_set = get_dataset(cfg)

    _log.info('{} samples found, {} train samples and {} test samples '.format(
        len(valid_set) + len(train_set), len(train_set), len(valid_set)))

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=cfg.train.batch_size,
                                               num_workers=cfg.train.workers,
                                               pin_memory=True,
                                               shuffle=True)

    max_test_batch = 4
    if type(valid_set) is torch.utils.data.ConcatDataset:
        valid_loader = [
            torch.utils.data.DataLoader(s,
                                        batch_size=min(max_test_batch,
                                                       cfg.train.batch_size),
                                        num_workers=min(4, cfg.train.workers),
                                        pin_memory=True,
                                        shuffle=False)
            for s in valid_set.datasets
        ]
        valid_size = sum([len(l) for l in valid_loader])
    else:
        valid_loader = torch.utils.data.DataLoader(
            valid_set,
            batch_size=min(max_test_batch, cfg.train.batch_size),
            num_workers=min(4, cfg.train.workers),
            pin_memory=True,
            shuffle=False)
        valid_size = len(valid_loader)

    if cfg.train.epoch_size == 0:
        cfg.train.epoch_size = len(train_loader)
    if cfg.train.valid_size == 0:
        cfg.train.valid_size = valid_size
    cfg.train.epoch_size = min(cfg.train.epoch_size, len(train_loader))
    cfg.train.valid_size = min(cfg.train.valid_size, valid_size)

    model = get_model(cfg.model)
    loss = get_loss(cfg.loss)
    trainer = get_trainer(cfg.trainer)(train_loader, valid_loader, model, loss,
                                       _log, cfg.save_root, cfg.train)

    trainer.train()
Ejemplo n.º 3
0
def worker(id, cfg, shared):
    # init logger
    curr_time = datetime.datetime.now().strftime("%y%m%d%H%M%S")
    _log = init_logger(log_dir=cfg.save_root, filename=curr_time[6:] + '.log')
    if id == 0:
        _log.info(id, '=> will save everything to {}'.format(cfg.save_root))

    # show configurations
    cfg_str = pprint.pformat(cfg)
    if id == 0: _log.info(id, '=> configurations \n ' + cfg_str)

    # Distributed
    if cfg.mp.enabled:
        if cfg.train.n_gpu > 0:
            dist.init_process_group(backend="nccl",
                                    init_method="env://",
                                    world_size=cfg.mp.workers,
                                    rank=id)
        else:
            dist.init_process_group(backend="gloo",
                                    init_method="env://",
                                    world_size=cfg.mp.workers,
                                    rank=id)

    # Get Model and Loss
    model = get_model(cfg, id)
    loss = get_loss(cfg, id)

    # Create Trainer
    trainer = get_trainer(cfg)(id, model, loss, _log, cfg.save_root, cfg,
                               shared)

    # Train or Test
    try:
        if cfg.eval:
            trainer.eval()
        else:
            trainer.train()
    except Exception as e:
        import traceback
        traceback.print_exc()

    # Destroy
    if cfg.mp.enabled:
        dist.destroy_process_group()
Ejemplo n.º 4
0
 def __init__(self,
              n_class,
              batch_size,
              instances,
              embedding_size=128,
              pretrained=True):
     super(NDfdml, self).__init__()
     device = torch.device("cuda:0")
     self.batch_size = batch_size
     self.embedding_size = embedding_size
     self.instances = instances
     assert batch_size % instances == 0
     self.n_class = batch_size // instances
     self.googlelayer = get_feature(pretrained).to(device)
     self.embedding_layer = get_embedding(
         dim=1000, embedding_size=embedding_size).to(device)
     self.dataset_metricloss = get_loss('Triplet')
     self.loss_fn = nd_loss.weight_nd_Loss(self.batch_size, self.instances)
Ejemplo n.º 5
0
    def train(self):
        since = time.time()
        start = time.time()

        self.scheduler.step()
        self.model.train()

        running_iter = 0
        running_loss = 0.0
        running_count = 0
        running_epoch = 0

        pred_time = 0.0
        opt_time = 0.0
        load_time = 0.0

        print('Start training')
        while running_iter < self.iteration:

            if self.cm and running_epoch % self.update_epoch == 0:
                embeddings, labels, spend = self.feed_embeddings('mean')
                centers = class_centers(embeddings, labels)
                self.cm_sampler.update_centers(centers, running_epoch)
                start += spend

            # Train an epoch
            t0 = time.time()
            for sample in self.data_loaders['train']:
                if running_iter > self.iteration:
                    break
                inputs = sample['image'].to(self.device)
                labels = sample['label'].to(self.device)
                self.optimizer.zero_grad()
                with torch.set_grad_enabled(True):
                    t1 = time.time()
                    outputs = self.model(inputs)
                    t2 = time.time()
                    loss_fn = get_loss(self.method)
                    loss, count = loss_fn(outputs, labels)

                    loss.backward()
                    t3 = time.time()
                    self.optimizer.step()

                load_time += t1 - t0
                pred_time += t2 - t1
                opt_time += t3 - t2

                running_loss += loss.item() / count
                running_count += count
                if (running_iter + 1) % self.show_iter == 0:
                    print(
                        'Iteration {}/{} Loss {:.4f} Triplets: {:.0f} Spending {:.0f}s'
                        .format(running_iter + 1, self.iteration,
                                running_loss / self.show_iter,
                                running_count / self.show_iter,
                                time.time() - start))
                    running_loss = 0.0
                    running_count = 0
                    start = time.time()

                running_iter += 1
            running_epoch += 1

        time_elapsed = time.time() - since
        print('pred:{:.0f} opt:{:.0f} load:{:.0f}'.format(
            pred_time, opt_time, load_time))
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        return self.model