Example #1
0
    def optimize_epoch(self, optimizer, loader, epoch, validation=False):
        logger.warning(f"Starting epoch {epoch}, validation: {validation} " + "="*30)

        loss_value = util.AverageMeter()
        # house keeping
        self.model.train()
        if self.lr_schedule(epoch+1)  != self.lr_schedule(epoch):
            files.save_checkpoint_all(self.checkpoint_dir, self.model, args.arch,
                                      optimizer, self.L, epoch, lowest=False, save_str='pre-lr-drop')
        lr = self.lr_schedule(epoch)
        for pg in optimizer.param_groups:
            pg['lr'] = lr
        XE = torch.nn.CrossEntropyLoss()
        for iter, (data, label, selected) in tqdm(enumerate(loader),desc="epoch={}/{}".format(epoch,args.epochs)):
            now = time.time()
            niter = epoch * len(loader) + iter

            if niter*args.batch_size >= self.optimize_times[-1]:
                ############ optimize labels #########################################
                self.model.headcount = 1
                logger.warning('Optimizaton starting')
                with torch.no_grad():
                    _ = self.optimize_times.pop()
                    self.optimize_labels(niter)
            data = data.to(self.dev)
            mass = data.size(0)
            final = self.model(data)
            #################### train CNN ####################################################
            if self.hc == 1:
                loss = XE(final, self.L[0, selected])
            else:
                loss = torch.mean(torch.stack([XE(final[h],
                                                  self.L[h, selected]) for h in range(self.hc)]))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_value.update(loss.item(), mass)
            data = 0

            # some logging stuff ##############################################################
            if iter % args.log_iter == 0:
                if self.writer:
                    self.writer.add_scalar('lr', self.lr_schedule(epoch), niter)

                    logger.info(niter, " Loss: {0:.3f}".format(loss.item()))
                    logger.info(niter, " Freq: {0:.2f}".format(mass/(time.time() - now)))
                    if writer:
                        self.writer.add_scalar('Loss', loss.item(), niter)
                        if iter > 0:
                            self.writer.add_scalar('Freq(Hz)', mass/(time.time() - now), niter)


        # end of epoch logging ################################################################
        if self.writer and (epoch % args.log_intv == 0):
            util.write_conv(self.writer, self.model, epoch=epoch)

        files.save_checkpoint_all(self.checkpoint_dir, self.model, args.arch,
                                  optimizer,  self.L, epoch, lowest=False)

        return {'loss': loss_value.avg}
Example #2
0
    def optimize(self):
        """Perform full optimization."""
        first_epoch = 0
        self.model = self.model.to(self.dev)
        N = len(self.pseudo_loader.dataset)
        # optimization times (spread exponentially), can also just be linear in practice (i.e. every n-th epoch)
        self.optimize_times = [(self.num_epochs+2)*N] + \
                              ((self.num_epochs+1.01)*N*(np.linspace(0, 1, args.nopts)**2)[::-1]).tolist()

        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()),
                                    weight_decay=self.weight_decay,
                                    momentum=self.momentum,
                                    lr=self.lr)

        if self.checkpoint_dir is not None and self.resume:
            self.L, first_epoch = files.load_checkpoint_all(self.checkpoint_dir, self.model, optimizer)
            logger.warning('found first epoch to be', first_epoch)
            include = [(qq/N >= first_epoch) for qq in self.optimize_times]
            self.optimize_times = (np.array(self.optimize_times)[include]).tolist()
        logger.warning('We will optimize L at epochs: {}'.format([np.round(1.0 * t / N, 2) for t in self.optimize_times]))

        if first_epoch == 0:
            # initiate labels as shuffled.
            self.L = np.zeros((self.hc, N), dtype=np.int32)
            for nh in range(self.hc):
                for _i in range(N):
                    self.L[nh, _i] = _i % self.outs[nh]
                self.L[nh] = np.random.permutation(self.L[nh])
            self.L = torch.LongTensor(self.L).to(self.dev)

        # Perform optmization ###############################################################
        lowest_loss = 1e9
        epoch = first_epoch
        while epoch < (self.num_epochs+1):
            m = self.optimize_epoch(optimizer, self.train_loader, epoch,
                                    validation=False)
            if m['loss'] < lowest_loss:
                lowest_loss = m['loss']
                files.save_checkpoint_all(self.checkpoint_dir, self.model, args.arch,
                                          optimizer, self.L, epoch, lowest=True)
            epoch += 1
        logger.info(f"optimization completed. Saving model to {os.path.join(self.checkpoint_dir,'model_final.pth.tar')}")
        torch.save(self.model, os.path.join(self.checkpoint_dir, 'model_final.pth.tar'))
        return self.model
Example #3
0
    def eval_k_s(K_, sigma_):
        total = 0
        top1 = 0.
        top5 = 0.

        with torch.no_grad():
            retrieval_one_hot = torch.zeros(K_, C)  # .cuda()
            for batch_idx, (inputs, targets, _) in enumerate(testloader):
                targets = targets  # .cuda(async=True) # or without async for py3.7
                inputs = inputs.cuda()
                batchSize = inputs.size(0)
                features = net(inputs)
                if use_pca:
                    features = pca.transform(features.cpu().numpy())
                    features = torch.Tensor(features).cuda()
                features = normalize(features).cpu()

                dist = torch.mm(features, trainFeatures)

                yd, yi = dist.topk(K_, dim=1, largest=True, sorted=True)
                candidates = trainLabels.view(1, -1).expand(batchSize, -1)
                retrieval = torch.gather(candidates, 1, yi)

                retrieval_one_hot.resize_(batchSize * K_, C).zero_()
                retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1)
                yd_transform = yd.clone().div_(sigma_).exp_()
                probs = torch.sum(
                    torch.mul(retrieval_one_hot.view(batchSize, -1, C),
                              yd_transform.view(batchSize, -1, 1)), 1)
                _, predictions = probs.sort(1, True)

                # Find which predictions match the target
                correct = predictions.eq(targets.data.view(-1, 1))

                top1 = top1 + correct.narrow(1, 0, 1).sum().item()
                top5 = top5 + correct.narrow(1, 0, 5).sum().item()

                total += targets.size(0)

        logger.warning(f"{K_}-NN,s={sigma_}: TOP1: {top1 * 100. / total}")
        return top1 / total
Example #4
0
    def eval_epoch(self, d_loader):
        self.model.eval()

        eval_dict = {}
        total_loss = 0.0
        count = 1.0
        for i, data in tqdm.tqdm(
                enumerate(d_loader, 0),
                total=len(d_loader),
                leave=False,
                desc='val'):
            self.optimizer.zero_grad()

            _, loss, eval_res = self.model_fn(self.model, data, eval=True)

            total_loss += loss.item()
            count += 1
            for k, v in eval_res.items():
                if v is not None:
                    eval_dict[k] = eval_dict.get(k, []) + [v]

        logger.warning("evaluation loss={}, result={}".format(total_loss / count, eval_dict))
        return total_loss / count, eval_dict
Example #5
0
def model_summary(model_list):
    if not isinstance(model_list, list):
        model_list = [model_list]

    from operator import mul

    for model in model_list:
        data = []
        trainable_param_num = 0
        all_param_num = 0
        for key, value in model.named_parameters():
            data.append([
                key,
                list(value.size()), value.requires_grad, value.dtype,
                value.device, value.is_leaf,
                str(value.grad_fn)
            ])
            _num = reduce(mul, list(value.size()), 1)
            all_param_num += _num
            if value.requires_grad:
                trainable_param_num += _num
        table = tabulate(data,
                         headers=[
                             "name", "shape", "requires_grad", "dtype",
                             "device", "is_leaf", "grad_fn"
                         ])
        logger.warning(
            " Arg Parameters: #param={}, #param(trainable) = {}".format(
                all_param_num, trainable_param_num))
        logger.info(colored(
            "Model Summary",
            "cyan",
        ))
        logger.info("\n\n" + table)

        logger.info(model)
        return all_param_num, trainable_param_num
Example #6
0
def load_checkpoint(model=None, optimizer=None, filename='checkpoint'):
    filename = "{}.pth.tar".format(filename)
    
    if os.path.isfile(filename):
        print("==> Loading from checkpoint '{}'".format(filename))
        checkpoint = torch.load(filename)
        epoch = checkpoint['epoch']
        it = checkpoint.get('it', 0.0)
        best_prec = checkpoint['best_prec']
        logger.warning("checkpoint it:{}, best_prec:{}".format(it,best_prec))
        if model is not None and checkpoint['model_state'] is not None:
            logger.warning("load model_state")
            model.load_state_dict(checkpoint['model_state'])
        if optimizer is not None and checkpoint['optimizer_state'] is not None:
            logger.warning("load optimizer_state")
            optimizer.load_state_dict(checkpoint['optimizer_state'])


        print("==> Done")
        return it, epoch, best_prec
    else:
        logger.warning("==> Checkpoint '{}' not found".format(filename))
        raise
        return None
Example #7
0
                              pytorchgo_args.get_args().epochs))
            #optimizer_summary(optimizer)

    cpu_prototype = model.prototype_N2K.detach().cpu().numpy()
    return cpu_prototype


optimizer_summary(optimizer)
model_summary(model)

pytorchgo_args.get_args().step = 0
for epoch in range(start_epoch, start_epoch + args.epochs):
    if args.debug and epoch >= 2: break
    prototype = train(epoch)
    feature_return_switch(model, True)
    logger.warning(logger.get_logger_dir())
    logger.warning("doing KNN evaluation.")
    acc = kNN(model, trainloader, testloader, K=10, sigma=0.1, dim=knn_dim)
    logger.warning("finish KNN evaluation.")
    feature_return_switch(model, False)
    if acc > best_acc:
        logger.info('get better result, saving..')
        state = {
            'net': model.state_dict(),
            'acc': acc,
            'epoch': epoch,
            'opt': optimizer.state_dict(),
            'prototype': prototype,
        }
        torch.save(state, os.path.join(logger.get_logger_dir(),
                                       'best_ckpt.t7'))
Example #8
0
def kNN(net, trainloader, testloader, K, sigma=0.1, dim=128, use_pca=False):
    net.eval()
    # this part is ugly but made to be backwards-compatible. there was a change in cifar dataset's structure.
    if hasattr(trainloader.dataset, 'imgs'):
        trainLabels = torch.LongTensor(
            [y for (p, y) in trainloader.dataset.imgs])  # .cuda()
    elif hasattr(trainloader.dataset, 'indices'):
        trainLabels = torch.LongTensor([
            k for path, k in trainloader.dataset.dataset.dt.imgs
        ])[trainloader.dataset.indices]
    elif hasattr(trainloader.dataset, 'train_labels'):
        trainLabels = torch.LongTensor(
            trainloader.dataset.train_labels)  # .cuda()
    if hasattr(trainloader.dataset, 'dt'):
        if hasattr(trainloader.dataset.dt, 'targets'):
            trainLabels = torch.LongTensor(
                trainloader.dataset.dt.targets)  # .cuda()
        else:  #  hasattr(trainloader.dataset.dt, 'imgs'):
            trainLabels = torch.LongTensor(
                [k for path, k in trainloader.dataset.dt.imgs])  # .cuda()
    else:
        trainLabels = torch.LongTensor(trainloader.dataset.targets)  # .cuda()
    C = trainLabels.max() + 1

    if hasattr(trainloader.dataset, 'transform'):
        transform_bak = trainloader.dataset.transform
        trainloader.dataset.transform = testloader.dataset.transform
    elif hasattr(trainloader.dataset.dataset.dt, 'transform'):
        transform_bak = trainloader.dataset.dataset.dt.transform
        trainloader.dataset.dataset.dt.transform = testloader.dataset.dt.transform
    else:
        transform_bak = trainloader.dataset.dt.transform
        trainloader.dataset.dt.transform = testloader.dataset.dt.transform

    temploader = torch.utils.data.DataLoader(trainloader.dataset,
                                             batch_size=64,
                                             num_workers=1)
    if hasattr(trainloader.dataset, 'indices'):
        LEN = len(trainloader.dataset.indices)
    else:
        LEN = len(trainloader.dataset)

    trainFeatures = torch.zeros((dim, LEN))  # , device='cuda:0')
    normalize = Normalize()
    for batch_idx, (inputs, targets, _) in enumerate(temploader):
        if pytorchgo_args.get_args().debug and batch_idx > 1: break
        batchSize = inputs.size(0)
        inputs = inputs.cuda()
        features = net(inputs)
        if not use_pca:
            features = normalize(features)
        trainFeatures[:, batch_idx * batchSize:batch_idx * batchSize +
                      batchSize] = features.data.t().cpu()
    if hasattr(temploader.dataset, 'imgs'):
        trainLabels = torch.LongTensor(
            temploader.dataset.train_labels)  # .cuda()
    elif hasattr(temploader.dataset, 'indices'):
        trainLabels = torch.LongTensor([
            k for path, k in temploader.dataset.dataset.dt.imgs
        ])[temploader.dataset.indices]
    elif hasattr(temploader.dataset, 'train_labels'):
        trainLabels = torch.LongTensor(
            temploader.dataset.train_labels)  # .cuda()
    elif hasattr(temploader.dataset, 'targets'):
        trainLabels = torch.LongTensor(temploader.dataset.targets)  # .cuda()
    elif hasattr(temploader.dataset.dt, 'imgs'):
        trainLabels = torch.LongTensor(
            [k for path, k in temploader.dataset.dt.imgs])  #.cuda()
    elif hasattr(temploader.dataset.dt, 'targets'):
        trainLabels = torch.LongTensor(temploader.dataset.dt.targets)  #.cuda()
    else:
        trainLabels = torch.LongTensor(temploader.dataset.labels)  #.cuda()
    trainLabels = trainLabels.cpu()
    if hasattr(trainloader.dataset, 'transform'):
        trainloader.dataset.transform = transform_bak
    elif hasattr(trainloader.dataset, 'indices'):
        trainloader.dataset.dataset.dt.transform = transform_bak
    else:
        trainloader.dataset.dt.transform = transform_bak

    if use_pca:
        comps = 128
        logger.warning('doing PCA with {} components'.format(comps))
        from sklearn.decomposition import PCA
        pca = PCA(n_components=comps, whiten=False)
        trainFeatures = pca.fit_transform(trainFeatures.numpy().T)
        trainFeatures = torch.Tensor(trainFeatures)
        trainFeatures = normalize(trainFeatures).t()
        logger.warning('..done')

    def eval_k_s(K_, sigma_):
        total = 0
        top1 = 0.
        top5 = 0.

        with torch.no_grad():
            retrieval_one_hot = torch.zeros(K_, C)  # .cuda()
            for batch_idx, (inputs, targets, _) in enumerate(testloader):
                targets = targets  # .cuda(async=True) # or without async for py3.7
                inputs = inputs.cuda()
                batchSize = inputs.size(0)
                features = net(inputs)
                if use_pca:
                    features = pca.transform(features.cpu().numpy())
                    features = torch.Tensor(features).cuda()
                features = normalize(features).cpu()

                dist = torch.mm(features, trainFeatures)

                yd, yi = dist.topk(K_, dim=1, largest=True, sorted=True)
                candidates = trainLabels.view(1, -1).expand(batchSize, -1)
                retrieval = torch.gather(candidates, 1, yi)

                retrieval_one_hot.resize_(batchSize * K_, C).zero_()
                retrieval_one_hot.scatter_(1, retrieval.view(-1, 1), 1)
                yd_transform = yd.clone().div_(sigma_).exp_()
                probs = torch.sum(
                    torch.mul(retrieval_one_hot.view(batchSize, -1, C),
                              yd_transform.view(batchSize, -1, 1)), 1)
                _, predictions = probs.sort(1, True)

                # Find which predictions match the target
                correct = predictions.eq(targets.data.view(-1, 1))

                top1 = top1 + correct.narrow(1, 0, 1).sum().item()
                top5 = top5 + correct.narrow(1, 0, 5).sum().item()

                total += targets.size(0)

        logger.warning(f"{K_}-NN,s={sigma_}: TOP1: {top1 * 100. / total}")
        return top1 / total

    if isinstance(K, list):
        res = []
        for K_ in K:
            for sigma_ in sigma:
                res.append(eval_k_s(K_, sigma_))
        return res
    else:
        res = eval_k_s(K, sigma)
        return res
Example #9
0
    try:
        args.device = [int(item) for item in args.device.split(',')]
    except AttributeError:
        args.device = [int(args.device)]
    args.modeldevice = args.device
    util.setup_runtime(seed=42, cuda_dev_id=list(np.unique(args.modeldevice + args.device)))
    logger.info(args)
    logger.info(name)
    time.sleep(5)

    writer = SummaryWriter('./runs/%s'%name)
    writer.add_text('args', " \n".join(['%s %s' % (arg, getattr(args, arg)) for arg in vars(args)]))

    # Setup model and train_loader
    model, train_loader = return_model_loader(args)
    logger.warning("dataset len={}".format(len(train_loader.dataset)))
    model.to('cuda:0')
    if torch.cuda.device_count() > 1:
        logger.info("Let's use", len(args.modeldevice), "GPUs for the model")
        if len(args.modeldevice) == 1:
            logger.warning('single GPU model')
        else:
            model.features = nn.DataParallel(model.features,
                                             device_ids=list(range(len(args.modeldevice))))
    # Setup optimizer
    o = Optimizer(m=model, hc=args.hc, ncl=args.ncl, t_loader=train_loader,
                  n_epochs=args.epochs, lr=args.lr, weight_decay=10**args.wd,
                  ckpt_dir=os.path.join(args.exp, 'checkpoints'))
    o.writer = writer
    # Optimize
    o.optimize()
Example #10
0
        lr_clip / args.lr,
    )
    bn_lbmd = lambda it: max(
        args.bn_momentum
        * args.bnm_decay ** (int(it * args.batch_size / args.decay_step)),
        bnm_clip,
    )

    # default value
    it = -1  # for the initialize value of `LambdaLR` and `BNMomentumScheduler`
    best_loss = 1e10
    start_epoch = 1

    # load status from checkpoint
    if args.checkpoint is not None:
        logger.warning("loading checkpoint weight file")
        checkpoint_status = pt_utils.load_checkpoint(
            model, optimizer, filename=args.checkpoint
        )
        if checkpoint_status is not None:
            it, start_epoch, best_loss = checkpoint_status

    lr_scheduler = lr_sched.LambdaLR(optimizer, lr_lambda=lr_lbmd, last_epoch=it)
    bnm_scheduler = pt_utils.BNMomentumScheduler(
        model, bn_lambda=bn_lbmd, last_epoch=it
    )
    it = max(it, 0)  # for the initialize value of `trainer.train`
    if args.pointmixup:
        model_fn = model_fn_decorator_mix(cross_entropy_with_probs, nn.CrossEntropyLoss(), num_class=num_class)
    else:
        model_fn = model_fn_decorator(nn.CrossEntropyLoss())
Example #11
0
    def train(self,
              start_it,
              start_epoch,
              n_epochs,
              train_loader,
              test_loader=None,
              best_loss=0.0,
              writer=None):
        r"""
           Call to begin training the model

        Parameters
        ----------
        start_epoch : int
            Epoch to start at
        n_epochs : int
            Number of epochs to train for
        test_loader : torch.utils.data.DataLoader
            DataLoader of the test_data
        train_loader : torch.utils.data.DataLoader
            DataLoader of training data
        best_loss : float
            Testing loss of the best model
        """

        eval_frequency = (self.eval_frequency
                          if self.eval_frequency > 0 else len(train_loader))

        it = start_it
        best_acc = -1
        with tqdm.trange(start_epoch, n_epochs + 1, desc='epochs') as tbar, \
                tqdm.tqdm(total=eval_frequency, leave=False, desc='train') as pbar:

            for epoch in tbar:
                for batch in train_loader:
                    idx_minor = None
                    mixrates = None
                    strategy = None
                    manilayer_batch = 0 # all use in no mixup case
                    if self.n_strategies > 0:
                        strategy_idx = np.random.randint(self.n_strategies)
                        strategy = self.strategies[strategy_idx]
                        if self.manimixup:
                            manilayer_batch = np.random.randint(self.manilayer_all)
                        else:
                            manilayer_batch = 0

                        B, N, C = batch[0].shape
                        idx_minor = torch.randperm(B)

                        mixrates = (0.5 - np.abs(np.random.beta(self.alpha, self.alpha, B) - 0.5))
                        label_main = batch[1]
                        label_minor = batch[1][idx_minor]

                        label = torch.zeros(B, self.n_class)
                        for i in range(B):
                            if label_main[i] == label_minor[i]: # same label
                                label[i][label_main[i]] = 1.0
                            else:
                                label[i][label_main[i]] = 1 - mixrates[i]
                                label[i][label_minor[i]] = mixrates[i]
                        batch[1] = label

                    res = self._train_it(it, batch, idx_minor, mixrates, strategy, manilayer_batch)
                    it += 1

                    pbar.update()
                    pbar.set_postfix(dict(total_it=it))
                    tbar.refresh()

                    if (it % eval_frequency) == 0:
                        pbar.close()

                        if test_loader is not None:
                            val_loss, res = self.eval_epoch(test_loader)

                            if writer is not None:
                                writer.add_scalar('{}/valacc'.format(self.savename), np.mean(res['acc']), epoch)
                                writer.add_scalar('{}/valloss'.format(self.savename), np.mean(res['loss']), epoch)

                            #is_best = val_loss < best_loss
                            best_loss = min(best_loss, val_loss)
                            is_best = np.mean(res['acc']) > best_acc
                            if is_best:
                                best_acc = np.mean(res['acc'])

                            logger.warning("eval_loss={}, eval_acc={}, eval_best_acc={}".format(np.mean(res['loss']), np.mean(res['acc']), best_acc))
                            save_checkpoint(
                                checkpoint_state(self.model, self.optimizer,
                                                 val_loss, epoch, it),
                                is_best,
                                filename=self.checkpoint_name,
                                bestname=self.best_name)

                        pbar = tqdm.tqdm(
                            total=eval_frequency, leave=False, desc='train')
                        pbar.set_postfix(dict(total_it=it))
        return best_loss
Example #12
0

    testset = CIFAR100Instance(root=args.datadir, train=False, download=True,
                              transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)


logger.info('==> Building model..') ##########################################
numc = [args.ncl] * args.hc
model = models.__dict__[args.arch](num_classes=numc,return_features=False)
knn_dim = 4096

N = len(trainloader.dataset)
optimize_times = ((args.epochs + 1.0001)*N*(np.linspace(0, 1, args.nopts))[::-1]).tolist()
optimize_times = [(args.epochs +10)*N] + optimize_times
logger.warning('We will optimize L at epochs: {}'.format([np.round(1.0*t/N, 2) for t in optimize_times]))

# init selflabels randomly
if args.hc == 1:
    selflabels = np.zeros(N, dtype=np.int32)
    for qq in range(N):
        selflabels[qq] = qq % args.ncl
    selflabels = np.random.permutation(selflabels)
    selflabels = torch.LongTensor(selflabels).cuda()
else:
    selflabels = np.zeros((args.hc, N), dtype=np.int32)
    for nh in range(args.hc):
        for _i in range(N):
            selflabels[nh, _i] = _i % numc[nh]
        selflabels[nh] = np.random.permutation(selflabels[nh])
    selflabels = torch.LongTensor(selflabels).cuda()