Ejemplo n.º 1
0
    def after_train(self):
        #self.old_labels = list(set(self.old_labels + self.new_labels))
        self.old_labels += self.new_labels
        self.new_labels_zombie = copy.deepcopy(self.new_labels)
        self.new_labels.clear()
        self.task_seen += 1
        if self.params.trick['review_trick'] and hasattr(self, 'buffer'):
            self.model.train()
            mem_x = self.buffer.buffer_img[:self.buffer.current_index]
            mem_y = self.buffer.buffer_label[:self.buffer.current_index]
            # criterion = torch.nn.CrossEntropyLoss(reduction='mean')
            if mem_x.size(0) > 0:
                rv_dataset = TensorDataset(mem_x, mem_y)
                rv_loader = DataLoader(rv_dataset, batch_size=self.batch, shuffle=True, num_workers=0,
                                       drop_last=True)
                for ep in range(1):
                    for i, batch_data in enumerate(rv_loader):
                        # batch update
                        batch_x, batch_y = batch_data
                        batch_x = maybe_cuda(batch_x, self.cuda)
                        batch_y = maybe_cuda(batch_y, self.cuda)
                        logits = self.model.forward(batch_x)
                        loss = self.criterion(logits, batch_y)
                        self.opt.zero_grad()
                        loss.backward()
                        params = [p for p in self.model.parameters() if p.requires_grad]
                        grad = [p.grad.clone()/10. for p in params]
                        for g, p in zip(grad, params):
                            p.grad.data.copy_(g)
                        self.opt.step()

        if self.params.trick['kd_trick'] or self.params.agent == 'LWF':
            self.kd_manager.update_teacher(self.model)
Ejemplo n.º 2
0
    def learn(self, x, y):
        x, y = maybe_cuda(x), maybe_cuda(y)

        if MODELS_NDPM_NDPM_SEND_TO_STM_ALWAYS:
            self.stm_x.extend(torch.unbind(x.cpu()))
            self.stm_y.extend(torch.unbind(y.cpu()))
        else:
            # Determine the destination of each data point
            nll = self.experts[-1].collect_nll(x, y)  # [B, 1+K]
            nl_prior = self.prior.nl_prior()  # [1+K]
            nl_joint = nll + nl_prior.unsqueeze(0).expand(nll.size(0),
                                                          -1)  # [B, 1+K]

            # Save to short-term memory
            destination = maybe_cuda(torch.argmin(nl_joint, dim=1))  # [B]
            to_stm = destination == 0  # [B]
            self.stm_x.extend(torch.unbind(x[to_stm].cpu()))
            self.stm_y.extend(torch.unbind(y[to_stm].cpu()))

            # Train expert
            with torch.no_grad():
                min_joint = nl_joint.min(dim=1)[0].view(-1, 1)
                to_expert = torch.exp(-nl_joint + min_joint)  # [B, 1+K]
                to_expert[:, 0] = 0.  # [B, 1+K]
                to_expert = \
                    to_expert / (to_expert.sum(dim=1).view(-1, 1) + 1e-7)

            # Compute losses per expert
            nll_for_train = nll * (1. - to_stm.float()).unsqueeze(1)  # [B,1+K]
            losses = (nll_for_train * to_expert).sum(0)  # [1+K]

            # Record expert usage
            expert_usage = to_expert.sum(dim=0)  # [K+1]
            self.prior.record_usage(expert_usage)

            # Do lr_decay implicitly
            if MODELS_NDPM_NDPM_IMPLICIT_LR_DECAY:
                losses = losses \
                         * self.params.stm_capacity / (self.prior.counts + 1e-8)
            loss = losses.sum()

            if loss.requires_grad:
                update_threshold = 0
                for k, usage in enumerate(expert_usage):
                    if usage > update_threshold:
                        self.experts[k].zero_grad()
                loss.backward()
                for k, usage in enumerate(expert_usage):
                    if usage > update_threshold:
                        self.experts[k].clip_grad()
                        self.experts[k].optimizer_step()
                        self.experts[k].lr_scheduler_step()

        # Sleep
        if len(self.stm_x) >= self.stm_capacity:
            dream_dataset = TensorDataset(torch.stack(self.stm_x),
                                          torch.stack(self.stm_y))
            self.sleep(dream_dataset)
            self.stm_x = []
            self.stm_y = []
Ejemplo n.º 3
0
    def train_learner(self, x_train, y_train):
        # set up loader
        train_dataset = dataset_transform(
            x_train, y_train, transform=transforms_match[self.data])
        train_loader = data.DataLoader(train_dataset,
                                       batch_size=self.batch,
                                       shuffle=True,
                                       num_workers=0,
                                       drop_last=True)
        # setup tracker
        losses_batch = AverageMeter()
        acc_batch = AverageMeter()

        self.model.train()

        for ep in range(self.epoch):
            for i, batch_data in enumerate(train_loader):
                # batch update
                batch_x, batch_y = batch_data
                batch_x = maybe_cuda(batch_x, self.cuda)
                batch_y = maybe_cuda(batch_y, self.cuda)
                self.model.learn(batch_x, batch_y)
                if self.params.verbose:
                    print('\r[Step {:4}] STM: {:5}/{} | #Expert: {}'.format(
                        i, len(self.model.stm_x), self.params.stm_capacity,
                        len(self.model.experts) - 1),
                          end='')
        print()
 def __init__(self, params):
     super().__init__()
     # the number of gradient vectors to estimate new samples similarity, line 5 in alg.2
     self.mem_strength = params.gss_mem_strength
     self.gss_batch_size = params.gss_batch_size
     self.buffer_score = maybe_cuda(
         torch.FloatTensor(params.mem_size).fill_(0))
def deep_features(model, eval_x, n_eval, cand_x, n_cand):
    """
        Compute deep features of evaluation and candidate data.
            Args:
                model (object): neural network.
                eval_x (tensor): evaluation data tensor.
                n_eval (int): number of evaluation data.
                cand_x (tensor): candidate data tensor.
                n_cand (int): number of candidate data.
            Returns
                eval_df (tensor): deep features of evaluation data.
                cand_df (tensor): deep features of evaluation data.
    """
    # Get deep features
    if cand_x is None:
        num = n_eval
        total_x = eval_x
    else:
        num = n_eval + n_cand
        total_x = torch.cat((eval_x, cand_x), 0)

    # compute deep features with mini-batches
    total_x = maybe_cuda(total_x)
    deep_features_ = mini_batch_deep_features(model, total_x, num)

    eval_df = deep_features_[0:n_eval]
    cand_df = deep_features_[n_eval:]
    return eval_df, cand_df
    def nll(self, x, y, step=None):
        x, y = maybe_cuda(x), maybe_cuda(y)
        log_softmax = self.forward(x)
        loss_pred = self.ce_loss(log_softmax, y)

        # Classifier chilling
        chilled_log_softmax = F.log_softmax(log_softmax /
                                            self.params.classifier_chill,
                                            dim=1)
        chilled_loss_pred = self.ce_loss(chilled_log_softmax, y)

        # Value with chill & gradient without chill
        loss_pred = loss_pred - loss_pred.detach() \
            + chilled_loss_pred.detach()

        return loss_pred
Ejemplo n.º 7
0
    def sample(cls,
               buffer_x,
               buffer_y,
               n_smp_cls,
               excl_indices=None,
               device="cpu"):
        """
            Take same number of random samples from each class from buffer.
                Args:
                    buffer_x (tensor): data buffer.
                    buffer_y (tensor): label buffer.
                    n_smp_cls (int): number of samples to take from each class.
                    excl_indices (set): indices of buffered instances to be excluded from sampling.
                    device (str): device for tensor allocation.
                Returns
                    x (tensor): class balanced random sample data tensor.
                    y (tensor): class balanced random sample label tensor.
                    sample_ind (tensor): class balanced random sample index tensor.
        """
        if excl_indices is None:
            excl_indices = set()

        # Get indices for class balanced random samples
        # cls_ind_cache = class_index_tensor_list_cache(buffer_y, num_class, excl_indices, device=device)

        sample_ind = torch.tensor([], device=device, dtype=torch.long)

        # Use cache to retrieve indices belonging to each class in buffer
        for ind_set in cls.class_index_cache.values():
            if ind_set:
                # Exclude some indices
                valid_ind = ind_set - excl_indices
                # Auxiliary indices for permutation
                perm_ind = torch.randperm(len(valid_ind), device=device)
                # Apply permutation, and select indices
                ind = torch.tensor(list(valid_ind),
                                   device=device,
                                   dtype=torch.long)[perm_ind][:n_smp_cls]
                sample_ind = torch.cat((sample_ind, ind))

        x = buffer_x[sample_ind]
        y = buffer_y[sample_ind]

        x = maybe_cuda(x)
        y = maybe_cuda(y)

        return x, y, sample_ind
Ejemplo n.º 8
0
    def collect_nll(self, x, y=None, step=None):
        """Collect NLL values

        Returns:
            loss_vae: Tensor of shape [B, 1+K]
        """
        x = maybe_cuda(x)

        # Dummy VAE
        dummy_nll = self.experts[0].g.nll(x, y, step)

        # Encode
        z_means, z_log_vars, features = self.encode(x, collect=True)

        # Decode
        loss_vaes = [dummy_nll]
        vaes = [expert.g for expert in self.experts[1:]] + [self]
        x_logits = []
        for z_mean, z_log_var, vae in zip(z_means, z_log_vars, vaes):
            z = self.reparameterize(z_mean, z_log_var,
                                    MODELS_NDPM_VAE_Z_SAMPLES)
            if MODELS_NDPM_VAE_PRECURSOR_CONDITIONED_DECODER:
                x_logit = vae.decode(z, as_logit=True)
                x_logits.append(x_logit)
                continue
            x_mean = vae.decode(z)
            x_mean = x_mean.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES,
                                 *x.shape[1:])
            x_log_var = (None if MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' else
                         self.log_var.view(1, 1, -1, 1, 1))
            loss_recon = self.reconstruction_loss(x, x_mean, x_log_var)
            loss_recon = loss_recon.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES,
                                         -1)
            loss_recon = loss_recon.sum(2).mean(1)
            loss_kl = self.gaussian_kl(z_mean, z_log_var)
            loss_vae = loss_recon + loss_kl

            loss_vaes.append(loss_vae)

        x_logits = list(
            accumulate(x_logits, func=(lambda x, y: x.detach() + y)))
        for x_logit in x_logits:
            x_mean = torch.sigmoid(x_logit)
            x_mean = x_mean.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES,
                                 *x.shape[1:])
            x_log_var = (None if MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' else
                         self.log_var.view(1, 1, -1, 1, 1))
            loss_recon = self.reconstruction_loss(x, x_mean, x_log_var)
            loss_recon = loss_recon.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES,
                                         -1)
            loss_recon = loss_recon.sum(2).mean(1)
            loss_kl = self.gaussian_kl(z_mean, z_log_var)
            loss_vae = loss_recon + loss_kl
            loss_vaes.append(loss_vae)

        return torch.stack(loss_vaes, dim=1)
Ejemplo n.º 9
0
    def __init__(self, params, experts=()):
        super().__init__()
        self.id = len(experts)
        self.experts = experts

        self.g = maybe_cuda(CnnSharingVae(params, experts))
        self.d = maybe_cuda(ResNetSharingClassifier(
            params, experts)) if not MODELS_NDPM_NDPM_DISABLE_D else None

        # use random initialized g if it's a placeholder
        if self.id == 0:
            self.eval()
            for p in self.g.parameters():
                p.requires_grad = False

        # use random initialized d if it's a placeholder
        if self.id == 0 and self.d is not None:
            for p in self.d.parameters():
                p.requires_grad = False
Ejemplo n.º 10
0
    def train_learner(self, x_train, y_train):
        self.before_train(x_train, y_train)

        # set up loader
        train_dataset = dataset_transform(x_train, y_train, transform=transforms_match[self.data])
        train_loader = data.DataLoader(train_dataset, batch_size=self.batch, shuffle=True, num_workers=0,
                                       drop_last=True)

        # set up model
        self.model = self.model.train()

        # setup tracker
        losses_batch = AverageMeter()
        acc_batch = AverageMeter()

        for ep in range(self.epoch):
            for i, batch_data in enumerate(train_loader):
                # batch update
                batch_x, batch_y = batch_data
                batch_x = maybe_cuda(batch_x, self.cuda)
                batch_y = maybe_cuda(batch_y, self.cuda)

                logits = self.forward(batch_x)
                loss_old = self.kd_manager.get_kd_loss(logits, batch_x)
                loss_new = self.criterion(logits, batch_y)
                loss = 1/(self.task_seen + 1) * loss_new + (1 - 1/(self.task_seen + 1)) * loss_old
                _, pred_label = torch.max(logits, 1)
                correct_cnt = (pred_label == batch_y).sum().item() / batch_y.size(0)
                # update tracker
                acc_batch.update(correct_cnt, batch_y.size(0))
                losses_batch.update(loss, batch_y.size(0))
                # backward
                self.opt.zero_grad()
                loss.backward()
                self.opt.step()

                if i % 100 == 1 and self.verbose:
                    print(
                        '==>>> it: {}, avg. loss: {:.6f}, '
                        'running train acc: {:.3f}'
                            .format(i, losses_batch.avg(), acc_batch.avg())
                    )
        self.after_train()
Ejemplo n.º 11
0
 def update_representation(self, train_loader):
     updated_idx = []
     for ep in range(self.epoch):
         for i, train_data in enumerate(train_loader):
             # batch update
             train_x, train_y = train_data
             train_x = maybe_cuda(train_x, self.cuda)
             train_y = maybe_cuda(train_y, self.cuda)
             train_y_copy = train_y.clone()
             for k, y in enumerate(train_y_copy):
                 train_y_copy[k] = len(
                     self.old_labels) + self.new_labels.index(y)
             all_cls_num = len(self.new_labels) + len(self.old_labels)
             target_labels = utils.ohe_label(
                 train_y_copy, all_cls_num,
                 device=train_y_copy.device).float()
             if self.prev_model is not None:
                 mem_x, mem_y = random_retrieve(self.buffer,
                                                self.batch,
                                                excl_indices=updated_idx)
                 mem_x = maybe_cuda(mem_x, self.cuda)
                 batch_x = torch.cat([train_x, mem_x])
                 target_labels = torch.cat(
                     [target_labels,
                      torch.zeros_like(target_labels)])
             else:
                 batch_x = train_x
             logits = self.forward(batch_x)
             self.opt.zero_grad()
             if self.prev_model is not None:
                 with torch.no_grad():
                     q = torch.sigmoid(self.prev_model.forward(batch_x))
                 for k, y in enumerate(self.old_labels):
                     target_labels[:, k] = q[:, k]
             loss = F.binary_cross_entropy_with_logits(
                 logits[:, :all_cls_num], target_labels,
                 reduction='none').sum(dim=1).mean()
             loss.backward()
             self.opt.step()
             updated_idx += self.buffer.update(train_x, train_y)
Ejemplo n.º 12
0
    def train_learner(self, x_train, y_train):
        self.before_train(x_train, y_train)
        # set up loader
        train_dataset = dataset_transform(
            x_train, y_train, transform=transforms_match[self.data])
        train_loader = data.DataLoader(train_dataset,
                                       batch_size=self.batch,
                                       shuffle=True,
                                       num_workers=0,
                                       drop_last=True)

        for i, batch_data in enumerate(train_loader):
            # batch update
            batch_x, batch_y = batch_data
            batch_x = maybe_cuda(batch_x, self.cuda)
            batch_y = maybe_cuda(batch_y, self.cuda)
            # update mem
            for j in range(len(batch_x)):
                self.greedy_balancing_update(batch_x[j], batch_y[j].item())
        #self.early_stopping.reset()
        self.train_mem()
        self.after_train()
Ejemplo n.º 13
0
 def __init__(self, params, experts):
     super().__init__(params, experts)
     x_c, x_h, x_w = input_size_match[params.data]
     bernoulli = MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli'
     if bernoulli:
         self.log_var_param = None
     elif MODELS_NDPM_VAE_LEARN_X_LOG_VAR:
         self.log_var_param = nn.Parameter(torch.ones([x_c]) *
                                           MODELS_NDPM_VAE_X_LOG_VAR_PARAM,
                                           requires_grad=True)
     else:
         self.log_var_param = (maybe_cuda(torch.ones([x_c])) *
                               MODELS_NDPM_VAE_X_LOG_VAR_PARAM)
    def forward(self, x, collect=False):
        x = maybe_cuda(x)

        # First component
        if len(self.precursors) == 0:
            h1 = self.layer0(x)
            h2 = self.layer1(h1)
            h3 = self.layer2(h2)
            h4 = self.layer3(h3)
            h5 = self.layer4(h4)
            h5 = F.avg_pool2d(h5, h5.size(2)).view(h5.size(0), -1)
            pred = self.predict(h5)

            if collect:
                return [pred], [
                    h1.detach(),
                    h2.detach(),
                    h3.detach(),
                    h4.detach(),
                    h5.detach()
                ]
            else:
                return pred

        # Second or layer component
        preds, features = self.precursors[-1](x, collect=True)
        h1 = self.layer0(x)
        h1_cat = torch.cat([features[0], h1], dim=1)
        h2 = self.layer1(h1_cat)
        h2_cat = torch.cat([features[1], h2], dim=1)
        h3 = self.layer2(h2_cat)
        h3_cat = torch.cat([features[2], h3], dim=1)
        h4 = self.layer3(h3_cat)
        h4_cat = torch.cat([features[3], h4], dim=1)
        h5 = self.layer4(h4_cat)
        h5 = F.avg_pool2d(h5, h5.size(2)).view(h5.size(0), -1)
        h5_cat = torch.cat([features[4], h5], dim=1)
        pred = self.predict(h5_cat)

        if collect:
            preds.append(pred)
            return preds, [
                h1_cat.detach(),
                h2_cat.detach(),
                h3_cat.detach(),
                h4_cat.detach(),
                h5_cat.detach(),
            ]
        else:
            return pred
Ejemplo n.º 15
0
def get_grad_vector(pp, grad_dims):
    """
        gather the gradients in one vector
    """
    grads = maybe_cuda(torch.Tensor(sum(grad_dims)))
    grads.fill_(0.0)
    cnt = 0
    for param in pp():
        if param.grad is not None:
            beg = 0 if cnt == 0 else sum(grad_dims[:cnt])
            en = sum(grad_dims[:cnt + 1])
            grads[beg:en].copy_(param.grad.data.view(-1))
        cnt += 1
    return grads
Ejemplo n.º 16
0
    def __init__(self, model, params):
        super().__init__()
        self.params = params
        self.model = model
        self.cuda = self.params.cuda
        self.current_index = 0
        self.n_seen_so_far = 0

        # define buffer
        buffer_size = params.mem_size
        print('buffer has %d slots' % buffer_size)
        input_size = input_size_match[params.data]
        buffer_img = maybe_cuda(
            torch.FloatTensor(buffer_size, *input_size).fill_(0))
        buffer_label = maybe_cuda(torch.LongTensor(buffer_size).fill_(0))

        # registering as buffer allows us to save the object using `torch.save`
        self.register_buffer('buffer_img', buffer_img)
        self.register_buffer('buffer_label', buffer_label)

        # define update and retrieve method
        self.update_method = name_match.update_methods[params.update](params)
        self.retrieve_method = name_match.retrieve_methods[params.retrieve](
            params)
Ejemplo n.º 17
0
    def train_mem(self):
        mem_x = []
        mem_y = []
        for i in self.mem_img.keys():
            mem_x += self.mem_img[i]
            mem_y += [i] * self.mem_c[i]

        mem_x = torch.stack(mem_x)
        mem_y = torch.LongTensor(mem_y)
        self.model = setup_architecture(self.params)
        self.model = maybe_cuda(self.model, self.cuda)
        opt = setup_opt(self.params.optimizer, self.model,
                        self.params.learning_rate, self.params.weight_decay)
        #scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=1, T_mult=2, eta_min=self.params.minlr)

        #loss = math.inf
        for i in range(self.params.mem_epoch):
            idx = np.random.permutation(len(mem_x)).tolist()
            mem_x = maybe_cuda(mem_x[idx], self.cuda)
            mem_y = maybe_cuda(mem_y[idx], self.cuda)
            self.model = self.model.train()
            batch_size = self.params.batch
            #scheduler.step()
            #if opt.param_groups[0]['lr'] == self.params.learning_rate:
            #    if self.early_stopping.step(-loss):
            #        return
            for j in range(len(mem_y) // batch_size):
                opt.zero_grad()
                logits = self.model.forward(mem_x[batch_size * j:batch_size *
                                                  (j + 1)])
                loss = self.criterion(
                    logits, mem_y[batch_size * j:batch_size * (j + 1)])
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.params.clip)
                opt.step()
Ejemplo n.º 18
0
    def nll(self, x, y=None, step=None):
        x = maybe_cuda(x)
        z_mean, z_log_var = self.encode(x)
        z = self.reparameterize(z_mean, z_log_var, MODELS_NDPM_VAE_Z_SAMPLES)
        x_mean = self.decode(z)
        x_mean = x_mean.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES,
                             *x.shape[1:])
        x_log_var = (None if MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' else
                     self.log_var.view(1, 1, -1, 1, 1))
        loss_recon = self.reconstruction_loss(x, x_mean, x_log_var)
        loss_recon = loss_recon.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, -1)
        loss_recon = loss_recon.sum(2).mean(1)
        loss_kl = self.gaussian_kl(z_mean, z_log_var)
        loss_vae = loss_recon + loss_kl

        return loss_vae
Ejemplo n.º 19
0
    def forward(self, x):
        with torch.no_grad():
            if len(self.experts) == 1:
                raise RuntimeError('There\'s no expert to run on the input')
            x = maybe_cuda(x)
            log_evid = -self.experts[-1].g.collect_nll(x)  # [B, 1+K]
            log_evid = log_evid[:, 1:].unsqueeze(2)  # [B, K, 1]
            log_prior = -self.prior.nl_prior()[1:]  # [K]
            log_prior -= torch.logsumexp(log_prior, dim=0)
            log_prior = log_prior.unsqueeze(0).unsqueeze(2)  # [1, K, 1]
            log_joint = log_prior + log_evid  # [B, K, 1]
            if not MODELS_NDPM_NDPM_DISABLE_D:
                log_pred = self.experts[-1].d.collect_forward(x)  # [B, 1+K, C]
                log_pred = log_pred[:, 1:, :]  # [B, K, C]
                log_joint = log_joint + log_pred  # [B, K, C]

            log_joint = log_joint.logsumexp(dim=1).squeeze()  # [B,] or [B, C]
            return log_joint
Ejemplo n.º 20
0
    def record_usage(self, usage, index=None):
        """Record expert usage

        Args:
            usage: Tensor of shape [K+1] if index is None else scalar
            index: expert index
        """
        if index is None:
            self.log_counts = torch.logsumexp(torch.stack(
                [self.log_counts, usage.log()], dim=1),
                                              dim=1)
        else:
            self.log_counts[index] = torch.logsumexp(torch.stack([
                self.log_counts[index],
                maybe_cuda(torch.tensor(usage)).float().log()
            ],
                                                                 dim=0),
                                                     dim=0)
Ejemplo n.º 21
0
def multiple_run(params):
    # Set up data stream
    start = time.time()
    print('Setting up data stream')
    data_continuum = continuum(params.data, params.cl_type, params)
    data_end = time.time()
    print('data setup time: {}'.format(data_end - start))
    accuracy_list = []
    for run in range(params.num_runs):
        tmp_acc = []
        run_start = time.time()
        data_continuum.new_run()
        model = setup_architecture(params)
        model = maybe_cuda(model, params.cuda)
        opt = setup_opt(params.optimizer, model, params.learning_rate,
                        params.weight_decay)
        agent = agents[params.agent](model, opt, params)

        # prepare val data loader
        test_loaders = setup_test_loader(data_continuum.test_data(), params)
        for i, (x_train, y_train, labels) in enumerate(data_continuum):
            print("-----------run {} training batch {}-------------".format(
                run, i))
            print('size: {}, {}'.format(x_train.shape, y_train.shape))
            agent.train_learner(x_train, y_train)
            acc_array = agent.evaluate(test_loaders)
            tmp_acc.append(acc_array)
        run_end = time.time()
        print(
            "-----------run {}-----------avg_end_acc {}-----------train time {}"
            .format(run, np.mean(tmp_acc[-1]), run_end - run_start))
        accuracy_list.append(np.array(tmp_acc))
    accuracy_list = np.array(accuracy_list)
    avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance(
        accuracy_list)
    end = time.time()
    print('----------- Total {} run: {}s -----------'.format(
        params.num_runs, end - start))
    print(
        '----------- Avg_End_Acc {} Avg_End_Fgt {} Avg_Acc {} Avg_Bwtp {} Avg_Fwt {}-----------'
        .format(avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt))
 def get_each_batch_sample_sim(self, buffer, grad_dims, mem_grads, batch_x,
                               batch_y):
     """
     Args:
         buffer: memory buffer
         grad_dims: gradient dimensions
         mem_grads: gradient from memory subsets
         batch_x: batch images
         batch_y: batch labels
     Returns: score of each sample from current batch
     """
     cosine_sim = maybe_cuda(torch.zeros(batch_x.size(0)))
     for i, (x, y) in enumerate(zip(batch_x, batch_y)):
         buffer.model.zero_grad()
         ptloss = F.cross_entropy(buffer.model.forward(x.unsqueeze(0)),
                                  y.unsqueeze(0))
         ptloss.backward()
         # add the new grad to the memory grads and add it is cosine similarity
         this_grad = get_grad_vector(buffer.model.parameters,
                                     grad_dims).unsqueeze(0)
         cosine_sim[i] = max(cosine_similarity(mem_grads, this_grad))
     return cosine_sim
 def get_rand_mem_grads(self, buffer, grad_dims):
     """
     Args:
         buffer: memory buffer
         grad_dims: gradient dimensions
     Returns: gradient from memory subsets
     """
     gss_batch_size = min(self.gss_batch_size, buffer.current_index)
     num_mem_subs = min(self.mem_strength,
                        buffer.current_index // gss_batch_size)
     mem_grads = maybe_cuda(
         torch.zeros(num_mem_subs, sum(grad_dims), dtype=torch.float32))
     shuffeled_inds = torch.randperm(buffer.current_index)
     for i in range(num_mem_subs):
         random_batch_inds = shuffeled_inds[i * gss_batch_size:i *
                                            gss_batch_size + gss_batch_size]
         batch_x = buffer.buffer_img[random_batch_inds]
         batch_y = buffer.buffer_label[random_batch_inds]
         buffer.model.zero_grad()
         loss = F.cross_entropy(buffer.model.forward(batch_x), batch_y)
         loss.backward()
         mem_grads[i].data.copy_(
             get_grad_vector(buffer.model.parameters, grad_dims))
     return mem_grads
def tune_hyper(tune_data, tune_test_loaders, default_params, tune_params):
    param_grid_list = list(ParameterGrid(tune_params))
    print(len(param_grid_list))
    tune_accs = []
    tune_fgt = []
    for param_set in param_grid_list:
        final_params = vars(default_params)
        print(param_set)
        final_params.update(param_set)
        final_params = SimpleNamespace(**final_params)
        accuracy_list = []
        for run in range(final_params.num_runs_val):
            tmp_acc = []
            model = setup_architecture(final_params)
            model = maybe_cuda(model, final_params.cuda)
            opt = setup_opt(final_params.optimizer, model,
                            final_params.learning_rate,
                            final_params.weight_decay)
            agent = agents[final_params.agent](model, opt, final_params)
            for i, (x_train, y_train, labels) in enumerate(tune_data):
                print("-----------tune run {} task {}-------------".format(
                    run, i))
                print('size: {}, {}'.format(x_train.shape, y_train.shape))
                agent.train_learner(x_train, y_train)
                acc_array = agent.evaluate(tune_test_loaders)
                tmp_acc.append(acc_array)
            print("-----------tune run {}-----------avg_end_acc {}-----------".
                  format(run, np.mean(tmp_acc[-1])))
            accuracy_list.append(np.array(tmp_acc))
        accuracy_list = np.array(accuracy_list)
        avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance(
            accuracy_list)
        tune_accs.append(avg_end_acc[0])
        tune_fgt.append(avg_end_fgt[0])
    best_tune = param_grid_list[tune_accs.index(max(tune_accs))]
    return best_tune
Ejemplo n.º 25
0
def multiple_run_tune(defaul_params, tune_params, save_path):
    # Set up data stream
    start = time.time()
    print('Setting up data stream')
    data_continuum = continuum(defaul_params.data, defaul_params.cl_type,
                               defaul_params)
    data_end = time.time()
    print('data setup time: {}'.format(data_end - start))

    #store table
    # set up storing table
    table_path = load_yaml('config/global.yml', key='path')['tables']
    metric_list = ['Avg_End_Acc'] + ['Avg_End_Fgt'] + ['Time'] + [
        "Batch" + str(i)
        for i in range(defaul_params.num_val, data_continuum.task_nums)
    ]
    param_list = list(tune_params.keys()) + metric_list
    table_columns = ['Run'] + param_list
    table_path = table_path + defaul_params.data
    os.makedirs(table_path, exist_ok=True)
    if not save_path:
        save_path = defaul_params.model_name + '_' + defaul_params.data_name + '.csv'
    df = pd.DataFrame(columns=table_columns)
    # store list
    accuracy_list = []
    params_keep = []
    for run in range(defaul_params.num_runs):
        tmp_acc = []
        tune_data = []
        run_start = time.time()
        data_continuum.new_run()
        # prepare val data loader
        test_loaders = setup_test_loader(data_continuum.test_data(),
                                         defaul_params)
        tune_test_loaders = test_loaders[:defaul_params.num_val]
        test_loaders = test_loaders[defaul_params.num_val:]
        for i, (x_train, y_train, labels) in enumerate(data_continuum):
            if i < defaul_params.num_val:
                #collection tune data
                tune_data.append((x_train, y_train, labels))
                if len(tune_data) == defaul_params.num_val:
                    # tune
                    best_params = tune_hyper(tune_data, tune_test_loaders,
                                             defaul_params, tune_params)
                    params_keep.append(best_params)
                    final_params = vars(defaul_params)
                    final_params.update(best_params)
                    final_params = SimpleNamespace(**final_params)
                    # set up
                    print('Tuning is done. Best hyper parameter set is {}'.
                          format(best_params))
                    model = setup_architecture(final_params)
                    model = maybe_cuda(model, final_params.cuda)
                    opt = setup_opt(final_params.optimizer, model,
                                    final_params.learning_rate,
                                    final_params.weight_decay)
                    agent = agents[final_params.agent](model, opt,
                                                       final_params)
                    print('Training Start')
            else:
                print("----------run {} training batch {}-------------".format(
                    run, i))
                print('size: {}, {}'.format(x_train.shape, y_train.shape))
                agent.train_learner(x_train, y_train)
                acc_array = agent.evaluate(test_loaders)
                tmp_acc.append(acc_array)

        run_end = time.time()
        print(
            "-----------run {}-----------avg_end_acc {}-----------train time {}"
            .format(run, np.mean(tmp_acc[-1]), run_end - run_start))
        accuracy_list.append(np.array(tmp_acc))

        #store result
        result_dict = {'Run': run}
        result_dict.update(best_params)
        end_task_acc = tmp_acc[-1]
        for i in range(data_continuum.task_nums - defaul_params.num_val):
            result_dict["Batch" +
                        str(i + defaul_params.num_val)] = end_task_acc[i]
        result_dict['Avg_End_Acc'] = np.mean(tmp_acc[-1])
        result_dict['Avg_End_Fgt'] = single_run_avg_end_fgt(np.array(tmp_acc))
        result_dict['Time'] = run_end - run_start
        df = df.append(result_dict, ignore_index=True)
        save_dataframe_csv(df, table_path, save_path)
    accuracy_list = np.array(accuracy_list)
    avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance(
        accuracy_list)
    end = time.time()
    final_result = {'Run': 'Final Result'}
    final_result['Avg_End_Acc'] = avg_end_acc
    final_result['Avg_End_Fgt'] = avg_end_fgt
    final_result['Time'] = end - start
    df = df.append(final_result, ignore_index=True)
    save_dataframe_csv(df, table_path, save_path)
    print('----------- Total {} run: {}s -----------'.format(
        defaul_params.num_runs, end - start))
    print(
        '----------- Avg_End_Acc {} Avg_End_Fgt {} Avg_Acc {} Avg_Bwtp {} Avg_Fwt {}-----------'
        .format(avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt))
Ejemplo n.º 26
0
def single_tune_train_val(data_continuum, default_params, tune_params,
                          params_keep, tmp_acc, run):
    tune_data = []
    # prepare val data loader
    test_loaders_full = setup_test_loader(data_continuum.test_data(),
                                          default_params)
    tune_test_loaders = test_loaders_full[:default_params.num_val]
    if default_params.online:
        for i, (x_train, y_train, labels) in enumerate(data_continuum):
            if i < default_params.num_val:
                # collection tune data
                tune_data.append((x_train, y_train, labels))
                if len(tune_data) == default_params.num_val:
                    # tune
                    best_params = tune_hyper(
                        tune_data,
                        tune_test_loaders,
                        default_params,
                        tune_params,
                    )
                    params_keep.append(best_params)
                    final_params = vars(default_params)
                    final_params.update(best_params)
                    final_params = SimpleNamespace(**final_params)
                    print('Tuning is done. Best hyper parameter set is {}'.
                          format(best_params))
                    break

        data_continuum.reset_run()
        # set up
        model = setup_architecture(final_params)
        model = maybe_cuda(model, final_params.cuda)
        opt = setup_opt(final_params.optimizer, model,
                        final_params.learning_rate, final_params.weight_decay)
        agent = agents[final_params.agent](model, opt, final_params)
        print('Training Start')
        for i, (x_train, y_train, labels) in enumerate(data_continuum):
            print("----------run {} training batch {}-------------".format(
                run, i))
            print('size: {}, {}'.format(x_train.shape, y_train.shape))
            agent.train_learner(x_train, y_train)
            acc_array = agent.evaluate(test_loaders_full)
            tmp_acc.append(acc_array)
    else:
        x_train_offline = []
        y_train_offline = []
        x_tune_offline = []
        y_tune_offline = []
        labels_offline = []
        for i, (x_train, y_train, labels) in enumerate(data_continuum):
            if i < default_params.num_val:
                # collection tune data
                x_tune_offline.append(x_train)
                y_tune_offline.append(y_train)
                labels_offline.append(labels)
            x_train_offline.append(x_train)
            y_train_offline.append(y_train)
        tune_data = [(np.concatenate(x_tune_offline, axis=0),
                      np.concatenate(y_tune_offline, axis=0), labels_offline)]
        best_params = tune_hyper(
            tune_data,
            tune_test_loaders,
            default_params,
            tune_params,
        )
        params_keep.append(best_params)
        final_params = vars(default_params)
        final_params.update(best_params)
        final_params = SimpleNamespace(**final_params)
        # set up
        print('Tuning is done. Best hyper parameter set is {}'.format(
            best_params))
        model = setup_architecture(final_params)
        model = maybe_cuda(model, final_params.cuda)
        opt = setup_opt(final_params.optimizer, model,
                        final_params.learning_rate, final_params.weight_decay)
        agent = agents[final_params.agent](model, opt, final_params)
        print('Training Start')
        x_train_offline = np.concatenate(x_train_offline, axis=0)
        y_train_offline = np.concatenate(y_train_offline, axis=0)
        print("----------run {} training-------------".format(run))
        print('size: {}, {}'.format(x_train_offline.shape,
                                    y_train_offline.shape))
        agent.train_learner(x_train_offline, y_train_offline)
        acc_array = agent.evaluate(test_loaders_full)
        tmp_acc.append(acc_array)
Ejemplo n.º 27
0
 def sample(self, n=1):
     z = maybe_cuda(torch.randn(n, MODELS_NDPM_VAE_Z_DIM))
     x_mean = self.decode(z)
     return x_mean
Ejemplo n.º 28
0
    def sleep(self, dream_dataset):
        print('\nGoing to sleep...')
        # Add new expert and optimizer
        expert = Expert(self.params, self.get_experts())
        self.experts.append(expert)
        self.prior.add_expert()

        stacked_stm_x = torch.stack(self.stm_x)
        stacked_stm_y = torch.stack(self.stm_y)
        indices = torch.randperm(stacked_stm_x.size(0))
        train_size = stacked_stm_x.size(
            0) - MODELS_NDPM_NDPM_SLEEP_SLEEP_VAL_SIZE
        dream_dataset = TensorDataset(stacked_stm_x[indices[:train_size]],
                                      stacked_stm_y[indices[:train_size]])

        # Prepare data iterator
        self.prior.record_usage(len(dream_dataset), index=-1)
        dream_iterator = iter(
            DataLoader(dream_dataset,
                       batch_size=MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE,
                       num_workers=MODELS_NDPM_NDPM_SLEEP_NUM_WORKERS,
                       sampler=RandomSampler(
                           dream_dataset,
                           replacement=True,
                           num_samples=(MODELS_NDPM_NDPM_SLEEP_STEP_G *
                                        MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE))))

        # Train generative component
        for step, (x, y) in enumerate(dream_iterator):
            step += 1
            x, y = maybe_cuda(x), maybe_cuda(y)
            g_loss = expert.g.nll(x, y, step=step)
            g_loss = (
                g_loss +
                MODELS_NDPM_NDPM_WEIGHT_DECAY * expert.g.weight_decay_loss())
            expert.g.zero_grad()
            g_loss.mean().backward()
            expert.g.clip_grad()
            expert.g.optimizer.step()

            if step % MODELS_NDPM_NDPM_SLEEP_SUMMARY_STEP == 0:
                print('\r   [Sleep-G %6d] loss: %5.1f' % (step, g_loss.mean()),
                      end='')
        print()

        dream_iterator = iter(
            DataLoader(dream_dataset,
                       batch_size=MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE,
                       num_workers=MODELS_NDPM_NDPM_SLEEP_NUM_WORKERS,
                       sampler=RandomSampler(
                           dream_dataset,
                           replacement=True,
                           num_samples=(MODELS_NDPM_NDPM_SLEEP_STEP_D *
                                        MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE))))

        # Train discriminative component
        if not MODELS_NDPM_NDPM_DISABLE_D:
            for step, (x, y) in enumerate(dream_iterator):
                step += 1
                x, y = maybe_cuda(x), maybe_cuda(y)
                d_loss = expert.d.nll(x, y, step=step)
                d_loss = (d_loss + MODELS_NDPM_NDPM_WEIGHT_DECAY *
                          expert.d.weight_decay_loss())
                expert.d.zero_grad()
                d_loss.mean().backward()
                expert.d.clip_grad()
                expert.d.optimizer.step()

                if step % MODELS_NDPM_NDPM_SLEEP_SUMMARY_STEP == 0:
                    print('\r   [Sleep-D %6d] loss: %5.1f' %
                          (step, d_loss.mean()),
                          end='')

        expert.lr_scheduler_step()
        expert.lr_scheduler_step()
        expert.eval()
        print()
Ejemplo n.º 29
0
 def evaluate(self, test_loaders):
     self.model.eval()
     acc_array = np.zeros(len(test_loaders))
     if self.params.trick['nmc_trick'] or self.params.agent == 'ICARL':
         exemplar_means = {}
         cls_exemplar = {cls: [] for cls in self.old_labels}
         buffer_filled = self.buffer.current_index
         for x, y in zip(self.buffer.buffer_img[:buffer_filled], self.buffer.buffer_label[:buffer_filled]):
             cls_exemplar[y.item()].append(x)
         for cls, exemplar in cls_exemplar.items():
             features = []
             # Extract feature for each exemplar in p_y
             for ex in exemplar:
                 feature = self.model.features(ex.unsqueeze(0)).detach().clone()
                 feature = feature.squeeze()
                 feature.data = feature.data / feature.data.norm()  # Normalize
                 features.append(feature)
             features = torch.stack(features)
             mu_y = features.mean(0).squeeze()
             mu_y.data = mu_y.data / mu_y.data.norm()  # Normalize
             exemplar_means[cls] = mu_y
     with torch.no_grad():
         if self.params.error_analysis:
             error = 0
             no = 0
             nn = 0
             oo = 0
             on = 0
             new_class_score = AverageMeter()
             old_class_score = AverageMeter()
         for task, test_loader in enumerate(test_loaders):
             acc = AverageMeter()
             for i, (batch_x, batch_y) in enumerate(test_loader):
                 batch_x = maybe_cuda(batch_x, self.cuda)
                 batch_y = maybe_cuda(batch_y, self.cuda)
                 if self.params.trick['nmc_trick'] or self.params.agent == 'ICARL':
                     feature = self.model.features(batch_x)  # (batch_size, feature_size)
                     for j in range(feature.size(0)):  # Normalize
                         feature.data[j] = feature.data[j] / feature.data[j].norm()
                     feature = feature.unsqueeze(2)  # (batch_size, feature_size, 1)
                     means = torch.stack([exemplar_means[cls] for cls in self.old_labels])  # (n_classes, feature_size)
                     means = torch.stack([means] * batch_x.size(0))  # (batch_size, n_classes, feature_size)
                     means = means.transpose(1, 2)
                     feature = feature.expand_as(means)  # (batch_size, feature_size, n_classes)
                     dists = (feature - means).pow(2).sum(1).squeeze()  # (batch_size, n_classes)
                     _, preds = dists.min(1)
                     correct_cnt = (np.array(self.old_labels)[
                                        preds.tolist()] == batch_y.cpu().numpy()).sum().item() / batch_y.size(0)
                 else:
                     logits = self.model.forward(batch_x)
                     _, pred_label = torch.max(logits, 1)
                     correct_cnt = (pred_label == batch_y).sum().item()/batch_y.size(0)
                     if self.params.error_analysis:
                         if task < self.task_seen-1:
                             # old test
                             total = (pred_label != batch_y).sum().item()
                             wrong = pred_label[pred_label != batch_y]
                             error += total
                             on_tmp = sum([(wrong == i).sum().item() for i in self.new_labels_zombie])
                             oo += total - on_tmp
                             on += on_tmp
                             old_class_score.update(logits[:, list(set(self.old_labels) - set(self.new_labels_zombie))].mean().item(), batch_y.size(0))
                         elif task == self.task_seen -1:
                             # new test
                             total = (pred_label != batch_y).sum().item()
                             error += total
                             wrong = pred_label[pred_label != batch_y]
                             no_tmp = sum([(wrong == i).sum().item() for i in list(set(self.old_labels) - set(self.new_labels_zombie))])
                             no += no_tmp
                             nn += total - no_tmp
                             new_class_score.update(logits[:, self.new_labels_zombie].mean().item(), batch_y.size(0))
                         else:
                             pass
                 acc.update(correct_cnt, batch_y.size(0))
             acc_array[task] = acc.avg()
     print(acc_array)
     if self.params.error_analysis:
         self.error_list.append((no, nn, oo, on))
         self.new_class_score.append(new_class_score.avg())
         self.old_class_score.append(old_class_score.avg())
         print("no ratio: {}\non ratio: {}".format(no/(no+nn+0.1), on/(oo+on+0.1)))
         print(self.error_list)
         print(self.new_class_score)
         print(self.old_class_score)
         self.fc_norm_new.append(self.model.linear.weight[self.new_labels_zombie].mean().item())
         self.fc_norm_old.append(self.model.linear.weight[list(set(self.old_labels) - set(self.new_labels_zombie))].mean().item())
         self.bias_norm_new.append(self.model.linear.bias[self.new_labels_zombie].mean().item())
         self.bias_norm_old.append(self.model.linear.bias[list(set(self.old_labels) - set(self.new_labels_zombie))].mean().item())
         print(self.fc_norm_old)
         print(self.fc_norm_new)
         print(self.bias_norm_old)
         print(self.bias_norm_new)
     return acc_array
Ejemplo n.º 30
0
 def weight_decay_loss(self):
     loss = maybe_cuda(torch.zeros([]))
     for param in self.parameters():
         loss += torch.norm(param) ** 2
     return loss