def after_train(self): #self.old_labels = list(set(self.old_labels + self.new_labels)) self.old_labels += self.new_labels self.new_labels_zombie = copy.deepcopy(self.new_labels) self.new_labels.clear() self.task_seen += 1 if self.params.trick['review_trick'] and hasattr(self, 'buffer'): self.model.train() mem_x = self.buffer.buffer_img[:self.buffer.current_index] mem_y = self.buffer.buffer_label[:self.buffer.current_index] # criterion = torch.nn.CrossEntropyLoss(reduction='mean') if mem_x.size(0) > 0: rv_dataset = TensorDataset(mem_x, mem_y) rv_loader = DataLoader(rv_dataset, batch_size=self.batch, shuffle=True, num_workers=0, drop_last=True) for ep in range(1): for i, batch_data in enumerate(rv_loader): # batch update batch_x, batch_y = batch_data batch_x = maybe_cuda(batch_x, self.cuda) batch_y = maybe_cuda(batch_y, self.cuda) logits = self.model.forward(batch_x) loss = self.criterion(logits, batch_y) self.opt.zero_grad() loss.backward() params = [p for p in self.model.parameters() if p.requires_grad] grad = [p.grad.clone()/10. for p in params] for g, p in zip(grad, params): p.grad.data.copy_(g) self.opt.step() if self.params.trick['kd_trick'] or self.params.agent == 'LWF': self.kd_manager.update_teacher(self.model)
def learn(self, x, y): x, y = maybe_cuda(x), maybe_cuda(y) if MODELS_NDPM_NDPM_SEND_TO_STM_ALWAYS: self.stm_x.extend(torch.unbind(x.cpu())) self.stm_y.extend(torch.unbind(y.cpu())) else: # Determine the destination of each data point nll = self.experts[-1].collect_nll(x, y) # [B, 1+K] nl_prior = self.prior.nl_prior() # [1+K] nl_joint = nll + nl_prior.unsqueeze(0).expand(nll.size(0), -1) # [B, 1+K] # Save to short-term memory destination = maybe_cuda(torch.argmin(nl_joint, dim=1)) # [B] to_stm = destination == 0 # [B] self.stm_x.extend(torch.unbind(x[to_stm].cpu())) self.stm_y.extend(torch.unbind(y[to_stm].cpu())) # Train expert with torch.no_grad(): min_joint = nl_joint.min(dim=1)[0].view(-1, 1) to_expert = torch.exp(-nl_joint + min_joint) # [B, 1+K] to_expert[:, 0] = 0. # [B, 1+K] to_expert = \ to_expert / (to_expert.sum(dim=1).view(-1, 1) + 1e-7) # Compute losses per expert nll_for_train = nll * (1. - to_stm.float()).unsqueeze(1) # [B,1+K] losses = (nll_for_train * to_expert).sum(0) # [1+K] # Record expert usage expert_usage = to_expert.sum(dim=0) # [K+1] self.prior.record_usage(expert_usage) # Do lr_decay implicitly if MODELS_NDPM_NDPM_IMPLICIT_LR_DECAY: losses = losses \ * self.params.stm_capacity / (self.prior.counts + 1e-8) loss = losses.sum() if loss.requires_grad: update_threshold = 0 for k, usage in enumerate(expert_usage): if usage > update_threshold: self.experts[k].zero_grad() loss.backward() for k, usage in enumerate(expert_usage): if usage > update_threshold: self.experts[k].clip_grad() self.experts[k].optimizer_step() self.experts[k].lr_scheduler_step() # Sleep if len(self.stm_x) >= self.stm_capacity: dream_dataset = TensorDataset(torch.stack(self.stm_x), torch.stack(self.stm_y)) self.sleep(dream_dataset) self.stm_x = [] self.stm_y = []
def train_learner(self, x_train, y_train): # set up loader train_dataset = dataset_transform( x_train, y_train, transform=transforms_match[self.data]) train_loader = data.DataLoader(train_dataset, batch_size=self.batch, shuffle=True, num_workers=0, drop_last=True) # setup tracker losses_batch = AverageMeter() acc_batch = AverageMeter() self.model.train() for ep in range(self.epoch): for i, batch_data in enumerate(train_loader): # batch update batch_x, batch_y = batch_data batch_x = maybe_cuda(batch_x, self.cuda) batch_y = maybe_cuda(batch_y, self.cuda) self.model.learn(batch_x, batch_y) if self.params.verbose: print('\r[Step {:4}] STM: {:5}/{} | #Expert: {}'.format( i, len(self.model.stm_x), self.params.stm_capacity, len(self.model.experts) - 1), end='') print()
def __init__(self, params): super().__init__() # the number of gradient vectors to estimate new samples similarity, line 5 in alg.2 self.mem_strength = params.gss_mem_strength self.gss_batch_size = params.gss_batch_size self.buffer_score = maybe_cuda( torch.FloatTensor(params.mem_size).fill_(0))
def deep_features(model, eval_x, n_eval, cand_x, n_cand): """ Compute deep features of evaluation and candidate data. Args: model (object): neural network. eval_x (tensor): evaluation data tensor. n_eval (int): number of evaluation data. cand_x (tensor): candidate data tensor. n_cand (int): number of candidate data. Returns eval_df (tensor): deep features of evaluation data. cand_df (tensor): deep features of evaluation data. """ # Get deep features if cand_x is None: num = n_eval total_x = eval_x else: num = n_eval + n_cand total_x = torch.cat((eval_x, cand_x), 0) # compute deep features with mini-batches total_x = maybe_cuda(total_x) deep_features_ = mini_batch_deep_features(model, total_x, num) eval_df = deep_features_[0:n_eval] cand_df = deep_features_[n_eval:] return eval_df, cand_df
def nll(self, x, y, step=None): x, y = maybe_cuda(x), maybe_cuda(y) log_softmax = self.forward(x) loss_pred = self.ce_loss(log_softmax, y) # Classifier chilling chilled_log_softmax = F.log_softmax(log_softmax / self.params.classifier_chill, dim=1) chilled_loss_pred = self.ce_loss(chilled_log_softmax, y) # Value with chill & gradient without chill loss_pred = loss_pred - loss_pred.detach() \ + chilled_loss_pred.detach() return loss_pred
def sample(cls, buffer_x, buffer_y, n_smp_cls, excl_indices=None, device="cpu"): """ Take same number of random samples from each class from buffer. Args: buffer_x (tensor): data buffer. buffer_y (tensor): label buffer. n_smp_cls (int): number of samples to take from each class. excl_indices (set): indices of buffered instances to be excluded from sampling. device (str): device for tensor allocation. Returns x (tensor): class balanced random sample data tensor. y (tensor): class balanced random sample label tensor. sample_ind (tensor): class balanced random sample index tensor. """ if excl_indices is None: excl_indices = set() # Get indices for class balanced random samples # cls_ind_cache = class_index_tensor_list_cache(buffer_y, num_class, excl_indices, device=device) sample_ind = torch.tensor([], device=device, dtype=torch.long) # Use cache to retrieve indices belonging to each class in buffer for ind_set in cls.class_index_cache.values(): if ind_set: # Exclude some indices valid_ind = ind_set - excl_indices # Auxiliary indices for permutation perm_ind = torch.randperm(len(valid_ind), device=device) # Apply permutation, and select indices ind = torch.tensor(list(valid_ind), device=device, dtype=torch.long)[perm_ind][:n_smp_cls] sample_ind = torch.cat((sample_ind, ind)) x = buffer_x[sample_ind] y = buffer_y[sample_ind] x = maybe_cuda(x) y = maybe_cuda(y) return x, y, sample_ind
def collect_nll(self, x, y=None, step=None): """Collect NLL values Returns: loss_vae: Tensor of shape [B, 1+K] """ x = maybe_cuda(x) # Dummy VAE dummy_nll = self.experts[0].g.nll(x, y, step) # Encode z_means, z_log_vars, features = self.encode(x, collect=True) # Decode loss_vaes = [dummy_nll] vaes = [expert.g for expert in self.experts[1:]] + [self] x_logits = [] for z_mean, z_log_var, vae in zip(z_means, z_log_vars, vaes): z = self.reparameterize(z_mean, z_log_var, MODELS_NDPM_VAE_Z_SAMPLES) if MODELS_NDPM_VAE_PRECURSOR_CONDITIONED_DECODER: x_logit = vae.decode(z, as_logit=True) x_logits.append(x_logit) continue x_mean = vae.decode(z) x_mean = x_mean.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, *x.shape[1:]) x_log_var = (None if MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' else self.log_var.view(1, 1, -1, 1, 1)) loss_recon = self.reconstruction_loss(x, x_mean, x_log_var) loss_recon = loss_recon.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, -1) loss_recon = loss_recon.sum(2).mean(1) loss_kl = self.gaussian_kl(z_mean, z_log_var) loss_vae = loss_recon + loss_kl loss_vaes.append(loss_vae) x_logits = list( accumulate(x_logits, func=(lambda x, y: x.detach() + y))) for x_logit in x_logits: x_mean = torch.sigmoid(x_logit) x_mean = x_mean.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, *x.shape[1:]) x_log_var = (None if MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' else self.log_var.view(1, 1, -1, 1, 1)) loss_recon = self.reconstruction_loss(x, x_mean, x_log_var) loss_recon = loss_recon.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, -1) loss_recon = loss_recon.sum(2).mean(1) loss_kl = self.gaussian_kl(z_mean, z_log_var) loss_vae = loss_recon + loss_kl loss_vaes.append(loss_vae) return torch.stack(loss_vaes, dim=1)
def __init__(self, params, experts=()): super().__init__() self.id = len(experts) self.experts = experts self.g = maybe_cuda(CnnSharingVae(params, experts)) self.d = maybe_cuda(ResNetSharingClassifier( params, experts)) if not MODELS_NDPM_NDPM_DISABLE_D else None # use random initialized g if it's a placeholder if self.id == 0: self.eval() for p in self.g.parameters(): p.requires_grad = False # use random initialized d if it's a placeholder if self.id == 0 and self.d is not None: for p in self.d.parameters(): p.requires_grad = False
def train_learner(self, x_train, y_train): self.before_train(x_train, y_train) # set up loader train_dataset = dataset_transform(x_train, y_train, transform=transforms_match[self.data]) train_loader = data.DataLoader(train_dataset, batch_size=self.batch, shuffle=True, num_workers=0, drop_last=True) # set up model self.model = self.model.train() # setup tracker losses_batch = AverageMeter() acc_batch = AverageMeter() for ep in range(self.epoch): for i, batch_data in enumerate(train_loader): # batch update batch_x, batch_y = batch_data batch_x = maybe_cuda(batch_x, self.cuda) batch_y = maybe_cuda(batch_y, self.cuda) logits = self.forward(batch_x) loss_old = self.kd_manager.get_kd_loss(logits, batch_x) loss_new = self.criterion(logits, batch_y) loss = 1/(self.task_seen + 1) * loss_new + (1 - 1/(self.task_seen + 1)) * loss_old _, pred_label = torch.max(logits, 1) correct_cnt = (pred_label == batch_y).sum().item() / batch_y.size(0) # update tracker acc_batch.update(correct_cnt, batch_y.size(0)) losses_batch.update(loss, batch_y.size(0)) # backward self.opt.zero_grad() loss.backward() self.opt.step() if i % 100 == 1 and self.verbose: print( '==>>> it: {}, avg. loss: {:.6f}, ' 'running train acc: {:.3f}' .format(i, losses_batch.avg(), acc_batch.avg()) ) self.after_train()
def update_representation(self, train_loader): updated_idx = [] for ep in range(self.epoch): for i, train_data in enumerate(train_loader): # batch update train_x, train_y = train_data train_x = maybe_cuda(train_x, self.cuda) train_y = maybe_cuda(train_y, self.cuda) train_y_copy = train_y.clone() for k, y in enumerate(train_y_copy): train_y_copy[k] = len( self.old_labels) + self.new_labels.index(y) all_cls_num = len(self.new_labels) + len(self.old_labels) target_labels = utils.ohe_label( train_y_copy, all_cls_num, device=train_y_copy.device).float() if self.prev_model is not None: mem_x, mem_y = random_retrieve(self.buffer, self.batch, excl_indices=updated_idx) mem_x = maybe_cuda(mem_x, self.cuda) batch_x = torch.cat([train_x, mem_x]) target_labels = torch.cat( [target_labels, torch.zeros_like(target_labels)]) else: batch_x = train_x logits = self.forward(batch_x) self.opt.zero_grad() if self.prev_model is not None: with torch.no_grad(): q = torch.sigmoid(self.prev_model.forward(batch_x)) for k, y in enumerate(self.old_labels): target_labels[:, k] = q[:, k] loss = F.binary_cross_entropy_with_logits( logits[:, :all_cls_num], target_labels, reduction='none').sum(dim=1).mean() loss.backward() self.opt.step() updated_idx += self.buffer.update(train_x, train_y)
def train_learner(self, x_train, y_train): self.before_train(x_train, y_train) # set up loader train_dataset = dataset_transform( x_train, y_train, transform=transforms_match[self.data]) train_loader = data.DataLoader(train_dataset, batch_size=self.batch, shuffle=True, num_workers=0, drop_last=True) for i, batch_data in enumerate(train_loader): # batch update batch_x, batch_y = batch_data batch_x = maybe_cuda(batch_x, self.cuda) batch_y = maybe_cuda(batch_y, self.cuda) # update mem for j in range(len(batch_x)): self.greedy_balancing_update(batch_x[j], batch_y[j].item()) #self.early_stopping.reset() self.train_mem() self.after_train()
def __init__(self, params, experts): super().__init__(params, experts) x_c, x_h, x_w = input_size_match[params.data] bernoulli = MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' if bernoulli: self.log_var_param = None elif MODELS_NDPM_VAE_LEARN_X_LOG_VAR: self.log_var_param = nn.Parameter(torch.ones([x_c]) * MODELS_NDPM_VAE_X_LOG_VAR_PARAM, requires_grad=True) else: self.log_var_param = (maybe_cuda(torch.ones([x_c])) * MODELS_NDPM_VAE_X_LOG_VAR_PARAM)
def forward(self, x, collect=False): x = maybe_cuda(x) # First component if len(self.precursors) == 0: h1 = self.layer0(x) h2 = self.layer1(h1) h3 = self.layer2(h2) h4 = self.layer3(h3) h5 = self.layer4(h4) h5 = F.avg_pool2d(h5, h5.size(2)).view(h5.size(0), -1) pred = self.predict(h5) if collect: return [pred], [ h1.detach(), h2.detach(), h3.detach(), h4.detach(), h5.detach() ] else: return pred # Second or layer component preds, features = self.precursors[-1](x, collect=True) h1 = self.layer0(x) h1_cat = torch.cat([features[0], h1], dim=1) h2 = self.layer1(h1_cat) h2_cat = torch.cat([features[1], h2], dim=1) h3 = self.layer2(h2_cat) h3_cat = torch.cat([features[2], h3], dim=1) h4 = self.layer3(h3_cat) h4_cat = torch.cat([features[3], h4], dim=1) h5 = self.layer4(h4_cat) h5 = F.avg_pool2d(h5, h5.size(2)).view(h5.size(0), -1) h5_cat = torch.cat([features[4], h5], dim=1) pred = self.predict(h5_cat) if collect: preds.append(pred) return preds, [ h1_cat.detach(), h2_cat.detach(), h3_cat.detach(), h4_cat.detach(), h5_cat.detach(), ] else: return pred
def get_grad_vector(pp, grad_dims): """ gather the gradients in one vector """ grads = maybe_cuda(torch.Tensor(sum(grad_dims))) grads.fill_(0.0) cnt = 0 for param in pp(): if param.grad is not None: beg = 0 if cnt == 0 else sum(grad_dims[:cnt]) en = sum(grad_dims[:cnt + 1]) grads[beg:en].copy_(param.grad.data.view(-1)) cnt += 1 return grads
def __init__(self, model, params): super().__init__() self.params = params self.model = model self.cuda = self.params.cuda self.current_index = 0 self.n_seen_so_far = 0 # define buffer buffer_size = params.mem_size print('buffer has %d slots' % buffer_size) input_size = input_size_match[params.data] buffer_img = maybe_cuda( torch.FloatTensor(buffer_size, *input_size).fill_(0)) buffer_label = maybe_cuda(torch.LongTensor(buffer_size).fill_(0)) # registering as buffer allows us to save the object using `torch.save` self.register_buffer('buffer_img', buffer_img) self.register_buffer('buffer_label', buffer_label) # define update and retrieve method self.update_method = name_match.update_methods[params.update](params) self.retrieve_method = name_match.retrieve_methods[params.retrieve]( params)
def train_mem(self): mem_x = [] mem_y = [] for i in self.mem_img.keys(): mem_x += self.mem_img[i] mem_y += [i] * self.mem_c[i] mem_x = torch.stack(mem_x) mem_y = torch.LongTensor(mem_y) self.model = setup_architecture(self.params) self.model = maybe_cuda(self.model, self.cuda) opt = setup_opt(self.params.optimizer, self.model, self.params.learning_rate, self.params.weight_decay) #scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=1, T_mult=2, eta_min=self.params.minlr) #loss = math.inf for i in range(self.params.mem_epoch): idx = np.random.permutation(len(mem_x)).tolist() mem_x = maybe_cuda(mem_x[idx], self.cuda) mem_y = maybe_cuda(mem_y[idx], self.cuda) self.model = self.model.train() batch_size = self.params.batch #scheduler.step() #if opt.param_groups[0]['lr'] == self.params.learning_rate: # if self.early_stopping.step(-loss): # return for j in range(len(mem_y) // batch_size): opt.zero_grad() logits = self.model.forward(mem_x[batch_size * j:batch_size * (j + 1)]) loss = self.criterion( logits, mem_y[batch_size * j:batch_size * (j + 1)]) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.params.clip) opt.step()
def nll(self, x, y=None, step=None): x = maybe_cuda(x) z_mean, z_log_var = self.encode(x) z = self.reparameterize(z_mean, z_log_var, MODELS_NDPM_VAE_Z_SAMPLES) x_mean = self.decode(z) x_mean = x_mean.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, *x.shape[1:]) x_log_var = (None if MODELS_NDPM_VAE_RECON_LOSS == 'bernoulli' else self.log_var.view(1, 1, -1, 1, 1)) loss_recon = self.reconstruction_loss(x, x_mean, x_log_var) loss_recon = loss_recon.view(x.size(0), MODELS_NDPM_VAE_Z_SAMPLES, -1) loss_recon = loss_recon.sum(2).mean(1) loss_kl = self.gaussian_kl(z_mean, z_log_var) loss_vae = loss_recon + loss_kl return loss_vae
def forward(self, x): with torch.no_grad(): if len(self.experts) == 1: raise RuntimeError('There\'s no expert to run on the input') x = maybe_cuda(x) log_evid = -self.experts[-1].g.collect_nll(x) # [B, 1+K] log_evid = log_evid[:, 1:].unsqueeze(2) # [B, K, 1] log_prior = -self.prior.nl_prior()[1:] # [K] log_prior -= torch.logsumexp(log_prior, dim=0) log_prior = log_prior.unsqueeze(0).unsqueeze(2) # [1, K, 1] log_joint = log_prior + log_evid # [B, K, 1] if not MODELS_NDPM_NDPM_DISABLE_D: log_pred = self.experts[-1].d.collect_forward(x) # [B, 1+K, C] log_pred = log_pred[:, 1:, :] # [B, K, C] log_joint = log_joint + log_pred # [B, K, C] log_joint = log_joint.logsumexp(dim=1).squeeze() # [B,] or [B, C] return log_joint
def record_usage(self, usage, index=None): """Record expert usage Args: usage: Tensor of shape [K+1] if index is None else scalar index: expert index """ if index is None: self.log_counts = torch.logsumexp(torch.stack( [self.log_counts, usage.log()], dim=1), dim=1) else: self.log_counts[index] = torch.logsumexp(torch.stack([ self.log_counts[index], maybe_cuda(torch.tensor(usage)).float().log() ], dim=0), dim=0)
def multiple_run(params): # Set up data stream start = time.time() print('Setting up data stream') data_continuum = continuum(params.data, params.cl_type, params) data_end = time.time() print('data setup time: {}'.format(data_end - start)) accuracy_list = [] for run in range(params.num_runs): tmp_acc = [] run_start = time.time() data_continuum.new_run() model = setup_architecture(params) model = maybe_cuda(model, params.cuda) opt = setup_opt(params.optimizer, model, params.learning_rate, params.weight_decay) agent = agents[params.agent](model, opt, params) # prepare val data loader test_loaders = setup_test_loader(data_continuum.test_data(), params) for i, (x_train, y_train, labels) in enumerate(data_continuum): print("-----------run {} training batch {}-------------".format( run, i)) print('size: {}, {}'.format(x_train.shape, y_train.shape)) agent.train_learner(x_train, y_train) acc_array = agent.evaluate(test_loaders) tmp_acc.append(acc_array) run_end = time.time() print( "-----------run {}-----------avg_end_acc {}-----------train time {}" .format(run, np.mean(tmp_acc[-1]), run_end - run_start)) accuracy_list.append(np.array(tmp_acc)) accuracy_list = np.array(accuracy_list) avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance( accuracy_list) end = time.time() print('----------- Total {} run: {}s -----------'.format( params.num_runs, end - start)) print( '----------- Avg_End_Acc {} Avg_End_Fgt {} Avg_Acc {} Avg_Bwtp {} Avg_Fwt {}-----------' .format(avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt))
def get_each_batch_sample_sim(self, buffer, grad_dims, mem_grads, batch_x, batch_y): """ Args: buffer: memory buffer grad_dims: gradient dimensions mem_grads: gradient from memory subsets batch_x: batch images batch_y: batch labels Returns: score of each sample from current batch """ cosine_sim = maybe_cuda(torch.zeros(batch_x.size(0))) for i, (x, y) in enumerate(zip(batch_x, batch_y)): buffer.model.zero_grad() ptloss = F.cross_entropy(buffer.model.forward(x.unsqueeze(0)), y.unsqueeze(0)) ptloss.backward() # add the new grad to the memory grads and add it is cosine similarity this_grad = get_grad_vector(buffer.model.parameters, grad_dims).unsqueeze(0) cosine_sim[i] = max(cosine_similarity(mem_grads, this_grad)) return cosine_sim
def get_rand_mem_grads(self, buffer, grad_dims): """ Args: buffer: memory buffer grad_dims: gradient dimensions Returns: gradient from memory subsets """ gss_batch_size = min(self.gss_batch_size, buffer.current_index) num_mem_subs = min(self.mem_strength, buffer.current_index // gss_batch_size) mem_grads = maybe_cuda( torch.zeros(num_mem_subs, sum(grad_dims), dtype=torch.float32)) shuffeled_inds = torch.randperm(buffer.current_index) for i in range(num_mem_subs): random_batch_inds = shuffeled_inds[i * gss_batch_size:i * gss_batch_size + gss_batch_size] batch_x = buffer.buffer_img[random_batch_inds] batch_y = buffer.buffer_label[random_batch_inds] buffer.model.zero_grad() loss = F.cross_entropy(buffer.model.forward(batch_x), batch_y) loss.backward() mem_grads[i].data.copy_( get_grad_vector(buffer.model.parameters, grad_dims)) return mem_grads
def tune_hyper(tune_data, tune_test_loaders, default_params, tune_params): param_grid_list = list(ParameterGrid(tune_params)) print(len(param_grid_list)) tune_accs = [] tune_fgt = [] for param_set in param_grid_list: final_params = vars(default_params) print(param_set) final_params.update(param_set) final_params = SimpleNamespace(**final_params) accuracy_list = [] for run in range(final_params.num_runs_val): tmp_acc = [] model = setup_architecture(final_params) model = maybe_cuda(model, final_params.cuda) opt = setup_opt(final_params.optimizer, model, final_params.learning_rate, final_params.weight_decay) agent = agents[final_params.agent](model, opt, final_params) for i, (x_train, y_train, labels) in enumerate(tune_data): print("-----------tune run {} task {}-------------".format( run, i)) print('size: {}, {}'.format(x_train.shape, y_train.shape)) agent.train_learner(x_train, y_train) acc_array = agent.evaluate(tune_test_loaders) tmp_acc.append(acc_array) print("-----------tune run {}-----------avg_end_acc {}-----------". format(run, np.mean(tmp_acc[-1]))) accuracy_list.append(np.array(tmp_acc)) accuracy_list = np.array(accuracy_list) avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance( accuracy_list) tune_accs.append(avg_end_acc[0]) tune_fgt.append(avg_end_fgt[0]) best_tune = param_grid_list[tune_accs.index(max(tune_accs))] return best_tune
def multiple_run_tune(defaul_params, tune_params, save_path): # Set up data stream start = time.time() print('Setting up data stream') data_continuum = continuum(defaul_params.data, defaul_params.cl_type, defaul_params) data_end = time.time() print('data setup time: {}'.format(data_end - start)) #store table # set up storing table table_path = load_yaml('config/global.yml', key='path')['tables'] metric_list = ['Avg_End_Acc'] + ['Avg_End_Fgt'] + ['Time'] + [ "Batch" + str(i) for i in range(defaul_params.num_val, data_continuum.task_nums) ] param_list = list(tune_params.keys()) + metric_list table_columns = ['Run'] + param_list table_path = table_path + defaul_params.data os.makedirs(table_path, exist_ok=True) if not save_path: save_path = defaul_params.model_name + '_' + defaul_params.data_name + '.csv' df = pd.DataFrame(columns=table_columns) # store list accuracy_list = [] params_keep = [] for run in range(defaul_params.num_runs): tmp_acc = [] tune_data = [] run_start = time.time() data_continuum.new_run() # prepare val data loader test_loaders = setup_test_loader(data_continuum.test_data(), defaul_params) tune_test_loaders = test_loaders[:defaul_params.num_val] test_loaders = test_loaders[defaul_params.num_val:] for i, (x_train, y_train, labels) in enumerate(data_continuum): if i < defaul_params.num_val: #collection tune data tune_data.append((x_train, y_train, labels)) if len(tune_data) == defaul_params.num_val: # tune best_params = tune_hyper(tune_data, tune_test_loaders, defaul_params, tune_params) params_keep.append(best_params) final_params = vars(defaul_params) final_params.update(best_params) final_params = SimpleNamespace(**final_params) # set up print('Tuning is done. Best hyper parameter set is {}'. format(best_params)) model = setup_architecture(final_params) model = maybe_cuda(model, final_params.cuda) opt = setup_opt(final_params.optimizer, model, final_params.learning_rate, final_params.weight_decay) agent = agents[final_params.agent](model, opt, final_params) print('Training Start') else: print("----------run {} training batch {}-------------".format( run, i)) print('size: {}, {}'.format(x_train.shape, y_train.shape)) agent.train_learner(x_train, y_train) acc_array = agent.evaluate(test_loaders) tmp_acc.append(acc_array) run_end = time.time() print( "-----------run {}-----------avg_end_acc {}-----------train time {}" .format(run, np.mean(tmp_acc[-1]), run_end - run_start)) accuracy_list.append(np.array(tmp_acc)) #store result result_dict = {'Run': run} result_dict.update(best_params) end_task_acc = tmp_acc[-1] for i in range(data_continuum.task_nums - defaul_params.num_val): result_dict["Batch" + str(i + defaul_params.num_val)] = end_task_acc[i] result_dict['Avg_End_Acc'] = np.mean(tmp_acc[-1]) result_dict['Avg_End_Fgt'] = single_run_avg_end_fgt(np.array(tmp_acc)) result_dict['Time'] = run_end - run_start df = df.append(result_dict, ignore_index=True) save_dataframe_csv(df, table_path, save_path) accuracy_list = np.array(accuracy_list) avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance( accuracy_list) end = time.time() final_result = {'Run': 'Final Result'} final_result['Avg_End_Acc'] = avg_end_acc final_result['Avg_End_Fgt'] = avg_end_fgt final_result['Time'] = end - start df = df.append(final_result, ignore_index=True) save_dataframe_csv(df, table_path, save_path) print('----------- Total {} run: {}s -----------'.format( defaul_params.num_runs, end - start)) print( '----------- Avg_End_Acc {} Avg_End_Fgt {} Avg_Acc {} Avg_Bwtp {} Avg_Fwt {}-----------' .format(avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt))
def single_tune_train_val(data_continuum, default_params, tune_params, params_keep, tmp_acc, run): tune_data = [] # prepare val data loader test_loaders_full = setup_test_loader(data_continuum.test_data(), default_params) tune_test_loaders = test_loaders_full[:default_params.num_val] if default_params.online: for i, (x_train, y_train, labels) in enumerate(data_continuum): if i < default_params.num_val: # collection tune data tune_data.append((x_train, y_train, labels)) if len(tune_data) == default_params.num_val: # tune best_params = tune_hyper( tune_data, tune_test_loaders, default_params, tune_params, ) params_keep.append(best_params) final_params = vars(default_params) final_params.update(best_params) final_params = SimpleNamespace(**final_params) print('Tuning is done. Best hyper parameter set is {}'. format(best_params)) break data_continuum.reset_run() # set up model = setup_architecture(final_params) model = maybe_cuda(model, final_params.cuda) opt = setup_opt(final_params.optimizer, model, final_params.learning_rate, final_params.weight_decay) agent = agents[final_params.agent](model, opt, final_params) print('Training Start') for i, (x_train, y_train, labels) in enumerate(data_continuum): print("----------run {} training batch {}-------------".format( run, i)) print('size: {}, {}'.format(x_train.shape, y_train.shape)) agent.train_learner(x_train, y_train) acc_array = agent.evaluate(test_loaders_full) tmp_acc.append(acc_array) else: x_train_offline = [] y_train_offline = [] x_tune_offline = [] y_tune_offline = [] labels_offline = [] for i, (x_train, y_train, labels) in enumerate(data_continuum): if i < default_params.num_val: # collection tune data x_tune_offline.append(x_train) y_tune_offline.append(y_train) labels_offline.append(labels) x_train_offline.append(x_train) y_train_offline.append(y_train) tune_data = [(np.concatenate(x_tune_offline, axis=0), np.concatenate(y_tune_offline, axis=0), labels_offline)] best_params = tune_hyper( tune_data, tune_test_loaders, default_params, tune_params, ) params_keep.append(best_params) final_params = vars(default_params) final_params.update(best_params) final_params = SimpleNamespace(**final_params) # set up print('Tuning is done. Best hyper parameter set is {}'.format( best_params)) model = setup_architecture(final_params) model = maybe_cuda(model, final_params.cuda) opt = setup_opt(final_params.optimizer, model, final_params.learning_rate, final_params.weight_decay) agent = agents[final_params.agent](model, opt, final_params) print('Training Start') x_train_offline = np.concatenate(x_train_offline, axis=0) y_train_offline = np.concatenate(y_train_offline, axis=0) print("----------run {} training-------------".format(run)) print('size: {}, {}'.format(x_train_offline.shape, y_train_offline.shape)) agent.train_learner(x_train_offline, y_train_offline) acc_array = agent.evaluate(test_loaders_full) tmp_acc.append(acc_array)
def sample(self, n=1): z = maybe_cuda(torch.randn(n, MODELS_NDPM_VAE_Z_DIM)) x_mean = self.decode(z) return x_mean
def sleep(self, dream_dataset): print('\nGoing to sleep...') # Add new expert and optimizer expert = Expert(self.params, self.get_experts()) self.experts.append(expert) self.prior.add_expert() stacked_stm_x = torch.stack(self.stm_x) stacked_stm_y = torch.stack(self.stm_y) indices = torch.randperm(stacked_stm_x.size(0)) train_size = stacked_stm_x.size( 0) - MODELS_NDPM_NDPM_SLEEP_SLEEP_VAL_SIZE dream_dataset = TensorDataset(stacked_stm_x[indices[:train_size]], stacked_stm_y[indices[:train_size]]) # Prepare data iterator self.prior.record_usage(len(dream_dataset), index=-1) dream_iterator = iter( DataLoader(dream_dataset, batch_size=MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE, num_workers=MODELS_NDPM_NDPM_SLEEP_NUM_WORKERS, sampler=RandomSampler( dream_dataset, replacement=True, num_samples=(MODELS_NDPM_NDPM_SLEEP_STEP_G * MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE)))) # Train generative component for step, (x, y) in enumerate(dream_iterator): step += 1 x, y = maybe_cuda(x), maybe_cuda(y) g_loss = expert.g.nll(x, y, step=step) g_loss = ( g_loss + MODELS_NDPM_NDPM_WEIGHT_DECAY * expert.g.weight_decay_loss()) expert.g.zero_grad() g_loss.mean().backward() expert.g.clip_grad() expert.g.optimizer.step() if step % MODELS_NDPM_NDPM_SLEEP_SUMMARY_STEP == 0: print('\r [Sleep-G %6d] loss: %5.1f' % (step, g_loss.mean()), end='') print() dream_iterator = iter( DataLoader(dream_dataset, batch_size=MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE, num_workers=MODELS_NDPM_NDPM_SLEEP_NUM_WORKERS, sampler=RandomSampler( dream_dataset, replacement=True, num_samples=(MODELS_NDPM_NDPM_SLEEP_STEP_D * MODELS_NDPM_NDPM_SLEEP_BATCH_SIZE)))) # Train discriminative component if not MODELS_NDPM_NDPM_DISABLE_D: for step, (x, y) in enumerate(dream_iterator): step += 1 x, y = maybe_cuda(x), maybe_cuda(y) d_loss = expert.d.nll(x, y, step=step) d_loss = (d_loss + MODELS_NDPM_NDPM_WEIGHT_DECAY * expert.d.weight_decay_loss()) expert.d.zero_grad() d_loss.mean().backward() expert.d.clip_grad() expert.d.optimizer.step() if step % MODELS_NDPM_NDPM_SLEEP_SUMMARY_STEP == 0: print('\r [Sleep-D %6d] loss: %5.1f' % (step, d_loss.mean()), end='') expert.lr_scheduler_step() expert.lr_scheduler_step() expert.eval() print()
def evaluate(self, test_loaders): self.model.eval() acc_array = np.zeros(len(test_loaders)) if self.params.trick['nmc_trick'] or self.params.agent == 'ICARL': exemplar_means = {} cls_exemplar = {cls: [] for cls in self.old_labels} buffer_filled = self.buffer.current_index for x, y in zip(self.buffer.buffer_img[:buffer_filled], self.buffer.buffer_label[:buffer_filled]): cls_exemplar[y.item()].append(x) for cls, exemplar in cls_exemplar.items(): features = [] # Extract feature for each exemplar in p_y for ex in exemplar: feature = self.model.features(ex.unsqueeze(0)).detach().clone() feature = feature.squeeze() feature.data = feature.data / feature.data.norm() # Normalize features.append(feature) features = torch.stack(features) mu_y = features.mean(0).squeeze() mu_y.data = mu_y.data / mu_y.data.norm() # Normalize exemplar_means[cls] = mu_y with torch.no_grad(): if self.params.error_analysis: error = 0 no = 0 nn = 0 oo = 0 on = 0 new_class_score = AverageMeter() old_class_score = AverageMeter() for task, test_loader in enumerate(test_loaders): acc = AverageMeter() for i, (batch_x, batch_y) in enumerate(test_loader): batch_x = maybe_cuda(batch_x, self.cuda) batch_y = maybe_cuda(batch_y, self.cuda) if self.params.trick['nmc_trick'] or self.params.agent == 'ICARL': feature = self.model.features(batch_x) # (batch_size, feature_size) for j in range(feature.size(0)): # Normalize feature.data[j] = feature.data[j] / feature.data[j].norm() feature = feature.unsqueeze(2) # (batch_size, feature_size, 1) means = torch.stack([exemplar_means[cls] for cls in self.old_labels]) # (n_classes, feature_size) means = torch.stack([means] * batch_x.size(0)) # (batch_size, n_classes, feature_size) means = means.transpose(1, 2) feature = feature.expand_as(means) # (batch_size, feature_size, n_classes) dists = (feature - means).pow(2).sum(1).squeeze() # (batch_size, n_classes) _, preds = dists.min(1) correct_cnt = (np.array(self.old_labels)[ preds.tolist()] == batch_y.cpu().numpy()).sum().item() / batch_y.size(0) else: logits = self.model.forward(batch_x) _, pred_label = torch.max(logits, 1) correct_cnt = (pred_label == batch_y).sum().item()/batch_y.size(0) if self.params.error_analysis: if task < self.task_seen-1: # old test total = (pred_label != batch_y).sum().item() wrong = pred_label[pred_label != batch_y] error += total on_tmp = sum([(wrong == i).sum().item() for i in self.new_labels_zombie]) oo += total - on_tmp on += on_tmp old_class_score.update(logits[:, list(set(self.old_labels) - set(self.new_labels_zombie))].mean().item(), batch_y.size(0)) elif task == self.task_seen -1: # new test total = (pred_label != batch_y).sum().item() error += total wrong = pred_label[pred_label != batch_y] no_tmp = sum([(wrong == i).sum().item() for i in list(set(self.old_labels) - set(self.new_labels_zombie))]) no += no_tmp nn += total - no_tmp new_class_score.update(logits[:, self.new_labels_zombie].mean().item(), batch_y.size(0)) else: pass acc.update(correct_cnt, batch_y.size(0)) acc_array[task] = acc.avg() print(acc_array) if self.params.error_analysis: self.error_list.append((no, nn, oo, on)) self.new_class_score.append(new_class_score.avg()) self.old_class_score.append(old_class_score.avg()) print("no ratio: {}\non ratio: {}".format(no/(no+nn+0.1), on/(oo+on+0.1))) print(self.error_list) print(self.new_class_score) print(self.old_class_score) self.fc_norm_new.append(self.model.linear.weight[self.new_labels_zombie].mean().item()) self.fc_norm_old.append(self.model.linear.weight[list(set(self.old_labels) - set(self.new_labels_zombie))].mean().item()) self.bias_norm_new.append(self.model.linear.bias[self.new_labels_zombie].mean().item()) self.bias_norm_old.append(self.model.linear.bias[list(set(self.old_labels) - set(self.new_labels_zombie))].mean().item()) print(self.fc_norm_old) print(self.fc_norm_new) print(self.bias_norm_old) print(self.bias_norm_new) return acc_array
def weight_decay_loss(self): loss = maybe_cuda(torch.zeros([])) for param in self.parameters(): loss += torch.norm(param) ** 2 return loss