def train(self,t,xtrain,ytrain,xvalid,yvalid): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model,best_model) # Model update if t==0: self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) else: fisher_new=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) for (n,p),(_,p_old) in zip(self.model.named_parameters(),self.model_old.named_parameters()): p=fisher_new[n]*p+self.fisher[n]*p_old self.fisher[n]+=fisher_new[n] p/=(self.fisher[n]==0).float()+self.fisher[n] # Old model save self.model_old=deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) return
def post_train(self, t, xtrain, ytrain, xvalid, yvalid): # store the old model (and freeze it for gradients) self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # NOTE: other option is to save models to disk and reload them after each training session (slower but more accurate?) # deep copy the values from the old fisher matrix (previous models) if t > 0: fisher_old = {} for n, _ in self.model.named_parameters(): fisher_old[n] = self.fisher[n].clone() # compute the fisher matrix for the current model # NOTE: shouldn't it be recomputed for all outputs? self.fisher = utils.fisher_matrix_diag(t, xtrain, ytrain, self.model, self._fw_pass) # combine the fisher matrices if t > 0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals # NOTE: is that equivalent? for n, _ in self.model.named_parameters(): # count the old fisher matrix t times for the number of pervious tasks self.fisher[n] = (self.fisher[n] + fisher_old[n] * t) / ( t + 1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) return
def train(self,t,xtrain,ytrain,xvalid,yvalid): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model,best_model) # Update old self.model_old=deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t>0: fisher_old={} for n,_ in self.model.named_parameters(): fisher_old[n]=self.fisher[n].clone() self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) if t>0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals for n,_ in self.model.named_parameters(): self.fisher[n]=(self.fisher[n]+fisher_old[n]*t)/(t+1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) torch.save(self.model.state_dict(),'pretrain_ewc.pth') return
def post_train(self, t, xtrain, ytrain, xvalid, yvalid): # Model update if t == 0: self.fisher = utils.fisher_matrix_diag(t, xtrain, ytrain, self.model, self._fw_pass) else: fisher_new = utils.fisher_matrix_diag(t, xtrain, ytrain, self.model, self._fw_pass) for (n, p), (_, p_old) in zip(self.model.named_parameters(), self.model_old.named_parameters()): p = fisher_new[n] * p + self.fisher[n] * p_old self.fisher[n] += fisher_new[n] p /= (self.fisher[n] == 0).float() + self.fisher[n] # Old model save self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) return
def train(self, t, xtrain, ytrain, xvalid, yvalid, data, input_size, taskcla): best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr patience = self.lr_patience self.optimizer = self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') #save log for current task & old tasks at every epoch self.logger.add(epoch=(t*self.nepochs)+e, task_num=t+1, valid_loss=valid_loss, valid_acc=valid_acc) for task in range(t): xvalid_t=data[task]['valid']['x'].cuda() yvalid_t=data[task]['valid']['y'].cuda() valid_loss_t,valid_acc_t=self.eval(task,xvalid_t,yvalid_t) self.logger.add(epoch=(t*self.nepochs)+e, task_num=task+1, valid_loss=valid_loss_t, valid_acc=valid_acc_t) # Adapt lr if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) patience = self.lr_patience print(' *', end='') else: patience -= 1 if patience <= 0: lr /= self.lr_factor print(' lr={:.1e}'.format(lr), end='') if lr < self.lr_min: print() break patience = self.lr_patience self.optimizer = self._get_optimizer(lr) print() # Restore best utils.set_model_(self.model, best_model) self.logger.save() # Update old self.model_old = Net(input_size, taskcla).cuda() self.model_old.load_state_dict(self.model.state_dict()) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t>0: fisher_old={} for n,_ in self.model.named_parameters(): fisher_old[n]=self.fisher[n].clone() self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) if t>0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals for n,_ in self.model.named_parameters(): self.fisher[n]=(self.fisher[n]+fisher_old[n]*t)/(t+1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) return
def train(self, t, train_data, valid_data, device='cuda'): self.writer.add_text( "ModelSize/Task_{}".format(t), "model size = {}".format(utils.get_model_size(self.model))) best_loss = np.inf best_model = utils.get_model(self.model) lr = self.lr # 1 define the optimizer and scheduler self.optimizer = self._get_optimizer(lr) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=self.lr_patience, # factor=self.lr_factor, threshold=0.001) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( self.optimizer, self.epochs) # 2 define the dataloader train_loader = torch.utils.data.DataLoader(train_data, batch_size=self.batch, shuffle=True, num_workers=4, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=self.batch, shuffle=False, num_workers=4, pin_memory=True) # 3 training the model for e in range(self.epochs): # 3.1 train self.train_epoch(t, train_loader, device=device) # 3.2 compute training loss train_loss, train_acc = self.eval(t, train_loader, mode='train', device=device) # 3.3 compute valid loss valid_loss, valid_acc = self.eval(t, valid_loader, mode='train', device=device) # 3.4 logging print( '| Epoch {:3d} | Train: loss={:.3f}, acc={:5.1f}% | Valid: loss={:.3f}, acc={:5.1f}% |' .format(e, train_loss, 100 * train_acc, valid_loss, 100 * valid_acc)) self.writer.add_scalars('Train_Loss/Task: {}'.format(t), { 'train_loss': train_loss, 'valid_loss': valid_loss }, global_step=e) self.writer.add_scalars('Train_Accuracy/Task: {}'.format(t), { 'train_acc': train_acc * 100, 'valid_acc': valid_acc * 100 }, global_step=e) # 3.5 Adapt learning rate scheduler.step() # 3.6 update the best model if valid_loss < best_loss: best_loss = valid_loss best_model = utils.get_model(self.model) # 4 Restore best model utils.set_model_(self.model, best_model) # Update old self.model_old = deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t > 0: fisher_old = {} for n, _ in self.model.named_parameters(): fisher_old[n] = self.fisher[n].clone() self.fisher = utils.fisher_matrix_diag(t, train_loader, self.model, self.criterion, device, self.batch) if t > 0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, # therefore we have to merge fisher diagonals for n, _ in self.model.named_parameters(): self.fisher[n] = (self.fisher[n] + fisher_old[n] * t) / (t + 1) return
def trainsi(self,t,xtrain,ytrain,xvalid,yvalid): best_loss=np.inf best_model=utils.get_model(self.model) lr=self.lr patience=self.lr_patience self.optimizer=self._get_optimizer(lr) # Loop epochs for e in range(self.nepochs): # Train clock0=time.time() self.train_epoch(t,xtrain,ytrain) clock1=time.time() train_loss,train_acc=self.eval(t,xtrain,ytrain) clock2=time.time() print('| Epoch {:3d}, time={:5.1f}ms/{:5.1f}ms | Train: loss={:.3f}, acc={:5.1f}% |'.format( e+1,1000*self.sbatch*(clock1-clock0)/xtrain.size(0),1000*self.sbatch*(clock2-clock1)/xtrain.size(0),train_loss,100*train_acc),end='') # Valid valid_loss,valid_acc=self.eval(t,xvalid,yvalid) print(' Valid: loss={:.3f}, acc={:5.1f}% |'.format(valid_loss,100*valid_acc),end='') # Adapt lr if valid_loss<best_loss: best_loss=valid_loss best_model=utils.get_model(self.model) patience=self.lr_patience print(' *',end='') else: patience-=1 if patience<=0: lr/=self.lr_factor print(' lr={:.1e}'.format(lr),end='') if lr<self.lr_min: print() break patience=self.lr_patience self.optimizer=self._get_optimizer(lr) print() # After each task is complete, call update_big_omega and reset_small_omega # Reset_small_omega also makes a backup of the final weights, used as hook in the auxiliary loss self.big_omega_var = self.update_big_omega(self.model.named_parameters(), self.previous_weights_mu_minus_1, self.small_omega_var) for i, (name, var) in enumerate(self.model.named_parameters()): self.previous_weights_mu_minus_1[name] = var.data self.small_omega_var[name] = 0.0 # Restore best utils.set_model_(self.model,best_model) # Update old self.model_old=deepcopy(self.model) self.model_old.eval() utils.freeze_model(self.model_old) # Freeze the weights # Fisher ops if t>0: fisher_old={} for n,_ in self.model.named_parameters(): fisher_old[n]=self.fisher[n].clone() self.fisher=utils.fisher_matrix_diag(t,xtrain,ytrain,self.model,self.criterion) if t>0: # Watch out! We do not want to keep t models (or fisher diagonals) in memory, therefore we have to merge fisher diagonals for n,_ in self.model.named_parameters(): self.fisher[n]=(self.fisher[n]+fisher_old[n]*t)/(t+1) # Checked: it is better than the other option #self.fisher[n]=0.5*(self.fisher[n]+fisher_old[n]) return