def dataloader_hv_product(self, v): device = self.device num_data = 0 # count the number of datum points in the dataloader THv = [torch.zeros(p.size()).to(device) for p in self.params ] # accumulate result for inputs, targets in self.data: self.model.zero_grad() tmp_num_data = inputs.size(0) outputs = self.model(inputs.to(device)) loss = self.criterion(outputs, targets.to(device)) loss.backward(create_graph=True) params, gradsH = get_params_grad(self.model) self.model.zero_grad() Hv = torch.autograd.grad(gradsH, params, grad_outputs=v, only_inputs=True, retain_graph=False) THv = [ THv1 + Hv1 * float(tmp_num_data) + 0. for THv1, Hv1 in zip(THv, Hv) ] num_data += float(tmp_num_data) THv = [THv1 / float(num_data) for THv1 in THv] eigenvalue = group_product(THv, v).cpu().item() return eigenvalue, THv
def __init__(self, model, criterion, data=None, dataloader=None, cuda=True, data_save_dir="data", record_data=False): """ model: the model that needs Hessain information criterion: the loss function data: a single batch of data, including inputs and its corresponding labels dataloader: the data loader including bunch of batches of data """ # make sure we either pass a single batch or a dataloader assert (data != None and dataloader == None) or (data == None and dataloader != None) self.model = model.eval() # make model is in evaluation model self.criterion = criterion if data != None: self.data = data self.full_dataset = False else: self.data = dataloader self.full_dataset = True if cuda: self.device = 'cuda' else: self.device = 'cpu' # checking whether or not data save folder has been created self.record_data = record_data self.data_save_dir = data_save_dir + "/" if record_data and (not os.path.exists(data_save_dir)): try: os.mkdir(data_save_dir) except OSError: print("Could not create data save directory") # pre-processing for single batch case to simplify the computation. if not self.full_dataset: self.inputs, self.targets = self.data if self.device == 'cuda': self.inputs, self.targets = self.inputs.cuda( ), self.targets.cuda() # if we only compute the Hessian information for a single batch data, we can re-use the gradients. outputs = self.model(self.inputs) loss = self.criterion(outputs, self.targets) loss.backward(create_graph=True) # this step is used to extract the parameters from the model params, gradsH = get_params_grad(self.model) self.params = params self.gradsH = gradsH # gradient used for Hessian computation
def __init__(self, model, criterion, data=None, dataloader=None, cuda=True, drop=None): """ model: the model that needs Hessain information criterion: the loss function data: a single batch of data, including inputs and its corresponding labels dataloader: the data loader including bunch of batches of data """ # make sure we either pass a single batch or a dataloader assert (data != None and dataloader == None) or (data == None and dataloader != None) self.model = model.eval() # make model is in evaluation model self.criterion = criterion if data != None: self.data = data self.full_dataset = False else: self.data = dataloader self.full_dataset = True if cuda: self.device = 'cuda' else: self.device = 'cpu' # pre-processing for single batch case to simplify the computation. if not self.full_dataset: self.inputs, self.targets = self.data if self.device == 'cuda': self.inputs, self.targets = self.inputs.cuda( ), self.targets.cuda() # if we only compute the Hessian information for a single batch data, we can re-use the gradients. outputs = self.model(self.inputs) loss = self.criterion(outputs, self.targets) loss.backward(create_graph=True) # this step is used to extract the parameters from the model names, params, gradsH = get_params_grad(self.model) self.params = params self.names = names self.gradsH = gradsH # gradient used for Hessian computation self.drop = drop
def train_hessian(args, trainer, task, epoch_itr, sample_iter=1, maxIter=500, tol=1e-4, top_n=1, ignore_grad=False): """Train the model for one epoch.""" # Update parameters every N batches update_freq = args.update_freq[ epoch_itr.epoch - 1] if epoch_itr.epoch <= len( args.update_freq) else args.update_freq[-1] # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=args.fix_batches_to_gpus, shuffle=(epoch_itr.epoch >= args.curriculum), ) itr = iterators.GroupedIterator(itr, update_freq) progress = progress_bar.build_progress_bar( args, itr, epoch_itr.epoch, no_progress_bar='simple', ) extra_meters = collections.defaultdict(lambda: AverageMeter()) valid_subsets = args.valid_subset.split(',') max_update = args.max_update or math.inf max_iters = 10 samples_hessian = [] for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch): if i > max_iters: break samples = [trainer._prepare_sample(sample) for sample in samples] samples_hessian.extend(samples) eigenvalues = [] eigenvectors = [] computed_dim = 0 params, gradsH = get_params_grad(trainer.model) while computed_dim < top_n: eigenvalue = None v = [torch.randn(p.size()).cuda() for p in params] v = normalization(v) for i in range(maxIter): trainer.model.zero_grad() v = orthnormal(v, eigenvectors) loss, sample_size, logging_output, gradsH, tmp_eigenvalue, Hv = trainer.task.train_step_hessian( samples_hessian, trainer.model, trainer.criterion, trainer.optimizer, ignore_grad, v=v) v = normalization(Hv) if eigenvalue == None: eigenvalue = tmp_eigenvalue else: if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) + 1e-6) < tol: break else: eigenvalue = tmp_eigenvalue eigenvalues.append(eigenvalue) eigenvectors.append(v) computed_dim += 1 return eigenvalues, eigenvectors