def prune_by_percentile(percent, resample=False, reinit=False, total_params=1, hessian_aware=False, criterion=None, dataloader=None, cuda=True, **kwargs): global step global mask global model traces = {} print("Start Hessian Computing") if hessian_aware: model.eval() hessian_comp = hessian(model, criterion, data=dataloader, cuda=cuda) for name, param in model.named_parameters(): param_frac = param.numel() / total_params if 'weight' in name and param_frac > 0.01: traces[name] = hessian_comp.trace(name) print("Trace for layer:{} is {}".format(name, traces[name])) traces_thres = np.mean(np.array(list(traces.values()))) for name, trace in traces.items(): traces[name] = np.mean(trace) >= traces_thres # Calculate percentile value step = 0 for name, param in model.named_parameters(): # We do not prune bias term if 'weight' in name: # each layer param_frac = param.numel() / total_params if param_frac > 0.01: tensor = param.data.cpu().numpy() alive = tensor[np.nonzero( tensor)] # flattened array of nonzero values if hessian: if traces[name]: hess_percent = percent / 2 else: hess_percent = min(percent * 2, 30) print("Hessian-aware pruning percent for layer:{} is {}". format(name, hess_percent)) percentile_value = np.percentile(abs(alive), hess_percent) else: percentile_value = np.percentile(abs(alive), percent) # Convert Tensors to numpy and calculate weight_dev = param.device new_mask = np.where( abs(tensor) < percentile_value, 0, mask[step]) # Apply new weight and mask param.data = torch.from_numpy(tensor * new_mask).to( weight_dev) # zero out mask[step] = new_mask step += 1 step = 0
def get_hessian(model, dataset_name, batch_size): """ This function creates a Pyhessian object given which can be used to compute top_n eigenvalues, density or trace of the hessian of the model Args: model: nn.Module dataset_name: name of the dataset used to train the model batch_size: size of the batch used to create the Pyhessian object Returns: model_hessian: Pyhessian object inputs: inputs used to create model_hesssian targets: targets used to create model_hesssian criterion: loss criterion """ # Work with only one (big) batch train_loader, _ = utils.load_data("/home/app/datasets", batch_size=batch_size, dataset=dataset_name) inputs, targets = next(iter(train_loader)) criterion = torch.nn.CrossEntropyLoss() model_hessian = hessian(model, criterion, data=(inputs, targets), cuda=False) return model_hessian, inputs, targets, criterion
def get_models_trace(models, data_loader, criterion, full_dataset=False, verbose=False, device=None): trace_dict = {} hessian_dataloader = [] for i, (inputs, labels) in enumerate(data_loader): hessian_dataloader.append((inputs, labels)) if not full_dataset: break # get trace for k, m in models.items(): if verbose: print(k) a = time.time() ts = [] if device is not None: m = m.to(device) is_gpu = True else: is_gpu = False if full_dataset: trace = hessian(m, criterion, dataloader=hessian_dataloader, cuda=is_gpu).trace() else: trace = hessian(m, criterion, data=hessian_dataloader[0], cuda=is_gpu).trace() trace_dict[k] = trace return trace_dict
def GetHessianEig(path, hessian_dataloader): criterion = nn.CrossEntropyLoss() # label loss # get model model = MLP() if args.cuda: model = model.cuda() model = torch.nn.DataParallel(model) check_point = torch.load(path) model.load_state_dict(check_point.state_dict) ###################################################### # Begin the computation ###################################################### # turn model to eval mode model.eval() if batch_num == 1: hessian_comp = hessian(model, criterion, data=hessian_dataloader, cuda=args.cuda) else: hessian_comp = hessian(model, criterion, dataloader=hessian_dataloader, cuda=args.cuda) print( '********** finish data londing and begin Hessian computation **********') top_eigenvalues, _ = hessian_comp.eigenvalues(top_n=3) return top_eigenvalues
# Get model checkpoint, get saving folder ################### if args.resume == '': raise Exception("please choose the trained model") model.load_state_dict(torch.load(args.resume)) ###################################################### # Begin the computation ###################################################### # turn model to eval mode model.eval() if batch_num == 1: hessian_comp = hessian(model, criterion, data=hessian_dataloader, cuda=args.cuda, record_data=True) else: hessian_comp = hessian(model, criterion, dataloader=hessian_dataloader, cuda=args.cuda, record_data=True) print( '********** finish data loading and begin Hessian computation **********') print("Computing eigenvalues...") top_eigenvalues, _ = hessian_comp.eigenvalues(top_n=20, debug=True) print("Computing trace...")
# for k, v in state_dict.items(): # name = k[7:] # new_state_dict[name] = v model.load_state_dict(torch.load(args.resume + '/' + files[file_index])) ###################################################### # Begin the computation ###################################################### # turn model to eval mode model.eval() if batch_num == 1: hessian_comp = hessian(model, criterion, data=hessian_dataloader, cuda=args.cuda) else: hessian_comp = hessian(model, criterion, dataloader=hessian_dataloader, cuda=args.cuda) print( '********** finish data londing and begin Hessian computation **********' ) print('now is for the whole model') top_eigenvalues, _ = hessian_comp.eigenvalues() trace = hessian_comp.trace() density_eigen, density_weight = hessian_comp.density()
################### # Get model checkpoint, get saving folder ################### if args.resume == '': raise Exception("please choose the trained model") model.load_state_dict(torch.load(args.resume)) ###################################################### # Begin the computation ###################################################### # turn model to eval mode model.eval() if batch_num == 1: hessian_comp = hessian(model, criterion, data=hessian_dataloader, cuda=args.cuda) else: hessian_comp = hessian(model, criterion, dataloader=hessian_dataloader, cuda=args.cuda) print( '********** finish data londing and begin Hessian computation **********') top_eigenvalues, _ = hessian_comp.eigenvalues() trace = hessian_comp.trace() density_eigen, density_weight = hessian_comp.density() print('\n***Top Eigenvalues: ', top_eigenvalues)
def _hard_training_step(nets, nets_weights, net_optimizers, net_data_loaders, criterion, weight_type, var_noise=None, curr_step=0, writer=None, device=None, hard_train_eps=None): """Does update step on all networks and computes the weights. If wanting to do a random walk, set learning rate of net_optimizer to zero and set var_noise to noise level.""" taking_step = True steps_taken = 0 mean_loss = 0 curr_loss = float("inf") assert not (hard_train_eps and (len(nets) > 0)) continue_training = True for idx_net in range(len(nets)): # get net and optimizer net = nets[idx_net] optimizer = net_optimizers[idx_net] # get the inputs; data is a list of [inputs, labels] try: data = next(net_data_loaders[idx_net]) except: taking_step = False break inputs, labels = data if device is not None: inputs, labels = inputs.to(device).type( torch.cuda.FloatTensor), labels.to(device).type( torch.cuda.LongTensor) while continue_training: # Compute gradients for input. inputs.requires_grad = True # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward(retain_graph=True) optimizer.step() if var_noise is not None: with torch.no_grad(): for param in net.parameters(): param.add_(torch.randn(param.size()) * var_noise) # update weights param_grads = get_grad_params_vec(net) curr_weight = torch.norm(param_grads) nets_weights[idx_net] += curr_weight # store metrics for each net if writer is not None: writer.add_scalar('Loss/train/net_{}'.format(idx_net), loss, curr_step) writer.add_scalar('Potential/curr/net_{}'.format(idx_net), curr_weight, curr_step) writer.add_scalar('Norm/net_{}'.format(idx_net), torch.norm(get_params_vec(net)), curr_step) if (curr_step % 50) == 0: # a = time.time() is_gpu = device is not None trace = np.mean( hessian(net, criterion, data=(inputs, labels), cuda=is_gpu).trace()) writer.add_scalar('Trace/net_{}'.format(idx_net), trace, curr_step) # print("Getting trace took {}".format(time.time() - a)) mean_loss += float(loss) steps_taken += 1 curr_step += 1 continue_training = float(loss) > hard_train_eps assert taking_step or (idx_net == 0) return nets, nets_weights, steps_taken, mean_loss / steps_taken
def training_step(nets, nets_weights, net_optimizers, net_data_loaders, criterion, weight_type, var_noise=None, curr_step=0, writer=None, device=None): """Does update step on all networks and computes the weights. If wanting to do a random walk, set learning rate of net_optimizer to zero and set var_noise to noise level.""" taking_step = True steps_taken = 0 mean_loss = 0 for idx_net in range(len(nets)): # get net and optimizer net = nets[idx_net] optimizer = net_optimizers[idx_net] # get the inputs; data is a list of [inputs, labels] try: data = next(net_data_loaders[idx_net]) except: taking_step = False break inputs, labels = data if device is not None: inputs, labels = inputs.to(device).type( torch.cuda.FloatTensor), labels.to(device).type( torch.cuda.LongTensor) # Compute gradients for input. inputs.requires_grad = True # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward(retain_graph=True) optimizer.step() if var_noise is not None: with torch.no_grad(): for param in net.parameters(): noise = torch.randn(param.size()) * var_noise if device is not None: noise = noise.to(device) param.add_(noise) # update weights if weight_type == "input_output_forbenius": # zero the parameter optimizer.zero_grad() # get input gradients output_forb = torch.norm(outputs) output_forb.backward() input_grads = inputs.grad curr_weight = weight_function_input_jacobian(input_grads) nets_weights[idx_net] += curr_weight elif weight_type == "loss_gradient_weights": param_grads = get_grad_params_vec(net) curr_weight = torch.norm(param_grads) nets_weights[idx_net] += curr_weight elif weight_type is None: pass else: raise NotImplementedError() # store metrics for each net if writer is not None: writer.add_scalar('Loss/train/net_{}'.format(idx_net), loss, curr_step) writer.add_scalar('Potential/curr/net_{}'.format(idx_net), curr_weight, curr_step) writer.add_scalar('Potential/total/net_{}'.format(idx_net), nets_weights[idx_net], curr_step) writer.add_scalar('Norm/net_{}'.format(idx_net), torch.norm(get_params_vec(net)), curr_step) if (curr_step % 50) == 0: # a = time.time() is_gpu = device is not None trace = np.mean( hessian(net, criterion, data=(inputs, labels), cuda=is_gpu).trace()) writer.add_scalar('Trace/net_{}'.format(idx_net), trace, curr_step) # print("Getting trace took {}".format(time.time() - a)) mean_loss += float(loss) assert taking_step or (idx_net == 0) return nets, nets_weights, 1 * taking_step, mean_loss / len(nets)