def calc_grad_z(model, train_loader, save_pth=False, gpu=-1, start=0): """Calculates grad_z and can save the output to files. One grad_z should be computed for each training data sample. Arguments: model: pytorch model, for which s_test should be calculated train_loader: pytorch dataloader, which can load the train data save_pth: Path, path where to save the grad_z files if desired. Omitting this argument will skip saving gpu: int, device id to use for GPU, -1 for CPU (default) start: int, index of the first test index to use. default is 0 Returns: grad_zs: list of torch tensors, contains the grad_z tensors save_pth: Path, path where grad_z files were saved to or False if they were not saved.""" if save_pth and isinstance(save_pth, str): save_pth = Path(save_pth) if not save_pth: logging.info("ATTENTION: Not saving grad_z files!") grad_zs = [] for i in tqdm(range(start, len(train_loader.dataset)), desc="Calc. grad_z"): z, t = train_loader.dataset[i] z = train_loader.collate_fn([z]) t = train_loader.collate_fn([t]) grad_z_vec = grad_z(z, t, model, gpu=gpu) if save_pth: grad_z_vec = [g.cpu() for g in grad_z_vec] torch.save(grad_z_vec, save_pth.joinpath(f"{i}.grad_z")) else: grad_zs.append(grad_z_vec) return grad_zs, save_pth
def calc_influence_single(model, train_loader, test_loader, test_id_num, gpu, recursion_depth, r, s_test_vec=None, time_logging=False): """Calculates the influences of all training data points on a single test dataset image. Arugments: model: pytorch model train_loader: DataLoader, loads the training dataset test_loader: DataLoader, loads the test dataset test_id_num: int, id of the test sample for which to calculate the influence function gpu: int, identifies the gpu id, -1 for cpu recursion_depth: int, number of recursions to perform during s_test calculation, increases accuracy. r*recursion_depth should equal the training dataset size. r: int, number of iterations of which to take the avg. of the h_estimate calculation; r*recursion_depth should equal the training dataset size. s_test_vec: list of torch tensor, contains s_test vectors. If left empty it will also be calculated Returns: influence: list of float, influences of all training data samples for one test sample harmful: list of float, influences sorted by harmfulness helpful: list of float, influences sorted by helpfulness test_id_num: int, the number of the test dataset point the influence was calculated for""" # Calculate s_test vectors if not provided if not s_test_vec: z_test, t_test = test_loader.dataset[test_id_num] z_test = test_loader.collate_fn([z_test]) t_test = test_loader.collate_fn([t_test]) s_test_vec = calc_s_test_single(model, z_test, t_test, train_loader, gpu, recursion_depth=recursion_depth, r=r) # Calculate the influence function train_dataset_size = len(train_loader.dataset) influences = [] for i in range(train_dataset_size): z, t = train_loader.dataset[i] z = train_loader.collate_fn([z]) t = train_loader.collate_fn([t]) if time_logging: time_a = datetime.datetime.now() grad_z_vec = grad_z(z, t, model, gpu=gpu) if time_logging: time_b = datetime.datetime.now() time_delta = time_b - time_a logging.info(f"Time for grad_z iter:" f" {time_delta.total_seconds() * 1000}") tmp_influence = -sum([ #################### # TODO: potential bottle neck, takes 17% execution time # torch.sum(k * j).data.cpu().numpy() #################### torch.sum(k * j).data for k, j in zip(grad_z_vec, s_test_vec) ]) / train_dataset_size influences.append(tmp_influence) display_progress("Calc. influence function: ", i, train_dataset_size) harmful = np.argsort(influences) helpful = harmful[::-1] return influences, harmful.tolist(), helpful.tolist(), test_id_num