def load_grad_z(grad_z_dir=Path("./grad_z/"), train_dataset_size=-1): """Loads all grad_z data required to calculate the influence function and returns it. Arguments: grad_z_dir: Path, folder containing files storing the grad_z values train_dataset_size: int, number of total samples in dataset; -1 indicates to use all available grad_z files Returns: grad_z_vecs: list of torch tensors, contains the grad_z tensors""" if isinstance(grad_z_dir, str): grad_z_dir = Path(grad_z_dir) grad_z_vecs = [] logging.info(f"Loading grad_z from: {grad_z_dir} ...") available_grad_z_files = len(grad_z_dir.glob("*.grad_z")) if available_grad_z_files != train_dataset_size: logging.warn("Load Influence Data: number of grad_z files mismatches" " the dataset size") if -1 == train_dataset_size: train_dataset_size = available_grad_z_files for i in range(train_dataset_size): grad_z_vecs.append(torch.load(grad_z_dir / str(i) + ".grad_z")) display_progress("grad_z files loaded: ", i, train_dataset_size) return grad_z_vecs
def load_s_test(s_test_dir=Path("./s_test/"), s_test_id=0, r_sample_size=10, train_dataset_size=-1): """Loads all s_test data required to calculate the influence function and returns a list of it. Arguments: s_test_dir: Path, folder containing files storing the s_test values s_test_id: int, number of the test data sample s_test was calculated for r_sample_size: int, number of s_tests precalculated per test dataset point train_dataset_size: int, number of total samples in dataset; -1 indicates to use all available grad_z files Returns: e_s_test: list of torch vectors, contains all e_s_tests for the whole dataset. s_test: list of torch vectors, contain all s_test for the whole dataset. Can be huge.""" if isinstance(s_test_dir, str): s_test_dir = Path(s_test_dir) s_test = [] logging.info(f"Loading s_test from: {s_test_dir} ...") num_s_test_files = len(s_test_dir.glob("*.s_test")) if num_s_test_files != r_sample_size: logging.warning("Load Influence Data: number of s_test sample files" " mismatches the available samples") ######################## # TODO: should prob. not hardcode the file name, use natsort+glob ######################## for i in range(num_s_test_files): s_test.append(torch.load(s_test_dir / str(s_test_id) + f"_{i}.s_test")) display_progress("s_test files loaded: ", i, r_sample_size) ######################### # TODO: figure out/change why here element 0 is chosen by default ######################### e_s_test = s_test[0] # Calculate the sum for i in range(len(s_test)): e_s_test = [i + j for i, j in zip(e_s_test, s_test[0])] # Calculate the average ######################### # TODO: figure out over what to calculate the average # should either be r_sample_size OR e_s_test ######################### e_s_test = [i / len(s_test) for i in e_s_test] return e_s_test, s_test
def calc_grad_z(model, train_loader, save_pth=False, gpu=-1, start=0): """Calculates grad_z and can save the output to files. One grad_z should be computed for each training data sample. Arguments: model: pytorch model, for which s_test should be calculated train_loader: pytorch dataloader, which can load the train data save_pth: Path, path where to save the grad_z files if desired. Omitting this argument will skip saving gpu: int, device id to use for GPU, -1 for CPU (default) start: int, index of the first test index to use. default is 0 Returns: grad_zs: list of torch tensors, contains the grad_z tensors save_pth: Path, path where grad_z files were saved to or False if they were not saved.""" if save_pth and isinstance(save_pth, str): save_pth = Path(save_pth) if not save_pth: logging.info("ATTENTION: Not saving grad_z files!") grad_zs = [] for i in range(start, len(train_loader.dataset)): z, t = train_loader.dataset[i] z = train_loader.collate_fn([z]) t = train_loader.collate_fn([t]) grad_z_vec = grad_z(z, t, model, gpu=gpu) if save_pth: grad_z_vec = [g.cpu() for g in grad_z_vec] torch.save(grad_z_vec, save_pth.joinpath(f"{i}.grad_z")) else: grad_zs.append(grad_z_vec) display_progress("Calc. grad_z: ", i - start, len(train_loader.dataset) - start) return grad_zs, save_pth
def calc_img_wise(config, model, train_loader, test_loader): """Calculates the influence function one test point at a time. Calcualtes the `s_test` and `grad_z` values on the fly and discards them afterwards. Arguments: config: dict, contains the configuration from cli params""" influences_meta = copy.deepcopy(config) test_sample_num = config["test_sample_num"] test_start_index = config["test_start_index"] outdir = Path(config["outdir"]) # If calculating the influence for a subset of the whole dataset, # calculate it evenly for the same number of samples from all classes. # `test_start_index` is `False` when it hasn't been set by the user. It can # also be set to `0`. if test_sample_num and test_start_index is not False: test_dataset_iter_len = test_sample_num * config["num_classes"] _, sample_list = get_dataset_sample_ids(test_sample_num, test_loader, config["num_classes"], test_start_index) else: test_dataset_iter_len = len(test_loader.dataset) # Set up logging and save the metadata conf file logging.info(f"Running on: {test_sample_num} images per class.") logging.info(f"Starting at img number: {test_start_index} per class.") influences_meta["test_sample_index_list"] = sample_list influences_meta_fn = (f"influences_results_meta_{test_start_index}-" f"{test_sample_num}.json") influences_meta_path = outdir.joinpath(influences_meta_fn) save_json(influences_meta, influences_meta_path) influences = {} # Main loop for calculating the influence function one test sample per # iteration. for j in range(test_dataset_iter_len): # If we calculate evenly per class, choose the test img indicies # from the sample_list instead if test_sample_num and test_start_index: if j >= len(sample_list): logging.warning( "ERROR: the test sample id is out of index of the" " defined test set. Jumping to next test sample.") i = sample_list[j] else: i = j start_time = time.time() influence, harmful, helpful, _ = calc_influence_single( model, train_loader, test_loader, test_id_num=i, gpu=config["gpu"], recursion_depth=config["recursion_depth"], r=config["r_averaging"], ) end_time = time.time() ########### # Different from `influence` above ########### influences[str(i)] = {} _, label = test_loader.dataset[i] influences[str(i)]["label"] = label influences[str(i)]["num_in_dataset"] = j influences[str(i)]["time_calc_influence_s"] = end_time - start_time infl = [x.cpu().numpy().tolist() for x in influence] influences[str(i)]["influence"] = infl influences[str(i)]["harmful"] = harmful[:500] influences[str(i)]["helpful"] = helpful[:500] tmp_influences_path = outdir.joinpath(f"influence_results_tmp_" f"{test_start_index}_" f"{test_sample_num}" f"_last-i_{i}.json") save_json(influences, tmp_influences_path) display_progress("Test samples processed: ", j, test_dataset_iter_len) logging.info(f"The results for this run are:") logging.info("Influences: ") logging.info(influence[:3]) logging.info("Most harmful img IDs: ") logging.info(harmful[:3]) logging.info("Most helpful img IDs: ") logging.info(helpful[:3]) influences_path = outdir.joinpath(f"influence_results_{test_start_index}_" f"{test_sample_num}.json") save_json(influences, influences_path)
def calc_s_test( model, test_loader, train_loader, save=False, gpu=-1, damp=0.01, scale=25, recursion_depth=5000, r=1, start=0, ): """Calculates s_test for the whole test dataset taking into account all training data images. Arguments: model: pytorch model, for which s_test should be calculated test_loader: pytorch dataloader, which can load the test data train_loader: pytorch dataloader, which can load the train data save: Path, path where to save the s_test files if desired. Omitting this argument will skip saving gpu: int, device id to use for GPU, -1 for CPU (default) damp: float, influence function damping factor scale: float, influence calculation scaling factor recursion_depth: int, number of recursions to perform during s_test calculation, increases accuracy. r*recursion_depth should equal the training dataset size. r: int, number of iterations of which to take the avg. of the h_estimate calculation; r*recursion_depth should equal the training dataset size. start: int, index of the first test index to use. default is 0 Returns: s_tests: list of torch vectors, contain all s_test for the whole dataset. Can be huge. save: Path, path to the folder where the s_test files were saved to or False if they were not saved.""" if save and not isinstance(save, Path): save = Path(save) if not save: logging.info("ATTENTION: not saving s_test files.") s_tests = [] for i in range(start, len(test_loader.dataset)): z_test, t_test = test_loader.dataset[i] z_test = test_loader.collate_fn([z_test]) t_test = test_loader.collate_fn([t_test]) s_test_vec = s_test_sample(model, z_test, t_test, train_loader, gpu, damp, scale, recursion_depth, r) if save: s_test_vec = [s.cpu() for s in s_test_vec] torch.save( s_test_vec, save.joinpath(f"{i}_recdep{recursion_depth}_r{r}.s_test")) else: s_tests.append(s_test_vec) display_progress("Calc. z_test (s_test): ", i - start, len(test_loader.dataset) - start) return s_tests, save