def margins(self): result = dict() model = self.get_model() model.load('initial') l2_reg = self.config['l2_reg'] if model.num_classes == 2: print("Model is binary, we can compute margins.") with benchmark("Computing margins"): indiv_margin = model.get_indiv_margin(self.datasets.train) s = np.zeros(self.datasets.train.num_examples) some_indices = [35, 6, 1, 8, 42] s[some_indices] = 1 with benchmark("Computing margin gradients"): grad_margin = model.get_total_grad_margin(self.datasets.train, s, l2_reg=l2_reg) result['indiv_margin'] = indiv_margin result['grad_margin'] = grad_margin return result
def training(self): res = dict() ds = self.get_dataset() model = self.get_model() res['l2_reg'] = l2_reg = ds.train.num_examples * 1e-3 with benchmark("Training original model"): model.fit(ds.train, l2_reg=l2_reg) model.print_model_eval(ds, l2_reg=l2_reg) model.save('initial') res['train_losses'] = model.get_indiv_loss(ds.train) res['train_margins'] = model.get_indiv_margin(ds.train) res['train_accuracy'] = model.get_accuracy(ds.train) res['test_losses'] = model.get_indiv_loss(ds.test) res['test_margins'] = model.get_indiv_margin(ds.test) res['test_accuracy'] = model.get_accuracy(ds.test) with benchmark("Computing gradients"): res['train_grad_losses'] = model.get_indiv_grad_loss(ds.train) res['train_grad_margins'] = model.get_indiv_grad_margin(ds.train) res['test_grad_losses'] = model.get_indiv_grad_loss(ds.test) res['test_grad_margins'] = model.get_indiv_grad_margin(ds.test) res['hessian'] = model.get_hessian(ds.train, l2_reg=l2_reg) return res
def initial_training(self): model = self.get_model() l2_reg = self.R['cv_l2_reg'] res = dict() with benchmark("Training original model"): model.fit(self.train, l2_reg=l2_reg, sample_weights=self.sample_weights[0]) model.print_model_eval(self.datasets, train_sample_weights=self.sample_weights[0], test_sample_weights=self.sample_weights[2]) model.save('initial') res['initial_train_losses'] = model.get_indiv_loss(self.train) res['initial_test_losses'] = model.get_indiv_loss(self.test) res['initial_nonfires_losses'] = model.get_indiv_loss(self.nonfires) if self.num_classes == 2: res['initial_train_margins'] = model.get_indiv_margin(self.train) res['initial_test_margins'] = model.get_indiv_margin(self.test) res['initial_nonfires_margins'] = model.get_indiv_margin( self.nonfires) print('F1 test score: {}'.format( f1_score( self.test.labels, np.array( model.get_predictions(self.test.x)[:, 1] > 0.5).astype( np.int), sample_weight=self.sample_weights[2]))) with benchmark("Computing gradients"): res['train_grad_loss'] = model.get_indiv_grad_loss(self.train) return res
def save_benchmark(self): if self.args.benchmark_iters > self.train_step * self.num_env: return None if self.terminal: self.train_thread[self.cpu_rew_thread].input_queue.put( ("save_benchmark", 0, (self.exp_name, self.exp_itr))) bench_status = None while not (bench_status): try: if bench_status is not "bench_saved": bench_status = self.train_thread[ self.cpu_rew_thread].output_queue.get() except self.train_thread[ self.cpu_rew_thread].output_queue.empty: if bench_status is "bench_saved": break else: continue file_name = './exp_data/' + self.exp_name + '/' + self.exp_itr + '/' + self.args.benchmark_dir + '/' + self.exp_name + "_attn" + '.pkl' with open(file_name, 'wb') as fp: pickle.dump(self.attn_benchmark, fp) fp.close() print("Saving memory") file_name = './exp_data/' + self.exp_name + '/' + self.exp_itr + '/' + self.args.benchmark_dir + '/' + self.exp_name + "_mem" + '.pkl' with open(file_name, 'wb') as fp: pickle.dump(self.mem_benchmark, fp) fp.close() benchmark(self.args) return True else: return None
def compute_infls(infl_name, initial_vals, subset_vals): datasets = [self.train, self.test, self.nonfires] weights = [self.sample_weights[0], self.sample_weights[2],\ np.ones(self.nonfires.num_examples)] dataset_names = ['train', 'test', 'nonfires'] class_names = [ 'class_{}'.format(i) for i in range(self.num_classes) ] for ds, ds_name, weight, initial_val, subset_val in\ zip(datasets, dataset_names, weights, initial_vals, subset_vals): # all name = 'all_{}_{}'.format(ds_name, infl_name) with benchmark('Computing {}'.format(name)): infl = np.einsum('ai,i->a', subset_val - initial_val, weight) res[name] = infl # class-specific for i, class_name in enumerate(class_names): class_inds = np.where(ds.labels == i)[0] name = '{}_{}_{}'.format(class_name, ds_name, infl_name) with benchmark('Computing {}'.format(name)): infl = np.einsum('ai,i->a', subset_val[:,class_inds]-initial_val[class_inds],\ weight[class_inds]) res[name] = infl # test/nonfires genres for ds, ds_name, weight, initial_val, subset_val, genre_names, genre_inds in\ zip(datasets[1:], dataset_names[1:], weights[1:],\ initial_vals[1:], subset_vals[1:],\ [test_genres, nonfires_genres], [test_genre_inds, nonfires_genre_inds]): for genre_name, genre_ind in zip(genre_names, genre_inds): name = '{}_{}_{}'.format(genre_name, ds_name, infl_name) with benchmark('Computing {}'.format(name)): infl = np.einsum('ai,i->a', subset_val[:,genre_ind]-initial_val[genre_ind],\ weight[genre_ind]) res[name] = infl # For a few specific points specific_names = ['fixed_test', 'fixed_nonfires'] for name, fixed_inds, initial_val, subset_val in\ zip(specific_names, [fixed_test, fixed_nonfires],\ initial_vals[1:], subset_vals[1:]): res_name = '{}_{}'.format(name, infl_name) for ind in fixed_inds: with benchmark('Computing {} {}'.format(name, ind)): infl = subset_val[:, ind] - initial_val[ind] add(res, res_name, infl) res[res_name] = np.transpose(res[res_name]) # self influence for subset_val, remove_indices in zip(subset_vals[0], subset_indices): infl = np.dot(subset_val[remove_indices] - initial_vals[0][remove_indices],\ weights[0][remove_indices]) add(res, 'self_{}'.format(infl_name), infl) res['self_{}'.format(infl_name)] = np.transpose( res['self_{}'.format(infl_name)])
def write_bench(exp_name, exp_itr, args, agent_info): benchmark_dir = os.path.join('./exp_data', exp_name, exp_itr, args.benchmark_dir) if not os.path.exists(benchmark_dir): os.mkdir(benchmark_dir) file_name = './exp_data/' + exp_name + '/' + exp_itr + '/' + args.benchmark_dir + '/' + exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) benchmark(args) return
def initial_training(self): model = self.get_model() res = dict() l2_reg = self.R['cv_l2_reg'] with benchmark("Training original model"): model.fit(self.train, l2_reg=l2_reg, sample_weights=self.sample_weights[0]) model.print_model_eval(self.datasets, train_sample_weights=self.sample_weights[0], test_sample_weights=self.sample_weights[2], l2_reg=l2_reg) model.save('initial') res['initial_train_losses'] = model.get_indiv_loss(self.train) res['initial_train_accuracy'] = model.get_accuracy(self.train) res['initial_test_losses'] = model.get_indiv_loss(self.test) res['initial_test_accuracy'] = model.get_accuracy(self.test) res['initial_nonfires_losses'] = model.get_indiv_loss(self.nonfires) res['initial_nonfires_accuracy'] = model.get_accuracy(self.nonfires) if self.num_classes == 2: res['initial_train_margins'] = model.get_indiv_margin(self.train) res['initial_test_margins'] = model.get_indiv_margin(self.test) res['initial_nonfires_margins'] = model.get_indiv_margin( self.nonfires) return res
def compare_sklearn(self): model = self.get_model() model.load('initial') l2_reg = self.config['l2_reg'] with benchmark("Copying params to sklearn"): C = 1.0 / l2_reg multi_class = "ovr" if self.model_config['arch'][ 'num_classes'] == 2 else "multinomial" fit_intercept = self.model_config['arch']['fit_intercept'] sklearn_model = sklearn.linear_model.LogisticRegression( C=C, tol=1e-8, fit_intercept=fit_intercept, solver='lbfgs', multi_class=multi_class, warm_start=False, max_iter=2048) sklearn_model.intercept_ = 0 model.copy_params_to_sklearn_model(sklearn_model) preds = model.get_predictions(self.datasets.train.x) preds_sk = sklearn_model.predict_proba(self.datasets.train.x) print("l2 between predictions: {}".format( np.linalg.norm(preds - preds_sk))) result = dict() result['preds'] = preds result['preds_sk'] = preds_sk return result
def hess(self): result = dict() model = self.get_model() model.load('initial') l2_reg = self.config['l2_reg'] with benchmark("Computing hessian"): result['hessian_reg'] = model.get_hessian(self.datasets.train, l2_reg=l2_reg, **self.eval_args) if result['hessian_reg'].shape[0] < 800: with benchmark("Finding hessian eigenvalues"): result['eigs'] = eigs = np.linalg.eigvalsh( result['hessian_reg']) print("Hessian eigenvalue range:", np.min(eigs), np.max(eigs)) return result
def fixed_test_influence(self): model = self.get_model() model.load('initial') l2_reg = self.R['cv_l2_reg'] res = dict() hessian = self.R['hessian'] inverse_hvp_args = { 'hessian_reg': hessian, 'dataset': self.train, 'l2_reg': l2_reg, 'verbose': False, 'verbose_cg': True, 'inverse_vp_method': self.config['inverse_vp_method'], } fixed_test = self.R['fixed_test'] fixed_test_grad_loss = [] fixed_test_pred_infl = [] fixed_test_pred_margin_infl = [] for test_idx in fixed_test: single_test_point = self.test.subset([test_idx]) with benchmark('Scalar infl for all training points on test_idx {}.'.format(test_idx)): test_grad_loss = model.get_indiv_grad_loss(single_test_point).reshape(-1, 1) test_grad_loss_H_inv = model.get_inverse_hvp(test_grad_loss, **inverse_hvp_args).reshape(-1) pred_infl = np.dot(self.R['train_grad_loss'], test_grad_loss_H_inv) fixed_test_grad_loss.append(test_grad_loss) fixed_test_pred_infl.append(pred_infl) if self.num_classes == 2: with benchmark('Scalar margin infl for all training points on test_idx {}.'.format(test_idx)): test_grad_margin = model.get_indiv_grad_margin(single_test_point).reshape(-1, 1) test_grad_margin_H_inv = model.get_inverse_hvp(test_grad_margin, **inverse_hvp_args).reshape(-1) pred_margin_infl = np.dot(self.R['train_grad_loss'], test_grad_margin_H_inv) fixed_test_pred_margin_infl.append(pred_margin_infl) res['fixed_test_pred_infl'] = np.array(fixed_test_pred_infl) if self.num_classes == 2: res['fixed_test_pred_margin_infl'] = np.array(fixed_test_pred_margin_infl) return res
def compute_grad_loss(self): model = self.get_model() model.load('initial') result = dict() with benchmark("Computing gradients individually"): result['indiv_grad_loss'] = model.get_indiv_grad_loss( self.datasets.train, method='from_total_grad') with benchmark("Computing gradients batched"): result['batch_indiv_grad_loss'] = model.get_indiv_grad_loss( self.datasets.train, method="batched") grad_loss_1 = result['indiv_grad_loss'] grad_loss_2 = result['batch_indiv_grad_loss'] print("l2 between gradients: {}".format( np.linalg.norm(grad_loss_1 - grad_loss_2))) return result
def hessian(self): model = self.get_model() model.load('initial') l2_reg = self.R['cv_l2_reg'] res = dict() with benchmark("Computing hessian"): res['hessian'] = hessian = model.get_hessian( self.train, l2_reg=l2_reg, sample_weights=self.sample_weights[0]) return res
def initial_training(self): model = self.get_model() l2_reg = self.R['cv_l2_reg'] res = dict() with benchmark("Training original model"): model.fit(self.train, l2_reg=l2_reg) model.print_model_eval(self.datasets, l2_reg=l2_reg) model.save('initial') res['initial_train_losses'] = model.get_indiv_loss(self.train) res['initial_train_accuracy'] = model.get_accuracy(self.train) res['initial_test_losses'] = model.get_indiv_loss(self.test) res['initial_test_accuracy'] = model.get_accuracy(self.test) if self.num_classes == 2: res['initial_train_margins'] = model.get_indiv_margin(self.train) res['initial_test_margins'] = model.get_indiv_margin(self.test) with benchmark("Computing gradients"): res['train_grad_loss'] = model.get_indiv_grad_loss(self.train) return res
def hessian(self): model = self.get_model() model.load('initial') l2_reg = self.R['cv_l2_reg'] res = dict() if self.config['inverse_hvp_method'] == 'explicit': with benchmark("Computing hessian"): res['hessian'] = hessian = model.get_hessian(self.train, l2_reg=l2_reg) elif self.config['inverse_hvp_method'] == 'cg': print("Not computing explicit hessian.") res['hessian'] = None return res
def train_model(self): results = dict() model = self.get_model() l2_reg = self.config['l2_reg'] with benchmark("Training the model"): model.fit(self.datasets.train, l2_reg=l2_reg) model.print_model_eval(self.datasets, l2_reg=l2_reg) with benchmark("Computing losses"): results['train_loss'] = model.get_total_loss(self.datasets.train, l2_reg=l2_reg) results['indiv_train_loss'] = model.get_indiv_loss( self.datasets.train) results['test_loss'] = model.get_total_loss(self.datasets.test, l2_reg=l2_reg) results['indiv_test_loss'] = model.get_indiv_loss( self.datasets.test) with benchmark("Saving model"): model.save('initial') return results
def cross_validation(self): model = self.get_model() res = dict() reg_min, reg_max, reg_samples = self.config['normalized_cross_validation_range'] reg_min *= self.num_train reg_max *= self.num_train num_folds = self.config['cross_validation_folds'] regs = np.logspace(np.log10(reg_min), np.log10(reg_max), reg_samples) cv_errors = np.zeros_like(regs) cv_accs = np.zeros_like(regs) fold_size = (self.num_train + num_folds - 1) // num_folds folds = [(k * fold_size, min((k + 1) * fold_size, self.num_train)) for k in range(num_folds)] for i, reg in enumerate(regs): with benchmark("Evaluating CV error for reg={}".format(reg)): cv_error = 0.0 cv_acc = 0.0 for k, fold in enumerate(folds): fold_begin, fold_end = fold train_indices = np.concatenate((np.arange(0, fold_begin), np.arange(fold_end, self.num_train))) val_indices = np.arange(fold_begin, fold_end) model.fit(self.train.subset(train_indices), l2_reg=reg) fold_loss = model.get_total_loss(self.train.subset(val_indices), l2_reg=0) acc = model.get_accuracy(self.train.subset(val_indices)) cv_error += fold_loss cv_acc += acc print('Acc: {}, loss: {}'.format(acc, fold_loss)) cv_errors[i] = cv_error cv_accs[i] = cv_acc / num_folds print('Cross-validation acc {}, error {} for reg={}.'.format(cv_accs[i], cv_errors[i], reg)) best_i = np.argmax(cv_accs) best_reg = regs[best_i] print('Cross-validation errors: {}'.format(cv_errors)) print('Cross-validation accs: {}'.format(cv_accs)) print('Selecting weight_decay {}, with acc {}, error {}.'.format(\ best_reg, cv_accs[best_i], cv_errors[best_i])) res['cv_regs'] = regs res['cv_errors'] = cv_errors res['cv_accs'] = cv_accs res['cv_l2_reg'] = best_reg return res
def hvp(self): model = self.get_model() model.load('initial') result = dict() indiv_grad_loss = self.results['compute_grad_loss']['indiv_grad_loss'] some_indices = [1, 6, 2, 4, 3] vectors = indiv_grad_loss[some_indices, :].T hessian = self.results['hess']['hessian_reg'] with benchmark("Inverse HVP"): result['inverse_hvp'] = model.get_inverse_vp( hessian, vectors, **self.eval_args) return result
def pick_subsets(self): tagged_subsets = [] with benchmark("Same features subsets"): same_features_subsets = self.get_same_features_subsets( self.train.x, self.train.labels) tagged_subsets += [('same_features', s) for s in same_features_subsets] subset_tags = [tag for tag, subset in tagged_subsets] subset_indices = [subset for tag, subset in tagged_subsets] subset_sizes = np.unique( [len(subset) for tag, subset in tagged_subsets]) return {'subset_tags': subset_tags, 'subset_indices': subset_indices}
def pick_subsets(self): rng = np.random.RandomState(self.config['subset_seed']) tagged_subsets = [] if self.subset_choice_type == "types": subset_sizes = np.ones(self.num_subsets).astype(np.int) * self.subset_size elif self.subset_choice_type == "range": subset_sizes = np.linspace(self.subset_min_size, self.subset_max_size, self.num_subsets).astype(np.int) with benchmark("Random subsets"): random_subsets = self.get_random_subsets(rng, subset_sizes) tagged_subsets += [('random', s) for s in random_subsets] with benchmark("Same class subsets"): same_class_subsets = self.get_same_class_subsets(rng, self.train.labels, subset_sizes) same_class_subset_labels = [self.train.labels[s[0]] for s in same_class_subsets] tagged_subsets += [('random_same_class-{}'.format(label), s) for s, label in zip(same_class_subsets, same_class_subset_labels)] with benchmark("Scalar infl tail subsets"): # 1) pick x*N out of the top 1.5 * 0.025 * N where x in (0.0025 - 0.025) # 2) pick x*N out of the top 1.5 * 0.1 * N where x in (0.0025 - 0.1) # 3) pick x*N out of the top 1.5 * 0.25 * N where x in (0.0025 - 0.25) size_1, size_2, size_3, size_4 = list(int(self.num_train * x) for x in (0.0025, 0.025, 0.1, 0.25)) subsets_per_phase = self.num_subsets // 3 subset_size_phases = [ np.linspace(size_1, size_2, subsets_per_phase).astype(int), np.linspace(size_1, size_3, subsets_per_phase).astype(int), np.linspace(size_1, size_4, self.num_subsets - 2 * subsets_per_phase).astype(int) ] for pred_infl, test_idx in zip(self.R['fixed_test_pred_infl'], self.R['fixed_test']): for phase, subset_sizes in enumerate(subset_size_phases, 1): neg_tail_subsets, pos_tail_subsets = self.get_scalar_infl_tails(rng, pred_infl, subset_sizes) tagged_subsets += [('neg_tail_test-{}-{}'.format(phase, test_idx), s) for s in neg_tail_subsets] tagged_subsets += [('pos_tail_test-{}-{}'.format(phase, test_idx), s) for s in pos_tail_subsets] print('Found scalar infl tail subsets for test idx {}.'.format(test_idx)) with benchmark("Same features subsets"): same_features_subsets = self.get_subsets_by_clustering(rng, self.train.x, subset_sizes) tagged_subsets += [('same_features', s) for s in same_features_subsets] with benchmark("Same gradient subsets"): same_grad_subsets = self.get_subsets_by_clustering(rng, self.R['train_grad_loss'], subset_sizes) tagged_subsets += [('same_grad', s) for s in same_grad_subsets] with benchmark("Same feature subsets by windowing"): feature_window_subsets = self.get_subsets_by_projection(rng, self.train.x, subset_sizes) tagged_subsets += [('feature_window', s) for s in feature_window_subsets] subset_tags = [tag for tag, subset in tagged_subsets] subset_indices = [subset for tag, subset in tagged_subsets] return { 'subset_tags': subset_tags, 'subset_indices': subset_indices }
def get_subsets_by_clustering(self, rng, X, subset_sizes): cluster_indices = [] for n_clusters in (None, 4, 8, 16, 32, 64, 128): with benchmark("Clustering with k={}".format(n_clusters)): clusters = self.get_clusters(X, n_clusters=n_clusters) print("Cluster sizes:", [len(cluster) for cluster in clusters]) cluster_indices.extend(clusters) cluster_sizes = np.array([len(indices) for indices in cluster_indices]) subsets = [] for i, subset_size in enumerate(subset_sizes): valid_clusters = np.nonzero(cluster_sizes >= subset_size)[0] if len(valid_clusters) == 0: continue cluster_idx = rng.choice(valid_clusters) subset = rng.choice(cluster_indices[cluster_idx], subset_size, replace=False) subsets.append(subset) return np.array(subsets)
def pick_subsets(self): tagged_subsets = [] res = dict() with benchmark("Turker ID subsets"): names, inds = self.get_turker_subsets() for name, ind in zip(names, inds): tagged_subsets.append(('same_turker-{}'.format(name), ind)) subset_tags = [tag for tag, subset in tagged_subsets] subset_indices = [subset for tag, subset in tagged_subsets] res['test_genres'], res['test_genre_inds'] = self.get_genre_subsets( split=2) res['nonfires_genres'], res[ 'nonfires_genre_inds'] = self.get_nonfires_genre_subsets() res['subset_tags'], res['subset_indices'] = subset_tags, subset_indices return res
def retrain_model(self): model = self.get_model() model.load('initial') l2_reg = self.config['l2_reg'] print( "Sanity check: reloading the model gives same train and test losses" ) indiv_train_loss = model.get_indiv_loss(self.datasets.train) indiv_test_loss = model.get_indiv_loss(self.datasets.test) print("train loss l2 diff: {}, test loss l2 diff: {}".format( np.linalg.norm(indiv_train_loss - self.results['train_model']['indiv_train_loss']), np.linalg.norm(indiv_test_loss - self.results['train_model']['indiv_test_loss']))) print("Sanity check: warm fit is fast") with benchmark("Performing warm fit"): model.warm_fit(self.datasets.train, l2_reg=l2_reg) model.print_model_eval(self.datasets, l2_reg=l2_reg) print("Sanity check: force-recreate the model and load it") del self.model model = self.get_model() model.load('initial') print("Sanity check: losses should still be the same") indiv_train_loss = model.get_indiv_loss(self.datasets.train) indiv_test_loss = model.get_indiv_loss(self.datasets.test) print("train loss l2 diff: {}, test loss l2 diff: {}".format( np.linalg.norm(indiv_train_loss - self.results['train_model']['indiv_train_loss']), np.linalg.norm(indiv_test_loss - self.results['train_model']['indiv_test_loss']))) return {}
def all_and_fixed_test_and_nonfire_influence(self): model = self.get_model() model.load('initial') res = dict() res['nonfires_predictions'] = model.get_predictions(self.nonfires.x) hessian = self.R['hessian'] def compute_test_like_infl(points, grad_fn, **kwargs): test_grad = grad_fn(points, **kwargs).reshape(-1, 1) test_grad_H_inv = model.get_inverse_vp(hessian, test_grad).reshape(-1) pred_infl = np.dot(self.R['train_grad_loss'], test_grad_H_inv) return pred_infl fixed_test = self.R['fixed_test'] fixed_test_pred_infl = [] fixed_test_pred_margin_infl = [] for test_idx in fixed_test: single_test_point = self.test.subset([test_idx]) with benchmark( 'Scalar infl for all training points on test_idx {}.'. format(test_idx)): fixed_test_pred_infl.append(compute_test_like_infl(single_test_point,\ model.get_indiv_grad_loss)) if self.num_classes == 2: with benchmark( 'Scalar margin infl for all training points on test_idx {}.' .format(test_idx)): fixed_test_pred_margin_infl.append(compute_test_like_infl(single_test_point,\ model.get_indiv_grad_margin)) # Compute influence on the entire test set with benchmark( 'Scalar infl for all training points on entire test set.'): res['all_test_pred_infl'] = np.array(compute_test_like_infl(self.test,\ model.get_total_grad_loss, sample_weights=self.sample_weights[2])) if self.num_classes == 2: with benchmark( 'Scalar margin infl for all training points on entire test set.' ): res['all_test_pred_margin_infl'] = np.array(compute_test_like_infl(self.test,\ model.get_total_grad_margin, sample_weights=self.sample_weights[2])) # Compute influence on the positive and negative parts of the test set pos_inds = np.where(self.test.labels == 1)[0] neg_inds = np.where(self.test.labels == 0)[0] with benchmark( 'Scalar infl for all training points on positive test set.'): res['pos_test_pred_infl'] = np.array(compute_test_like_infl(self.test.subset(pos_inds),\ model.get_total_grad_loss, sample_weights=self.sample_weights[2][pos_inds])) if self.num_classes == 2: with benchmark( 'Scalar margin infl for all training points on positive test set.' ): res['pos_test_pred_margin_infl'] = np.array(compute_test_like_infl(self.test.subset(pos_inds),\ model.get_total_grad_margin, sample_weights=self.sample_weights[2][pos_inds])) with benchmark( 'Scalar infl for all training points on negative test set.'): res['neg_test_pred_infl'] = np.array(compute_test_like_infl(self.test.subset(neg_inds),\ model.get_total_grad_loss, sample_weights=self.sample_weights[2][neg_inds])) if self.num_classes == 2: with benchmark( 'Scalar margin infl for all training points on negative test set.' ): res['neg_test_pred_margin_infl'] = np.array(compute_test_like_infl(self.test.subset(neg_inds),\ model.get_total_grad_margin, sample_weights=self.sample_weights[2][neg_inds])) # Compute influence on the entire train set with benchmark( 'Scalar infl for all training points on entire training set.'): res['all_train_pred_infl'] = np.array(compute_test_like_infl(self.train,\ model.get_total_grad_loss, sample_weights=self.sample_weights[0])) if self.num_classes == 2: with benchmark( 'Scalar margin infl for all training points on entire training set.' ): res['all_train_pred_margin_infl'] = np.array(compute_test_like_infl(self.train,\ model.get_total_grad_margin, sample_weights=self.sample_weights[0])) # Do this for the nonfires nonfires_pred_infl = [] nonfires_pred_margin_infl = [] for idx in range(self.nonfires.x.shape[0]): single_point = self.nonfires.subset([idx]) with benchmark( 'Scalar infl for all training points on nonfire idx {}.'. format(idx)): nonfires_pred_infl.append(compute_test_like_infl(single_point,\ model.get_indiv_grad_loss)) if self.num_classes == 2: with benchmark( 'Scalar margin infl for all training points on nonfire idx {}.' .format(idx)): nonfires_pred_margin_infl.append(compute_test_like_infl(single_point,\ model.get_indiv_grad_margin)) res['fixed_test_pred_infl'] = np.array(fixed_test_pred_infl) if self.num_classes == 2: res['fixed_test_pred_margin_infl'] = np.array( fixed_test_pred_margin_infl) res['nonfires_pred_infl'] = np.array(nonfires_pred_infl) if self.num_classes == 2: res['nonfires_pred_margin_infl'] = np.array( nonfires_pred_margin_infl) return res
def compute_infls(infl_name, infl_total_fn, infl_indiv_fn): for i, remove_indices in enumerate( subset_indices[subset_start:subset_end], subset_start): print( 'Computing influences on model for subset {} out of {} (tag={})' .format(i, len(subset_indices), subset_tags[i])) tag = subset_tags[i] inds = remove_indices with benchmark('Computing {} for subset {}'.format( infl_name, tag)): grad = infl_total_fn( self.train.subset(inds), sample_weights=self.sample_weights[0][inds], l2_reg=0) add(res, 'subset_train_grad_for_{}'.format(infl_name), grad) # For big parts of the datasets datasets = [self.train, self.test, self.nonfires] weights = [self.sample_weights[0], self.sample_weights[2],\ np.ones(self.nonfires.num_examples)] dataset_names = ['train', 'test', 'nonfires'] class_names = [ 'class_{}'.format(i) for i in range(self.num_classes) ] for ds, ds_name, weight in zip(datasets, dataset_names, weights): # all name = 'all_{}_{}'.format(ds_name, infl_name) with benchmark('Computing {}'.format(name)): infl = compute_test_like_infl( ds, infl_total_fn, grad, sample_weights=weight) add(res, name, infl) # class-specific for i, class_name in enumerate(class_names): class_inds = np.where(ds.labels == i)[0] name = '{}_{}_{}'.format(class_name, ds_name, infl_name) with benchmark('Computing {}'.format(name)): infl = compute_test_like_infl(ds.subset(class_inds), infl_total_fn,\ grad, sample_weights=weight[class_inds]) add(res, name, infl) # test/nonfires genres for ds, ds_name, weight, genre_names, genre_inds in\ zip(datasets[1:], dataset_names[1:], weights[1:],\ [test_genres, nonfires_genres], [test_genre_inds, nonfires_genre_inds]): for genre_name, genre_ind in zip( genre_names, genre_inds): name = '{}_{}_{}'.format(genre_name, ds_name, infl_name) with benchmark('Computing {}'.format(name)): infl = compute_test_like_infl(ds.subset(genre_ind), infl_total_fn,\ grad, sample_weights=weight[genre_ind]) add(res, name, infl) # For a few specific points specific_names = ['fixed_test', 'fixed_nonfires'] for name, fixed_inds, ds in zip(specific_names, [fixed_test, fixed_nonfires],\ [self.test, self.nonfires]): data = [] for ind in fixed_inds: with benchmark('Computing {} {}'.format(name, ind)): data.append( compute_test_like_infl( ds.subset([ind]), infl_indiv_fn, grad)) add(res, '{}_{}'.format(name, infl_name), data)
def retrain_and_newton_batch(self, subset_start, subset_end): res = dict() self.load_phases([0, 1, 2], verbose=False) ds = self.get_dataset() model = self.get_model() model.load('initial') initial_params = model.get_params_flat() l2_reg = self.R['l2_reg'] hessian = self.R['hessian'] subsets = self.R['subset_indices'][subset_start:subset_end] num_subsets = len(subsets) train_grad_losses = self.R['train_grad_losses'] subset_grad_losses = np.array([ np.sum(train_grad_losses[subset, :], axis=0) for subset in subsets ]) start_time = time.time() with benchmark( 'Computing first-order predicted parameters for subsets {}-{}'. format(subset_start, subset_end)): inverse_hvp_args = { 'hessian_reg': hessian, 'verbose': False, 'inverse_hvp_method': 'explicit', 'inverse_vp_method': 'cholesky', } res['subset_pred_dparam'] = model.get_inverse_hvp( subset_grad_losses.T, **inverse_hvp_args).T with benchmark( 'Computing Newton predicted parameters for subsets {}-{}'. format(subset_start, subset_end)): newton_pred_dparam = np.zeros((num_subsets, model.params_dim)) for i, subset in enumerate(subsets): hessian_w = model.get_hessian(ds.train.subset(subset), l2_reg=0, verbose=False) inverse_hvp_args = { 'hessian_reg': hessian - hessian_w, 'verbose': False, 'inverse_hvp_method': 'explicit', 'inverse_vp_method': 'cholesky', } subset_grad_loss = subset_grad_losses[i, :].reshape(-1, 1) pred_dparam = model.get_inverse_hvp( subset_grad_loss, **inverse_hvp_args).reshape(-1) newton_pred_dparam[i, :] = pred_dparam res['subset_newton_pred_dparam'] = newton_pred_dparam with benchmark('Computing actual parameters for subsets {}-{}'.format( subset_start, subset_end)): actl_dparam = np.zeros((num_subsets, model.params_dim)) for i, subset in enumerate(subsets): s = np.ones(ds.train.num_examples) s[subset] = 0 model.warm_fit(ds.train, s, l2_reg=l2_reg) model.save('subset_{}'.format(i + subset_start)) actl_dparam[i, :] = model.get_params_flat() - initial_params res['subset_dparam'] = actl_dparam end_time = time.time() time_per_subset = (end_time - start_time) / num_subsets remaining_time = (len(self.R['subset_indices']) - subset_end) * time_per_subset print('Each retraining and iHVP takes {} s, {} s remaining'.format( time_per_subset, remaining_time)) return res
def try_regs(self): model = self.get_model() res = dict() reg_min, reg_max, reg_samples = self.config['initial_reg_range'] reg_min *= self.num_train reg_max *= self.num_train regs = np.logspace(np.log10(reg_min), np.log10(reg_max), reg_samples) accs = [np.zeros_like(regs) for i in range(3)] coefs = [None for i in range(len(regs))] intercepts = np.zeros_like(regs) nonzeros = np.zeros_like(regs) f1s = [np.zeros_like(regs) for i in range(3)] for i, reg in enumerate(regs): with benchmark("Training model for reg={}".format(reg)): model.set_params(C=1.0 / reg) model.fit(self.train.x, self.train.labels, sample_weight=self.sample_weights[0]) coefs[i] = model.coef_[0] print('Coefs: {}'.format(coefs[i])) intercepts[i] = model.intercept_ if self.fit_intercept else 0 print('Intercept: {}'.format(intercepts[i])) nonzeros[i] = np.sum(coefs[i] != 0) + (intercepts[i] != 0) print('Nonzeros {}/{}'.format(nonzeros[i], coefs[i].size + 1)) accs[0][i] = model.score(self.train.x, self.train.labels, sample_weight=self.sample_weights[0]) accs[1][i] = model.score(self.validation.x, self.validation.labels, sample_weight=self.sample_weights[1]) accs[2][i] = model.score(self.test.x, self.test.labels, sample_weight=self.sample_weights[2]) print('Train acc: {}'.format(accs[0][i])) print('Validation acc: {}'.format(accs[1][i])) print('Test acc: {}'.format(accs[2][i])) y_preds = [None for j in range(3)] y_preds[0] = model.predict(self.train.x) y_preds[1] = model.predict(self.validation.x) y_preds[2] = model.predict(self.test.x) f1s[0][i] = f1_score(self.train.labels, y_preds[0], sample_weight=self.sample_weights[0]) f1s[1][i] = f1_score(self.validation.labels, y_preds[1], sample_weight=self.sample_weights[1]) f1s[2][i] = f1_score(self.test.labels, y_preds[2], sample_weight=self.sample_weights[2]) print('Train f1: {}'.format(f1s[0][i])) print('Validation f1: {}'.format(f1s[1][i])) print('Test f1: {}'.format(f1s[2][i])) best_i = np.argmax(f1s[1]) print('Using reg {}'.format(regs[best_i])) res['best_i'] = best_i res['feature_inds'] = np.where(coefs[best_i] != 0)[0] res['regs'] = regs res['accs'] = np.array(accs) res['coefs'] = np.array(coefs) res['intercepts'] = intercepts res['nonzeros'] = nonzeros res['f1s'] = np.array(f1s) return res
def cross_validation(self): model = self.get_model() res = dict() reg_min, reg_max, reg_samples = self.config[ 'normalized_cross_validation_range'] reg_min *= self.num_train reg_max *= self.num_train num_folds = self.config['cross_validation_folds'] regs = np.logspace(np.log10(reg_min), np.log10(reg_max), reg_samples) cv_errors = np.zeros_like(regs) cv_accs = np.zeros_like(regs) cv_f1s = np.zeros_like(regs) fold_size = (self.num_train + num_folds - 1) // num_folds folds = [(k * fold_size, min((k + 1) * fold_size, self.num_train)) for k in range(num_folds)] for i, reg in enumerate(regs): with benchmark("Evaluating CV error for reg={}".format(reg)): cv_error = 0.0 cv_acc = 0.0 cv_f1 = 0.0 for k, fold in enumerate(folds): print('Beginning fold {}'.format(k)) fold_begin, fold_end = fold train_indices = np.concatenate( (np.arange(0, fold_begin), np.arange(fold_end, self.num_train))) val_indices = np.arange(fold_begin, fold_end) print('Fitting model.') model.fit( self.train.subset(train_indices), l2_reg=reg, sample_weights=self.sample_weights[0][train_indices]) fold_loss = model.get_total_loss( self.train.subset(val_indices), reg=False, sample_weights=self.sample_weights[0][val_indices]) acc = model.get_accuracy(self.train.subset(val_indices)) cv_error += fold_loss cv_acc += acc score = f1_score( self.test.labels, np.array( model.get_predictions(self.test.x)[:, 1] > 0.5 ).astype(np.int), sample_weight=self.sample_weights[2]) cv_f1 += score print('F1: {}, Acc: {}, loss: {}'.format( score, acc, fold_loss)) cv_errors[i] = cv_error cv_accs[i] = cv_acc / num_folds cv_f1s[i] = cv_f1 / num_folds print( 'Cross-validation f1 {}, acc {}, error {} for reg={}.'.format( cv_f1s[i], cv_accs[i], cv_errors[i], reg)) best_i = np.argmax(cv_f1s) best_reg = regs[best_i] print('Cross-validation errors: {}'.format(cv_errors)) print('Cross-validation accs: {}'.format(cv_accs)) print('Cross-validation F1s: {}'.format(cv_f1s)) print('Selecting weight_decay {}, with f1 {}, acc {}, error {}.'.format(\ best_reg, cv_f1s[best_i], cv_accs[best_i], cv_errors[best_i])) res['cv_regs'] = regs res['cv_errors'] = cv_errors res['cv_accs'] = cv_accs res['cv_l2_reg'] = best_reg res['cv_f1s'] = cv_f1s return res
def compute_test_influence(self, subset_start, subset_end, ds_test): res = dict() model = self.get_model() model.load('initial') initial_params = model.get_params_flat() actl_dparam = self.R['subset_dparam'][subset_start:subset_end, :] pred_dparam = self.R['subset_pred_dparam'][subset_start:subset_end, :] newton_pred_dparam = self.R['subset_newton_pred_dparam'][ subset_start:subset_end, :] test_losses = model.get_indiv_loss(ds_test, verbose=False) test_margins = model.get_indiv_margin(ds_test, verbose=False) subsets = self.R['subset_indices'][subset_start:subset_end] num_subsets = len(subsets) with benchmark( 'Computing actual parameters and influence for subsets {}-{}'. format(subset_start, subset_end)): subset_test_actl_infl = np.zeros( (num_subsets, ds_test.num_examples)) subset_test_actl_margin_infl = np.zeros( (num_subsets, ds_test.num_examples)) for i, subset in enumerate(subsets): actl_param = initial_params + actl_dparam[i, :] model.set_params_flat(actl_param) actl_losses = model.get_indiv_loss(ds_test, verbose=False) actl_margins = model.get_indiv_margin(ds_test, verbose=False) subset_test_actl_infl[i, :] = actl_losses - test_losses subset_test_actl_margin_infl[ i, :] = actl_margins - test_margins res['subset_test_actl_infl'] = subset_test_actl_infl res['subset_test_actl_margin_infl'] = subset_test_actl_margin_infl with benchmark( 'Computing influence approximates for subsets {}-{}'.format( subset_start, subset_end)): subset_test_pparam_infl = np.zeros( (num_subsets, ds_test.num_examples)) subset_test_pparam_margin_infl = np.zeros( (num_subsets, ds_test.num_examples)) subset_test_nparam_infl = np.zeros( (num_subsets, ds_test.num_examples)) subset_test_nparam_margin_infl = np.zeros( (num_subsets, ds_test.num_examples)) for i, subset in enumerate(subsets): pparam = initial_params + pred_dparam[i, :] nparam = initial_params + newton_pred_dparam[i, :] model.set_params_flat(pparam) pparam_losses = model.get_indiv_loss(ds_test, verbose=False) pparam_margins = model.get_indiv_margin(ds_test, verbose=False) model.set_params_flat(nparam) nparam_losses = model.get_indiv_loss(ds_test, verbose=False) nparam_margins = model.get_indiv_margin(ds_test, verbose=False) subset_test_pparam_infl[i, :] = pparam_losses - test_losses subset_test_pparam_margin_infl[ i, :] = pparam_margins - test_margins subset_test_nparam_infl[i, :] = nparam_losses - test_losses subset_test_nparam_margin_infl[ i, :] = nparam_margins - test_margins res['subset_test_pparam_infl'] = subset_test_pparam_infl res['subset_test_pparam_margin_infl'] = subset_test_pparam_margin_infl res['subset_test_nparam_infl'] = subset_test_nparam_infl res['subset_test_nparam_margin_infl'] = subset_test_nparam_margin_infl return res