Esempio n. 1
0
 def __init__(self, config, out_dir=None):
     super(Counterexamples, self).__init__(config, out_dir)
     self.dataset_id = config['dataset_id']
     tasks_dir = os.path.join(self.base_dir, 'tasks')
     self.task_queue = TaskQueue(tasks_dir,
                                 master_only=self.config['master_only'])
     self.task_queue.define_task('retrain_and_newton_batch',
                                 self.retrain_and_newton_batch)
     self.task_queue.define_task('compute_cex_test_infl_batch',
                                 self.compute_cex_test_infl_batch)
class TestDistribute(Experiment):
    """
    Test the TaskQueue
    """
    def __init__(self, config, out_dir=None):
        super(TestDistribute, self).__init__(config, out_dir)
        task_dir = os.path.join(self.base_dir, 'tasks')
        self.task_queue = TaskQueue(task_dir)
        self.task_queue.define_task('is_prime', self.is_prime)
        self.task_queue.define_task('random_vector', self.random_vector)

    experiment_id = "test_distribute"

    @property
    def run_id(self):
        return "test"

    @phase(0)
    def initialize(self):
        return {'max_prime': 200}

    def is_prime(self, n):
        for i in range(2, n):
            if i * i > n: return True
            if n % i == 0: return False
        return True

    @phase(1)
    def count_primes(self):
        results = self.task_queue.execute(
            'is_prime', [(n, ) for n in range(1, self.R['max_prime'] + 1)])
        return {'num_primes': sum(results)}

    def random_vector(self, seed_start, seed_end):
        vectors, scalars = [], []
        for seed in range(seed_start, seed_end):
            rng = np.random.RandomState(seed)
            vector = rng.normal(0, 1, (30, ))
            scalar = rng.normal(0, 1)
            vectors.append(vector)
            scalars.append(scalar)

        res = dict()
        res['vector'] = np.array(vectors)
        res['scalar'] = np.array(scalars)
        import time
        time.sleep(1)
        return res

    @phase(2)
    def make_random_vectors(self):
        num_seeds = 200
        seeds_per_batch = 4
        results = self.task_queue.execute(
            'random_vector', [(i, min(i + seeds_per_batch, num_seeds))
                              for i in range(0, num_seeds, seeds_per_batch)])
        self.task_queue.notify_exit()

        return self.task_queue.collate_results(results)
    def __init__(self, config, out_dir=None):
        super(SubsetInfluenceLogreg, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.train = self.datasets.train
        self.test = self.datasets.test
        self.validation = self.datasets.validation

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.datasets.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch']['num_classes']
        model_config['grad_batch_size'] =  max(1, self.config['max_memory'] // D)
        model_config['hessian_batch_size'] = max(1, self.config['max_memory'] // (D * D))

        # Set the method for computing inverse HVP
        model_config['inverse_hvp_method'] = self.config['inverse_hvp_method']

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.num_train = self.datasets.train.num_examples
        self.num_classes = self.model_config['arch']['num_classes']
        self.num_subsets = self.config['num_subsets']
        if self.subset_choice_type == "types":
            self.subset_size = int(self.num_train * self.config['subset_rel_size'])
        elif self.subset_choice_type == "range":
            self.subset_min_size = int(self.num_train * self.config['subset_min_rel_size'])
            self.subset_max_size = int(self.num_train * self.config['subset_max_rel_size'])

        tasks_dir = os.path.join(self.base_dir, 'tasks')
        self.task_queue = TaskQueue(tasks_dir, master_only=self.config['master_only'])
        self.task_queue.define_task('retrain_subsets', self.retrain_subsets)
        self.task_queue.define_task('self_pred_infl', self.self_pred_infl)
        self.task_queue.define_task('newton_batch', self.newton_batch)
    def __init__(self, config, out_dir=None):
        super(CreditAssignment, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.data_dir = self.config['dataset_config']['data_dir']

        self.train = self.datasets.train
        print("Shape of training set: {}".format(self.train.x.shape))
        self.test = self.datasets.test
        self.validation = self.datasets.validation
        self.sample_weights = ds.loader.load_supplemental_info(self.dataset_id + '_weights',
                data_dir=self.data_dir)\
                if self.config['sample_weights'] else [np.ones(self.train.x.shape[0]),
                        np.ones(self.validation.x.shape[0]),
                        np.ones(self.test.x.shape[0])]

        self.num_train = self.train.num_examples

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch'][
            'num_classes']
        if 'grad_batch_size' in self.config and self.config[
                'grad_batch_size'] is not None:
            model_config['grad_batch_size'] = self.config['grad_batch_size']
        else:
            model_config['grad_batch_size'] = max(
                1, self.config['max_memory'] // D)
        if 'hessian_batch_size' in self.config and self.config[
                'hessian_batch_size'] is not None:
            model_config['hessian_batch_size'] = self.config[
                'hessian_batch_size']
        else:
            model_config['hessian_batch_size'] = max(
                1, self.config['max_memory'] // (D * D))

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.num_classes = self.model_config['arch']['num_classes']
        self.nonfires = ds.loader.load_supplemental_info(
            self.dataset_id + '_nonfires', data_dir=self.data_dir)

        def print_class_balance(ds, name):
            print("Dataset {}:".format(name))
            for i, val in enumerate(
                    np.bincount(ds.labels) / ds.labels.shape[0]):
                print("Class {} is {} of the dataset.".format(i, val))

        print_class_balance(self.train, 'train')
        print_class_balance(self.test, 'test')
        print_class_balance(self.nonfires, 'nonfires')

        self.task_queue = TaskQueue(os.path.join(self.base_dir, 'tasks'),\
                master_only=self.config['master_only'])
        self.task_queue.define_task('compute_all_and_fixed_test_and_nonfire_influence',\
                self.compute_all_and_fixed_test_and_nonfire_influence)
        self.task_queue.define_task('retrain_subsets', self.retrain_subsets)
        self.task_queue.define_task('self_pred_infl', self.self_pred_infl)
class CreditAssignment(Experiment):
    def __init__(self, config, out_dir=None):
        super(CreditAssignment, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.data_dir = self.config['dataset_config']['data_dir']

        self.train = self.datasets.train
        print("Shape of training set: {}".format(self.train.x.shape))
        self.test = self.datasets.test
        self.validation = self.datasets.validation
        self.sample_weights = ds.loader.load_supplemental_info(self.dataset_id + '_weights',
                data_dir=self.data_dir)\
                if self.config['sample_weights'] else [np.ones(self.train.x.shape[0]),
                        np.ones(self.validation.x.shape[0]),
                        np.ones(self.test.x.shape[0])]

        self.num_train = self.train.num_examples

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch'][
            'num_classes']
        if 'grad_batch_size' in self.config and self.config[
                'grad_batch_size'] is not None:
            model_config['grad_batch_size'] = self.config['grad_batch_size']
        else:
            model_config['grad_batch_size'] = max(
                1, self.config['max_memory'] // D)
        if 'hessian_batch_size' in self.config and self.config[
                'hessian_batch_size'] is not None:
            model_config['hessian_batch_size'] = self.config[
                'hessian_batch_size']
        else:
            model_config['hessian_batch_size'] = max(
                1, self.config['max_memory'] // (D * D))

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.num_classes = self.model_config['arch']['num_classes']
        self.nonfires = ds.loader.load_supplemental_info(
            self.dataset_id + '_nonfires', data_dir=self.data_dir)

        def print_class_balance(ds, name):
            print("Dataset {}:".format(name))
            for i, val in enumerate(
                    np.bincount(ds.labels) / ds.labels.shape[0]):
                print("Class {} is {} of the dataset.".format(i, val))

        print_class_balance(self.train, 'train')
        print_class_balance(self.test, 'test')
        print_class_balance(self.nonfires, 'nonfires')

        self.task_queue = TaskQueue(os.path.join(self.base_dir, 'tasks'),\
                master_only=self.config['master_only'])
        self.task_queue.define_task('compute_all_and_fixed_test_and_nonfire_influence',\
                self.compute_all_and_fixed_test_and_nonfire_influence)
        self.task_queue.define_task('retrain_subsets', self.retrain_subsets)
        self.task_queue.define_task('self_pred_infl', self.self_pred_infl)

    experiment_id = "credit_assignment"

    @property
    def run_id(self):
        return "{}_sample_weights-{}".format(self.dataset_id,
                                             self.config['sample_weights'])

    def get_model(self):
        if not hasattr(self, 'model'):
            self.model = LogisticRegression(self.model_config, self.model_dir)
        return self.model

    @phase(0)
    def cross_validation(self):
        model = self.get_model()
        res = dict()

        reg_min, reg_max, reg_samples = self.config[
            'normalized_cross_validation_range']
        reg_min *= self.num_train
        reg_max *= self.num_train

        num_folds = self.config['cross_validation_folds']

        regs = np.logspace(np.log10(reg_min), np.log10(reg_max), reg_samples)
        cv_errors = np.zeros_like(regs)
        cv_accs = np.zeros_like(regs)
        fold_size = (self.num_train + num_folds - 1) // num_folds
        folds = [(k * fold_size, min((k + 1) * fold_size, self.num_train))
                 for k in range(num_folds)]

        for i, reg in enumerate(regs):
            with benchmark("Evaluating CV error for reg={}".format(reg)):
                cv_error = 0.0
                cv_acc = 0.0
                for k, fold in enumerate(folds):
                    print('Beginning fold {}'.format(k))
                    fold_begin, fold_end = fold
                    train_indices = np.concatenate(
                        (np.arange(0, fold_begin),
                         np.arange(fold_end, self.num_train)))
                    val_indices = np.arange(fold_begin, fold_end)

                    print('Fitting model.')
                    model.fit(
                        self.train.subset(train_indices),
                        l2_reg=reg,
                        sample_weights=self.sample_weights[0][train_indices])
                    fold_loss = model.get_total_loss(
                        self.train.subset(val_indices),
                        reg=False,
                        sample_weights=self.sample_weights[0][val_indices])
                    acc = model.get_accuracy(self.train.subset(val_indices))
                    cv_error += fold_loss
                    cv_acc += acc
                    print('Acc: {}, loss: {}'.format(acc, fold_loss))

            cv_errors[i] = cv_error
            cv_accs[i] = cv_acc / num_folds
            print('Cross-validation acc {}, error {} for reg={}.'.format(
                cv_accs[i], cv_errors[i], reg))

        best_i = np.argmax(cv_accs)
        best_reg = regs[best_i]
        print('Cross-validation errors: {}'.format(cv_errors))
        print('Cross-validation accs: {}'.format(cv_accs))
        print('Selecting weight_decay {}, with acc {}, error {}.'.format(\
                best_reg, cv_accs[best_i], cv_errors[best_i]))

        res['cv_regs'] = regs
        res['cv_errors'] = cv_errors
        res['cv_accs'] = cv_accs
        res['cv_l2_reg'] = best_reg
        return res

    @phase(1)
    def initial_training(self):
        model = self.get_model()
        res = dict()
        l2_reg = self.R['cv_l2_reg']

        with benchmark("Training original model"):
            model.fit(self.train,
                      l2_reg=l2_reg,
                      sample_weights=self.sample_weights[0])
            model.print_model_eval(self.datasets,
                                   train_sample_weights=self.sample_weights[0],
                                   test_sample_weights=self.sample_weights[2],
                                   l2_reg=l2_reg)
            model.save('initial')

        res['initial_train_losses'] = model.get_indiv_loss(self.train)
        res['initial_train_accuracy'] = model.get_accuracy(self.train)
        res['initial_test_losses'] = model.get_indiv_loss(self.test)
        res['initial_test_accuracy'] = model.get_accuracy(self.test)
        res['initial_nonfires_losses'] = model.get_indiv_loss(self.nonfires)
        res['initial_nonfires_accuracy'] = model.get_accuracy(self.nonfires)
        if self.num_classes == 2:
            res['initial_train_margins'] = model.get_indiv_margin(self.train)
            res['initial_test_margins'] = model.get_indiv_margin(self.test)
            res['initial_nonfires_margins'] = model.get_indiv_margin(
                self.nonfires)

        return res

    @phase(2)
    def pick_test_points(self):
        res = dict()

        # Freeze each set after the first run
        if self.dataset_id == "mnli":
            fixed_test = [8487, 3448, 3156, 1127, 4218, 6907]
        else:
            test_losses = self.R['initial_test_losses']
            argsort = np.argsort(test_losses)
            high_loss = argsort[-3:]  # Pick 3 high loss points
            random_loss = np.random.choice(
                argsort[:-3], 3, replace=False)  # Pick 3 random points

            fixed_test = list(high_loss) + list(random_loss)

        if self.dataset_id in ["mnli"]:
            res['fixed_nonfires'] = [1722, 2734, 9467, 7378, 9448, 2838]
            print("Fixed nonfires points: {}".format(res['fixed_nonfires']))

        print("Fixed test points: {}".format(fixed_test))
        res['fixed_test'] = fixed_test
        return res

    @phase(3)
    def hessian(self):
        res = dict()

        if self.config['inverse_hvp_method'] == 'explicit':
            model = self.get_model()
            model.load('initial')
            l2_reg = self.R['cv_l2_reg']
            with benchmark("Computing hessian"):
                res['hessian'] = hessian = model.get_hessian(
                    self.train,
                    l2_reg=l2_reg,
                    sample_weights=self.sample_weights[0])
        elif self.config['inverse_hvp_method'] == 'cg':
            print("Not computing explicit hessian.")
            res['hessian'] = None

        return res

    def get_turker_subsets(self):
        if self.dataset_id in ['mnli']:
            from datasets.loader import load_supplemental_info
            turk_IDs = load_supplemental_info(self.dataset_id + '_ids',
                                              data_dir=self.data_dir)
            subsets = []
            uniq, inds = np.unique(turk_IDs[0], return_inverse=True)
            for i in range(uniq.shape[0]):
                subsets.append(np.where(inds == i)[0])
            return uniq, subsets
        return [], []

    def get_genre_subsets(self, split=0):
        if self.dataset_id in ['mnli']:
            from datasets.loader import load_supplemental_info
            genre_IDs = load_supplemental_info(self.dataset_id + '_genres',
                                               data_dir=self.data_dir)
            subsets = []
            uniq, inds = np.unique(genre_IDs[split], return_inverse=True)
            for i in range(uniq.shape[0]):
                subsets.append(np.where(inds == i)[0])
            return uniq, subsets
        return [], []

    def get_nonfires_genre_subsets(self):
        if self.dataset_id in ['mnli']:
            from datasets.loader import load_supplemental_info
            genre_IDs = load_supplemental_info(self.dataset_id +
                                               '_nonfires_genres',
                                               data_dir=self.data_dir)
            subsets = []
            uniq, inds = np.unique(genre_IDs, return_inverse=True)
            for i in range(uniq.shape[0]):
                subsets.append(np.where(inds == i)[0])
            return uniq, subsets
        return [], []

    @phase(4)
    def pick_subsets(self):
        tagged_subsets = []
        res = dict()

        with benchmark("Turker ID subsets"):
            names, inds = self.get_turker_subsets()
            for name, ind in zip(names, inds):
                tagged_subsets.append(('same_turker-{}'.format(name), ind))

        subset_tags = [tag for tag, subset in tagged_subsets]
        subset_indices = [subset for tag, subset in tagged_subsets]

        res['test_genres'], res['test_genre_inds'] = self.get_genre_subsets(
            split=2)
        res['nonfires_genres'], res[
            'nonfires_genre_inds'] = self.get_nonfires_genre_subsets()

        res['subset_tags'], res['subset_indices'] = subset_tags, subset_indices

        return res

    def compute_all_and_fixed_test_and_nonfire_influence(
            self, subset_start, subset_end):
        print("Computing pred infl for subsets {}-{}".format(
            subset_start, subset_end))
        self.load_phases([1, 2, 3, 4])

        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        res['nonfires_predictions'] = model.get_predictions(self.nonfires.x)
        hessian = self.R['hessian']
        inverse_hvp_args = {
            'hessian_reg': hessian,
            'dataset': self.train,
            'l2_reg': l2_reg,
            'verbose': False,
            'verbose_cg': True,
            'inverse_vp_method': self.config['inverse_vp_method'],
            'inverse_hvp_method': self.config['inverse_hvp_method'],
        }

        def compute_test_like_infl(points, grad_fn, train_grad, **kwargs):
            test_grad = grad_fn(points, **kwargs).reshape(-1, 1)
            test_grad_H_inv = model.get_inverse_hvp(
                test_grad, **inverse_hvp_args).reshape(-1)
            pred_infl = np.dot(train_grad, test_grad_H_inv)
            return np.array(pred_infl)

        fixed_test = self.R['fixed_test']
        fixed_nonfires = self.R['fixed_nonfires']

        subset_indices = self.R['subset_indices']
        subset_tags = self.R['subset_tags']

        test_genres, test_genre_inds = self.R['test_genres'], self.R[
            'test_genre_inds']
        nonfires_genres, nonfires_genre_inds = self.R[
            'nonfires_genres'], self.R['nonfires_genre_inds']

        def compute_infls(infl_name, infl_total_fn, infl_indiv_fn):
            for i, remove_indices in enumerate(
                    subset_indices[subset_start:subset_end], subset_start):
                print(
                    'Computing influences on model for subset {} out of {} (tag={})'
                    .format(i, len(subset_indices), subset_tags[i]))
                tag = subset_tags[i]
                inds = remove_indices
                with benchmark('Computing {} for subset {}'.format(
                        infl_name, tag)):
                    grad = infl_total_fn(
                        self.train.subset(inds),
                        sample_weights=self.sample_weights[0][inds],
                        l2_reg=0)
                    add(res, 'subset_train_grad_for_{}'.format(infl_name),
                        grad)

                    # For big parts of the datasets
                    datasets = [self.train, self.test, self.nonfires]
                    weights = [self.sample_weights[0], self.sample_weights[2],\
                            np.ones(self.nonfires.num_examples)]
                    dataset_names = ['train', 'test', 'nonfires']
                    class_names = [
                        'class_{}'.format(i) for i in range(self.num_classes)
                    ]
                    for ds, ds_name, weight in zip(datasets, dataset_names,
                                                   weights):
                        # all
                        name = 'all_{}_{}'.format(ds_name, infl_name)
                        with benchmark('Computing {}'.format(name)):
                            infl = compute_test_like_infl(
                                ds, infl_total_fn, grad, sample_weights=weight)
                            add(res, name, infl)

                        # class-specific
                        for i, class_name in enumerate(class_names):
                            class_inds = np.where(ds.labels == i)[0]
                            name = '{}_{}_{}'.format(class_name, ds_name,
                                                     infl_name)
                            with benchmark('Computing {}'.format(name)):
                                infl = compute_test_like_infl(ds.subset(class_inds), infl_total_fn,\
                                        grad, sample_weights=weight[class_inds])
                                add(res, name, infl)

                    # test/nonfires genres
                    for ds, ds_name, weight, genre_names, genre_inds in\
                            zip(datasets[1:], dataset_names[1:], weights[1:],\
                                [test_genres, nonfires_genres], [test_genre_inds, nonfires_genre_inds]):
                        for genre_name, genre_ind in zip(
                                genre_names, genre_inds):
                            name = '{}_{}_{}'.format(genre_name, ds_name,
                                                     infl_name)
                            with benchmark('Computing {}'.format(name)):
                                infl = compute_test_like_infl(ds.subset(genre_ind), infl_total_fn,\
                                        grad, sample_weights=weight[genre_ind])
                                add(res, name, infl)

                    # For a few specific points
                    specific_names = ['fixed_test', 'fixed_nonfires']
                    for name, fixed_inds, ds in zip(specific_names, [fixed_test, fixed_nonfires],\
                            [self.test, self.nonfires]):
                        data = []
                        for ind in fixed_inds:
                            with benchmark('Computing {} {}'.format(name,
                                                                    ind)):
                                data.append(
                                    compute_test_like_infl(
                                        ds.subset([ind]), infl_indiv_fn, grad))
                        add(res, '{}_{}'.format(name, infl_name), data)

        compute_infls('pred_infl', model.get_total_grad_loss,
                      model.get_indiv_grad_loss)
        if self.num_classes == 2:
            compute_infls('pred_margin_infl', mode.get_total_grad_margin,
                          model.get_indiv_grad_margin)

        for key, val in res.items():
            res[key] = np.array(val)

        return res

    @phase(5)
    def all_and_fixed_test_and_nonfire_influence(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 24
        results = self.task_queue.execute(
            'compute_all_and_fixed_test_and_nonfire_influence',
            [(i, min(i + subsets_per_batch, num_subsets))
             for i in range(0, num_subsets, subsets_per_batch)])

        return self.task_queue.collate_results(results)

    def retrain_subsets(self, subset_start, subset_end):
        print("Retraining subsets {}-{}".format(subset_start, subset_end))
        self.load_phases([1, 4])

        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        subset_tags, subset_indices = self.R['subset_tags'], self.R[
            'subset_indices']

        start_time = time.time()
        train_losses, test_losses, nonfires_losses = [], [], []
        train_margins, test_margins, nonfires_margins = [], [], []
        for i, remove_indices in enumerate(
                subset_indices[subset_start:subset_end], subset_start):
            print('Retraining model for subset {} out of {} (tag={})'.format(
                i, len(subset_indices), subset_tags[i]))
            inds = [
                j for j in range(self.num_train) if j not in remove_indices
            ]

            model.warm_fit(self.train.subset(inds), l2_reg=l2_reg)
            model.save('subset_{}'.format(i))
            train_losses.append(model.get_indiv_loss(self.train))
            test_losses.append(model.get_indiv_loss(self.test))
            nonfires_losses.append(model.get_indiv_loss(self.nonfires))
            if model.num_classes == 2:
                train_margins.append(model.get_indiv_margin(self.train))
                test_margins.append(model.get_indiv_margin(self.test))
                nonfires_margins.append(model.get_indiv_margin(self.nonfires))

        cur_time = time.time()
        time_per_retrain = (cur_time - start_time) / (subset_end -
                                                      subset_start)
        remaining_time = time_per_retrain * (len(subset_indices) - subset_end)
        print('Each retraining takes {} s, {} s remaining'.format(
            time_per_retrain, remaining_time))

        res['subset_train_losses'] = np.array(train_losses)
        res['subset_test_losses'] = np.array(test_losses)
        res['subset_nonfires_losses'] = np.array(nonfires_losses)

        if self.num_classes == 2:
            res['subset_train_margins'] = np.array(train_margins)
            res['subset_test_margins'] = np.array(test_margins)
            res['subset_nonfires_margins'] = np.array(nonfires_margins)

        return res

    @phase(6)
    def retrain(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 24
        results = self.task_queue.execute(
            'retrain_subsets',
            [(i, min(i + subsets_per_batch, num_subsets))
             for i in range(0, num_subsets, subsets_per_batch)])

        return self.task_queue.collate_results(results)

    def self_pred_infl(self, subset_start, subset_end):
        self.load_phases([1, 3, 4, 5])

        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        subset_tags, subset_indices = self.R['subset_tags'], self.R[
            'subset_indices']

        hessian = self.R['hessian']
        inverse_hvp_args = {
            'hessian_reg': hessian,
            'dataset': self.train,
            'l2_reg': l2_reg,
            'verbose': False,
            'verbose_cg': True,
            'inverse_vp_method': self.config['inverse_vp_method'],
        }

        subset_train_grads = self.R['subset_train_grad_for_pred_infl']
        if self.num_classes == 2:
            subset_train_margin_grads = self.R[
                'subset_train_grad_for_pred_margin_infl']

        start_time = time.time()
        subset_pred_dparam = []
        self_pred_infls = []
        self_pred_margin_infls = []
        for i, remove_indices in enumerate(
                subset_indices[subset_start:subset_end], subset_start):
            print('Computing self-influences for subset {} out of {} (tag={})'.
                  format(i, len(subset_indices), subset_tags[i]))
            grad_loss = subset_train_grads[i]
            H_inv_grad_loss = model.get_inverse_hvp(
                grad_loss.reshape(1, -1).T, **inverse_hvp_args).reshape(-1)
            pred_infl = np.dot(grad_loss, H_inv_grad_loss)
            subset_pred_dparam.append(H_inv_grad_loss)
            self_pred_infls.append(pred_infl)

            if model.num_classes == 2:
                grad_margin = subset_train_margin_grads[i]
                pred_margin_infl = np.dot(grad_margin, H_inv_grad_loss)
                self_pred_margin_infls.append(pred_margin_infl)

        cur_time = time.time()
        time_per_vp = (cur_time - start_time) / (subset_end - subset_start)
        remaining_time = time_per_vp * (len(subset_indices) - subset_end)
        print('Each self-influence calculation takes {} s, {} s remaining'.
              format(time_per_vp, remaining_time))

        res['subset_pred_dparam'] = np.array(subset_pred_dparam)
        res['self_pred_infl'] = np.array(self_pred_infls)
        if self.num_classes == 2:
            res['self_pred_margin_infl'] = np.array(self_pred_margin_infls)

        return res

    @phase(7)
    def compute_self_pred_infl(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 24
        results = self.task_queue.execute(
            'self_pred_infl',
            [(i, min(i + subsets_per_batch, num_subsets))
             for i in range(0, num_subsets, subsets_per_batch)])

        return self.task_queue.collate_results(results)

    @phase(8)
    def compute_actl_infl(self):
        self.task_queue.notify_exit()
        res = dict()

        subset_tags, subset_indices = self.R['subset_tags'], self.R[
            'subset_indices']
        fixed_test, fixed_nonfires = self.R['fixed_test'], self.R[
            'fixed_nonfires']
        test_genres, test_genre_inds = self.R['test_genres'], self.R[
            'test_genre_inds']
        nonfires_genres, nonfires_genre_inds = self.R[
            'nonfires_genres'], self.R['nonfires_genre_inds']

        def compute_infls(infl_name, initial_vals, subset_vals):
            datasets = [self.train, self.test, self.nonfires]
            weights = [self.sample_weights[0], self.sample_weights[2],\
                    np.ones(self.nonfires.num_examples)]
            dataset_names = ['train', 'test', 'nonfires']
            class_names = [
                'class_{}'.format(i) for i in range(self.num_classes)
            ]
            for ds, ds_name, weight, initial_val, subset_val in\
                    zip(datasets, dataset_names, weights, initial_vals, subset_vals):
                # all
                name = 'all_{}_{}'.format(ds_name, infl_name)
                with benchmark('Computing {}'.format(name)):
                    infl = np.einsum('ai,i->a', subset_val - initial_val,
                                     weight)
                    res[name] = infl

                # class-specific
                for i, class_name in enumerate(class_names):
                    class_inds = np.where(ds.labels == i)[0]
                    name = '{}_{}_{}'.format(class_name, ds_name, infl_name)
                    with benchmark('Computing {}'.format(name)):
                        infl = np.einsum('ai,i->a', subset_val[:,class_inds]-initial_val[class_inds],\
                                weight[class_inds])
                        res[name] = infl

            # test/nonfires genres
            for ds, ds_name, weight, initial_val, subset_val, genre_names, genre_inds in\
                    zip(datasets[1:], dataset_names[1:], weights[1:],\
                        initial_vals[1:], subset_vals[1:],\
                        [test_genres, nonfires_genres], [test_genre_inds, nonfires_genre_inds]):
                for genre_name, genre_ind in zip(genre_names, genre_inds):
                    name = '{}_{}_{}'.format(genre_name, ds_name, infl_name)
                    with benchmark('Computing {}'.format(name)):
                        infl = np.einsum('ai,i->a', subset_val[:,genre_ind]-initial_val[genre_ind],\
                                weight[genre_ind])
                        res[name] = infl

            # For a few specific points
            specific_names = ['fixed_test', 'fixed_nonfires']
            for name, fixed_inds, initial_val, subset_val in\
                    zip(specific_names, [fixed_test, fixed_nonfires],\
                    initial_vals[1:], subset_vals[1:]):
                res_name = '{}_{}'.format(name, infl_name)
                for ind in fixed_inds:
                    with benchmark('Computing {} {}'.format(name, ind)):
                        infl = subset_val[:, ind] - initial_val[ind]
                        add(res, res_name, infl)
                res[res_name] = np.transpose(res[res_name])

            # self influence
            for subset_val, remove_indices in zip(subset_vals[0],
                                                  subset_indices):
                infl = np.dot(subset_val[remove_indices] - initial_vals[0][remove_indices],\
                        weights[0][remove_indices])
                add(res, 'self_{}'.format(infl_name), infl)
            res['self_{}'.format(infl_name)] = np.transpose(
                res['self_{}'.format(infl_name)])

        dataset_names = ['train', 'test', 'nonfires']
        initial_losses = [
            self.R['initial_{}_losses'.format(ds_name)]
            for ds_name in dataset_names
        ]
        subset_losses = [
            self.R['subset_{}_losses'.format(ds_name)]
            for ds_name in dataset_names
        ]
        compute_infls('actl_infl', initial_losses, subset_losses)
        if self.num_classes == 2:
            initial_margins = [
                self.R['initial_{}_margins'.format(ds_name)]
                for ds_name in dataset_names
            ]
            subset_margins = [
                self.R['subset_{}_margins'.format(ds_name)]
                for ds_name in dataset_names
            ]
            compute_infls('actl_margin_infl', initial_margins, subset_margins)

        for key, val in res.items():
            res[key] = np.array(val)

        return res

    def get_simple_subset_tags(self):
        def simplify_tag(tag):
            if 'same_turker' in tag: return 'same_turker'
            return tag

        return map(simplify_tag, self.R['subset_tags'])

    def get_subtitle(self):
        subtitle = '{}'.format(self.dataset_id)
        return subtitle

    def plot_influence(self,
                       title,
                       figname,
                       actl_loss,
                       pred_loss,
                       actl_margin=None,
                       pred_margin=None,
                       verbose=True):
        subset_tags = self.get_simple_subset_tags()

        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        plot_influence_correlation(ax[0][0],
                                   actl_loss,
                                   pred_loss,
                                   label=subset_tags,
                                   title='Group influence on ' + title,
                                   subtitle=self.get_subtitle())
        fig.savefig(os.path.join(self.plot_dir, figname + '_loss.png'),
                    bbox_inches='tight')
        plt.close(fig)

        if self.num_classes == 2:
            fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
            plot_influence_correlation(ax[0][0],
                                       actl_margin,
                                       pred_margin,
                                       label=subset_tags,
                                       title='Group margin influence on ' +
                                       title,
                                       subtitle=self.get_subtitle())
            fig.savefig(os.path.join(self.plot_dir, figname + '_margin.png'),
                        bbox_inches='tight')
            plt.close(fig)

        if verbose: print('Finished plotting {} influence.'.format(title))

    @phase(9)
    def plot_all(self):
        print('Will save plots to {}'.format(self.plot_dir))
        self.plot_influence('self', 'self-influence', self.R['self_actl_infl'],
                            self.R['self_pred_infl'])
        for i, test_idx in enumerate(self.R['fixed_test']):
            self.plot_influence('fixed test {}'.format(test_idx),
                                'fixed-test-{}'.format(test_idx),
                                self.R['fixed_test_actl_infl'][:, i],
                                self.R['fixed_test_pred_infl'][:, i])
        for i, nonfire_idx in enumerate(self.R['fixed_nonfires']):
            self.plot_influence('fixed nonfire {}'.format(i),
                                'fixed-nonfires-{}'.format(i),
                                self.R['fixed_nonfires_actl_infl'][:, i],
                                self.R['fixed_nonfires_pred_infl'][:, i])
        for i, genre_inds in enumerate(self.R['test_genre_inds']):
            name = self.R['test_genres'][i]
            self.plot_influence('test genre {}'.format(name),
                                'test-genre-{}'.format(name),
                                self.R['{}_test_actl_infl'.format(name)],
                                self.R['{}_test_pred_infl'.format(name)])
        for i, genre_inds in enumerate(self.R['nonfires_genre_inds']):
            name = self.R['nonfires_genres'][i]
            self.plot_influence('nonfire genre {}'.format(name),
                                'nonfires-genre-{}'.format(name),
                                self.R['{}_nonfires_actl_infl'.format(name)],
                                self.R['{}_nonfires_pred_infl'.format(name)])
        prefixes = ['all'] + [
            'class_{}'.format(i) for i in range(self.num_classes)
        ]
        for prefix in prefixes:
            for ds_name in ['train', 'test', 'nonfires']:
                self.plot_influence(
                    '{} {} set'.format(prefix, ds_name),
                    '{}-{}'.format(prefix, ds_name),
                    self.R['{}_{}_actl_infl'.format(prefix, ds_name)],
                    self.R['{}_{}_pred_infl'.format(prefix, ds_name)])

        return dict()
 def __init__(self, config, out_dir=None):
     super(TestDistribute, self).__init__(config, out_dir)
     task_dir = os.path.join(self.base_dir, 'tasks')
     self.task_queue = TaskQueue(task_dir)
     self.task_queue.define_task('is_prime', self.is_prime)
     self.task_queue.define_task('random_vector', self.random_vector)
class SubsetInfluenceLogreg(Experiment):
    """
    Compute various types of influence on subsets of the dataset
    """
    def __init__(self, config, out_dir=None):
        super(SubsetInfluenceLogreg, self).__init__(config, out_dir)
        self.datasets = ds.loader.load_dataset(**self.config['dataset_config'])
        self.train = self.datasets.train
        self.test = self.datasets.test
        self.validation = self.datasets.validation

        model_dir = os.path.join(self.base_dir, 'models')
        model_config = LogisticRegression.default_config()
        model_config['arch'] = LogisticRegression.infer_arch(self.datasets.train)
        model_config['arch']['fit_intercept'] = True

        # Heuristic for determining maximum batch evaluation sizes without OOM
        D = model_config['arch']['input_dim'] * model_config['arch']['num_classes']
        model_config['grad_batch_size'] =  max(1, self.config['max_memory'] // D)
        model_config['hessian_batch_size'] = max(1, self.config['max_memory'] // (D * D))

        # Set the method for computing inverse HVP
        model_config['inverse_hvp_method'] = self.config['inverse_hvp_method']

        self.model_dir = model_dir
        self.model_config = model_config

        # Convenience member variables
        self.dataset_id = self.config['dataset_config']['dataset_id']
        self.num_train = self.datasets.train.num_examples
        self.num_classes = self.model_config['arch']['num_classes']
        self.num_subsets = self.config['num_subsets']
        if self.subset_choice_type == "types":
            self.subset_size = int(self.num_train * self.config['subset_rel_size'])
        elif self.subset_choice_type == "range":
            self.subset_min_size = int(self.num_train * self.config['subset_min_rel_size'])
            self.subset_max_size = int(self.num_train * self.config['subset_max_rel_size'])

        tasks_dir = os.path.join(self.base_dir, 'tasks')
        self.task_queue = TaskQueue(tasks_dir, master_only=self.config['master_only'])
        self.task_queue.define_task('retrain_subsets', self.retrain_subsets)
        self.task_queue.define_task('self_pred_infl', self.self_pred_infl)
        self.task_queue.define_task('newton_batch', self.newton_batch)

    experiment_id = "ss_logreg"

    @property
    def subset_choice_type(self):
        return self.config.get('subset_choice_type', 'types')

    @property
    def run_id(self):
        if self.subset_choice_type == "types":
            run_id = "{}_ihvp-{}_seed-{}_size-{}_num-{}".format(
                self.config['dataset_config']['dataset_id'],
                self.config['inverse_hvp_method'],
                self.config['subset_seed'],
                self.config['subset_rel_size'],
                self.config['num_subsets'])
        elif self.subset_choice_type == "range":
            run_id = "{}_ihvp-{}_seed-{}_sizes-{}-{}_num-{}".format(
                self.config['dataset_config']['dataset_id'],
                self.config['inverse_hvp_method'],
                self.config['subset_seed'],
                self.config['subset_min_rel_size'],
                self.config['subset_max_rel_size'],
                self.config['num_subsets'])
        if self.config.get('tag', None) is not None:
            run_id = "{}_{}".format(run_id, self.config['tag'])
        return run_id

    def get_model(self):
        if not hasattr(self, 'model'):
            self.model = LogisticRegression(self.model_config, self.model_dir, random_state=np.random.RandomState(2))
        return self.model

    @phase(0)
    def cross_validation(self):
        model = self.get_model()
        res = dict()

        reg_min, reg_max, reg_samples = self.config['normalized_cross_validation_range']
        reg_min *= self.num_train
        reg_max *= self.num_train

        num_folds = self.config['cross_validation_folds']

        regs = np.logspace(np.log10(reg_min), np.log10(reg_max), reg_samples)
        cv_errors = np.zeros_like(regs)
        cv_accs = np.zeros_like(regs)
        fold_size = (self.num_train + num_folds - 1) // num_folds
        folds = [(k * fold_size, min((k + 1) * fold_size, self.num_train)) for k in range(num_folds)]

        for i, reg in enumerate(regs):
            with benchmark("Evaluating CV error for reg={}".format(reg)):
                cv_error = 0.0
                cv_acc = 0.0
                for k, fold in enumerate(folds):
                    fold_begin, fold_end = fold
                    train_indices = np.concatenate((np.arange(0, fold_begin), np.arange(fold_end, self.num_train)))
                    val_indices = np.arange(fold_begin, fold_end)

                    model.fit(self.train.subset(train_indices), l2_reg=reg)
                    fold_loss = model.get_total_loss(self.train.subset(val_indices), l2_reg=0)
                    acc = model.get_accuracy(self.train.subset(val_indices))
                    cv_error += fold_loss
                    cv_acc += acc
                    print('Acc: {}, loss: {}'.format(acc, fold_loss))

            cv_errors[i] = cv_error
            cv_accs[i] = cv_acc / num_folds
            print('Cross-validation acc {}, error {} for reg={}.'.format(cv_accs[i], cv_errors[i], reg))

        best_i = np.argmax(cv_accs)
        best_reg = regs[best_i]
        print('Cross-validation errors: {}'.format(cv_errors))
        print('Cross-validation accs: {}'.format(cv_accs))
        print('Selecting weight_decay {}, with acc {}, error {}.'.format(\
                best_reg, cv_accs[best_i], cv_errors[best_i]))

        res['cv_regs'] = regs
        res['cv_errors'] = cv_errors
        res['cv_accs'] = cv_accs
        res['cv_l2_reg'] = best_reg
        return res

    @phase(1)
    def initial_training(self):
        model = self.get_model()
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        with benchmark("Training original model"):
            model.fit(self.train, l2_reg=l2_reg)
            model.print_model_eval(self.datasets, l2_reg=l2_reg)
            model.save('initial')

        res['initial_train_losses'] = model.get_indiv_loss(self.train)
        res['initial_train_accuracy'] = model.get_accuracy(self.train)
        res['initial_test_losses'] = model.get_indiv_loss(self.test)
        res['initial_test_accuracy'] = model.get_accuracy(self.test)
        if self.num_classes == 2:
            res['initial_train_margins'] = model.get_indiv_margin(self.train)
            res['initial_test_margins'] = model.get_indiv_margin(self.test)

        with benchmark("Computing gradients"):
            res['train_grad_loss'] = model.get_indiv_grad_loss(self.train)

        return res

    @phase(2)
    def pick_test_points(self):
        dataset_id = self.config['dataset_config']['dataset_id']

        # Freeze each set after the first run
        if dataset_id == "hospital":
            fixed_test = [2267, 54826, 66678, 41567, 485, 25286]
        elif dataset_id == "spam":
            fixed_test = [92, 441, 593, 275, 267, 415]
        elif dataset_id == "mnist_small":
            fixed_test = [6172, 2044, 2293, 5305, 324, 3761]
        elif dataset_id == "mnist":
            fixed_test = [9009, 1790, 2293, 5844, 8977, 9433]
        elif dataset_id == "dogfish":
            fixed_test = [300, 339, 222, 520, 323, 182]
        elif dataset_id == "animals":
            fixed_test = [684,  850, 1492, 2380, 1539, 1267]
        elif dataset_id == "cifar10":
            fixed_test = [3629, 1019, 5259, 1082, 4237, 6811]
        else:
            test_losses = self.R['initial_test_losses']
            argsort = np.argsort(test_losses)
            high_loss = argsort[-3:] # Pick 3 high loss points
            random_loss = np.random.choice(argsort[:-3], 3, replace=False) # Pick 3 random points

            fixed_test = list(high_loss) + list(random_loss)

        print("Fixed test points: {}".format(fixed_test))
        return { 'fixed_test': fixed_test }

    @phase(3)
    def hessian(self):
        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        if self.config['inverse_hvp_method'] == 'explicit':
            with benchmark("Computing hessian"):
                res['hessian'] = hessian = model.get_hessian(self.train, l2_reg=l2_reg)
        elif self.config['inverse_hvp_method'] == 'cg':
            print("Not computing explicit hessian.")
            res['hessian'] = None

        return res

    @phase(4)
    def fixed_test_influence(self):
        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        hessian = self.R['hessian']
        inverse_hvp_args = {
            'hessian_reg': hessian,
            'dataset': self.train,
            'l2_reg': l2_reg,
            'verbose': False,
            'verbose_cg': True,
            'inverse_vp_method': self.config['inverse_vp_method'],
        }

        fixed_test = self.R['fixed_test']
        fixed_test_grad_loss = []
        fixed_test_pred_infl = []
        fixed_test_pred_margin_infl = []
        for test_idx in fixed_test:
            single_test_point = self.test.subset([test_idx])

            with benchmark('Scalar infl for all training points on test_idx {}.'.format(test_idx)):
                test_grad_loss = model.get_indiv_grad_loss(single_test_point).reshape(-1, 1)
                test_grad_loss_H_inv = model.get_inverse_hvp(test_grad_loss, **inverse_hvp_args).reshape(-1)
                pred_infl = np.dot(self.R['train_grad_loss'], test_grad_loss_H_inv)
                fixed_test_grad_loss.append(test_grad_loss)
                fixed_test_pred_infl.append(pred_infl)

            if self.num_classes == 2:
                with benchmark('Scalar margin infl for all training points on test_idx {}.'.format(test_idx)):
                    test_grad_margin = model.get_indiv_grad_margin(single_test_point).reshape(-1, 1)
                    test_grad_margin_H_inv = model.get_inverse_hvp(test_grad_margin, **inverse_hvp_args).reshape(-1)
                    pred_margin_infl = np.dot(self.R['train_grad_loss'], test_grad_margin_H_inv)
                    fixed_test_pred_margin_infl.append(pred_margin_infl)

        res['fixed_test_pred_infl'] = np.array(fixed_test_pred_infl)
        if self.num_classes == 2:
            res['fixed_test_pred_margin_infl'] = np.array(fixed_test_pred_margin_infl)

        return res

    def get_random_subsets(self, rng, subset_sizes):
        subsets = []
        for i, subset_size in enumerate(subset_sizes):
            subsets.append(rng.choice(self.num_train, subset_size, replace=False))
        return np.array(subsets)

    def get_same_class_subsets(self, rng, labels, subset_sizes):
        label_vals, label_counts = np.unique(labels, return_counts=True)
        label_indices = [ np.nonzero(labels == label_val)[0] for label_val in label_vals ]

        subsets = []
        for i, subset_size in enumerate(subset_sizes):
            valid_label_indices = np.nonzero(label_counts >= subset_size)[0]

            if len(valid_label_indices) == 0: continue
            valid_label_idx = rng.choice(valid_label_indices)
            subset = rng.choice(label_indices[valid_label_idx], subset_size, replace=False)
            subsets.append(subset)
        return np.array(subsets)

    def get_scalar_infl_tails(self, rng, pred_infl, subset_sizes):
        window = int(1.5 * np.max(subset_sizes))
        assert window <= self.num_train

        scalar_infl_indices = np.argsort(pred_infl).reshape(-1)
        pos_subsets, neg_subsets = [], []
        for i, subset_size in enumerate(subset_sizes):
            neg_subsets.append(rng.choice(scalar_infl_indices[:window], subset_size, replace=False))
            pos_subsets.append(rng.choice(scalar_infl_indices[-window:], subset_size, replace=False))
        return np.array(neg_subsets), np.array(pos_subsets)

    def get_clusters(self, X, n_clusters=None):
        """
        Clusters a set of points and returns the indices of the points
        within each cluster.
        :param X: An (N, D) tensor representing N points in D dimensions
        :param n_clusters: The number of clusters to use for KMeans, or None to use hierarchical
                           clustering and automatically determine the number of clusters.
        :returns: cluster_indices, a list of lists of indices
        """
        if n_clusters is None:
            cluster_labels = hcluster.fclusterdata(X, 1)
            print("Hierarchical clustering returned {} clusters".format(len(set(cluster_labels))))
        else:
            km = KMeans(n_clusters=n_clusters)
            km.fit(X)
            cluster_labels = km.labels_
        cluster_indices = [ np.nonzero(cluster_labels == label)[0] for label in set(cluster_labels) ]
        return cluster_indices

    def get_subsets_by_clustering(self, rng, X, subset_sizes):
        cluster_indices = []
        for n_clusters in (None, 4, 8, 16, 32, 64, 128):
            with benchmark("Clustering with k={}".format(n_clusters)):
                clusters = self.get_clusters(X, n_clusters=n_clusters)
                print("Cluster sizes:", [len(cluster) for cluster in clusters])
                cluster_indices.extend(clusters)

        cluster_sizes = np.array([len(indices) for indices in cluster_indices])

        subsets = []
        for i, subset_size in enumerate(subset_sizes):
            valid_clusters = np.nonzero(cluster_sizes >= subset_size)[0]
            if len(valid_clusters) == 0: continue

            cluster_idx = rng.choice(valid_clusters)
            subset = rng.choice(cluster_indices[cluster_idx], subset_size, replace=False)
            subsets.append(subset)
        return np.array(subsets)

    def get_subsets_by_projection(self, rng, X, subset_sizes):
        subsets = []
        for subset_size in subset_sizes:
            dim = rng.choice(X.shape[1])
            indices = np.argsort(np.array(list(X[:, dim])).reshape(-1))
            print(indices.shape)
            middle = rng.choice(X.shape[0])
            st = max(middle - subset_size // 2, 0)
            en = min(st + subset_size, self.num_train)
            st = en - subset_size
            subsets.append(indices[st:en])
        print(subsets)
        return subsets

    @phase(5)
    def pick_subsets(self):
        rng = np.random.RandomState(self.config['subset_seed'])

        tagged_subsets = []
        if self.subset_choice_type == "types":
            subset_sizes = np.ones(self.num_subsets).astype(np.int) * self.subset_size
        elif self.subset_choice_type == "range":
            subset_sizes = np.linspace(self.subset_min_size,
                                       self.subset_max_size,
                                       self.num_subsets).astype(np.int)

        with benchmark("Random subsets"):
            random_subsets = self.get_random_subsets(rng, subset_sizes)
            tagged_subsets += [('random', s) for s in random_subsets]

        with benchmark("Same class subsets"):
            same_class_subsets = self.get_same_class_subsets(rng, self.train.labels, subset_sizes)
            same_class_subset_labels = [self.train.labels[s[0]] for s in same_class_subsets]
            tagged_subsets += [('random_same_class-{}'.format(label), s) for s, label in zip(same_class_subsets, same_class_subset_labels)]

        with benchmark("Scalar infl tail subsets"):
            # 1) pick x*N out of the top 1.5 * 0.025 * N where x in (0.0025 - 0.025)
            # 2) pick x*N out of the top 1.5 * 0.1 * N where x in (0.0025 - 0.1)
            # 3) pick x*N out of the top 1.5 * 0.25 * N where x in (0.0025 - 0.25)
            size_1, size_2, size_3, size_4 = list(int(self.num_train * x) for x in (0.0025, 0.025, 0.1, 0.25))
            subsets_per_phase = self.num_subsets // 3
            subset_size_phases = [ np.linspace(size_1, size_2, subsets_per_phase).astype(int),
                                   np.linspace(size_1, size_3, subsets_per_phase).astype(int),
                                   np.linspace(size_1, size_4, self.num_subsets - 2 * subsets_per_phase).astype(int) ]
            for pred_infl, test_idx in zip(self.R['fixed_test_pred_infl'], self.R['fixed_test']):
                for phase, subset_sizes in enumerate(subset_size_phases, 1):
                    neg_tail_subsets, pos_tail_subsets = self.get_scalar_infl_tails(rng, pred_infl, subset_sizes)
                    tagged_subsets += [('neg_tail_test-{}-{}'.format(phase, test_idx), s) for s in neg_tail_subsets]
                    tagged_subsets += [('pos_tail_test-{}-{}'.format(phase, test_idx), s) for s in pos_tail_subsets]
                print('Found scalar infl tail subsets for test idx {}.'.format(test_idx))

        with benchmark("Same features subsets"):
            same_features_subsets = self.get_subsets_by_clustering(rng, self.train.x, subset_sizes)
            tagged_subsets += [('same_features', s) for s in same_features_subsets]

        with benchmark("Same gradient subsets"):
            same_grad_subsets = self.get_subsets_by_clustering(rng, self.R['train_grad_loss'], subset_sizes)
            tagged_subsets += [('same_grad', s) for s in same_grad_subsets]

        with benchmark("Same feature subsets by windowing"):
            feature_window_subsets = self.get_subsets_by_projection(rng, self.train.x, subset_sizes)
            tagged_subsets += [('feature_window', s) for s in feature_window_subsets]

        subset_tags = [tag for tag, subset in tagged_subsets]
        subset_indices = [subset for tag, subset in tagged_subsets]

        return { 'subset_tags': subset_tags, 'subset_indices': subset_indices }

    def retrain_subsets(self, subset_start, subset_end):
        print("Retraining subsets {}-{}".format(subset_start, subset_end))
        # Workers might need to reload results
        self.load_phases([0, 5], verbose=False)

        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        subset_tags, subset_indices = self.R['subset_tags'], self.R['subset_indices']

        start_time = time.time()
        train_losses, test_losses = [], []
        train_margins, test_margins = [], []
        for i, remove_indices in enumerate(subset_indices[subset_start:subset_end], subset_start):
            print('Retraining model for subset {} out of {} (tag={})'.format(i, len(subset_indices), subset_tags[i]))

            s = np.ones(self.num_train)
            s[remove_indices] = 0

            model.warm_fit(self.train, s, l2_reg=l2_reg)
            model.save('subset_{}'.format(i))
            train_losses.append(model.get_indiv_loss(self.train, verbose=False))
            test_losses.append(model.get_indiv_loss(self.test, verbose=False))
            if model.num_classes == 2:
                train_margins.append(model.get_indiv_margin(self.train, verbose=False))
                test_margins.append(model.get_indiv_margin(self.test, verbose=False))

            cur_time = time.time()
            time_per_retrain = (cur_time - start_time) / ((i + 1) - subset_start)
            remaining_time = time_per_retrain * (len(subset_indices) - (i + 1))
            print('Each retraining takes {} s, {} s remaining'.format(time_per_retrain, remaining_time))

        res['subset_train_losses'] = np.array(train_losses)
        res['subset_test_losses'] = np.array(test_losses)

        if self.num_classes == 2:
            res['subset_train_margins'] = np.array(train_margins)
            res['subset_test_margins'] = np.array(test_margins)

        return res

    @phase(6)
    def retrain(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 32
        results = self.task_queue.execute('retrain_subsets', [
            (i, min(i + subsets_per_batch, num_subsets))
            for i in range(0, num_subsets, subsets_per_batch)])

        return self.task_queue.collate_results(results)

    def self_pred_infl(self, subset_start, subset_end):
        self.load_phases([0, 1, 3, 5], verbose=False)

        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        subset_tags, subset_indices = self.R['subset_tags'], self.R['subset_indices']

        hessian = self.R['hessian']
        inverse_hvp_args = {
            'hessian_reg': hessian,
            'dataset': self.train,
            'l2_reg': l2_reg,
            'verbose': False,
            'verbose_cg': True,
            'inverse_vp_method': self.config['inverse_vp_method'],
        }
        train_grad_loss = self.R['train_grad_loss']

        # It is important that the influence gets calculated before the model is retrained,
        # so that the parameters are the original parameters
        start_time = time.time()
        subset_pred_dparam = []
        self_pred_infls = []
        self_pred_margin_infls = []
        for i, remove_indices in enumerate(subset_indices[subset_start:subset_end], subset_start):
            print('Computing self-influences for subset {} out of {} (tag={})'.format(i, len(subset_indices), subset_tags[i]))

            grad_loss = np.sum(train_grad_loss[remove_indices, :], axis=0)
            H_inv_grad_loss = model.get_inverse_hvp(grad_loss.reshape(-1, 1), **inverse_hvp_args).reshape(-1)
            pred_infl = np.dot(grad_loss, H_inv_grad_loss)
            subset_pred_dparam.append(H_inv_grad_loss)
            self_pred_infls.append(pred_infl)

            if model.num_classes == 2:
                grad_margin = model.get_total_grad_margin(self.train.subset(remove_indices))
                pred_margin_infl = np.dot(grad_margin, H_inv_grad_loss)
                self_pred_margin_infls.append(pred_margin_infl)

            cur_time = time.time()
            time_per_retrain = (cur_time - start_time) / ((i + 1) - subset_start)
            remaining_time = time_per_retrain * (len(subset_indices) - (i + 1))
            print('Each self-influence calculation takes {} s, {} s remaining'.format(time_per_retrain, remaining_time))

        res['subset_pred_dparam'] = np.array(subset_pred_dparam)
        res['subset_self_pred_infl'] = np.array(self_pred_infls)
        if self.num_classes == 2:
            res['subset_self_pred_margin_infl'] = np.array(self_pred_margin_infls)

        return res

    @phase(7)
    def compute_self_pred_infl(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 32
        results = self.task_queue.execute('self_pred_infl', [
            (i, min(i + subsets_per_batch, num_subsets))
            for i in range(0, num_subsets, subsets_per_batch)])

        return self.task_queue.collate_results(results)

    @phase(8)
    def compute_actl_infl(self):
        res = dict()

        subset_tags, subset_indices = self.R['subset_tags'], self.R['subset_indices']

        # Helper to collate fixed test infl and subset self infl on a quantity q
        def compute_collate_infl(fixed_test, fixed_test_pred_infl_q,
                                 initial_train_q, initial_test_q,
                                 subset_train_q, subset_test_q):
            subset_fixed_test_actl_infl = subset_test_q[:, fixed_test] - initial_test_q[fixed_test]
            subset_fixed_test_pred_infl = np.array([
                np.sum(fixed_test_pred_infl_q[:, remove_indices], axis=1).reshape(-1)
                for remove_indices in subset_indices])
            subset_self_actl_infl = np.array([
                np.sum(subset_train_q[i][remove_indices]) - np.sum(initial_train_q[remove_indices])
                for i, remove_indices in enumerate(subset_indices)])
            return subset_fixed_test_actl_infl, subset_fixed_test_pred_infl, subset_self_actl_infl

        # Compute influences on loss
        res['subset_fixed_test_actl_infl'], \
        res['subset_fixed_test_pred_infl'], \
        res['subset_self_actl_infl'] = compute_collate_infl(
            *[self.R[key] for key in ["fixed_test", "fixed_test_pred_infl",
                                      "initial_train_losses", "initial_test_losses",
                                      "subset_train_losses", "subset_test_losses"]])

        if self.num_classes == 2:
            # Compute influences on margin
            res['subset_fixed_test_actl_margin_infl'], \
            res['subset_fixed_test_pred_margin_infl'], \
            res['subset_self_actl_margin_infl'] = compute_collate_infl(
                *[self.R[key] for key in ["fixed_test", "fixed_test_pred_margin_infl",
                                          "initial_train_margins", "initial_test_margins",
                                          "subset_train_margins", "subset_test_margins"]])

        return res

    def newton_batch(self, subset_start, subset_end):
        self.load_phases([0, 1, 2, 3, 5], verbose=False)

        model = self.get_model()
        model.load('initial')
        l2_reg = self.R['cv_l2_reg']
        res = dict()

        # The Newton approximation is obtained by evaluating
        # -g(|w|, theta_0)^T H(s+w, theta_0)^{-1} g(w, theta_0)
        # where w is the difference in weights. Since we already have the full
        # hessian H_reg(s), we can compute H(w) (with no regularization) and
        # use it to update H_reg(s+w) = H_reg(s) + H(w) instead.

        subset_tags, subset_indices = self.R['subset_tags'], self.R['subset_indices']

        hessian = self.R['hessian']
        train_grad_loss = self.R['train_grad_loss']

        test_grad_loss = model.get_indiv_grad_loss(self.test.subset(self.R['fixed_test']))
        if self.num_classes == 2:
            test_grad_margin = model.get_indiv_grad_margin(self.test.subset(self.R['fixed_test']))

        # It is important that the gradients get calculated on the original model
        # so that the parameters are the original parameters
        start_time = time.time()
        subset_newton_dparam = []
        self_newton_infls = []
        self_newton_margin_infls = []
        fixed_test_newton_infls = []
        fixed_test_newton_margin_infls = []
        subset_hessian_spectrum = []
        for i, remove_indices in enumerate(subset_indices[subset_start:subset_end], subset_start):
            print('Computing Newton influences for subset {} out of {} (tag={})'.format(i, len(subset_indices), subset_tags[i]))

            grad_loss = np.sum(train_grad_loss[remove_indices, :], axis=0).reshape(-1, 1)
            if self.config['inverse_hvp_method'] == 'explicit':
                hessian_w = model.get_hessian(self.train.subset(remove_indices),
                                              -np.ones(len(remove_indices)), l2_reg=0, verbose=False)
                H_inv_grad_loss = model.get_inverse_vp(hessian + hessian_w, grad_loss,
                                                       inverse_vp_method=self.config['inverse_vp_method']).reshape(-1)

                if not self.config['skip_hessian_spectrum']:
                    H_inv_H_w = model.get_inverse_vp(hessian, hessian_w,
                                                     inverse_vp_method=self.config['inverse_vp_method'])
                    hessian_spectrum = scipy.linalg.eigvals(H_inv_H_w)
                    subset_hessian_spectrum.append(hessian_spectrum)
            elif self.config['inverse_hvp_method'] == 'cg':
                sample_weights = np.ones(self.num_train)
                sample_weights[remove_indices] = 0
                inverse_hvp_args = {
                    'dataset': self.train,
                    'sample_weights': sample_weights,
                    'l2_reg': l2_reg,
                    'verbose': False,
                    'verbose_cg': True,
                }
                H_inv_grad_loss = model.get_inverse_hvp(grad_loss, **inverse_hvp_args).reshape(-1)

            self_newton_infl = np.dot(grad_loss.reshape(-1), H_inv_grad_loss)
            subset_newton_dparam.append(H_inv_grad_loss)
            self_newton_infls.append(self_newton_infl)

            fixed_test_newton_infl = np.dot(test_grad_loss, H_inv_grad_loss)
            fixed_test_newton_infls.append(fixed_test_newton_infl)

            if model.num_classes == 2:
                s = np.zeros(self.num_train)
                s[remove_indices] = 1
                grad_margin = model.get_total_grad_margin(self.train, s)
                self_newton_margin_infl = np.dot(grad_margin, H_inv_grad_loss)
                self_newton_margin_infls.append(self_newton_margin_infl)
                fixed_test_newton_margin_infl = np.dot(test_grad_margin, H_inv_grad_loss)
                fixed_test_newton_margin_infls.append(fixed_test_newton_margin_infl)

            cur_time = time.time()
            time_per_retrain = (cur_time - start_time) / ((i + 1) - subset_start)
            remaining_time = time_per_retrain * (len(subset_indices) - (i + 1))
            print('Each Newton influence calculation takes {} s, {} s remaining'.format(time_per_retrain, remaining_time))

        res['subset_newton_dparam'] = np.array(subset_newton_dparam)
        res['subset_self_newton_infl'] = np.array(self_newton_infls)
        res['subset_fixed_test_newton_infl'] = np.array(fixed_test_newton_infls)
        if self.num_classes == 2:
            res['subset_self_newton_margin_infl'] = np.array(self_newton_margin_infls)
            res['subset_fixed_test_newton_margin_infl'] = np.array(fixed_test_newton_margin_infls)

        if self.config['inverse_hvp_method'] == 'explicit' and not self.config['skip_hessian_spectrum']:
            res['subset_hessian_spectrum'] = np.array(subset_hessian_spectrum)

        return res

    @phase(9)
    def newton(self):
        if self.config['skip_newton']:
            return dict()

        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 32
        results = self.task_queue.execute('newton_batch', [
            (i, min(i + subsets_per_batch, num_subsets))
            for i in range(0, num_subsets, subsets_per_batch)])

        return self.task_queue.collate_results(results)

    @phase(10)
    def fixed_test_newton(self):
        # Merged into the newton phase above to avoid duplicating work
        return dict()

    @phase(11)
    def param_changes(self):
        # The later phases do not need task workers
        self.task_queue.notify_exit()

        model = self.get_model()
        res = dict()

        model.load('initial')
        initial_param = model.get_params_flat()

        subset_tags, subset_indices = self.R['subset_tags'], self.R['subset_indices']
        n, n_report = len(subset_indices), max(len(subset_indices) // 100, 1)

        # Calculate actual changes in parameters
        subset_dparam = []
        subset_train_acc, subset_test_acc = [], []
        for i, remove_indices in enumerate(subset_indices):
            model.load('subset_{}'.format(i))
            param = model.get_params_flat()
            subset_dparam.append(param - initial_param)
            subset_train_acc.append(model.get_accuracy(self.train))
            subset_test_acc.append(model.get_accuracy(self.test))
        res['subset_dparam'] = np.array(subset_dparam)
        res['subset_train_accuracy'] = np.array(subset_train_acc)
        res['subset_test_accuracy'] = np.array(subset_test_acc)

        return res

    @phase(12)
    def param_change_norms(self):
        if self.config['skip_param_change_norms']:
            return dict()

        res = dict()
        model = self.get_model()
        l2_reg = self.R['cv_l2_reg']
        model.load('initial')

        # Compute l2 norm of gradient
        train_grad_loss = self.R['train_grad_loss']
        res['subset_grad_loss_l2_norm'] = np.array([
            np.linalg.norm(np.sum(train_grad_loss[remove_indices, :], axis=0))
            for remove_indices in self.R['subset_indices']])

        # Compute l2 norms and norms under the Hessian metric of parameter changes
        l2_reg = self.R['cv_l2_reg']
        hessian = self.R['hessian']
        for dparam_type in ('subset_dparam', 'subset_pred_dparam', 'subset_newton_dparam'):
            dparam = self.R[dparam_type]
            res[dparam_type + '_l2_norm'] = np.linalg.norm(dparam, axis=1)
            if self.config['inverse_hvp_method'] == 'explicit':
                hvp = np.dot(dparam, hessian)
            else:
                hvp = model.get_hvp(dparam.T, self.train, l2_reg=l2_reg)
            res[dparam_type + '_hessian_norm'] = np.sqrt(np.sum(dparam * hvp, axis=1))

        return res

    @phase(13)
    def z_norms(self):
        if self.config['skip_z_norms'] or self.num_classes != 2:
            return dict()

        res = dict()
        model = self.get_model()
        l2_reg = self.R['cv_l2_reg']
        model.load('initial')

        inverse_hvp_args = {
            'hessian_reg': self.R['hessian'],
            'dataset': self.train,
            'l2_reg': l2_reg,
            'verbose': False,
            'verbose_cg': True,
            'inverse_vp_method': self.config['inverse_vp_method'],
        }

        # z_i = sqrt(sigma''_i) x_i so that H = ZZ^T
        res['zs'] = zs = model.get_zs(self.train)
        ihvp_zs = model.get_inverse_hvp(zs.T, **inverse_hvp_args).T
        res['z_norms'] = np.linalg.norm(zs, axis=1)
        res['z_hessian_norms'] = np.sqrt(np.sum(zs * ihvp_zs, axis=1))

        return res

    @phase(14)
    def compute_pparam_infl(self):
        model = self.get_model()
        res = dict()

        model.load('initial')
        initial_param = model.get_params_flat()

        subset_tags, subset_indices = self.R['subset_tags'], self.R['subset_indices']

        fixed_test = self.R['fixed_test']
        fixed_test_ds = self.test.subset(fixed_test)

        initial_fixed_test_loss = self.R['initial_test_losses'][fixed_test]
        if self.num_classes == 2:
            initial_fixed_test_margin = self.R['initial_test_margins'][fixed_test]

        def compute_pparam_influences(pred_dparam, pparam_type='pparam'):
            # Calculate change in loss/margin at predicted parameters
            subset_fixed_test_pparam_infl = []
            subset_self_pparam_infl = []
            subset_fixed_test_pparam_margin_infl = []
            for i, remove_indices in enumerate(subset_indices):

                subset_ds = self.train.subset(remove_indices)
                pred_param = initial_param + pred_dparam[i, :]
                model.set_params_flat(pred_param)

                pparam_fixed_test_loss = model.get_indiv_loss(fixed_test_ds, verbose=False)
                pparam_self_loss = model.get_total_loss(subset_ds, l2_reg=0, verbose=False)
                initial_self_loss = np.sum(self.R['initial_train_losses'][remove_indices])
                subset_fixed_test_pparam_infl.append(pparam_fixed_test_loss - initial_fixed_test_loss)
                subset_self_pparam_infl.append(pparam_self_loss - initial_self_loss)

                if self.num_classes == 2:
                    pparam_fixed_test_margin = model.get_indiv_margin(fixed_test_ds, verbose=False)
                    subset_fixed_test_pparam_margin_infl.append(pparam_fixed_test_margin - initial_fixed_test_margin)

            res['subset_fixed_test_{}_infl'.format(pparam_type)] = np.array(subset_fixed_test_pparam_infl)
            res['subset_self_{}_infl'.format(pparam_type)] = np.array(subset_self_pparam_infl)
            if self.num_classes == 2:
                res['subset_fixed_test_{}_margin_infl'.format(pparam_type)] = np.array(subset_fixed_test_pparam_margin_infl)

        compute_pparam_influences(self.R['subset_pred_dparam'], 'pparam')
        if not self.config['skip_newton']:
            compute_pparam_influences(self.R['subset_newton_dparam'], 'nparam')

        return res

    def plot_z_norms(self, save_and_close=False):
        if 'z_norms' not in self.R: return

        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        plot_distribution(ax[0][0], self.R['z_norms'],
                          title='Z-norms', xlabel='Z-norm',
                          subtitle=self.get_subtitle())
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'z-norms.png'), bbox_inches='tight')
            plt.close(fig)

    def get_simple_subset_tags(self):
        def simplify_tag(tag):
            if '-' in tag: return tag.split('-')[0]
            return tag
        return map(simplify_tag, self.R['subset_tags'])

    def get_subtitle(self):
        if self.subset_choice_type == "types":
            subtitle='{}, {} subsets per type, proportion {}'.format(
                self.dataset_id, self.num_subsets, self.config['subset_rel_size'])
        elif self.subset_choice_type == "range":
            subtitle='{}, {} subsets per type, proportion {}-{}'.format(
                self.dataset_id, self.num_subsets,
                self.config['subset_min_rel_size'],
                self.config['subset_max_rel_size'])
        return subtitle

    def plot_group_influence(self,
                             influence_type, # 'self' or 'fixed-test-{:test_idx}'
                             quantity, # 'loss' or 'margin'
                             x, y,
                             x_approx_type, y_approx_type, # 'actl', 'pred', 'newton', 'pparam', 'nparam'
                             save_and_close=False):
        subset_tags = self.get_simple_subset_tags()
        subset_sizes = np.array([len(indices) for indices in self.R['subset_indices']])

        if influence_type.find('self') == 0:
            title = 'Group self-influence on '
        elif influence_type.find('fixed-test-') == 0:
            test_idx = influence_type.rsplit('-',  1)[-1]
            title = "Group influence on test pt {}'s ".format(test_idx)
        title += quantity

        filename = '{}_{}_{}-{}.png'.format(
            influence_type, quantity, x_approx_type, y_approx_type)

        approx_type_to_label = { 'actl': 'Actual influence',
                                 'pred': 'First-order influence',
                                 'newton': 'Newton influence',
                                 'pparam': 'Predicted parameter influence',
                                 'nparam': 'Newton predicted parameter influence' }
        xlabel = approx_type_to_label[x_approx_type]
        ylabel = approx_type_to_label[y_approx_type]

        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        plot_influence_correlation(ax[0][0], x, y,
                                   label=subset_tags,
                                   title=title,
                                   subtitle=self.get_subtitle(),
                                   xlabel=xlabel,
                                   ylabel=ylabel)
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, filename), bbox_inches='tight')
            plt.close(fig)

        range_x, range_y = np.max(x) - np.min(x), np.max(y) - np.min(y)
        imbalanced = (np.abs(range_x) < 1e-9 or np.abs(range_y) < 1e-9 or
                      range_x / range_y < 1e-1 or range_y / range_x < 1e-1)
        if imbalanced:
            filename = '{}_{}_{}-{}_imbalanced.png'.format(
                influence_type, quantity, x_approx_type, y_approx_type)
            fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
            plot_influence_correlation(ax[0][0], x, y,
                                       label=subset_tags,
                                       title=title,
                                       subtitle=self.get_subtitle(),
                                       xlabel=xlabel,
                                       ylabel=ylabel,
                                       equal=False)
            if save_and_close:
                fig.savefig(os.path.join(self.plot_dir, filename), bbox_inches='tight')
                plt.close(fig)

    def plot_self_influence(self, save_and_close=False):
        if 'subset_self_actl_infl' not in self.R: return
        if 'subset_self_pred_infl' not in self.R: return

        self.plot_group_influence('self', 'loss',
                                  self.R['subset_self_actl_infl'],
                                  self.R['subset_self_pred_infl'],
                                  'actl', 'pred',
                                  save_and_close=save_and_close)

        if self.num_classes == 2:
            self.plot_group_influence('self', 'margin',
                                      self.R['subset_self_actl_margin_infl'],
                                      self.R['subset_self_pred_margin_infl'],
                                      'actl', 'pred',
                                      save_and_close=save_and_close)

    def plot_fixed_test_influence(self, save_and_close=False):
        if 'subset_fixed_test_actl_infl' not in self.R: return
        if 'subset_fixed_test_pred_infl' not in self.R: return

        subset_tags = self.get_simple_subset_tags()

        for i, test_idx in enumerate(self.R['fixed_test']):
            self.plot_group_influence('fixed-test-{}'.format(test_idx), 'loss',
                                      self.R['subset_fixed_test_actl_infl'][:, i],
                                      self.R['subset_fixed_test_pred_infl'][:, i],
                                      'actl', 'pred',
                                      save_and_close=save_and_close)

            if self.num_classes == 2:
                self.plot_group_influence('fixed-test-{}'.format(test_idx), 'margin',
                                          self.R['subset_fixed_test_actl_margin_infl'][:, i],
                                          self.R['subset_fixed_test_pred_margin_infl'][:, i],
                                          'actl', 'pred',
                                          save_and_close=save_and_close)

    def plot_newton_influence(self, save_and_close=False):
        if 'subset_self_newton_infl' not in self.R: return
        if 'subset_fixed_test_newton_infl' not in self.R: return

        def compare_newton(influence_type, quantity, actl, pred, newton):
            self.plot_group_influence(influence_type, quantity, actl, newton, 'actl', 'newton',
                                      save_and_close=save_and_close)
            self.plot_group_influence(influence_type, quantity, pred, newton, 'pred', 'newton',
                                      save_and_close=save_and_close)

        compare_newton('self', 'loss',
                       self.R['subset_self_actl_infl'],
                       self.R['subset_self_pred_infl'],
                       self.R['subset_self_newton_infl'])

        for i, test_idx in enumerate(self.R['fixed_test']):
            compare_newton('fixed-test-{}'.format(test_idx), 'loss',
                           self.R['subset_fixed_test_actl_infl'][:, i],
                           self.R['subset_fixed_test_pred_infl'][:, i],
                           self.R['subset_fixed_test_newton_infl'][:, i])

            if self.num_classes == 2:
                compare_newton('fixed-test-{}'.format(test_idx), 'margin',
                               self.R['subset_fixed_test_actl_margin_infl'][:, i],
                               self.R['subset_fixed_test_pred_margin_infl'][:, i],
                               self.R['subset_fixed_test_newton_margin_infl'][:, i])

    def plot_pparam_influence(self, save_and_close=False):
        for pparam_type in ('pparam', 'nparam'):
            self_infl_key = 'subset_self_{}_infl'.format(pparam_type)
            fixed_test_infl_key = 'subset_fixed_test_{}_infl'.format(pparam_type)
            fixed_test_margin_infl_key = 'subset_fixed_test_{}_margin_infl'.format(pparam_type)
            if self_infl_key not in self.R: continue
            if fixed_test_infl_key not in self.R: continue

            self.plot_group_influence('self', 'loss',
                                      self.R['subset_self_actl_infl'],
                                      self.R[self_infl_key],
                                      'actl', pparam_type,
                                      save_and_close=save_and_close)

            for i, test_idx in enumerate(self.R['fixed_test']):
                self.plot_group_influence('fixed-test-{}'.format(test_idx), 'loss',
                                          self.R['subset_fixed_test_actl_infl'][:, i],
                                          self.R[fixed_test_infl_key][:, i],
                                          'actl', pparam_type,
                                          save_and_close=save_and_close)

                if self.num_classes == 2:
                    self.plot_group_influence('fixed-test-{}'.format(test_idx), 'margin',
                                              self.R['subset_fixed_test_actl_margin_infl'][:, i],
                                              self.R[fixed_test_margin_infl_key][:, i],
                                              'actl', pparam_type,
                                              save_and_close=save_and_close)

    def plot_subset_sizes(self, save_and_close=False):
        if self.subset_choice_type != "range": return

        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        plot_against_subset_size(ax[0][0],
                                 self.R['subset_tags'],
                                 self.R['subset_indices'],
                                 self.R['subset_self_pred_infl'],
                                 title='Group self-influence',
                                 ylabel='Self-influence',
                                 subtitle=self.get_subtitle())
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'sizes_self_loss.png'),
                        bbox_inches='tight')
            plt.close(fig)

        for i, test_idx in enumerate(self.R['fixed_test']):
            fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
            plot_against_subset_size(ax[0][0],
                                     self.R['subset_tags'],
                                     self.R['subset_indices'],
                                     self.R['subset_fixed_test_pred_infl'][:, i],
                                     title='Group influence on test pt {}'.format(test_idx),
                                     subtitle=self.get_subtitle())
            if save_and_close:
                fig.savefig(os.path.join(self.plot_dir, 'sizes_fixed-test-{}_loss.png'.format(test_idx)),
                            bbox_inches='tight')
                plt.close(fig)

        if 'subset_train_accuracy' not in self.R: return
        if 'subset_test_accuracy' not in self.R: return

        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        plot_against_subset_size(ax[0][0],
                                 self.R['subset_tags'],
                                 self.R['subset_indices'],
                                 self.R['subset_train_accuracy'],
                                 title='Train accuracy by subset size',
                                 ylabel='Train accuracy',
                                 subtitle=self.get_subtitle())
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'sizes_train-accuracy.png'),
                        bbox_inches='tight')
            plt.close(fig)


        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        plot_against_subset_size(ax[0][0],
                                 self.R['subset_tags'],
                                 self.R['subset_indices'],
                                 self.R['subset_test_accuracy'],
                                 title='Test accuracy by subset size',
                                 ylabel='Test accuracy',
                                 subtitle=self.get_subtitle())
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'sizes_test-accuracy.png'),
                        bbox_inches='tight')
            plt.close(fig)

    def plot_subset_hessian(self, save_and_close=False):
        if 'subset_hessian_spectrum' not in self.R: return

        fig, ax = plt.subplots(1, 1, figsize=(8, 8), squeeze=False)
        max_eigenvalue = np.max(np.abs(self.R['subset_hessian_spectrum']), axis=1)
        plot_distribution(ax[0][0],
                          max_eigenvalue,
                          title="Maximum eigenvalue of $H_{\lambda}(s)^{-1} H(w)$",
                          subtitle=self.get_subtitle(),
                          xlabel='Eigenvalue')
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'subset-hessian-max-eig.png'),
                        bbox_inches='tight')
            plt.close(fig)

    def plot_all(self, save_and_close=False):
        self.plot_self_influence(save_and_close)
        self.plot_fixed_test_influence(save_and_close)
        self.plot_newton_influence(save_and_close)
        self.plot_pparam_influence(save_and_close)
        self.plot_z_norms(save_and_close)
        self.plot_subset_sizes(save_and_close)
        self.plot_subset_hessian(save_and_close)
Esempio n. 8
0
class Counterexamples(Experiment):
    """
    Synthesize toy datasets and find counterexamples to possible
    properties of influence approximations
    """
    def __init__(self, config, out_dir=None):
        super(Counterexamples, self).__init__(config, out_dir)
        self.dataset_id = config['dataset_id']
        tasks_dir = os.path.join(self.base_dir, 'tasks')
        self.task_queue = TaskQueue(tasks_dir,
                                    master_only=self.config['master_only'])
        self.task_queue.define_task('retrain_and_newton_batch',
                                    self.retrain_and_newton_batch)
        self.task_queue.define_task('compute_cex_test_infl_batch',
                                    self.compute_cex_test_infl_batch)

    experiment_id = "counterexamples"

    @property
    def run_id(self):
        return "{}".format(self.dataset_id)

    @phase(0)
    def generate_datasets(self):
        res = dict()

        rng = np.random.RandomState(self.config['seed'])

        # Separated Gaussian mixture
        def generate_gaussian_mixture(N_per_class, D, axis):
            X_pos = rng.normal(0, 1, size=(N_per_class, D)) + axis / 2
            X_neg = rng.normal(0, 1, size=(N_per_class, D)) - axis / 2
            X = np.vstack([X_pos, X_neg])
            Y = np.hstack([np.zeros(N_per_class), np.ones(N_per_class)])
            indices = np.arange(Y.shape[0])
            rng.shuffle(indices)
            return X[indices, :], Y[indices]

        N_per_class, D = 20, 5
        separator = rng.normal(0, 1, size=(D, ))
        separator = separator / np.linalg.norm(separator) * 1
        res['gauss_train_X'], res['gauss_train_Y'] = generate_gaussian_mixture(
            N_per_class, D, separator)
        res['gauss_test_X'], res['gauss_test_Y'] = generate_gaussian_mixture(
            N_per_class, D, separator)

        # Fixed dataset
        X_fixed = np.array([[1, 0], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1]])
        Y_fixed = np.array([0, 0, 0, 1, 1, 1])
        X_confuse = np.array([[0.75, 0.25], [0.25, 0.75]])
        Y_confuse = np.array([1, 0])
        X = np.vstack([X_fixed, X_confuse])
        Y = np.hstack([Y_fixed, Y_confuse])
        indices = np.arange(Y.shape[0])
        rng.shuffle(indices)
        X, Y = X[indices, :], Y[indices]

        res['fixed_train_X'], res['fixed_train_Y'] = X, Y
        res['fixed_test_X'], res['fixed_test_Y'] = X, Y

        # Repeats dataset
        N_random, N_unique, D = 40, 20, 20
        X_unique = rng.normal(0, 1, (1, D))
        X_unique /= np.linalg.norm(X_unique)
        while X_unique.shape[0] < N_unique:
            X_new = rng.normal(0, 1, (1, D))
            new_rank = np.linalg.matrix_rank(np.vstack([X_unique, X_new]))
            if new_rank == X_unique.shape[0]: continue
            X_new -= np.dot(np.dot(X_unique.T, X_unique), X_new.T).T
            if np.linalg.norm(X_new) < 1e-3: continue
            X_new /= np.linalg.norm(X_new)
            X_unique = np.vstack([X_unique, X_new])
        axis = rng.normal(0, 1, (D, ))
        Y_unique = rng.randint(0, 2, (N_unique, ))

        X, Y = np.zeros((0, D)), np.zeros(0)
        for i in range(N_unique):
            X = np.vstack(
                [X, np.repeat(X_unique[np.newaxis, i, :], i + 1, axis=0)])
            Y = np.hstack([Y, np.repeat(Y_unique[i], i + 1)])

        X_random = rng.normal(0, 0.1, (N_random, D))
        Y_random = rng.randint(0, 2, (N_random, ))
        X = np.vstack([X, X_random])
        Y = np.hstack([Y, Y_random])

        res['repeats_train_X'], res['repeats_train_Y'] = X, Y
        res['repeats_test_X'], res['repeats_test_Y'] = X, Y
        res['repeats_N_unique'] = N_unique

        # Separated Gaussian mixture, high dimension
        N_per_class, D = 20, 10
        separator = rng.normal(0, 1, size=(D, ))
        separator = separator / np.linalg.norm(separator) * 0
        res['gauss2_train_X'], res[
            'gauss2_train_Y'] = generate_gaussian_mixture(
                N_per_class, D, separator)
        res['gauss2_test_X'], res['gauss2_test_Y'] = generate_gaussian_mixture(
            N_per_class, D, separator)

        # 2N < D
        N_per_class, D = 40, 100
        separator = rng.normal(0, 1, size=(D, ))
        separator = separator / np.linalg.norm(separator) * 0.1
        res['gauss3_train_X'], res[
            'gauss3_train_Y'] = generate_gaussian_mixture(
                N_per_class, D, separator)
        res['gauss3_test_X'], res['gauss3_test_Y'] = generate_gaussian_mixture(
            N_per_class, D, separator)

        # N = D
        N_per_class, D = 60, 60
        separator = rng.normal(0, 1, size=(D, ))
        separator = separator / np.linalg.norm(separator) * 0.5
        res['gauss4_train_X'], res[
            'gauss4_train_Y'] = generate_gaussian_mixture(
                N_per_class, D, separator)
        res['gauss4_test_X'], res['gauss4_test_Y'] = generate_gaussian_mixture(
            N_per_class, D, separator)

        # Orthogonal with different distances from origin
        D = 2
        X = np.zeros((0, D))
        Y = np.zeros((0, ))
        Xa = np.eye(D)
        unique_ids = []
        for i in range(D):
            repeats_r = (40, 40)[i]
            repeats_s = (20, 20)[i]
            r = (0.25 * 0.3, 1)[i]
            s = (0.5 * 0.3, 0.1)[i]

            X = np.vstack([
                X,
                np.repeat(Xa[i, :][np.newaxis, :] * r, repeats_r, axis=0),
                np.repeat(Xa[i, :][np.newaxis, :] * s, repeats_s, axis=0)
            ])
            Y = np.hstack([Y, np.full(repeats_r, 0)])
            Y = np.hstack([Y, np.full(repeats_s, 1)])
            unique_ids.extend([2 * i] * repeats_r)
            unique_ids.extend([2 * i + 1] * repeats_s)
        res['ortho2_train_X'], res['ortho2_train_Y'] = X, Y
        res['ortho2_test_X'], res['ortho2_test_Y'] = X, Y
        res['ortho2_ids'] = np.array(unique_ids)

        # Gaussian on one plane and orthogonal elsewhere
        N_random, N_per_class, D = 20, 40, 10
        separator = rng.normal(0, 1, size=(D, ))
        separator = separator / np.linalg.norm(separator) * 1
        X, Y = generate_gaussian_mixture(N_per_class, D, separator)
        X = np.hstack([X, np.full((X.shape[0], 1), 0)])
        X_random = rng.normal(0, 1, (N_random, D + 1))
        Y_random = rng.randint(0, 2, (N_random, ))
        X = np.vstack([X, X_random])
        Y = np.hstack([Y, Y_random])
        res['gauss5_train_X'], res['gauss5_train_Y'] = X, Y
        res['gauss5_test_X'], res['gauss5_test_Y'] = X, Y

        return res

    def get_dataset(self, dataset_id=None):
        dataset_id = dataset_id if dataset_id is not None else self.dataset_id
        if not hasattr(self, 'datasets'):
            self.datasets = dict()
        if not dataset_id in self.datasets:
            ds_keys = [
                '{}_{}'.format(dataset_id, key)
                for key in ('train_X', 'train_Y', 'test_X', 'test_Y')
            ]
            if any(ds_key not in self.R for ds_key in ds_keys):
                raise ValueError('Dataset gauss has not been generated')
            train_X, train_Y, test_X, test_Y = [
                self.R[ds_key] for ds_key in ds_keys
            ]
            train = DataSet(train_X, train_Y)
            test = DataSet(test_X, test_Y)
            self.datasets[dataset_id] = base.Datasets(train=train,
                                                      test=test,
                                                      validation=None)
        return self.datasets[dataset_id]

    def get_model(self, dataset_id=None):
        if not hasattr(self, 'model'):
            dataset = self.get_dataset(dataset_id)
            model_config = LogisticRegression.default_config()
            model_config['arch'] = LogisticRegression.infer_arch(dataset.train)
            model_dir = os.path.join(self.base_dir, 'models')
            self.model = LogisticRegression(model_config, model_dir)
        return self.model

    @phase(1)
    def training(self):
        res = dict()

        ds = self.get_dataset()
        model = self.get_model()

        res['l2_reg'] = l2_reg = ds.train.num_examples * 1e-3

        with benchmark("Training original model"):
            model.fit(ds.train, l2_reg=l2_reg)
            model.print_model_eval(ds, l2_reg=l2_reg)
            model.save('initial')

        res['train_losses'] = model.get_indiv_loss(ds.train)
        res['train_margins'] = model.get_indiv_margin(ds.train)
        res['train_accuracy'] = model.get_accuracy(ds.train)
        res['test_losses'] = model.get_indiv_loss(ds.test)
        res['test_margins'] = model.get_indiv_margin(ds.test)
        res['test_accuracy'] = model.get_accuracy(ds.test)

        with benchmark("Computing gradients"):
            res['train_grad_losses'] = model.get_indiv_grad_loss(ds.train)
            res['train_grad_margins'] = model.get_indiv_grad_margin(ds.train)
            res['test_grad_losses'] = model.get_indiv_grad_loss(ds.test)
            res['test_grad_margins'] = model.get_indiv_grad_margin(ds.test)

        res['hessian'] = model.get_hessian(ds.train, l2_reg=l2_reg)

        return res

    @phase(2)
    def pick_subsets(self):
        ds = self.get_dataset()

        if self.dataset_id == "repeats":
            N_unique = self.R['repeats_N_unique']
            subset_indices = [
                list(range(i * (i + 1) // 2,
                           i * (i + 1) // 2 + size)) for i in range(N_unique)
                for size in range(1, (i + 1) + 1)
            ]
        elif self.dataset_id == "ortho2":
            unique_ids = np.unique(self.R['ortho2_ids'])
            subset_indices = []
            for id in unique_ids:
                repeats = np.nonzero(self.R['ortho2_ids'] == id)[0]
                for size in range(1, len(repeats)):
                    subset_indices.append(repeats[:size])
        else:
            if self.dataset_id == "gauss2" or self.dataset_id == "gauss3" or self.dataset_id == "gauss4":
                size_min, size_max = 2, 2
            elif self.dataset_id == "gauss":
                size_min, size_max = 1, 3
            elif self.dataset_id == "gauss5":
                size_min, size_max = 1, 1
            else:
                size_min, size_max = 2, 3
            subset_indices = list(
                list(subset) for r in range(size_min, size_max + 1)
                for subset in itertools.combinations(
                    range(ds.train.num_examples), r))

        return {'subset_indices': subset_indices}

    def retrain_and_newton_batch(self, subset_start, subset_end):
        res = dict()
        self.load_phases([0, 1, 2], verbose=False)

        ds = self.get_dataset()
        model = self.get_model()
        model.load('initial')
        initial_params = model.get_params_flat()
        l2_reg = self.R['l2_reg']
        hessian = self.R['hessian']

        subsets = self.R['subset_indices'][subset_start:subset_end]
        num_subsets = len(subsets)
        train_grad_losses = self.R['train_grad_losses']
        subset_grad_losses = np.array([
            np.sum(train_grad_losses[subset, :], axis=0) for subset in subsets
        ])

        start_time = time.time()

        with benchmark(
                'Computing first-order predicted parameters for subsets {}-{}'.
                format(subset_start, subset_end)):
            inverse_hvp_args = {
                'hessian_reg': hessian,
                'verbose': False,
                'inverse_hvp_method': 'explicit',
                'inverse_vp_method': 'cholesky',
            }
            res['subset_pred_dparam'] = model.get_inverse_hvp(
                subset_grad_losses.T, **inverse_hvp_args).T

        with benchmark(
                'Computing Newton predicted parameters for subsets {}-{}'.
                format(subset_start, subset_end)):
            newton_pred_dparam = np.zeros((num_subsets, model.params_dim))
            for i, subset in enumerate(subsets):
                hessian_w = model.get_hessian(ds.train.subset(subset),
                                              l2_reg=0,
                                              verbose=False)
                inverse_hvp_args = {
                    'hessian_reg': hessian - hessian_w,
                    'verbose': False,
                    'inverse_hvp_method': 'explicit',
                    'inverse_vp_method': 'cholesky',
                }
                subset_grad_loss = subset_grad_losses[i, :].reshape(-1, 1)
                pred_dparam = model.get_inverse_hvp(
                    subset_grad_loss, **inverse_hvp_args).reshape(-1)
                newton_pred_dparam[i, :] = pred_dparam
            res['subset_newton_pred_dparam'] = newton_pred_dparam

        with benchmark('Computing actual parameters for subsets {}-{}'.format(
                subset_start, subset_end)):
            actl_dparam = np.zeros((num_subsets, model.params_dim))
            for i, subset in enumerate(subsets):
                s = np.ones(ds.train.num_examples)
                s[subset] = 0
                model.warm_fit(ds.train, s, l2_reg=l2_reg)
                model.save('subset_{}'.format(i + subset_start))
                actl_dparam[i, :] = model.get_params_flat() - initial_params

            res['subset_dparam'] = actl_dparam

        end_time = time.time()
        time_per_subset = (end_time - start_time) / num_subsets
        remaining_time = (len(self.R['subset_indices']) -
                          subset_end) * time_per_subset
        print('Each retraining and iHVP takes {} s, {} s remaining'.format(
            time_per_subset, remaining_time))

        return res

    @phase(3)
    def retrain_and_newton(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 64
        results = self.task_queue.execute(
            'retrain_and_newton_batch',
            [(i, min(i + subsets_per_batch, num_subsets))
             for i in range(0, num_subsets, subsets_per_batch)],
            force_refresh=True)

        return self.task_queue.collate_results(results)

    def compute_test_influence(self, subset_start, subset_end, ds_test):
        res = dict()

        model = self.get_model()
        model.load('initial')
        initial_params = model.get_params_flat()
        actl_dparam = self.R['subset_dparam'][subset_start:subset_end, :]
        pred_dparam = self.R['subset_pred_dparam'][subset_start:subset_end, :]
        newton_pred_dparam = self.R['subset_newton_pred_dparam'][
            subset_start:subset_end, :]

        test_losses = model.get_indiv_loss(ds_test, verbose=False)
        test_margins = model.get_indiv_margin(ds_test, verbose=False)

        subsets = self.R['subset_indices'][subset_start:subset_end]
        num_subsets = len(subsets)

        with benchmark(
                'Computing actual parameters and influence for subsets {}-{}'.
                format(subset_start, subset_end)):
            subset_test_actl_infl = np.zeros(
                (num_subsets, ds_test.num_examples))
            subset_test_actl_margin_infl = np.zeros(
                (num_subsets, ds_test.num_examples))
            for i, subset in enumerate(subsets):
                actl_param = initial_params + actl_dparam[i, :]

                model.set_params_flat(actl_param)
                actl_losses = model.get_indiv_loss(ds_test, verbose=False)
                actl_margins = model.get_indiv_margin(ds_test, verbose=False)

                subset_test_actl_infl[i, :] = actl_losses - test_losses
                subset_test_actl_margin_infl[
                    i, :] = actl_margins - test_margins

            res['subset_test_actl_infl'] = subset_test_actl_infl
            res['subset_test_actl_margin_infl'] = subset_test_actl_margin_infl

        with benchmark(
                'Computing influence approximates for subsets {}-{}'.format(
                    subset_start, subset_end)):
            subset_test_pparam_infl = np.zeros(
                (num_subsets, ds_test.num_examples))
            subset_test_pparam_margin_infl = np.zeros(
                (num_subsets, ds_test.num_examples))
            subset_test_nparam_infl = np.zeros(
                (num_subsets, ds_test.num_examples))
            subset_test_nparam_margin_infl = np.zeros(
                (num_subsets, ds_test.num_examples))
            for i, subset in enumerate(subsets):
                pparam = initial_params + pred_dparam[i, :]
                nparam = initial_params + newton_pred_dparam[i, :]

                model.set_params_flat(pparam)
                pparam_losses = model.get_indiv_loss(ds_test, verbose=False)
                pparam_margins = model.get_indiv_margin(ds_test, verbose=False)

                model.set_params_flat(nparam)
                nparam_losses = model.get_indiv_loss(ds_test, verbose=False)
                nparam_margins = model.get_indiv_margin(ds_test, verbose=False)

                subset_test_pparam_infl[i, :] = pparam_losses - test_losses
                subset_test_pparam_margin_infl[
                    i, :] = pparam_margins - test_margins
                subset_test_nparam_infl[i, :] = nparam_losses - test_losses
                subset_test_nparam_margin_infl[
                    i, :] = nparam_margins - test_margins

            res['subset_test_pparam_infl'] = subset_test_pparam_infl
            res['subset_test_pparam_margin_infl'] = subset_test_pparam_margin_infl
            res['subset_test_nparam_infl'] = subset_test_nparam_infl
            res['subset_test_nparam_margin_infl'] = subset_test_nparam_margin_infl

        return res

    @phase(4)
    def find_adversarial_test(self):
        res = dict()

        ds = self.get_dataset()
        model = self.get_model()
        model.load('initial')

        rng = np.random.RandomState(self.config['seed'])

        subset_pred_dparam = self.R['subset_pred_dparam']
        subset_newton_pred_dparam = self.R['subset_newton_pred_dparam']
        norm = np.linalg.norm(subset_pred_dparam, axis=1) * np.linalg.norm(
            subset_newton_pred_dparam, axis=1)
        subset_cos_dparam = np.sum(
            subset_pred_dparam * subset_newton_pred_dparam,
            axis=1) / (norm + 1e-4)
        res['subset_cos_dparam'] = subset_cos_dparam

        # For K subsets, find a distribution of test points such that
        # (pred_margin, newton_pred_margin) ~ gaussian
        D = self.R['subset_pred_dparam'].shape[1]
        cex_X = np.zeros((0, D))
        cex_Y = np.zeros((0, ))
        cex_tags = []
        cos_indices = np.argsort(subset_cos_dparam)
        for K in (D // 2, D, 2 * D, 3 * D):
            if K > len(cos_indices): continue
            # Number of test points to try this method for
            N_test = 100

            # Subsets with a lower cosine similarity between dparams are easier to find
            # counterexample test points for
            easiest_subsets = cos_indices[:K]
            res['cex_lstsq_uv_K-{}_subsets'.format(K)] = easiest_subsets

            X = []
            A = np.vstack([
                subset_pred_dparam[easiest_subsets, :],
                subset_newton_pred_dparam[easiest_subsets, :]
            ])
            for u, v in rng.normal(0, 1, (N_test, 2)):
                B = np.hstack([np.full(K, u), np.full(K, v)])
                x = np.linalg.lstsq(A, B, rcond=None)[0]
                x /= np.linalg.norm(x)
                X.append(x)
            X = np.array(X)
            Y = np.ones(X.shape[0])

            cex_X = np.vstack([cex_X, X])
            cex_Y = np.hstack([cex_Y, Y])
            cex_tags.extend(['lstsq_uv_K-{}'.format(K)] * N_test)

            for y in rng.normal(0, 1, (N_test, A.shape[0])):
                x = np.linalg.lstsq(A, y, rcond=None)[0]
                cex_X = np.vstack([cex_X, x])
                cex_Y = np.hstack([cex_Y, 1])
                cex_tags.extend(['lstsq_uv_K-{}_gauss'.format(K)])

        # Pick the top K subsets and find a bad test point based on the average pred and newton dparams
        for K in [1] + list(range(10, 201, 10)):
            easiest_subsets = cos_indices[:K]
            res['cex_avg_K-{}'.format(K)] = easiest_subsets

            a = np.mean(subset_pred_dparam[easiest_subsets, :], axis=0)
            b = np.mean(subset_newton_pred_dparam[easiest_subsets, :], axis=0)

            # Minimizing a^T x_test - C b^T x_test subject to norm(x_test) = 1
            # this is just x_test = normalize(-a + C b)
            for C in (1e-1, 1e-5, 1, 2, 10):
                x = -a + C * b
                x /= np.linalg.norm(x)
                cex_X = np.vstack([cex_X, x])
                cex_Y = np.hstack([cex_Y, 1])
                cex_tags.append('cex_avg_K-{}_C-{}'.format(K, C))

            # Find x_test such that a^T x_test = 0 and b^T x_test is big
            x = b - a * np.dot(a, b) / np.dot(a, a)
            x /= np.linalg.norm(x)
            cex_X = np.vstack([cex_X, x])
            cex_Y = np.hstack([cex_Y, 1])
            cex_tags.append('cex_avg_K-{}_01'.format(K, C))

        # Sort subsets by b^T b - (a^T b)^2 / (a^T a) and use those to find a^T x = 0, b^T x = big
        aTa = np.sum(subset_pred_dparam**2, axis=1)
        aTb = np.sum(subset_pred_dparam * subset_newton_pred_dparam, axis=1)
        bTb = np.sum(subset_newton_pred_dparam**2, axis=1)
        factor = bTb - aTb**2 / aTa
        factor_indices = np.argsort(factor)
        for s in factor_indices[::-1][:50]:
            a = subset_pred_dparam[s, :]
            b = subset_newton_pred_dparam[s, :]

            # Find x_test such that a^T x_test = 0 and b^T x_test is big
            x = b - a * np.dot(a, b) / np.dot(a, a)
            x /= np.linalg.norm(x)
            cex_X = np.vstack([cex_X, x])
            cex_Y = np.hstack([cex_Y, 1])
            cex_tags.append('cex_factor')

        # Pick random small points
        N_random = 50
        X = rng.normal(0, 1, (N_random, cex_X.shape[1]))
        Y = rng.randint(0, 2, (N_random, ))
        cex_X = np.vstack([cex_X, X])
        cex_Y = np.hstack([cex_Y, Y])
        cex_tags.extend(['cex_random'] * N_random)

        cex_X = np.vstack([cex_X, np.full((1, cex_X.shape[1]), 1)])
        cex_Y = np.hstack([cex_Y, 1])
        cex_tags.extend(['cex_one'])

        res['cex_X'] = cex_X
        res['cex_Y'] = cex_Y
        res['cex_tags'] = cex_tags

        return res

    def compute_cex_test_infl_batch(self, subset_start, subset_end):
        self.load_phases([0, 1, 2, 3, 4], verbose=False)

        start_time = time.time()

        ds_test = DataSet(self.R['cex_X'], self.R['cex_Y'])
        res = dict(('cex_' + key, value)
                   for key, value in self.compute_test_influence(
                       subset_start, subset_end, ds_test).items())

        end_time = time.time()
        time_per_subset = (end_time - start_time) / (subset_end - subset_start)
        remaining_time = (len(self.R['subset_indices']) -
                          subset_end) * time_per_subset
        print('Each subset takes {} s, {} s remaining'.format(
            time_per_subset, remaining_time))

        return res

    @phase(5)
    def compute_test_infl(self):
        num_subsets = len(self.R['subset_indices'])
        subsets_per_batch = 256
        results = self.task_queue.execute(
            'compute_cex_test_infl_batch',
            [(i, min(i + subsets_per_batch, num_subsets))
             for i in range(0, num_subsets, subsets_per_batch)],
            force_refresh=True)

        res = self.task_queue.collate_results(results)

        ds_test = DataSet(self.R['cex_X'], self.R['cex_Y'])
        model = self.get_model()
        model.load('initial')
        test_grad_losses = model.get_indiv_grad_loss(ds_test, verbose=False)
        test_grad_margins = model.get_indiv_grad_margin(ds_test, verbose=False)

        pred_dparam = self.R['subset_pred_dparam']
        newton_pred_dparam = self.R['subset_newton_pred_dparam']
        res['cex_subset_test_pred_infl'] = np.dot(pred_dparam,
                                                  test_grad_losses.T)
        res['cex_subset_test_pred_margin_infl'] = np.dot(
            pred_dparam, test_grad_margins.T)
        res['cex_subset_test_newton_pred_infl'] = np.dot(
            newton_pred_dparam, test_grad_losses.T)
        res['cex_subset_test_newton_pred_margin_infl'] = np.dot(
            newton_pred_dparam, test_grad_margins.T)

        return res

    def plot_overestimates(self, save_and_close=False):
        ds = self.get_dataset()
        pred = self.R['subset_pred_margin_infl'].reshape(-1)
        newton = self.R['subset_newton_pred_margin_infl'].reshape(-1)
        actl = self.R['subset_actl_margin_infl'].reshape(-1)
        overestimates = self.R['overestimates'].reshape(-1)
        tags = np.array(['first-order < sign(first-order) * newton'] *
                        len(pred))
        tags[overestimates] = 'first-order > sign(first-order) * newton'
        if self.dataset_id != "repeats":
            subset_sizes = np.repeat(
                [len(subset) for subset in self.R['subset_indices']],
                ds.test.num_examples).reshape(-1)
            tags = [
                '{} (size {})'.format(tag, size)
                for tag, size in zip(tags, subset_sizes)
            ]

        fig, ax = plt.subplots(1, 1, figsize=(12, 12))
        plot_influence_correlation(
            ax,
            pred,
            newton,
            label=tags,
            xlabel='First-order influence',
            ylabel='Newton influence',
            title=
            'Influence on margin, for all combinations of test points and subsets',
            subtitle=self.dataset_id,
            size=1)
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'pred_over_newton.png'),
                        bbox_inches='tight')
            plt.close(fig)

    def plot_repeats(self, save_and_close=False):
        if self.dataset_id != "repeats": return
        ds = self.get_dataset()
        pred = self.R['subset_pred_margin_infl'].reshape(-1)
        newton = self.R['subset_newton_pred_margin_infl'].reshape(-1)

        sizes = np.array([len(subset) for subset in self.R['subset_indices']])
        norm = mpl.colors.Normalize(vmin=1, vmax=np.max(sizes))
        cmap = plt.get_cmap('plasma')
        color_by_size = np.repeat(cmap(norm(sizes)),
                                  ds.test.num_examples,
                                  axis=0)
        fig, ax = plt.subplots(1, 1, figsize=(12, 12))
        plot_influence_correlation(
            ax,
            pred,
            newton,
            colors=color_by_size,
            xlabel='First-order influence',
            ylabel='Newton influence',
            title=
            'Influence on margin, for all combinations of test points and subsets',
            subtitle=self.dataset_id,
            size=1,
            balanced=True,
            equal=False)
        ax.set_xlim([x * 0.5 for x in ax.get_xlim()])
        ax.set_ylim([x * 0.5 for x in ax.get_ylim()])

        sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
        sm.set_array([])
        cbar = plt.colorbar(sm, ax=ax)
        cbar.ax.set_ylabel('number of repeats removed', rotation=90)

        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir,
                                     'pred_over_newton_size.png'),
                        bbox_inches='tight')
            plt.close(fig)

        N_unique = self.R['repeats_N_unique']
        repeat_ids = np.repeat(
            np.array([i for i in range(N_unique) for _ in range(i + 1)]),
            ds.test.num_examples)
        norm = mpl.colors.Normalize(vmin=0, vmax=np.max(repeat_ids))
        cmap = plt.get_cmap('rainbow', N_unique)
        color_by_id = cmap(repeat_ids)
        fig, ax = plt.subplots(1, 1, figsize=(12, 12))
        plot_influence_correlation(
            ax,
            pred,
            newton,
            colors=color_by_id,
            xlabel='First-order influence',
            ylabel='Newton influence',
            title=
            'Influence on margin, for all combinations of test points and subsets',
            subtitle=self.dataset_id,
            size=3,
            balanced=True,
            equal=False)
        ax.set_xlim([x * 0.5 for x in ax.get_xlim()])
        ax.set_ylim([x * 0.5 for x in ax.get_ylim()])

        sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
        sm.set_array([])
        cbar = plt.colorbar(sm, ax=ax)
        cbar.ax.set_ylabel('repeated point id', rotation=90)

        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir, 'pred_over_newton_id.png'),
                        bbox_inches='tight')
            plt.close(fig)

    def plot_counterex_distribution(self, save_and_close=False):
        overestimates = np.mean(self.R['overestimates'], axis=1)
        std = np.std(overestimates)
        if std < 1e-8: return
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        plot_distribution(
            ax,
            overestimates,
            title='Distribution of counterexamples',
            xlabel=
            'Fraction of test points with first-order > sign(first-order) * newton',
            ylabel='Number of subsets',
            subtitle=self.dataset_id)
        if save_and_close:
            fig.savefig(os.path.join(self.plot_dir,
                                     'pred_over_newton_dist.png'),
                        bbox_inches='tight')
            plt.close(fig)

    def plot_dist_infl(self, save_and_close=False):
        K = self.R['dist_subset_pred_margin_infl'].shape[0]

        pred_margin = self.R['dist_subset_pred_margin_infl']
        newton_margin = self.R['dist_subset_newton_pred_margin_infl']
        actl_margin = self.R['dist_subset_actl_margin_infl']

        pred = self.R['dist_subset_pred_infl']
        newton = self.R['dist_subset_newton_pred_infl']
        actl = self.R['dist_subset_actl_infl']

        def compare_influences(x, y, x_approx_type, y_approx_type, infl_type):
            approx_type_to_label = {
                'pred': 'First-order influence',
                'newton': 'Newton influence',
                'actl': 'Actual influence'
            }
            xlabel = approx_type_to_label[x_approx_type]
            ylabel = approx_type_to_label[y_approx_type]
            fig, ax = plt.subplots(1, 1, figsize=(8, 8))
            plot_influence_correlation(
                ax,
                x.reshape(-1),
                y.reshape(-1),
                xlabel=xlabel,
                ylabel=ylabel,
                title=
                'Influence on {}, for {} subsets and a constructed test set'.
                format(infl_type, K),
                subtitle=self.dataset_id,
                size=3,
                equal=False)
            if save_and_close:
                fig.savefig(os.path.join(
                    self.plot_dir, 'dist_{}-{}_{}.png'.format(
                        x_approx_type, y_approx_type,
                        "infl" if infl_type == "loss" else "margin_infl")),
                            bbox_inches='tight')
                plt.close(fig)

        compare_influences(pred_margin, newton_margin, 'pred', 'newton',
                           'margin')
        compare_influences(actl_margin, pred_margin, 'actl', 'pred', 'margin')
        compare_influences(pred, newton, 'pred', 'newton', 'loss')
        compare_influences(actl, pred, 'actl', 'pred', 'loss')

    def plot_all(self, save_and_close=False):
        self.plot_overestimates(save_and_close)
        self.plot_repeats(save_and_close)
        self.plot_counterex_distribution(save_and_close)
        self.plot_dist_infl(save_and_close)