Ejemplo n.º 1
0
    def dataloader_hv_product(self, v):

        device = self.device
        num_data = 0  # count the number of datum points in the dataloader

        THv = [torch.zeros(p.size()).to(device) for p in self.params
              ]  # accumulate result
        for inputs, targets in self.data:
            self.model.zero_grad()
            tmp_num_data = inputs.size(0)
            outputs = self.model(inputs.to(device))
            loss = self.criterion(outputs, targets.to(device))
            loss.backward(create_graph=True)
            params, gradsH = get_params_grad(self.model)
            self.model.zero_grad()
            Hv = torch.autograd.grad(gradsH,
                                     params,
                                     grad_outputs=v,
                                     only_inputs=True,
                                     retain_graph=False)
            THv = [
                THv1 + Hv1 * float(tmp_num_data) + 0.
                for THv1, Hv1 in zip(THv, Hv)
            ]
            num_data += float(tmp_num_data)

        THv = [THv1 / float(num_data) for THv1 in THv]
        eigenvalue = group_product(THv, v).cpu().item()
        return eigenvalue, THv
Ejemplo n.º 2
0
    def __init__(self,
                 model,
                 criterion,
                 data=None,
                 dataloader=None,
                 cuda=True,
                 data_save_dir="data",
                 record_data=False):
        """
        model: the model that needs Hessain information
        criterion: the loss function
        data: a single batch of data, including inputs and its corresponding labels
        dataloader: the data loader including bunch of batches of data
        """

        # make sure we either pass a single batch or a dataloader
        assert (data != None
                and dataloader == None) or (data == None
                                            and dataloader != None)

        self.model = model.eval()  # make model is in evaluation model
        self.criterion = criterion

        if data != None:
            self.data = data
            self.full_dataset = False
        else:
            self.data = dataloader
            self.full_dataset = True

        if cuda:
            self.device = 'cuda'
        else:
            self.device = 'cpu'

        # checking whether or not data save folder has been created
        self.record_data = record_data
        self.data_save_dir = data_save_dir + "/"
        if record_data and (not os.path.exists(data_save_dir)):
            try:
                os.mkdir(data_save_dir)
            except OSError:
                print("Could not create data save directory")

        # pre-processing for single batch case to simplify the computation.
        if not self.full_dataset:
            self.inputs, self.targets = self.data
            if self.device == 'cuda':
                self.inputs, self.targets = self.inputs.cuda(
                ), self.targets.cuda()

            # if we only compute the Hessian information for a single batch data, we can re-use the gradients.
            outputs = self.model(self.inputs)
            loss = self.criterion(outputs, self.targets)
            loss.backward(create_graph=True)

        # this step is used to extract the parameters from the model
        params, gradsH = get_params_grad(self.model)
        self.params = params
        self.gradsH = gradsH  # gradient used for Hessian computation
Ejemplo n.º 3
0
    def __init__(self,
                 model,
                 criterion,
                 data=None,
                 dataloader=None,
                 cuda=True,
                 drop=None):
        """
        model: the model that needs Hessain information
        criterion: the loss function
        data: a single batch of data, including inputs and its corresponding labels
        dataloader: the data loader including bunch of batches of data
        """

        # make sure we either pass a single batch or a dataloader
        assert (data != None
                and dataloader == None) or (data == None
                                            and dataloader != None)

        self.model = model.eval()  # make model is in evaluation model
        self.criterion = criterion

        if data != None:
            self.data = data
            self.full_dataset = False
        else:
            self.data = dataloader
            self.full_dataset = True

        if cuda:
            self.device = 'cuda'
        else:
            self.device = 'cpu'

        # pre-processing for single batch case to simplify the computation.
        if not self.full_dataset:
            self.inputs, self.targets = self.data
            if self.device == 'cuda':
                self.inputs, self.targets = self.inputs.cuda(
                ), self.targets.cuda()

                # if we only compute the Hessian information for a single batch data, we can re-use the gradients.
                outputs = self.model(self.inputs)
                loss = self.criterion(outputs, self.targets)
                loss.backward(create_graph=True)

        # this step is used to extract the parameters from the model
        names, params, gradsH = get_params_grad(self.model)
        self.params = params
        self.names = names
        self.gradsH = gradsH  # gradient used for Hessian computation
        self.drop = drop
def train_hessian(args,
                  trainer,
                  task,
                  epoch_itr,
                  sample_iter=1,
                  maxIter=500,
                  tol=1e-4,
                  top_n=1,
                  ignore_grad=False):
    """Train the model for one epoch."""
    # Update parameters every N batches
    update_freq = args.update_freq[
        epoch_itr.epoch - 1] if epoch_itr.epoch <= len(
            args.update_freq) else args.update_freq[-1]

    # Initialize data iterator
    itr = epoch_itr.next_epoch_itr(
        fix_batches_to_gpus=args.fix_batches_to_gpus,
        shuffle=(epoch_itr.epoch >= args.curriculum),
    )
    itr = iterators.GroupedIterator(itr, update_freq)
    progress = progress_bar.build_progress_bar(
        args,
        itr,
        epoch_itr.epoch,
        no_progress_bar='simple',
    )

    extra_meters = collections.defaultdict(lambda: AverageMeter())
    valid_subsets = args.valid_subset.split(',')
    max_update = args.max_update or math.inf
    max_iters = 10
    samples_hessian = []
    for i, samples in enumerate(progress, start=epoch_itr.iterations_in_epoch):
        if i > max_iters:
            break
        samples = [trainer._prepare_sample(sample) for sample in samples]
        samples_hessian.extend(samples)

    eigenvalues = []
    eigenvectors = []
    computed_dim = 0

    params, gradsH = get_params_grad(trainer.model)
    while computed_dim < top_n:
        eigenvalue = None
        v = [torch.randn(p.size()).cuda() for p in params]
        v = normalization(v)

        for i in range(maxIter):
            trainer.model.zero_grad()
            v = orthnormal(v, eigenvectors)
            loss, sample_size, logging_output, gradsH, tmp_eigenvalue, Hv = trainer.task.train_step_hessian(
                samples_hessian,
                trainer.model,
                trainer.criterion,
                trainer.optimizer,
                ignore_grad,
                v=v)
            v = normalization(Hv)
            if eigenvalue == None:
                eigenvalue = tmp_eigenvalue
            else:
                if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) +
                                                       1e-6) < tol:
                    break
                else:
                    eigenvalue = tmp_eigenvalue
        eigenvalues.append(eigenvalue)
        eigenvectors.append(v)
        computed_dim += 1
    return eigenvalues, eigenvectors