def loglikelihood(self, res, alpha, scale):
     assert alpha.view(-1).size()[0] == 1 or alpha.view(
         -1).size()[0] == len(res)
     scale = scale + 1e-5
     N = len(res)
     dist = distribution.Distribution()
     loss = general.lossfun(res, alpha, scale, approximate=False).sum()
     log_partition = torch.log(scale) + dist.log_base_partition_function(
         alpha)
     if alpha.view(-1).size()[0] == 1:
         log_partition = N * log_partition
     else:
         log_partition = log_partition.sum()
     nll = loss + log_partition
     return -nll.detach().numpy()
    def sample(self, alpha, c):
        alpha = torch.as_tensor(alpha)
        scale = torch.as_tensor(c)
        assert (alpha >= 0).all()
        assert (scale >= 0).all()
        float_dtype = alpha.dtype
        assert scale.dtype == float_dtype

        cauchy = torch.distributions.cauchy.Cauchy(0., np.sqrt(2.))
        uniform = torch.distributions.uniform.Uniform(0, 1)
        samples = torch.zeros_like(alpha)
        accepted = torch.zeros(alpha.shape).type(torch.bool)
        dist = distribution.Distribution()
        while not accepted.type(torch.uint8).all():
            # Draw N samples from a Cauchy, our proposal distribution.
            cauchy_sample = torch.reshape(
                cauchy.sample((np.prod(alpha.shape), )), alpha.shape)
            cauchy_sample = cauchy_sample.type(alpha.dtype)

            # Compute the likelihood of each sample under its target distribution.
            nll = dist.nllfun(cauchy_sample,
                              torch.as_tensor(alpha).to(cauchy_sample),
                              torch.tensor(1).to(cauchy_sample))

            # Bound the NLL. We don't use the approximate loss as it may cause
            # unpredictable behavior in the context of sampling.
            nll_bound = general.lossfun(
                cauchy_sample,
                torch.tensor(0., dtype=cauchy_sample.dtype),
                torch.tensor(1., dtype=cauchy_sample.dtype),
                approximate=False) + dist.log_base_partition_function(alpha)

            # Draw N samples from a uniform distribution, and use each uniform sample
            # to decide whether or not to accept each proposal sample.
            uniform_sample = torch.reshape(
                uniform.sample((np.prod(alpha.shape), )), alpha.shape)
            uniform_sample = uniform_sample.type(alpha.dtype)
            accept = uniform_sample <= torch.exp(nll_bound - nll)

            # If a sample is accepted, replace its element in `samples` with the
            # proposal sample, and set its bit in `accepted` to True.
            samples = torch.where(accept, cauchy_sample, samples)
            accepted = accepted | accept

            # Because our distribution is a location-scale family, we sample from
            # p(x | 0, \alpha, 1) and then scale each sample by `scale`.
            samples *= scale
        return samples
def train_locally_adaptive(model, alpha, scale, trX, trY, learning_rate=0.01, epoch=500, verbose=True):
    params = list(model.parameters()) + list(alpha.parameters()) + list(scale.parameters())
    dist = distribution.Distribution()
    optimizer = torch.optim.Adam(params, lr=learning_rate, weight_decay=0.01)

    for e in tqdm(range(epoch)):
        y_hat = model(trX).view(-1)
        alphas = torch.exp(alpha(trX))
        scales = torch.exp(scale(trX))
        loss = general.lossfun((y_hat - trY)[:, None], alpha=alphas, scale=scales, approximate=False)
        scales = scales + 1e-10
        log_partition = torch.log(scales) + dist.log_base_partition_function(alphas)

        loss = (loss + log_partition).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if verbose and np.mod(e, 100) == 0:
            print('{:<4}: loss={:03f}'.format(e, loss.data))
    return model, alpha, scale
Exemple #4
0
    def __init__(self,
                 num_dims,
                 float_dtype,
                 device,
                 alpha_lo=0.001,
                 alpha_hi=1.999,
                 alpha_init=None,
                 scale_lo=1e-5,
                 scale_init=1.0):
        """Sets up the loss function.

    Args:
      num_dims: The number of dimensions of the input to come.
      float_dtype: The floating point precision of the inputs to come.
      device: The device to run on (cpu, cuda, etc).
      alpha_lo: The lowest possible value for loss's alpha parameters, must be
        >= 0 and a scalar. Should probably be in (0, 2).
      alpha_hi: The highest possible value for loss's alpha parameters, must be
        >= alpha_lo and a scalar. Should probably be in (0, 2).
      alpha_init: The value that the loss's alpha parameters will be initialized
        to, must be in (`alpha_lo`, `alpha_hi`), unless `alpha_lo` == `alpha_hi`
        in which case this will be ignored. Defaults to (`alpha_lo` +
        `alpha_hi`) / 2
      scale_lo: The lowest possible value for the loss's scale parameters. Must
        be > 0 and a scalar. This value may have more of an effect than you
        think, as the loss is unbounded as scale approaches zero (say, at a
        delta function).
      scale_init: The initial value used for the loss's scale parameters. This
        also defines the zero-point of the latent representation of scales, so
        SGD may cause optimization to gravitate towards producing scales near
        this value.
    """
        super(AdaptiveLossFunction, self).__init__()

        if not np.isscalar(alpha_lo):
            raise ValueError(
                '`alpha_lo` must be a scalar, but is of type {}'.format(
                    type(alpha_lo)))
        if not np.isscalar(alpha_hi):
            raise ValueError(
                '`alpha_hi` must be a scalar, but is of type {}'.format(
                    type(alpha_hi)))
        if alpha_init is not None and not np.isscalar(alpha_init):
            raise ValueError(
                '`alpha_init` must be None or a scalar, but is of type {}'.
                format(type(alpha_init)))
        if not alpha_lo >= 0:
            raise ValueError(
                '`alpha_lo` must be >= 0, but is {}'.format(alpha_lo))
        if not alpha_hi >= alpha_lo:
            raise ValueError(
                '`alpha_hi` = {} must be >= `alpha_lo` = {}'.format(
                    alpha_hi, alpha_lo))
        if alpha_init is not None and alpha_lo != alpha_hi:
            if not (alpha_init > alpha_lo and alpha_init < alpha_hi):
                raise ValueError(
                    '`alpha_init` = {} must be in (`alpha_lo`, `alpha_hi`) = ({} {})'
                    .format(alpha_init, alpha_lo, alpha_hi))
        if not np.isscalar(scale_lo):
            raise ValueError(
                '`scale_lo` must be a scalar, but is of type {}'.format(
                    type(scale_lo)))
        if not np.isscalar(scale_init):
            raise ValueError(
                '`scale_init` must be a scalar, but is of type {}'.format(
                    type(scale_init)))
        if not scale_lo > 0:
            raise ValueError(
                '`scale_lo` must be > 0, but is {}'.format(scale_lo))
        if not scale_init >= scale_lo:
            raise ValueError(
                '`scale_init` = {} must be >= `scale_lo` = {}'.format(
                    scale_init, scale_lo))

        self.num_dims = num_dims
        if float_dtype == np.float32:
            float_dtype = torch.float32
        if float_dtype == np.float64:
            float_dtype = torch.float64
        self.float_dtype = float_dtype
        self.device = device
        if isinstance(device, int) or\
           (isinstance(device, str) and 'cuda' in device):
            torch.cuda.set_device(self.device)

        self.distribution = distribution.Distribution()

        if alpha_lo == alpha_hi:
            # If the range of alphas is a single item, then we just fix `alpha` to be
            # a constant.
            self.fixed_alpha = torch.tensor(
                alpha_lo, dtype=self.float_dtype,
                device=self.device)[np.newaxis,
                                    np.newaxis].repeat(1, self.num_dims)
            self.alpha = lambda: self.fixed_alpha
        else:
            # Otherwise we construct a "latent" alpha variable and define `alpha`
            # As an affine function of a sigmoid on that latent variable, initialized
            # such that `alpha` starts off as `alpha_init`.
            if alpha_init is None:
                alpha_init = (alpha_lo + alpha_hi) / 2.
            latent_alpha_init = util.inv_affine_sigmoid(alpha_init,
                                                        lo=alpha_lo,
                                                        hi=alpha_hi)
            self.register_parameter(
                'latent_alpha',
                torch.nn.Parameter(latent_alpha_init.clone().detach().to(
                    dtype=self.float_dtype,
                    device=self.device)[np.newaxis,
                                        np.newaxis].repeat(1, self.num_dims),
                                   requires_grad=True))
            self.alpha = lambda: util.affine_sigmoid(
                self.latent_alpha, lo=alpha_lo, hi=alpha_hi)

        if scale_lo == scale_init:
            # If the difference between the minimum and initial scale is zero, then
            # we just fix `scale` to be a constant.
            self.fixed_scale = torch.tensor(
                scale_init, dtype=self.float_dtype,
                device=self.device)[np.newaxis,
                                    np.newaxis].repeat(1, self.num_dims)
            self.scale = lambda: self.fixed_scale
        else:
            # Otherwise we construct a "latent" scale variable and define `scale`
            # As an affine function of a softplus on that latent variable.
            self.register_parameter(
                'latent_scale',
                torch.nn.Parameter(torch.zeros(
                    (1, self.num_dims)).to(dtype=self.float_dtype,
                                           device=self.device),
                                   requires_grad=True))
            self.scale = lambda: util.affine_softplus(
                self.latent_scale, lo=scale_lo, ref=scale_init)
Exemple #5
0
 def setUp(self):
     self._distribution = distribution.Distribution()
     super(TestDistribution, self).setUp()
     torch.manual_seed(0)
     np.random.seed(0)