def test_studentT_likelihood(df: float, loc: float, scale: float): dfs = torch.zeros((NUM_SAMPLES, )) + df locs = torch.zeros((NUM_SAMPLES, )) + loc scales = torch.zeros((NUM_SAMPLES, )) + scale distr = StudentT(df=dfs, loc=locs, scale=scales) samples = distr.sample() init_bias = [ inv_softplus(df - 2), loc - START_TOL_MULTIPLE * TOL * loc, inv_softplus(scale - START_TOL_MULTIPLE * TOL * scale), ] df_hat, loc_hat, scale_hat = maximum_likelihood_estimate_sgd( StudentTOutput(), samples, init_biases=init_bias, num_epochs=15, learning_rate=1e-3, ) assert (np.abs(df_hat - df) < TOL * df), f"df did not match: df = {df}, df_hat = {df_hat}" assert (np.abs(loc_hat - loc) < TOL * loc), f"loc did not match: loc = {loc}, loc_hat = {loc_hat}" assert (np.abs(scale_hat - scale) < TOL * scale ), f"scale did not match: scale = {scale}, scale_hat = {scale_hat}"
def NIG_NLL(y: torch.Tensor, gamma: torch.Tensor, nu: torch.Tensor, alpha: torch.Tensor, beta: torch.Tensor, reduction='mean'): student_var = beta * (1. + alpha) / (nu * alpha) dist = StudentT(loc=gamma, scale=student_var, df=2 * alpha) nll = -1. * dist.log_prob(y) return reduce(nll, reduction=reduction)
def _reweight(self, N=100000): # Expect value: \mathbb{E}_{x~X}Ramp(|x|) if not hasattr(self, 'epv'): self.Hfunc = self.config.Hfunc # self.Hfunc = 'ramp' if self.real == 'Student': tdist = StudentT(df=self.config.r_df) x = tdist.sample((5000000, )) elif self.real == 'Gaussian': ndist = Normal(0, 1) x = ndist.sample((5000000, )) self.epv = self._HFunc(x, mode=self.Hfunc).mean().item() def sov_func(a, bs=1000): # find a suitable factor a to match expected value. r = AveMeter() for _ in range(N // bs): if self.config.use_ig: ub1 = torch.randn(bs, self.netGXi.input_dim // 2).to(device) ub2 = torch.randn( bs, self.netGXi.input_dim - self.netGXi.input_dim // 2).to(device) ub2.data.div_(torch.abs(ub2.data) + self.config.delta) ub = torch.cat([ub1, ub2], dim=1) else: ub = torch.randn(bs, self.netGXi.input_dim).to(device) with torch.no_grad(): xib = self.netGXi(ub) zb = torch.randn(bs, self.dim).to(device) vu = (zb[:, 0].div_(zb.norm(2, dim=1)) + self.config.delta).to(device) r.update( self._HFunc(a * xib * vu, mode=self.Hfunc).mean().item(), bs) return r.avg - self.epv # if sov_func(1) > 0: down,up= 0,3 # elif sov_func(3) > 0: down,up = 0,5 # elif sov_func(10) > 0: down,up = 1,12 # elif sov_func(25) > 0: down,up = 8,27 # elif sov_func(75) > 0: down,up = 23,77 if sov_func(250) > 0: down, up = 0, 3000 else: logger.info('Factor is larger than 2500!') return 250 factor = bisect(sov_func, down, up) print(factor) return factor
def sample(self, batch_size=16, num_ctx=None, max_num_points=50, x_range=(-2, 2), device='cpu'): batch = AttrDict() num_ctx = num_ctx or torch.randint(low=3, high=max_num_points-3, size=[1]).item() num_tar = torch.randint(low=3, high=max_num_points-num_ctx, size=[1]).item() num_points = num_ctx + num_tar batch.x = x_range[0] + (x_range[1] - x_range[0]) \ * torch.rand([batch_size, num_points, 1], device=device) batch.xc = batch.x[:,:num_ctx] batch.xt = batch.x[:,num_ctx:] # batch_size * num_points * num_points cov = self.kernel(batch.x) mean = torch.zeros(batch_size, num_points, device=device) batch.y = MultivariateNormal(mean, cov).rsample().unsqueeze(-1) batch.yc = batch.y[:,:num_ctx] batch.yt = batch.y[:,num_ctx:] if self.t_noise is not None: batch.y += self.t_noise * StudentT(2.1).rsample(batch.y.shape).to(device) return batch
def img_to_task(img, num_ctx=None, max_num_points=None, target_all=False, t_noise=None, device=None): B, C, H, W = img.shape num_pixels = H*W img = img.view(B, C, -1) if t_noise is not None: if t_noise == -1: t_noise = 0.09 * torch.rand(img.shape) img += t_noise * StudentT(2.1).rsample(img.shape) device = img.device if device is None else device batch = AttrDict() max_num_points = max_num_points or num_pixels num_ctx = num_ctx or \ torch.randint(low=3, high=max_num_points-3, size=[1]).item() num_tar = max_num_points - num_ctx if target_all else \ torch.randint(low=3, high=max_num_points-num_ctx, size=[1]).item() num_points = num_ctx + num_tar idxs = torch.rand(B, num_pixels).argsort(-1)[...,:num_points].to(img.device) x1, x2 = idxs//W, idxs%W batch.x = torch.stack([ 2*x1.float()/(H-1) - 1, 2*x2.float()/(W-1) - 1], -1).to(device) batch.y = (torch.gather(img, -1, idxs.unsqueeze(-2).repeat(1, C, 1))\ .transpose(-2, -1) - 0.5).to(device) batch.xc = batch.x[:,:num_ctx] batch.xt = batch.x[:,num_ctx:] batch.yc = batch.y[:,:num_ctx] batch.yt = batch.y[:,num_ctx:] return batch
class EpsiSampler: def __init__(self, x, epsi_nu): self.x = x self.len = self.x.shape[0] self.epsi_nu = epsi_nu self.tdistribution = StudentT(self.epsi_nu) def epsisamp(self, epsi, tau, mu): # assumes no covariance between epsilons; does not sample as a single block # Newton-Raphson iterations to find proposal density mu_f, hf, hf_inv = self.epsi_nr(epsi, mu, tau) # now propose with multivariate t centered at epsiMLE with covariance matrix from Hessian # note that since Hessian is diagonal, we can just simulate from n univariate t's. epsi_p = mu_f + hf_inv.neg().sqrt() * self.tdistribution.sample( torch.Size([self.len, 1])) # epsi_p = torch.randn(mu_f, -hf_inv) arat = self.pratepsi(epsi, epsi_p, tau, mu) + \ tqrat(epsi, epsi_p, mu_f, mu_f, hf_inv.neg().sqrt(), hf_inv.neg().sqrt(), self.epsi_nu) ridx = torch.rand(self.len, 1).log() >= arat.clamp(max=0) ridx_float = ridx.type(torch.float32) epsi[~ridx] = epsi_p[~ridx] mrej = (1 - ridx_float).mean() return epsi, mrej # TODO: find out if .exp() legal here def pratepsi(self, epsi, epsi_p, tau, mu): pr = epsi_p * self.x / tau.sqrt() - (mu + epsi_p / tau.sqrt()).exp() - epsi_p ** 2 / 2 - \ (epsi * self.x / tau.sqrt() - (mu + epsi / tau.sqrt()).exp() - epsi ** 2 / 2) return pr def epsi_nr(self, epsi, mu, tau): h, h_inv = 0, 0 for i in range(1, 100): h, h_inv = self.hessepsi(epsi, tau, mu) # N - R update grad = self.gradepsi(epsi, tau, mu) epsi = epsi - h_inv * grad # we've reached a local maximum if grad.norm() < 1e-6: break return epsi, h, h_inv @staticmethod def hessepsi(epsi, tau, mu): h = -(mu + epsi / tau.sqrt()).exp() / tau - 1 h_inv = 1 / h return h, h_inv def gradepsi(self, epsi, tau, mu): gr = self.x / tau.sqrt() - ( mu + epsi / torch.sqrt(tau)).exp() / tau.sqrt() - epsi return gr
def forward(self): self.precision_coeff = (self.belief + 1) / (self.belief * (self.df - self.dimensionality + 1)) return StudentT( (self.df - self.dimensionality + 1).unsqueeze(-1), loc=self.loc, scale=(self.precision_coeff.unsqueeze(-1) / self.precision_diag).pow(0.5), )
def forward(self): """Returns predictive posterior distribution""" self.precision_coeff = (self.belief + 1) / (self.belief * (self.df - self.dimensionality + 1)) return StudentT( (self.df - self.dimensionality + 1).unsqueeze(-1), loc=self.loc, scale=(self.precision_coeff.unsqueeze(-1) / self.precision_diag).pow(0.5), )
def distribution(self, distr_args, scale: Optional[torch.Tensor] = None) -> Distribution: mix_logits, df, loc, dist_scale = distr_args distr = MixtureSameFamily(Categorical(logits=mix_logits), StudentT(df, loc, dist_scale)) if scale is None: return distr else: return TransformedDistribution( distr, [AffineTransform(loc=0, scale=scale)])
def distribution(self, distr_args, scale: Optional[torch.Tensor] = None) -> Distribution: mix_logits, df, loc, scale = distr_args comp_distr = StudentT(df, loc, scale) if scale is None: return MixtureSameFamily(Categorical(logits=mix_logits), comp_distr) else: scaled_comp_distr = TransformedDistribution( comp_distr, [AffineTransform(loc=0, scale=scale)]) return MixtureSameFamily(Categorical(logits=mix_logits), scaled_comp_distr)
def sample(self, bx, device='cuda:0'): # bx: 1 * num_points * 1 # 1 * num_points * num_points cov = self.kernel(bx) mean = torch.zeros(1, bx.shape[1], device=device) mean = mean.cuda() by = MultivariateNormal(mean, cov).rsample().unsqueeze(-1) if self.t_noise is not None: by += self.t_noise * StudentT(2.1).rsample(by.shape).to(device) return by
def test_smoothing(self): # we should be able to run the kalman smoother over pretty much any # parameters without it blowing up fll = FilteredLocalLevelModel(input_length=50) true_params = dict(γ=0., η=2., ρ=0.95, σ=1.5) algo_seed, data_seed = 123, 123 torch.manual_seed(data_seed) y, z = fll.simulate(**true_params) for i in range(10): ζ = StudentT(df=4, loc=0, scale=10).sample((fll.d, )) sm = fll.kalman_smoother(y, ζ) for k in [ "z_upd", "Σz_upd", "z_smooth", "Σz_smooth", "y_pred", "Σy_pred" ]: self.assertIsInstance(sm[k], torch.Tensor) self.assertFalse(any(torch.isnan(sm[k])))
def test_log_prob(batch_shape, dim): loc = torch.randn(batch_shape + (dim, )) A = torch.randn(batch_shape + (dim, dim + dim)) scale_tril = A.matmul(A.transpose(-2, -1)).cholesky() x = torch.randn(batch_shape + (dim, )) df = torch.randn(batch_shape).exp() + 2 actual_log_prob = MultivariateStudentT(df, loc, scale_tril).log_prob(x) if dim == 1: expected_log_prob = StudentT(df.unsqueeze(-1), loc, scale_tril[..., 0]).log_prob(x).sum(-1) assert_equal(actual_log_prob, expected_log_prob) # test the fact MVT(df, loc, scale)(x) = int MVN(loc, scale / m)(x) Gamma(df/2,df/2)(m) dm num_samples = 100000 gamma_samples = Gamma(df / 2, df / 2).sample(sample_shape=(num_samples, )) mvn_scale_tril = scale_tril / gamma_samples.sqrt().unsqueeze(-1).unsqueeze( -1) mvn = MultivariateNormal(loc, scale_tril=mvn_scale_tril) expected_log_prob = mvn.log_prob(x).logsumexp(0) - math.log(num_samples) assert_equal(actual_log_prob, expected_log_prob, prec=0.01)
def student_parse_params(params, min_sigma=0, min_nu=3.0, multiple=False): """ Take a Tensor (e. g. neural network output) and return torch.distributions.Normal distribution. This Normal distribution is component-wise independent, and its dimensionality depends on the input shape. First half of channels is mean of the distribution, the softplus of the second half is std (sigma), so there is no restrictions on the input tensor. min_sigma is the minimal value of sigma. I. e. if the above softplus is less than min_sigma, then sigma is clipped from below with value min_sigma. This regularization is required for the numerical stability and may be considered as a neural network architecture choice without any change to the probabilistic model. """ if multiple: batch_size = params.shape[0] n = params.shape[1] d = params.shape[2] mu = params[:, :, :d // 3] sigma_params = params[:, :, d // 3:2 * d // 3] nu_params = params[:, :, 2 * d // 3:] else: n = params.shape[0] d = params.shape[1] mu = params[:, :d // 3] sigma_params = params[:, d // 3:2 * d // 3] nu_params = params[:, 2 * d // 3:] sigma = softplus(sigma_params).clamp(min=min_sigma) nu = softplus(nu_params).clamp(min=min_nu) distr = StudentT(nu, loc=mu, scale=sigma) return distr
def entropy(self): simple_tst = StudentT_torch(self.df) H = self.coeff * torch.logdet(self.S) + self.d * simple_tst.entropy() return H
def __init__(self, x, epsi_nu): self.x = x self.len = self.x.shape[0] self.epsi_nu = epsi_nu self.tdistribution = StudentT(self.epsi_nu)