def main(): sample_sizes = [250, 500, 1000, 2000, 10000] rhos = [0, 0.3, 0.6, 0.9] r = 2 s = 2 K = 50 time_results = { rho: [{ssize: [] for ssize in sample_sizes}, None] for rho in rhos } all_results = [] for rho in rhos: for sample_size in sample_sizes: cov = np.array([[1., rho], [rho, 1.]]) dist = MultivariateNormal(mean=np.zeros(2), cov=cov) t_ci, t_nad, t_ml = [], [], [] delta = lambda x: chi2.ppf(0.97, x**2 - 1) print(f"Timing samples {sample_size} for r = {rho}") for k in range(K): xy_sample = dist.sample(sample_size) plane = Plane(xy_sample) # Adaptive algorithm t0_ad = time.time() ad = AdaptiveAlgorithm(xy_sample, delta, r, s).run() t_ci.append(time.time() - t0_ad) t0_nad = time.time() nad = NonAdaptivePartition(xy_sample, bins=[50, 50]).run() t_nad.append(time.time() - t0_nad) t0_ml = time.time() ml = -np.log( 1 - pearsonr(xy_sample[:, 0], xy_sample[:, 1])[0]**2) / 2 t_ml.append(time.time() - t0_ml) all_results.append((ad, nad, ml)) time_results[rho][0][sample_size] = [ np.mean(t_ml), np.mean(t_ci), np.mean(t_nad) ] print( f"Times: ML: {np.mean(t_ml)}, CI: {np.mean(t_ci)}, NAD: {np.mean(t_nad)}" ) generate_timing_table(time_results) print(len(all_results))
def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None): if variational_inducing_covar is not None: raise NotImplementedError( "OrthogonallyDecoupledVariationalStrategy currently works with DeltaVariationalDistribution" ) num_data = x.size(-2) full_output = self.model(torch.cat([x, inducing_points], dim=-2)) full_mean = full_output.mean full_covar = full_output.lazy_covariance_matrix if self.training: induc_mean = full_mean[..., num_data:] induc_induc_covar = full_covar[..., num_data:, num_data:] self._memoize_cache[ "prior_distribution_memo"] = MultivariateNormal( induc_mean, induc_induc_covar) test_mean = full_mean[..., :num_data] data_induc_covar = full_covar[..., :num_data, num_data:] predictive_mean = (data_induc_covar @ inducing_values.unsqueeze(-1) ).squeeze(-1).add(test_mean) predictive_covar = full_covar[..., :num_data, :num_data] # Return the distribution return MultivariateNormal(predictive_mean, predictive_covar)
def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None): if variational_inducing_covar is None: raise RuntimeError( "GridInterpolationVariationalStrategy is only compatible with Gaussian variational " f"distributions. Got ({self.variational_distribution.__class__.__name__}." ) variational_distribution = self.variational_distribution # Get interpolations interp_indices, interp_values = self._compute_grid(x) # Compute test mean # Left multiply samples by interpolation matrix predictive_mean = left_interp(interp_indices, interp_values, inducing_values.unsqueeze(-1)) predictive_mean = predictive_mean.squeeze(-1) # Compute test covar predictive_covar = InterpolatedLazyTensor( variational_distribution.lazy_covariance_matrix, interp_indices, interp_values, interp_indices, interp_values, ) output = MultivariateNormal(predictive_mean, predictive_covar) return output
def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None): # Compute full prior distribution full_inputs = torch.cat([inducing_points, x], dim=-2) full_output = self.model.forward(full_inputs) full_covar = full_output.lazy_covariance_matrix # Covariance terms num_induc = inducing_points.size(-2) test_mean = full_output.mean[..., num_induc:] induc_induc_covar = full_covar[ ..., :num_induc, :num_induc].add_jitter() induc_data_covar = full_covar[..., :num_induc, num_induc:].evaluate() data_data_covar = full_covar[..., num_induc:, num_induc:] # Compute interpolation terms # K_ZZ^{-1/2} K_ZX # K_ZZ^{-1/2} \mu_Z L = self._cholesky_factor(induc_induc_covar) if L.shape != induc_induc_covar.shape: # Aggressive caching can cause nasty shape incompatibilies when evaluating with different batch shapes del self._memoize_cache["cholesky_factor"] L = self._cholesky_factor(induc_induc_covar) interp_term = torch.triangular_solve(induc_data_covar.double(), L, upper=False)[0].to( full_inputs.dtype) # Compute the mean of q(f) # k_XZ K_ZZ^{-1/2} (m - K_ZZ^{-1/2} \mu_Z) + \mu_X predictive_mean = (torch.matmul( interp_term.transpose(-1, -2), (inducing_values - self.prior_distribution.mean).unsqueeze(-1)).squeeze(-1) + test_mean) # Compute the covariance of q(f) # K_XX + k_XZ K_ZZ^{-1/2} (S - I) K_ZZ^{-1/2} k_ZX middle_term = self.prior_distribution.lazy_covariance_matrix.mul(-1) if variational_inducing_covar is not None: middle_term = SumLazyTensor(variational_inducing_covar, middle_term) if trace_mode.on(): predictive_covar = (data_data_covar.add_jitter(1e-4).evaluate() + interp_term.transpose(-1, -2) @ middle_term.evaluate() @ interp_term) else: predictive_covar = SumLazyTensor( data_data_covar.add_jitter(1e-4), MatmulLazyTensor(interp_term.transpose(-1, -2), middle_term @ interp_term), ) # Return the distribution return MultivariateNormal(predictive_mean, predictive_covar)
def forward(self): # TODO: if we don't multiply self._variational_stddev by a mask of one, Pyro models fail # not sure where this bug is occuring (in Pyro or PyTorch) # throwing this in as a hotfix for now - we should investigate later mask = torch.ones_like(self._variational_stddev) variational_covar = DiagLazyTensor( self._variational_stddev.mul(mask).pow(2)) return MultivariateNormal(self.variational_mean, variational_covar)
def forward(self, x1, x2, diag=False, **kwargs): covar = self._get_covariance(x1, x2) if self.training: if not torch.equal(x1, x2): raise RuntimeError("x1 should equal x2 in training mode") zero_mean = torch.zeros_like(x1.select(-1, 0)) new_added_loss_term = InducingPointKernelAddedLossTerm( MultivariateNormal(zero_mean, self._covar_diag(x1)), MultivariateNormal(zero_mean, covar), self.likelihood, ) self.update_added_loss_term("inducing_point_loss_term", new_added_loss_term) if diag: return covar.diag() else: return covar
def prior_distribution(self): """ If desired, models can compare the input to forward to inducing_points and use a GridKernel for space efficiency. However, when using a default VariationalDistribution which has an O(m^2) space complexity anyways, we find that GridKernel is typically not worth it due to the moderate slow down of using FFTs. """ out = super(AdditiveGridInterpolationVariationalStrategy, self).prior_distribution mean = out.mean.repeat(self.num_dim, 1) covar = out.lazy_covariance_matrix.repeat(self.num_dim, 1, 1) return MultivariateNormal(mean, covar)
def forward(self): chol_variational_covar = self.chol_variational_covar dtype = chol_variational_covar.dtype device = chol_variational_covar.device # First make the cholesky factor is upper triangular lower_mask = torch.ones(self.chol_variational_covar.shape[-2:], dtype=dtype, device=device).tril(0) chol_variational_covar = chol_variational_covar.mul(lower_mask) # Now construct the actual matrix variational_covar = CholLazyTensor(chol_variational_covar) return MultivariateNormal(self.variational_mean, variational_covar)
def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None): if x.ndimension() == 1: x = x.unsqueeze(-1) elif x.ndimension() != 2: raise RuntimeError("AdditiveGridInterpolationVariationalStrategy expects a 2d tensor.") num_data, num_dim = x.size() if num_dim != self.num_dim: raise RuntimeError("The number of dims should match the number specified.") output = super().forward(x, inducing_points, inducing_values, variational_inducing_covar) if self.sum_output: if variational_inducing_covar is not None: mean = output.mean.sum(0) covar = output.lazy_covariance_matrix.sum(-3) return MultivariateNormal(mean, covar) else: return Delta(output.mean.sum(0)) else: return output
def forward(self, x): r""" The :func:`~gpytorch.variational.VariationalStrategy.forward` method determines how to marginalize out the inducing point function values. Specifically, forward defines how to transform a variational distribution over the inducing point values, :math:`q(u)`, in to a variational distribution over the function values at specified locations x, :math:`q(f|x)`, by integrating :math:`\int p(f|x, u)q(u)du` :param torch.Tensor x: Locations x to get the variational posterior of the function values at. :rtype: ~gpytorch.distributions.MultivariateNormal :return: The distribution :math:`q(f|x)` """ variational_dist = self.variational_distribution inducing_points = self.inducing_points if inducing_points.dim() < x.dim(): inducing_points = inducing_points.expand( *x.shape[:-2], *inducing_points.shape[-2:]) if len(variational_dist.batch_shape) < x.dim() - 2: variational_dist = variational_dist.expand(x.shape[:-2]) # If our points equal the inducing points, we're done if torch.equal(x, inducing_points): # De-whiten the prior covar prior_covar = self.prior_distribution.lazy_covariance_matrix if isinstance(variational_dist.lazy_covariance_matrix, RootLazyTensor): predictive_covar = RootLazyTensor( prior_covar @ variational_dist.lazy_covariance_matrix.root.evaluate()) else: predictive_covar = MatmulLazyTensor( prior_covar @ variational_dist.covariance_matrix, prior_covar) # Cache some values for the KL divergence if self.training: self._mean_diff_inv_quad_memo, self._logdet_memo = prior_covar.inv_quad_logdet( (variational_dist.mean - self.prior_distribution.mean), logdet=True) return MultivariateNormal(variational_dist.mean, predictive_covar) # Otherwise, we have to marginalize else: num_induc = inducing_points.size(-2) full_inputs = torch.cat([inducing_points, x], dim=-2) full_output = self.model.forward(full_inputs) full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix # Mean terms test_mean = full_mean[..., num_induc:] induc_mean = full_mean[..., :num_induc] mean_diff = (variational_dist.mean - induc_mean).unsqueeze(-1) # Covariance terms induc_induc_covar = full_covar[ ..., :num_induc, :num_induc].add_jitter() induc_data_covar = full_covar[..., :num_induc, num_induc:].evaluate() data_data_covar = full_covar[..., num_induc:, num_induc:] # If we're less than a certain size, we'll compute the Cholesky decomposition of induc_induc_covar cholesky = False if settings.fast_computations.log_prob.off() or ( num_induc <= settings.max_cholesky_size.value()): induc_induc_covar = CholLazyTensor( induc_induc_covar.cholesky()) cholesky = True # Cache the CG results # Do not use preconditioning for whitened VI, as it does not seem to improve performance. with settings.max_preconditioner_size(0): with torch.no_grad(): eager_rhs = torch.cat([induc_data_covar, mean_diff], -1) solve, probe_vecs, probe_vec_norms, probe_vec_solves, tmats = CachedCGLazyTensor.precompute_terms( induc_induc_covar, eager_rhs.detach(), logdet_terms=(not cholesky), include_tmats=(not settings.skip_logdet_forward.on() and not cholesky), ) eager_rhss = [eager_rhs.detach()] solves = [solve.detach()] if settings.skip_logdet_forward.on() and self.training: eager_rhss.append( torch.cat([probe_vecs, eager_rhs], -1)) solves.append( torch.cat([ probe_vec_solves, solve[..., :eager_rhs.size(-1)] ], -1)) elif not self.training: eager_rhss.append(eager_rhs[..., :-1]) solves.append(solve[..., :-1]) induc_induc_covar = CachedCGLazyTensor( induc_induc_covar, eager_rhss=eager_rhss, solves=solves, probe_vectors=probe_vecs, probe_vector_norms=probe_vec_norms, probe_vector_solves=probe_vec_solves, probe_vector_tmats=tmats, ) # Compute some terms that will be necessary for the predicitve covariance and KL divergence if self.training: interp_data_data_var_plus_mean_diff_inv_quad, logdet = induc_induc_covar.inv_quad_logdet( torch.cat([induc_data_covar, mean_diff], -1), logdet=True, reduce_inv_quad=False) interp_data_data_var = interp_data_data_var_plus_mean_diff_inv_quad[ ..., :-1] mean_diff_inv_quad = interp_data_data_var_plus_mean_diff_inv_quad[ ..., -1] # Compute predictive mean predictive_mean = torch.add( test_mean, induc_induc_covar.inv_matmul( mean_diff, left_tensor=induc_data_covar.transpose(-1, -2)).squeeze(-1), ) # Compute the predictive covariance is_root_lt = isinstance(variational_dist.lazy_covariance_matrix, RootLazyTensor) is_repeated_root_lt = isinstance( variational_dist.lazy_covariance_matrix, BatchRepeatLazyTensor) and isinstance( variational_dist.lazy_covariance_matrix.base_lazy_tensor, RootLazyTensor) if is_root_lt: predictive_covar = RootLazyTensor( induc_data_covar.transpose(-1, -2) @ variational_dist.lazy_covariance_matrix.root.evaluate()) elif is_repeated_root_lt: predictive_covar = RootLazyTensor( induc_data_covar.transpose( -1, -2) @ variational_dist.lazy_covariance_matrix. root_decomposition().root.evaluate()) else: predictive_covar = MatmulLazyTensor( induc_data_covar.transpose(-1, -2), predictive_covar @ induc_data_covar) if self.training: data_covariance = DiagLazyTensor( (data_data_covar.diag() - interp_data_data_var).clamp( 0, math.inf)) else: neg_induc_data_data_covar = torch.matmul( induc_data_covar.transpose(-1, -2).mul(-1), induc_induc_covar.inv_matmul(induc_data_covar)) data_covariance = data_data_covar + neg_induc_data_data_covar predictive_covar = PsdSumLazyTensor(predictive_covar, data_covariance) # Save the logdet, mean_diff_inv_quad, prior distribution for the ELBO if self.training: self._memoize_cache[ "prior_distribution_memo"] = MultivariateNormal( induc_mean, induc_induc_covar) self._memoize_cache["logdet_memo"] = -logdet self._memoize_cache[ "mean_diff_inv_quad_memo"] = mean_diff_inv_quad return MultivariateNormal(predictive_mean, predictive_covar)
def prior_distribution(self): out = self.model.forward(self.inducing_points) res = MultivariateNormal(out.mean, out.lazy_covariance_matrix.add_jitter()) return res
def main(): sample_sizes = [250, 500, 1000, 2000, 10000] rhos = [0, 0.3, 0.6, 0.9] K = 20 rs_2 = [] rs_4 = [] rs_5 = [] rs_10 = [] results = { rho: [{ssize: [] for ssize in sample_sizes}, None] for rho in rhos } for rho in rhos: rs_2_std = [] rs_4_std = [] rs_5_std = [] rs_10_std = [] real_mi = -np.log(1 - rho**2) / 2 results[rho][1] = real_mi for sample_size in sample_sizes: cov = np.array([[1., rho], [rho, 1.]]) dist = MultivariateNormal(mean=np.zeros(2), cov=cov) rs_2_l, rs_4_l, rs_5_l, rs_10_l = [], [], [], [] delta = lambda x: chi2.ppf(0.97, x**2 - 1) for k in range(K): xy_sample = dist.sample(sample_size) # Adaptive algorithm plane = Plane(xy_sample) rs_2_l.append( kl_estimate( plane, AdaptiveAlgorithm(xy_sample, delta, 2, 2).run())) plane = Plane(xy_sample) rs_4_l.append( kl_estimate( plane, AdaptiveAlgorithm(xy_sample, delta, 4, 4).run())) plane = Plane(xy_sample) rs_5_l.append( kl_estimate( plane, AdaptiveAlgorithm(xy_sample, delta, 5, 5).run())) plane = Plane(xy_sample) rs_10_l.append( kl_estimate( plane, AdaptiveAlgorithm(xy_sample, delta, 10, 10).run())) results[rho][0][sample_size] = [ np.mean(rs_2_l), np.mean(rs_4_l), np.mean(rs_5_l), np.mean(rs_10_l) ] print( "---------------------------------------------------------------------------------------------" ) print("rho: %.2f, Sample Size: %d, Real MI: %.4f" % (rho, sample_size, real_mi)) print("r=s=2: %.4f, r=s=4: %.4f, r=s=5: %.4f, r=s=10: %.4f" % (np.mean(rs_2_l), np.mean(rs_4_l), np.mean(rs_5_l), np.mean(rs_5_l))) rs_2_std.append(np.std(rs_2_l)) rs_4_std.append(np.std(rs_4_l)) rs_5_std.append(np.std(rs_5_l)) rs_10_std.append(np.std(rs_5_l)) rs_2.append(rs_2_std) rs_4.append(rs_4_std) rs_5.append(rs_5_std) rs_10.append(rs_10_std) generate_rs_table(results) all_std = [rs_2, rs_4, rs_5, rs_10] for i, _ in enumerate(["r=s=2", "r=s=4", "r=s=5", "r=s=10"]): plt.figure() plt.semilogx(sample_sizes, all_std[i][0], '-o', label=r'$\rho$ =' + f'{0.0}') plt.semilogx(sample_sizes, all_std[i][1], '-o', label=r'$\rho$ =' + f'{0.3}') plt.semilogx(sample_sizes, all_std[i][2], '-o', label=r'$\rho$ =' + f'{0.6}') plt.semilogx(sample_sizes, all_std[i][3], '-o', label=r'$\rho$ =' + f'{0.9}') plt.xlabel('$\log_{10}$ of sample size') if i == 0: plt.ylabel("std($\hat{I}_{CI}^{r=s=2}$)") plt.title( "Standard deviation of MI estimator $I_{CI}$ with $r=s=2$") elif i == 1: plt.ylabel("std($\hat{I}_{CI}^{r=s=4}$)") plt.title( "Standard deviation of MI estimator $I_{CI}$ with $r=s=4$") elif i == 2: plt.ylabel("std($\hat{I}_{CI}^{r=s=5}$)") plt.title( "Standard deviation of MI estimator $I_{CI}$ with $r=s=5$") else: plt.ylabel("std($\hat{I}_{CI}^{r=s=10}$)") plt.title( "Standard deviation of MI estimator $I_{CI}$ with $r=s=10$") plt.legend() plt.show()
def marginal(self, function_dist: MultivariateNormal, *params: Any, **kwargs: Any) -> MultivariateNormal: mean, covar = function_dist.mean, function_dist.lazy_covariance_matrix noise_covar = self._shaped_noise_covar(mean.shape, *params, **kwargs) full_covar = covar + noise_covar return function_dist.__class__(mean, full_covar)
def main(): sample_sizes = [250, 500, 1000, 2000, 10000] rhos = [0, 0.3, 0.6, 0.9] r = 2 s = 2 K = 50 ci_mi_all_std = [] na_mi_all_std = [] ml_mi_all_std = [] results = { rho: [{ssize: [] for ssize in sample_sizes}, None] for rho in rhos } for rho in rhos: ci_mi_std = [] na_mi_std = [] ml_mi_std = [] real_mi = -np.log(1 - rho**2) / 2 results[rho][1] = real_mi for sample_size in sample_sizes: cov = np.array([[1., rho], [rho, 1.]]) dist = MultivariateNormal(mean=np.zeros(2), cov=cov) ci_mi_l, ml_mi_l, na_mi_l = [], [], [] delta = lambda x: chi2.ppf(0.97, x**2 - 1) for k in range(K): xy_sample = dist.sample(sample_size) plane = Plane(xy_sample) # Adaptive algorithm ci_mi_l.append( kl_estimate( plane, AdaptiveAlgorithm(xy_sample, delta, r, s).run())) na_mi_l.append( kl_estimate( plane, NonAdaptivePartition(xy_sample, bins=[50, 50]).run())) ml_mi_l.append( -np.log(1 - pearsonr(xy_sample[:, 0], xy_sample[:, 1])[0]**2) / 2) results[rho][0][sample_size] = [ np.mean(ml_mi_l), np.mean(ci_mi_l), np.mean(na_mi_l) ] print( "---------------------------------------------------------------------------------------------" ) print("rho: %.2f, Sample Size: %d, Real MI: %.4f" % (rho, sample_size, real_mi)) print( "Adaptive Partition MI: %.4f, NA Partition MI: %.4f, ML MI: %.4f" % (np.mean(ci_mi_l), np.mean(na_mi_l), np.mean(ml_mi_l))) ci_mi_std.append(np.std(ci_mi_l)) na_mi_std.append(np.std(na_mi_l)) ml_mi_std.append(np.std(ml_mi_l)) ci_mi_all_std.append(ci_mi_std) na_mi_all_std.append(na_mi_std) ml_mi_all_std.append(ml_mi_std) generate_table(results) all_std = [ci_mi_all_std, na_mi_all_std, ml_mi_all_std] for i, _ in enumerate(["CI", "NA", "ML"]): plt.figure() plt.semilogx(sample_sizes, all_std[i][0], '-o', label=r'$\rho$ =' + f'{0.0}') plt.semilogx(sample_sizes, all_std[i][1], '-o', label=r'$\rho$ =' + f'{0.3}') plt.semilogx(sample_sizes, all_std[i][2], '-o', label=r'$\rho$ =' + f'{0.6}') plt.semilogx(sample_sizes, all_std[i][3], '-o', label=r'$\rho$ =' + f'{0.9}') plt.xlabel('$\log_{10}$ of sample size') if i == 0: plt.ylabel("std($\hat{I}_{CI}$)") plt.title("Standard deviation of MI estimator $I_{CI}$") elif i == 1: plt.ylabel("std($\hat{I}_{NA}$)") plt.title("Standard deviation of MI estimator $I_{NA}$") else: plt.ylabel("std($\hat{I}_{ML}$)") plt.title("Standard deviation of MI estimator $I_{ML}$") plt.legend() plt.show()
def main(): # Load image im = Image.open(image_file).convert('RGB') width, height = im.size # Convenience function to build image band-by-band from array data def image_from_array(dat): bands = [Image.new('L', (width, height)) for n in range(3)] for i in range(3): bands[i].putdata(dat[:, i]) return Image.merge('RGB', bands) # Resize image width, height = int(width / image_rescale), int(height / image_rescale) im = im.resize((width, height)) # Summary image summary = Image.new('RGB', (width * 2 + 40, height * 2 + 60), (255, 255, 255)) draw = ImageDraw.Draw(summary) draw.text((5, height + 10), 'Original', fill=(0, 0, 0)) draw.text((width + 25, height + 10), 'Noise V = %.2f, C = %.2f' % (noise_var, noise_cov), fill=(0, 0, 0)) draw.text((5, 2 * height + 40), 'Blocked Gamma', fill=(0, 0, 0)) draw.text((width + 25, 2 * height + 40), 'Dists', fill=(0, 0, 0)) del draw summary.paste(im, (10, 10)) # Flatten to emissions real_emissions = list(im.getdata()) num_data = len(real_emissions) real_emissions = np.array(real_emissions) # Block emissions width_blocks = np.array_split(np.arange(width), block_splits) height_blocks = np.array_split(np.arange(height), block_splits) idx = np.arange(num_data) idx.resize((height, width)) blocks = [] for hb in height_blocks: for wb in width_blocks: block = [idx[h, w] for h in hb for w in wb] blocks.append(np.array(block)) # Generate noise v, c = noise_var, noise_cov cov = [[v, c, c], [c, v, c], [c, c, v]] noise = np.random.multivariate_normal([0, 0, 0], cov, width * height) noisy_emissions = real_emissions + noise # Generate noisy image noisy = image_from_array(noisy_emissions) summary.paste(noisy, (30 + width, 10)) # Use K-means to initialize components results = kmeans(noisy_emissions, num_comps) init_gamma = results['best'] means = results['means'] # Analyze color space if do_colormap: col = {'R': 0, 'G': 1, 'B': 2} plt.figure() for i, (d, c1, c2) in enumerate([(real_emissions, 'R', 'G'), (real_emissions, 'R', 'B'), (real_emissions, 'G', 'B'), (noisy_emissions, 'R', 'G'), (noisy_emissions, 'R', 'B'), (noisy_emissions, 'G', 'B')]): plt.subplot(2, 3, i + 1) plt.hexbin(d[:, col[c1]], d[:, col[c2]], gridsize=30, extent=(0, 255, 0, 255)) plt.plot(means[:, col[c1]], means[:, col[c2]], '.k') plt.xlabel(c1) plt.ylabel(c2) plt.axis([-20, 275, -20, 275]) plt.savefig('image_test_color_colormap.png') plt.show() # Do EM results = em(noisy_emissions, [MultivariateNormal() for n in range(num_comps)], count_restart=count_restart, blocks=blocks, max_reps=100, init_gamma=init_gamma, trace=True, pi_max=pi_max) dists = results['dists'] dists_trace = results['dists_trace'] pi = results['pi'] print 'Iterations: %(reps)d' % results gamma = np.transpose(results['gamma']) means = np.array([d.mean() for d in dists]) covs = np.array([d.cov() for d in dists]) # Reconstruct with blocked gamma rec_blocked_gamma = np.array( [np.average(means, weights=g, axis=0) for g in gamma]) im_blocked_gamma = image_from_array(rec_blocked_gamma) summary.paste(im_blocked_gamma, (10, 40 + height)) # Reconstruct from distributions alone pi_opt = pi_maximize(noisy_emissions, dists) phi = np.empty((num_data, num_comps)) for c in range(num_comps): phi[:, c] = dists[c].density(noisy_emissions) phi = np.matrix(phi) for i, pi in enumerate(pi_opt): phi[:, i] *= pi gamma_dists = phi / np.sum(phi, axis=1) rec_dists = np.array(np.dot(gamma_dists, means)) im_dists = image_from_array(rec_dists) summary.paste(im_dists, (30 + width, 40 + height)) # Show summary image if show_summary: summary.show() summary.save('image_test_color_reconstruction.png') # Compare RMSE between reconstructions def rmse(x): return np.sqrt(np.mean((x - real_emissions)**2)) print 'Raw MSE: %.1f' % rmse(noisy_emissions) print 'Blocked Gamma MSE: %.1f' % rmse(rec_blocked_gamma) print 'Dists MSE: %.1f' % rmse(rec_dists) # Visualize variance components if do_variance_viz: temp_files = [] col = {'R': 0, 'G': 1, 'B': 2} fig = plt.figure() for i, (d, c1, c2) in enumerate([(real_emissions, 'R', 'G'), (real_emissions, 'R', 'B'), (real_emissions, 'G', 'B'), (noisy_emissions, 'R', 'G'), (noisy_emissions, 'R', 'B'), (noisy_emissions, 'G', 'B')]): ax = fig.add_subplot(2, 3, i + 1) plt.hexbin(d[:, col[c1]], d[:, col[c2]], gridsize=30, extent=(0, 255, 0, 255)) plt.xlabel(c1) plt.ylabel(c2) plt.axis([-20, 275, -20, 275]) for idx, dists in enumerate(dists_trace): ells = [] for i, (d, c1, c2) in enumerate([(real_emissions, 'R', 'G'), (real_emissions, 'R', 'B'), (real_emissions, 'G', 'B'), (noisy_emissions, 'R', 'G'), (noisy_emissions, 'R', 'B'), (noisy_emissions, 'G', 'B')]): for dist in dists: m, c = dist.mean(), dist.cov() cm = (c[[col[c1], col[c2]]])[:, [col[c1], col[c2]]] e, v = la.eigh(cm) ell = Ellipse(xy=[m[col[c1]], m[col[c2]]], width=np.sqrt(e[0]), height=np.sqrt(e[1]), angle=(180.0 / np.pi) * np.arccos(v[0, 0])) ells.append(ell) ax = fig.add_subplot(2, 3, i + 1) ax.add_artist(ell) ell.set_clip_box(ax.bbox) ell.set_alpha(0.9) ell.set_facecolor(np.fmax(np.fmin(m / 255, 1), 0)) file_name = 'tmp_%03d.png' % idx temp_files.append(file_name) plt.savefig(file_name, dpi=100) for ell in ells: ell.remove() command = ('mencoder', 'mf://tmp_*.png', '-mf', 'type=png:w=800:h=600:fps=5', '-ovc', 'lavc', '-lavcopts', 'vcodec=mpeg4', '-oac', 'copy', '-o', 'image_test_color_components.avi') os.spawnvp(os.P_WAIT, 'mencoder', command) for temp_file in temp_files: os.unlink(temp_file) # Find common variance components print 'True noise:' print cov chols = [la.cholesky(c) for c in covs] chol_recon = np.zeros((3, 3)) for i in range(3): for j in range(3): if j > i: continue chol_recon[i, j] = np.Inf for chol in chols: if abs(chol[i, j]) < abs(chol_recon[i, j]): chol_recon[i, j] = chol[i, j] cov_recon = np.dot(chol_recon, np.transpose(chol_recon)) print 'Reconstructed noise:' print cov_recon
def prior_distribution(self): zeros = torch.zeros_like(self.variational_distribution.mean) ones = torch.ones_like(zeros) res = MultivariateNormal(zeros, DiagLazyTensor(ones)) return res
def forward(self, x, inducing_points, inducing_values, variational_inducing_covar=None): # If our points equal the inducing points, we're done if torch.equal(x, inducing_points): if variational_inducing_covar is None: raise RuntimeError else: return MultivariateNormal(inducing_values, variational_inducing_covar) # Otherwise, we have to marginalize num_induc = inducing_points.size(-2) full_inputs = torch.cat([inducing_points, x], dim=-2) full_output = self.model.forward(full_inputs) full_mean, full_covar = full_output.mean, full_output.lazy_covariance_matrix # Mean terms test_mean = full_mean[..., num_induc:] induc_mean = full_mean[..., :num_induc] mean_diff = (inducing_values - induc_mean).unsqueeze(-1) # Covariance terms induc_induc_covar = full_covar[ ..., :num_induc, :num_induc].add_jitter() induc_data_covar = full_covar[..., :num_induc, num_induc:].evaluate() data_data_covar = full_covar[..., num_induc:, num_induc:] # If we're less than a certain size, we'll compute the Cholesky decomposition of induc_induc_covar cholesky = False if settings.fast_computations.log_prob.off() or ( num_induc <= settings.max_cholesky_size.value()): induc_induc_covar = CholLazyTensor( self._cholesky_factor(induc_induc_covar)) cholesky = True # If we are making predictions and don't need variances, we can do things very quickly. if not self.training and settings.skip_posterior_variances.on(): if not hasattr(self, "_mean_cache"): # For now: run variational inference without a preconditioner # The preconditioner screws things up for some reason with settings.max_preconditioner_size(0): self._mean_cache = induc_induc_covar.inv_matmul( mean_diff).detach() predictive_mean = torch.add( test_mean, induc_data_covar.transpose(-2, -1).matmul( self._mean_cache).squeeze(-1)) predictive_covar = ZeroLazyTensor(test_mean.size(-1), test_mean.size(-1)) return MultivariateNormal(predictive_mean, predictive_covar) # Expand everything to the right size shapes = [ mean_diff.shape[:-1], induc_data_covar.shape[:-1], induc_induc_covar.shape[:-1] ] if variational_inducing_covar is not None: root_variational_covar = variational_inducing_covar.root_decomposition( ).root.evaluate() shapes.append(root_variational_covar.shape[:-1]) shape = _mul_broadcast_shape(*shapes) mean_diff = mean_diff.expand(*shape, mean_diff.size(-1)) induc_data_covar = induc_data_covar.expand(*shape, induc_data_covar.size(-1)) induc_induc_covar = induc_induc_covar.expand( *shape, induc_induc_covar.size(-1)) if variational_inducing_covar is not None: root_variational_covar = root_variational_covar.expand( *shape, root_variational_covar.size(-1)) # Cache the CG results # For now: run variational inference without a preconditioner # The preconditioner screws things up for some reason with settings.max_preconditioner_size(0): # Cache the CG results if variational_inducing_covar is None: left_tensors = mean_diff else: left_tensors = torch.cat([mean_diff, root_variational_covar], -1) with torch.no_grad(): eager_rhs = torch.cat([left_tensors, induc_data_covar], -1) solve, probe_vecs, probe_vec_norms, probe_vec_solves, tmats = CachedCGLazyTensor.precompute_terms( induc_induc_covar, eager_rhs.detach(), logdet_terms=(not cholesky), include_tmats=(not settings.skip_logdet_forward.on() and not cholesky), ) eager_rhss = [ eager_rhs.detach(), eager_rhs[..., left_tensors.size(-1):].detach(), eager_rhs[..., :left_tensors.size(-1)].detach(), ] solves = [ solve.detach(), solve[..., left_tensors.size(-1):].detach(), solve[..., :left_tensors.size(-1)].detach(), ] if settings.skip_logdet_forward.on(): eager_rhss.append(torch.cat([probe_vecs, left_tensors], -1)) solves.append( torch.cat([ probe_vec_solves, solve[..., :left_tensors.size(-1)] ], -1)) induc_induc_covar = CachedCGLazyTensor( induc_induc_covar, eager_rhss=eager_rhss, solves=solves, probe_vectors=probe_vecs, probe_vector_norms=probe_vec_norms, probe_vector_solves=probe_vec_solves, probe_vector_tmats=tmats, ) # Cache the kernel matrix with the cached CG calls if self.training: self._memoize_cache[ "prior_distribution_memo"] = MultivariateNormal( induc_mean, induc_induc_covar) # Compute predictive mean inv_products = induc_induc_covar.inv_matmul( induc_data_covar, left_tensors.transpose(-1, -2)) predictive_mean = torch.add(test_mean, inv_products[..., 0, :]) # Compute covariance if self.training: interp_data_data_var, _ = induc_induc_covar.inv_quad_logdet( induc_data_covar, logdet=False, reduce_inv_quad=False) data_covariance = DiagLazyTensor( (data_data_covar.diag() - interp_data_data_var).clamp( 0, math.inf)) else: neg_induc_data_data_covar = torch.matmul( induc_data_covar.transpose(-1, -2).mul(-1), induc_induc_covar.inv_matmul(induc_data_covar)) data_covariance = data_data_covar + neg_induc_data_data_covar predictive_covar = PsdSumLazyTensor( RootLazyTensor(inv_products[..., 1:, :].transpose(-1, -2)), data_covariance) # Done! return MultivariateNormal(predictive_mean, predictive_covar)