def get_dist(self, timesteps, samples=1, batch_size=1, fixed=False): locs = [] scales = [] sample_list = [] # Add a time dimension e_c = tf.expand_dims(self._e_c, 0) e_scale = tf.expand_dims(self._e_scale, 0) p_scale = tf.expand_dims(self._p_scale, 0) sample = tf.expand_dims(tf.expand_dims(tf.zeros_like(e_c), 0), 0) sample = tf.tile(sample, [samples, batch_size, 1, 1]) for _ in range(timesteps): loc = e_c + self._phi * sample scale = p_scale if _ > 0 else e_scale locs.append(loc) scales.append(scale) if self._offdiag: dist = tfd.MultivariateNormalTriL(loc=loc, scale_tril=scale) else: dist = tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale) sample = dist.sample() sample_list.append(sample) sample = tf.concat(sample_list, axis=2) loc = tf.concat(locs, axis=2) scale = tf.concat(scales, axis=-2) if self._offdiag: dist = tfd.MultivariateNormalTriL(loc=loc, scale_tril=scale) else: dist = tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale) dist = tfd.Independent(dist, reinterpreted_batch_ndims=1) return sample, dist
def generate_fa_data(n_sample, n_factor, n_item, ld, psi = None, rho = None, dtype = tf.float64): if (n_item % n_factor) != 0: n_item = n_factor * (n_item // n_factor) loading = np.zeros((n_item, n_factor)) item_per_factor = (n_item // n_factor) for i in range(n_factor): for j in range(i * item_per_factor, (i + 1) * item_per_factor): loading[j, i] = ld loading = tf.constant(loading, dtype = dtype) if rho is None: cor = tf.eye(n_factor, dtype = dtype) else: unit = tf.ones((n_factor, 1), dtype = dtype) identity = tf.eye(n_factor, dtype = dtype) cor = rho * (unit @ tf.transpose(unit)) + (1 - rho) * identity if psi is None: uniqueness = 1 - tf.linalg.diag_part(loading @ cor @ tf.transpose(loading)) else: uniqueness = psi * tf.ones((n_item, ), dtype = dtype) mean = tf.zeros(n_item, dtype = dtype) cov = loading @ cor @ tf.transpose(loading) + tf.linalg.diag(uniqueness) dist_x = tfd.MultivariateNormalTriL( loc = mean, scale_tril = tf.linalg.cholesky(cov)) x = dist_x.sample(n_sample) return x
def generate_2pl_data(n_sample, n_factor, n_item, alpha, beta, rho, dtype = tf.float64): if (n_item % n_factor) != 0: n_item = n_factor * (n_item // n_factor) item_per_factor = (n_item // n_factor) intercept = tf.fill((n_item,), value = tf.constant(alpha, dtype = dtype)) loading = np.zeros((n_item, n_factor)) for i in range(n_factor): for j in range(i * item_per_factor, (i + 1) * item_per_factor): loading[j, i] = ld loading = tf.constant(loading, dtype = dtype) if rho is None: cor = tf.eye(n_factor, dtype = dtype) else: unit = tf.ones((n_factor, 1), dtype = dtype) identity = tf.eye(n_factor, dtype = dtype) cor = rho * (unit @ tf.transpose(unit)) + (1 - rho) * identity dist_eta = tfd.MultivariateNormalTriL( loc = tf.zeros(n_factor, dtype = dtype), scale_tril = tf.linalg.cholesky(cor)) eta = dist_eta.sample(n_sample) logits = intercept + eta @ tf.transpose(loading) x = tfd.Bernoulli(logits=logits, dtype=dtype).sample() return x
def _build(self, inputs): mean, covariance, scale, L = self.create_mean_n_cov_layers(inputs) mean_t = mean covariance_t = covariance self.set_contractive_regularizer( mean, covariance_t, self._contractive_regularizer_inputs, self._contractive_regularizer_tuple, self._contractive_collection_network_str) # You might wonder why we use cholesky here, if we already have the covariance matrix. # The reason is, that because the matrix is ill conditioned often, # after inverting the precision matrix the cov can become asymetric because of numerical issues, # and the validate arguments gives an error. # If we give its cholesky decomposition, it stays symmetric. output_distribution = tfd.MultivariateNormalTriL(loc=mean_t, scale_tril=L, validate_args=True) # add reconstruction_node method (needed to some sort of mean or median to get reconstructions without sampling) def reconstruction_node(self): return self.mean() output_distribution.reconstruction_node = types.MethodType( reconstruction_node, output_distribution) self.mean = mean return output_distribution
def _build(self, inputs): mean, covariance, scale, L = self.create_mean_n_cov_layers(inputs) mean_t = mean covariance_t = covariance self.set_contractive_regularizer( mean, covariance, self._contractive_regularizer_inputs, self._contractive_regularizer_tuple, self._contractive_collection_network_str) # output_distribution = MultivariateNormalTriLChannelFlipped(loc=mean_t, scale_tril=L, validate_args=True) output_distribution = tfd.MultivariateNormalTriL(loc=mean_t, scale_tril=L, validate_args=True) # add reconstruction_node method (needed to some sort of mean or median to get reconstructions without sampling) def reconstruction_node(self): return self.mean() output_distribution.reconstruction_node = types.MethodType( reconstruction_node, output_distribution) self.mean = mean return output_distribution
def generate_grm_data(n_sample, n_factor, n_item, nu, ld, rho, dtype = tf.float64): if (n_item % n_factor) != 0: n_item = n_factor * (n_item // n_factor) item_per_factor = (n_item // n_factor) n_category = len(nu) + 1 intercept = tf.tile(tf.constant([nu], dtype = dtype), multiples = [n_item, 1]) loading = np.zeros((n_item, n_factor)) for i in range(n_factor): for j in range(i * item_per_factor, (i + 1) * item_per_factor): loading[j, i] = ld loading = tf.constant(loading, dtype = dtype) if rho is None: cor = tf.eye(n_factor, dtype = dtype) else: unit = tf.ones((n_factor, 1), dtype = dtype) identity = tf.eye(n_factor, dtype = dtype) cor = rho * (unit @ tf.transpose(unit)) + (1 - rho) * identity dist_eta = tfd.MultivariateNormalTriL( loc = tf.zeros(n_factor, dtype = dtype), scale_tril = tf.linalg.cholesky(cor)) eta = dist_eta.sample(n_sample) c, d = create_cd(n_category, dtype) probs = grm_irf(eta, intercept, loading, c, d) x = tfd.Categorical(probs=probs, dtype=dtype).sample() return x
def fbar_prior(self, fbar, v, l2): m, K = self.fbar_prior_params(v, l2) prob = 0 for r in range(self.num_replicates): prob += tfd.MultivariateNormalTriL( loc=m, scale_tril=tf.linalg.cholesky(K)).log_prob(fbar[r, 0]) return prob
def _init_distribution(conditions): loc, covariance_matrix = conditions["loc"], conditions["covariance_matrix"] try: chol_cov_matrix = tf.linalg.cholesky(covariance_matrix) except tf.errors.InvalidArgumentError: raise ValueError("Cholesky decomposition failed! Check your `covariance_matrix`.") return tfd.MultivariateNormalTriL(loc=loc, scale_tril=chol_cov_matrix)
def _build_cross_ent(self, weights, means, chol_covars, kernel_chol): """Construct the cross-entropy. Args: weights: shape: (num_components) means: shape: (num_components, num_latents, num_inducing) chol_covars: shape: (num_components, num_latents, num_inducing[, num_inducing]) kernel_chol: shape: (num_latents, num_inducing, num_inducing) Returns: Cross entropy as scalar """ if self.args['diag_post']: # TODO(karl): this is a bit inefficient since we're not making use of the fact # that chol_covars is diagonal. A solution most likely involves a custom tf op. # shape of trace: (num_components, num_latents) trace = tfl.trace( util.cholesky_solve_br(kernel_chol, tfl.diag(chol_covars))) else: trace = tf.reduce_sum(input_tensor=util.mul_sum( util.cholesky_solve_br(kernel_chol, chol_covars), chol_covars), axis=-1) # sum_val has the same shape as weights gaussian = tfd.MultivariateNormalTriL(means, kernel_chol) sum_val = tf.reduce_sum(input_tensor=gaussian.log_prob([0.0]) - 0.5 * trace, axis=-1) # weighted sum of weights and sum_val cross_ent = util.mul_sum(weights, sum_val) return cross_ent
def get_pdf(param_vec, vehicle_type): # see https://ericmjl.github.io/blog/2019/5/29/reasoning-about-shapes-and-probability-distributions/ # for info on shapes if vehicle_type == 'other_vehicle': alpha, mus, sigmas = slice_pvector(param_vec, vehicle_type) # Unpack parameter vectors mvn = tfd.MixtureSameFamily( mixture_distribution=tfd.Categorical(probs=alpha), components_distribution=tfd.Normal( loc=mus, scale=sigmas)) if vehicle_type == 'merge_vehicle': alphas, mus_long, sigmas_long, mus_lat, \ sigmas_lat, rhos = slice_pvector(param_vec, vehicle_type) cov = get_CovMatrix(rhos, sigmas_long, sigmas_lat) mus = tf.stack([mus_long, mus_lat], axis=3, name='mus') mvn = tfd.MixtureSameFamily( mixture_distribution=tfd.Categorical( probs=alphas), components_distribution=tfd.MultivariateNormalTriL( loc=mus, scale_tril=tf.linalg.cholesky(cov), name='MultivariateNormalTriL')) # print('mus shape: ', mus.shape) return mvn
def get_prior_mu(beta0, m0, lambdas): precisions = (lambdas*beta0) covs = tf.linalg.inv(precisions) covs = 0.5*(covs + tf.transpose(covs, [0, 1, 3, 2])) # numerical stability workaround d = tfd.MultivariateNormalTriL(loc=m0, scale_tril=tf.linalg.cholesky(covs)) return d
def get_norm_log_probs(mus, lambdas, X): ilambdas = tf.linalg.inv(lambdas) ilambdas = 0.5*(ilambdas + tf.transpose(ilambdas, [0, 1, 3, 2])) # numerical stability workaround d = tfd.MultivariateNormalTriL(loc=mus, scale_tril=tf.linalg.cholesky(ilambdas)) x_log_probs = [d.log_prob(X[n]) for n in range(X.shape[0])] return tf.stack(x_log_probs, 1)
def get_posterior_mu(beta, mu, lambdas): locations = np.broadcast_to(mu, lambdas.shape[0:1]+mu.shape) precisions = (lambdas*beta[None,:,None,None]) covs = tf.linalg.inv(precisions) #! covs = 0.5*(covs + tf.transpose(covs, [0, 1, 3, 2])) # numerical stability workaround d = tfd.MultivariateNormalTriL(loc=locations, scale_tril=tf.linalg.cholesky(covs)) return d
def fbar_prior(self, fbar, param_0bar, param_1bar): m, K = self.kernel_selector()(param_0bar, param_1bar) jitter = tf.linalg.diag(1e-8 *tf.ones(self.N_p, dtype='float64')) prob = 0 for r in range(self.num_replicates): for i in range(self.num_tfs): prob += tfd.MultivariateNormalTriL(loc=m, scale_tril=tf.linalg.cholesky(K[i]+jitter)).log_prob(fbar[r, i]) return prob
def get_dist(self, timesteps, samples=1, batch_size=1, fixed=True): """ Samples from self.cell `timesteps` times. On each step, the previous (sample, state) is fed back into the cell (zero_state used for 0th step). The cell returns a multivariate normal diagonal distribution for each timestep. We collect each timestep-dist's params (loc and scale), then use them to create the return value: a single MVN diag dist that has a dimension for timesteps. The cell returns a full dist for each timestep so that we can 'sample' it. If our sample size is 1, and our cell is an RNN cell, then this is roughly equivalent to doing a generative RNN (init state = zeros, return_sequences=True) then passing those values through a pair of Dense layers to parameterize a single MVNDiag. Args: timesteps: Number of times to sample from the dynamic_prior_cell. Output will have samples: Number of samples to draw from the latent distribution. batch_size: Number of sequences to sample. fixed: Boolean for whether or not to share the same random sample across all sequences in batch. https://github.com/tensorflow/probability/blob/698e0101aecf46c42858db7952ee3024e091c291/tensorflow_probability/examples/disentangled_vae.py#L887 Returns: """ if fixed: sample_batch_size = 1 else: sample_batch_size = batch_size sample, state = self.cell.zero_state([samples, sample_batch_size]) locs = [] scales = [] sample_list = [] scale_parm_name = "scale_tril" if self.cell.offdiag else "scale_diag" # TODO: Check this for offdiag for _ in range(timesteps): dist, state = self.cell(sample, state) sample = dist.sample() locs.append(dist.parameters["loc"]) scales.append(dist.parameters[scale_parm_name]) sample_list.append(sample) sample = tf.stack(sample_list, axis=2) loc = tf.stack(locs, axis=2) scale = tf.stack(scales, axis=2) if fixed: # tile along the batch axis sample = sample + tf.zeros([batch_size, 1, 1]) if self.cell.offdiag: dist = tfd.MultivariateNormalTriL(loc=loc, scale_tril=scale) else: dist = tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale) dist = tfd.Independent(dist, reinterpreted_batch_ndims=1) return sample, dist
def __call__(self): """Get the distribution object from the backend""" if get_backend() == 'pytorch': import torch.distributions as tod return tod.multivariate_normal.MultivariateNormal( self['loc'], covariance_matrix=self['cov']) else: import tensorflow as tf from tensorflow_probability import distributions as tfd tril = tf.linalg.cholesky(self['cov']) return tfd.MultivariateNormalTriL(loc=self['loc'], scale_tril=tril)
def log_likelihood_tf(data): data_ph = tf.placeholder(dtype=tf.float32, shape=data.shape) mean_ph = tf.placeholder(dtype=tf.float32, shape=(data.shape[1], )) cov_chol_ph = tf.placeholder(dtype=tf.float32, shape=(data.shape[1], data.shape[1])) dist = tpd.MultivariateNormalTriL(loc=mean_ph, scale_tril=cov_chol_ph) return dist.log_prob(data_ph), { 'mean': mean_ph, 'cov_chol': cov_chol_ph, 'data': data_ph }
def normal_sampler_fn(seed): p1, p2 = all_states[self.state_indices['kernel_params']] m, K = self.kernel_selector()(logit(p1), logit(p2)) m = tf.zeros((self.num_replicates, self.num_tfs, self.N_p), dtype='float64') K = tf.stack([K for _ in range(3)], axis=0) jitter = tf.linalg.diag(1e-8 * tf.ones(self.N_p, dtype='float64')) z = tfd.MultivariateNormalTriL( loc=m, scale_tril=tf.linalg.cholesky(K + jitter)).sample(seed=seed) # tf.print(z) return z
def qx(self): if self._qx is None: if self.Xchol.shape.ndims == 1: self._qx = tfd.MultivariateNormalDiag(loc=tf.reshape( self.X, [-1]), scale_diag=self.Xchol) else: self._qx = tfd.MultivariateNormalTriL( loc=self.X if self.Xchol.shape.ndims == 3 else tf.reshape( self.X, [-1]), scale_tril=self.Xchol) return self._qx
def loss(y_est, y): y = tf.cast(y, dtype=tf.float32) ######### Your code starts here ######### # We want to compute the negative log-likelihood loss between y_est and y where # - y_est is the output of the network for a batch of observations, # - y is the actions the expert took for the corresponding batch of observations # At the end your code should return the scalar loss value. # HINT: You may find the classes of tensorflow_probability.distributions (imported as tfd) useful. # In particular, we used MultivariateNormalTriL, but it is not the only way. mvn = tfd.MultivariateNormalTriL(loc=y_est[:, :2], scale_tril=tfp.math.fill_triangular( y_est[:, 2:])) return -tf.math.reduce_mean(mvn.log_prob(y))
def qx(self): if self._qx is None: self._qx = [] for s in range(self.n_seq): if self.Xchol[s].shape.ndims == 1: self._qx.append( tfd.MultivariateNormalDiag(loc=tf.reshape( self.X[s], [-1]), scale_diag=self.Xchol[s])) else: self._qx.append( tfd.MultivariateNormalTriL( loc=self.X[s] if self.Xchol[s].shape.ndims == 3 else tf.reshape(self.X[s], [-1]), scale_tril=self.Xchol[s])) return self._qx
def density(self, xi, t=0): x, dx = xi[:, :self._u_dim], xi[:, self._u_dim:] ys = [f(x) for f in self.fs] # transform state js = [j(x) for j in self.js] # get jacobians dys = [j.matvec(dx) for j in js] # get velocities in transformed space # "forces" in transformed space from the different policies fys_locs_covs = [ self.pis[i](ys[i], dys[i], t) for i in range(self.n_experts) ] # separate locs and covs fys_locs = [_y[0] for _y in fys_locs_covs] fys_covs = [_y[1] for _y in fys_locs_covs] # "forces" in original space fxs = [ js[i].matvec(fys_locs[i], adjoint=True) for i in range(self.n_experts) ] # covariances "forces" in original space fxs_covs = [ matquad(js[i], fys_covs[i], adjoint=True) for i in range(self.n_experts) ] # precisions with regularization fxs_precs = [ tf.linalg.inv(cov + self._reg**2 * tf.eye(self.experts_size[i])) for i, cov in enumerate(fxs_covs) ] # compute product of Gaussian policies precs = tf.reduce_sum(fxs_precs, axis=0) covs = tf.linalg.inv(precs) locs = [ tf.linalg.LinearOperatorFullMatrix(fxs_precs[i]).matvec(fxs[i]) for i in range(self.n_experts) ] locs = tf.linalg.LinearOperatorFullMatrix(covs).matvec( tf.reduce_sum(locs, axis=0)) return ds.MultivariateNormalTriL(locs, tf.linalg.cholesky(covs))
def __call__(self, x): mapped = self.net(x) batch_size = mapped.shape.as_list()[0] time_length = mapped.shape.as_list()[1] # Obtain mean and precision matrix components num_dim = len(mapped.shape.as_list()) perm = list(range(num_dim - 2)) + [num_dim - 1, num_dim - 2] mapped_transposed = tf.transpose(mapped, perm=perm) mapped_mean = mapped_transposed[:, :self.z_size] mapped_covar = mapped_transposed[:, self.z_size:] # tf.nn.sigmoid provides more stable performance on Physionet dataset if self.data_type == 'physionet': mapped_covar = tf.nn.sigmoid(mapped_covar) else: mapped_covar = tf.nn.softplus(mapped_covar) mapped_reshaped = tf.reshape(mapped_covar, [batch_size, self.z_size, 2*time_length]) dense_shape = [batch_size, self.z_size, time_length, time_length] idxs_1 = np.repeat(np.arange(batch_size), self.z_size*(2*time_length-1)) idxs_2 = np.tile(np.repeat(np.arange(self.z_size), (2*time_length-1)), batch_size) idxs_3 = np.tile(np.concatenate([np.arange(time_length), np.arange(time_length-1)]), batch_size*self.z_size) idxs_4 = np.tile(np.concatenate([np.arange(time_length), np.arange(1,time_length)]), batch_size*self.z_size) idxs_all = np.stack([idxs_1, idxs_2, idxs_3, idxs_4], axis=1) # ~10x times faster on CPU then on GPU with tf.device('/cpu:0'): # Obtain covariance matrix from precision one mapped_values = tf.reshape(mapped_reshaped[:, :, :-1], [-1]) prec_sparse = tf.sparse.SparseTensor(indices=idxs_all, values=mapped_values, dense_shape=dense_shape) prec_sparse = tf.sparse.reorder(prec_sparse) prec_tril = tf.sparse_add(tf.zeros(prec_sparse.dense_shape, dtype=tf.float32), prec_sparse) eye = tf.eye(num_rows=prec_tril.shape.as_list()[-1], batch_shape=prec_tril.shape.as_list()[:-2]) prec_tril = prec_tril + eye cov_tril = tf.linalg.triangular_solve(matrix=prec_tril, rhs=eye, lower=False) cov_tril = tf.where(tf.math.is_finite(cov_tril), cov_tril, tf.zeros_like(cov_tril)) num_dim = len(cov_tril.shape) perm = list(range(num_dim - 2)) + [num_dim - 1, num_dim - 2] cov_tril_lower = tf.transpose(cov_tril, perm=perm) z_dist = tfd.MultivariateNormalTriL(loc=mapped_mean, scale_tril=cov_tril_lower) return z_dist
def get_dist(self, timesteps, samples=1, batch_size=1): """ Tiles the saved loc and scale to the same shape as `posterior` then uses them to create a MVN dist with appropriate shape. Each timestep has the same loc and scale but if it were sampled then each timestep would return different values. Args: timesteps: samples: batch_size: Returns: MVNDiag distribution of the same shape as `posterior` """ loc = tf.tile(tf.expand_dims(self._loc, 0), [timesteps, 1]) scale = tf.expand_dims(self._scale, 0) if self._offdiag: scale = tf.tile(scale, [timesteps, 1, 1]) dist = tfd.MultivariateNormalTriL(loc=loc, scale_tril=scale) else: scale = tf.tile(scale, [timesteps, 1]) dist = tfd.MultivariateNormalDiag(loc=loc, scale_diag=scale) dist = tfd.Independent(dist, reinterpreted_batch_ndims=1) return dist.sample([samples, batch_size]), dist
def pdf_2D(z, density_name=''): assert density_name in AVAILABLE_2D_DISTRIBUTIONS, "Incorrect density name." if density_name == '': return 1 elif density_name == 'banana': z1, z2 = z[:, 0], z[:, 1] mu = np.array([0.5, 0.5], dtype='float32') cov = np.array([[0.06, 0.055], [0.055, 0.06]], dtype='float32') scale = tf.linalg.cholesky(cov) p = tfd.MultivariateNormalTriL(loc=mu, scale_tril=scale) z2 = z1**2 + z2 z1, z2 = tf.expand_dims(z1, 1), tf.expand_dims(z2, 1) z = tf.concat([z1, z2], axis=1) return p.prob(z) elif density_name == 'circle': z1, z2 = z[:, 0], z[:, 1] norm = (z1**2 + z2**2)**0.5 exp1 = math.exp(-0.2 * ((z1 - 2) / 0.8)**2) exp2 = math.exp(-0.2 * ((z1 + 2) / 0.8)**2) u = 0.5 * ((norm - 4) / 0.4)**2 - math.log(exp1 + exp2) return math.exp(-u) elif density_name == 'eight_schools': y_i = 0 sigma_i = 10 thetas, mu, log_tau = z[:, 0], z[:, 1], z[:, 2] likelihood = tfd.Normal(loc=thetas, scale=sigma_i) prior_theta = tfd.Normal(loc=mu, scale=math.exp(log_tau)) prior_mu = tfd.Normal(loc=0, scale=5) prior_tau = tfd.HalfCauchy(loc=0, scale=5) return likelihood.prob(y_i) * prior_theta.prob(thetas) * prior_mu.prob( mu) * prior_tau.prob(math.exp(log_tau)) * math.exp(log_tau) elif density_name == 'figure_eight': mu1 = 1 * np.array([-1, -1], dtype='float32') mu2 = 1 * np.array([1, 1], dtype='float32') scale = 0.45 * np.array([1, 1], dtype='float32') pi = 0.5 comp1 = tfd.MultivariateNormalDiag(loc=mu1, scale_diag=scale) comp2 = tfd.MultivariateNormalDiag(loc=mu2, scale_diag=scale) return (1 - pi) * comp1.prob(z) + pi * comp2.prob(z)
def _build(self, inputs): inputs = tf.layers.flatten(inputs) self.dense_loc = snt.Linear(self._output_size, **self._extra_kwargs) self.dense_diag_params = snt.Linear(self._output_size, **self._extra_kwargs) n_out_of_diag_elems = int(self._output_size * (self._output_size - 1) / 2) self.dense_out_of_diag_params = snt.Linear(n_out_of_diag_elems, **self._extra_kwargs) loc = self.dense_loc(inputs) diag_params = self.dense_diag_params(inputs) out_of_diag_params = self.dense_out_of_diag_params(inputs) lower_triangle = tf.contrib.distributions.fill_triangular(out_of_diag_params) lower_triangle = tf.pad(lower_triangle, [[0, 0], [1, 0], [0, 1]]) diag_positive = self._minimal_covariance + tf.nn.softplus(diag_params) scale_tril = tf.linalg.set_diag(lower_triangle, diag_positive) dtype = inputs.dtype n_tril = n_out_of_diag_elems + self._output_size self._calibration_tril_params = tf.get_variable("calibration_tril_params", shape=(n_tril,), dtype=dtype, trainable=False, initializer=tf.initializers.constant(value=1.)) self.calibration_tril = tf.contrib.distributions.fill_triangular(self._calibration_tril_params, name="calibration_tril") ouput_params = {"loc" : loc, "scale_tril" : tf.multiply(self.calibration_tril, scale_tril)} distr = tfd.MultivariateNormalTriL(**ouput_params) return distr
def density(self, xi, t=0): ys = [f(xi) for f in self.fs] # transform state js = [j(xi) for j in self.js] # get jacobians # "velocities" in transformed space from the different policies fys_locs_covs = [self.pis[i](ys[i], t) for i in range(self.n_experts)] # separate locs and covs fys_locs = [_y[0] for _y in fys_locs_covs] fys_covs = [_y[1] for _y in fys_locs_covs] # precisions with regularization J^T Lambda fys_precs = [ tf.linalg.inv(fys_covs[i] + self._reg**2 * tf.eye(self.experts_size[i])) for i in range(self.n_experts) ] fxs_eta = [ tf.linalg.LinearOperatorFullMatrix(js[i].matmul( fys_precs[i], adjoint=True)).matvec(fys_locs[i]) for i in range(self.n_experts) ] fxs_precs = [ matquad(js[i], fys_precs[i]) for i in range(self.n_experts) ] # compute product of Gaussian policies precs = tf.reduce_sum(fxs_precs, axis=0) covs = tf.linalg.inv(precs) etas = tf.reduce_sum(fxs_eta, axis=0) locs = tf.linalg.LinearOperatorFullMatrix(covs).matvec(etas) return ds.MultivariateNormalTriL(locs, tf.linalg.cholesky(covs))
def _init_distribution(conditions, **kwargs): loc, scale_tril = conditions["loc"], conditions["scale_tril"] return tfd.MultivariateNormalTriL(loc=loc, scale_tril=scale_tril, **kwargs)
def _build_entropy(self, weights, means, chol_covars): """Construct entropy. Args: weights: shape: (num_components) means: shape: (num_components, num_latents, num_inducing) chol_covars: shape: (num_components, num_latents, num_inducing[, num_inducing]) Returns: Entropy (scalar) """ # This part is to compute the product of the pdf of normal distributions """ chol_component_covar = [] component_mean = [] component_covar =[] covar_shape = tf.shape(chol_covars)[-2:] mean_shape = tf.shape(means)[-1:] # \Sigma_new = (\sum_{i=1}^{num_latents}( \Sigma_i^-1) )^{-1} # \Mu_new = \Sigma_new * (\sum_{i=1}^{num_latents} \Sigma_i^{-1} * \mu_i) for i in range(self.num_components): temp_cov = tf.zeros(covar_shape) temp_mean = tf.zeros(mean_shape)[..., tf.newaxis] for k in range(self.num_latents): # Compute the sum of (\Sigma_i)^{-1} temp_cov += tf.cholesky_solve(chol_covars[i, k, :, :], tf.eye(covar_shape[0])) # Compute the sum of (\Sigma_i)^{-1} * \mu_i temp_mean += tf.cholesky_solve(chol_covars[i, k, :, :], means[i, k, :, tf.newaxis]) # Compute \Sigma_new = temp_cov^{-1} temp_chol_covar = tf.cholesky(temp_cov) temp_component_covar = tf.cholesky_solve(temp_chol_covar, tf.eye(covar_shape[0])) component_covar.append(temp_component_covar) # Compute \Mu_new = \Sigma_new * (\sum_{i=1}^{num_latents} \Sigma_i^{-1} * \mu_i) temp_component_mean = temp_component_covar @ temp_mean component_mean.append(temp_component_mean) # Some functions need cholesky of \Sigma_new chol_component_covar.append(tf.cholesky(temp_component_covar)) chol_component_covar = tf.stack(chol_component_covar, 0) component_covar = tf.stack(component_covar, 0) component_mean = tf.squeeze(tf.stack(component_mean, 0), -1) """ # First build a square matrix of normals. if self.args['diag_post']: # construct normal distributions for all combinations of components variational_dist = tfd.MultivariateNormalDiag( means, tf.sqrt(chol_covars[tf.newaxis, ...] + chol_covars[:, tf.newaxis, ...])) else: if self.args['num_components'] == 1: # Use the fact that chol(S + S) = sqrt(2) * chol(S) chol_covars_sum = tf.sqrt(2.) * chol_covars[tf.newaxis, ...] else: # Here we use the original component_covar directly # TODO: Can we just stay in cholesky space somehow? component_covar = util.mat_square(chol_covars) chol_covars_sum = tfl.cholesky( component_covar[tf.newaxis, ...] + component_covar[:, tf.newaxis, ...]) # The class MultivariateNormalTriL only accepts cholesky decompositions of covariances variational_dist = tfd.MultivariateNormalTriL( means[tf.newaxis, ...], chol_covars_sum) # compute log probability of all means in all normal distributions # then sum over all latent functions # shape of log_normal_probs: (num_components, num_components) log_normal_probs = tf.reduce_sum( input_tensor=variational_dist.log_prob(means[:, tf.newaxis, ...]), axis=-1) # Now compute the entropy. # broadcast `weights` into dimension 1, then do `logsumexp` in that dimension weighted_logsumexp_probs = tf.reduce_logsumexp( input_tensor=tfm.log(weights) + log_normal_probs, axis=1) # multiply with weights again and then sum over it all return -util.mul_sum(weights, weighted_logsumexp_probs)
def run(self, its=None, samples=100, threshold=0.001): """ Run the VI optimisation. its: Number of iterations. Set its to None to automatically stop when the ELBO has reduced by less than threshold percent (between rolling averages of the last 50 calculations and the 50 before that). samples: Number of samples for the stochastic sampling of the gradient threshold: if its is None, this is the percentage change between the rolling average, over 50 iterations. Default: 0.001 (0.1%). """ elbo_record = [] it = 0 print("Starting Run") try: while (its is None) or (it < its): it += 1 with tf.GradientTape() as tape: qu = tfd.MultivariateNormalTriL(self.mu[:, 0], self.scale) samps = self.sm.get_samples(self.mu, self.scale, samples) scaled = tf.concat([ self.transform_fn(samps[:, :, ::2], self.Y[:, 0:1], self.sideY), self.transform_fn(samps[:, :, 1::2], self.Y[:, 1:2], self.sideY) ], 2) scaled = (scaled * (1 - self.ref)) + (self.Y * self.ref) if self.mulike is not None: #if we have non-stationary likelihood variance... qulike = tfd.MultivariateNormalTriL( self.mulike[:, 0], self.scalelike) like = self.smlike.get_samples(self.mulike, self.scalelike, samples) ell = tf.reduce_mean( tf.reduce_sum( self.likelihoodfn_nonstationary( scaled[:, :, 0], scaled[:, :, 1], like[:, :, 0] * (1 - self.ref[:, 0]) - 1000 * self.ref[:, 0], like[:, :, 1] * (1 - self.ref[:, 1]) - 1000 * self.ref[:, 1]), 1)) else: #stationary likelihood variance ell = tf.reduce_mean( tf.reduce_sum( self.likelihoodfn(scaled[:, :, 0], scaled[:, :, 1]), 1)) elbo_loss = -ell + tfd.kl_divergence(qu, self.pu) if self.likemodel == 'process': assert self.mulike is not None assert self.scalelike is not None elbo_loss += tfd.kl_divergence(qulike, self.pulike) if self.likemodel == 'distribution': assert self.mulike is not None elbo_loss -= self.pulike.log_prob(self.mulike[:, 0]) if it % 20 == 0: print("%d (ELBO=%0.4f)" % (it, elbo_loss)) if (self.mulike is None) or (it % 50 < 25): #optimise latent fns gradients = tape.gradient(elbo_loss, [self.mu, self.scale]) self.optimizer.apply_gradients( zip(gradients, [self.mu, self.scale])) else: #this optimises the likelihood... if self.likemodel == 'distribution': gradients = tape.gradient(elbo_loss, [self.mulike]) self.likeoptimizer.apply_gradients( zip(gradients, [self.mulike])) if self.likemodel == 'process': gradients = tape.gradient( elbo_loss, [self.mulike, self.scalelike]) self.likeoptimizer.apply_gradients( zip(gradients, [self.mulike, self.scalelike])) elbo_record.append(elbo_loss) if its is None: if it > 100: oldm = np.median(elbo_record[-100:-50]) m = np.median(elbo_record[-50:]) if np.abs((oldm - m) / ((oldm + m) / 2)) < threshold: #check that nothing weird's happened! if np.std(elbo_record[-50:]) < np.std( elbo_record[-100:-50]): break except KeyboardInterrupt: pass return np.array(elbo_record)