def create_prior(K, a_p=1, b_p=1, a_gamma=1, b_gamma=1, m_loc=0, g_loc=0.1, m_sigma=3, s_sigma=2, m_nu=0, s_nu=1, m_skew=0, g_skew=0.1, dtype=np.float64): return tfd.JointDistributionNamed( dict( p=tfd.Beta(dtype(a_p), dtype(b_p)), gamma_C=tfd.Gamma(dtype(a_gamma), dtype(b_gamma)), gamma_T=tfd.Gamma(dtype(a_gamma), dtype(b_gamma)), eta_C=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), eta_T=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), nu=tfd.Sample(tfd.LogNormal(dtype(m_nu), s_nu), sample_shape=K), sigma_sq=tfd.Sample(tfd.InverseGamma(dtype(m_sigma), dtype(s_sigma)), sample_shape=K), loc=lambda sigma_sq: tfd.Independent(tfd.Normal( dtype(m_loc), g_loc * tf.sqrt(sigma_sq)), reinterpreted_batch_ndims=1), skew=lambda sigma_sq: tfd.Independent(tfd.Normal( dtype(m_skew), g_skew * tf.sqrt(sigma_sq)), reinterpreted_batch_ndims=1), ))
def create_models(self, data): self.models = [] for i in range(self.num_outputs): kern = gpflow.kernels.SquaredExponential(lengthscales=tf.ones([data[0].shape[1],], dtype=gpflow.config.default_float())) kern.lengthscales.prior = tfd.Gamma(to_default_float(1.1), to_default_float(1/10.0)) # priors have to be included before kern.variance.prior = tfd.Gamma(to_default_float(1.5), to_default_float(1/2.0)) # before the model gets compiled self.models.append(gpflow.models.GPR((data[0], data[1][:, i:i+1]), kernel=kern)) self.models[-1].likelihood.prior = tfd.Gamma(to_default_float(1.2), to_default_float(1/0.05))
def logp(par): p = param p['beta1'] = par[0] p['gamma'] = par[1] beta_logp = tfd.Gamma(concentration=tf.constant(1., tf.float64), rate=tf.constant(1., tf.float64)).log_prob( p['beta1']) gamma_logp = tfd.Gamma(concentration=tf.constant(100., tf.float64), rate=tf.constant(400., tf.float64)).log_prob( p['gamma']) t, sim, solve = simulator.simulate(p, state_init) y_logp = covid19uk_logp(y_incr, sim, 0.1) logp = beta_logp + gamma_logp + tf.reduce_sum(y_logp) return logp
def empirical_Ey_and_Ey2_tf(a=3, ap=3, bp=1.0, c=3, cp=3, dp=1.0, nsamples_latent=100, nsamples_latent1=1, nsamples_output=10, K=25, N=1, M=1): """ Returns E_prior[Y] and E_prior[Y^2] for given set of hyperparameters. Parametrization like in: http://jakehofman.com/inprint/poisson_recs.pdf """ if N != 1: warnings.warn("N!=1 will be ignored!") if N != 1: warnings.warn("M!=1 will be ignored!") #a, ap, bp, c, cp, dp = _ttf(a), _ttf(ap), _ttf(bp), _ttf(c), _ttf(cp), _ttf(dp) # cast to tf ksi = tfd.Gamma(ap, ap / bp).sample(nsamples_latent) # NL0 theta = tfd.Gamma(a, ksi).sample((K, nsamples_latent1)) # K x NL1 x NL0 eta = tfd.Gamma(cp, cp / dp).sample(nsamples_latent) beta = tfd.Gamma(c, eta).sample((K, nsamples_latent1)) latent = tf.reduce_sum(theta * beta, 0) # NL1 x NL0 latent = tf.reshape(latent, [-1]) # NL1*NL0 poisson = tfd.Poisson(rate=latent) #y_samples = np.random.poisson(latent, size=[nsamples_output, nsamples_latent*nsamples_latent1]) # NO x NL1*NL0 y_samples = tf.stop_gradient(poisson.sample([nsamples_output])) y_probs = tf.exp(poisson.log_prob(y_samples)) #y_probs1 = np.array([[tf.exp(tfd.Poisson(rate=latent[i]).log_prob(y_samples[j,i])).numpy() # for j in range(nsamples_output)] # for i in range(nsamples_latent * nsamples_latent1)]).T #assert (y_probs - y_probs1).numpy().max()<1e-12 total_prob = tf.reduce_sum(y_probs, 0) conditional_expectation = tf.reduce_sum(y_probs * y_samples, 0) / total_prob conditional_expectation_squared = tf.reduce_sum(y_probs * (y_samples**2), 0) / total_prob expectation = tf.reduce_mean(conditional_expectation) expectation_squared = tf.reduce_mean(conditional_expectation_squared) return expectation, expectation_squared
def create_model(n_C, n_T, K, neg_inf=-10, dtype=np.float64): return tfd.JointDistributionNamed( dict(p=tfd.Beta(dtype(1), dtype(1)), gamma_C=tfd.Gamma(dtype(3), dtype(3)), gamma_T=tfd.Gamma(dtype(3), dtype(3)), eta_C=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), eta_T=tfd.Dirichlet(tf.ones(K, dtype=dtype) / K), loc=tfd.Sample(tfd.Normal(dtype(0), dtype(1)), sample_shape=K), sigma_sq=tfd.Sample(tfd.InverseGamma(dtype(3), dtype(2)), sample_shape=K), y_C=lambda gamma_C, eta_C, loc, sigma_sq: mix( gamma_C, eta_C, loc, tf.sqrt(sigma_sq), dtype(neg_inf), n_C), y_T=lambda gamma_C, gamma_T, eta_C, eta_T, p, loc, sigma_sq: mix_T(gamma_C, gamma_T, eta_C, eta_T, p, loc, tf.sqrt(sigma_sq), dtype(neg_inf), n_T)))
def empirical_Ey_and_Ey2_tf_logscore(a=3, ap=3, bp=1.0, c=3, cp=3, dp=1.0, nsamples_latent=100, nsamples_latent1=1, nsamples_output=10, K=25, N=1, M=1): """ Returns E_prior[Y] and E_prior[Y^2] for given set of hyperparameters. Parametrization like in: http://jakehofman.com/inprint/poisson_recs.pdf Gradients obtained with log-score derivative trick. """ if N != 1: warnings.warn("N!=1 will be ignored!") if N != 1: warnings.warn("M!=1 will be ignored!") ksi = tfd.Gamma(ap, ap / bp).sample(nsamples_latent) # NL0 theta = tfd.Gamma(a, ksi).sample((K, nsamples_latent1)) # K x NL1 x NL0 eta = tfd.Gamma(cp, cp / dp).sample(nsamples_latent) beta = tfd.Gamma(c, eta).sample((K, nsamples_latent1)) latent = tf.reduce_sum(theta * beta, 0) # NL1 x NL0 latent = tf.reshape(latent, [-1]) # NL1*NL0 poisson = tfd.Poisson(rate=latent) y_samples = poisson.sample([nsamples_output]) conditional_expectation = tfp.monte_carlo.expectation( f=lambda x: x, samples=y_samples, log_prob=poisson.log_prob, use_reparameterization=False) conditional_expectation_squared = tfp.monte_carlo.expectation( f=lambda x: x * x, samples=y_samples, log_prob=poisson.log_prob, use_reparameterization=False) expectation = tf.reduce_mean(conditional_expectation) expectation_squared = tf.reduce_mean(conditional_expectation_squared) return expectation, expectation_squared
def __init__(self, a, theta, alpha, beta, validate_args=False, allow_nan_stats=True, name='Amoroso'): parameters = dict(locals()) with tf.name_scope(name) as name: self._a = tensor_util.convert_nonref_to_tensor(a) self._theta = tensor_util.convert_nonref_to_tensor(theta) self._alpha = tensor_util.convert_nonref_to_tensor(alpha) self._beta = tensor_util.convert_nonref_to_tensor(beta) gamma = tfd.Gamma(alpha, 1.) chain = tfb.Invert( tfb.Chain([ tfb.Exp(), tfb.Scale(beta), tfb.Shift(-tf.math.log(theta)), tfb.Log(), tfb.Shift(-a), ])) super().__init__(distribution=gamma, bijector=chain, validate_args=validate_args, parameters=parameters, name=name)
def create_dp_sb_gmm(nobs, K, dtype=np.float64): return tfd.JointDistributionNamed( dict( # Mixture means mu=tfd.Independent(tfd.Normal(np.zeros(K, dtype), 3), reinterpreted_batch_ndims=1), # Mixture scales sigma=tfd.Independent(tfd.LogNormal(loc=np.full(K, -2, dtype), scale=0.5), reinterpreted_batch_ndims=1), # Mixture weights (stick-breaking construction) alpha=tfd.Gamma(concentration=np.float64(1.0), rate=10.0), v=lambda alpha: tfd.Independent( # NOTE: Dave Moore suggests doing this instead, to ensure # that a batch dimension in alpha doesn't conflict with # the other parameters. tfd.Beta(np.ones(K - 1, dtype), alpha[..., tf.newaxis]), reinterpreted_batch_ndims=1), # Observations (likelihood) obs=lambda mu, sigma, v: tfd.Sample( tfd.MixtureSameFamily( # This will be marginalized over. mixture_distribution=tfd.Categorical(probs=stickbreak(v)), components_distribution=tfd.Normal(mu, sigma)), sample_shape=nobs)))
def test_meta_distributions(): N = 100 sigma_tf = tfd.Gamma(np.asarray(1.), np.asarray(1.)).sample() epsilon_tf = tfd.Normal(np.zeros((N, 1)), sigma_tf).sample() beta_tf = tfd.Normal(np.zeros((2, 1)), 1).sample() X = np.vstack([np.random.randn(N), np.ones(N)]).T X_tf = tf.convert_to_tensor(X) Y_tf = tf.linalg.matmul(X_tf, beta_tf) + epsilon_tf Y_mt = mt(Y_tf) # Confirm that all `Operation`s are the same. assert_ops_equal(Y_mt, Y_tf) # Now, let's see if we can reconstruct it entirely from the # meta objects. def _remove_obj(meta_obj): if (hasattr(meta_obj, '_obj') and not isinstance(meta_obj, TFlowMetaOpDef)): meta_obj._obj = None if hasattr(meta_obj, 'ancestors'): for a in meta_obj.ancestors or []: _remove_obj(a) _remove_obj(Y_mt) Y_mt_tf = Y_mt.reify() assert_ops_equal(Y_mt, Y_mt_tf)
def __call__(self): """Get the distribution object from the backend""" if get_backend() == 'pytorch': import torch.distributions as tod return tod.gamma.Gamma(self.concentration, self.rate) else: from tensorflow_probability import distributions as tfd return tfd.Gamma(self.concentration, self.rate)
def logp(par): p = param p['beta1'] = par[0] p['beta3'] = par[1] p['gamma'] = par[2] p['I0'] = par[3] p['r'] = par[4] beta_logp = tfd.Gamma(concentration=tf.constant(1., dtype=DTYPE), rate=tf.constant(1., dtype=DTYPE)).log_prob(p['beta1']) beta3_logp = tfd.Gamma(concentration=tf.constant(200., dtype=DTYPE), rate=tf.constant(200., dtype=DTYPE)).log_prob(p['beta3']) gamma_logp = tfd.Gamma(concentration=tf.constant(100., dtype=DTYPE), rate=tf.constant(400., dtype=DTYPE)).log_prob(p['gamma']) I0_logp = tfd.Gamma(concentration=tf.constant(1.5, dtype=DTYPE), rate=tf.constant(0.05, dtype=DTYPE)).log_prob(p['I0']) r_logp = tfd.Gamma(concentration=tf.constant(0.1, dtype=DTYPE), rate=tf.constant(0.1, dtype=DTYPE)).log_prob(p['gamma']) state_init = simulator.create_initial_state(init_matrix=seeding * p['I0']) t, sim, solve = simulator.simulate(p, state_init) y_logp = covid19uk_logp(y_incr, sim, 0.1, p['r']) logp = beta_logp + beta3_logp + gamma_logp + I0_logp + r_logp + tf.reduce_sum(y_logp) return logp
def sample_f(self): """ Runs MCMC to sample posterior functions. """ # add priors to the hyperparameters. self.model.kernel.lengthscales.prior = tfd.Gamma(f64(1.0), f64(1.0)) self.model.kernel.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) self.model.likelihood.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) if self.mean_function is not None: self.model.mean_function.A.prior = tfd.Normal(f64(0.0), f64(10.0)) self.model.mean_function.b.prior = tfd.Normal(f64(0.0), f64(10.0)) # sample from the posterior using HMC (required to estimate epistemic uncertainty) num_burnin_steps = ci_niter(300) num_samples = ci_niter(self.num_samples) # Note that here we need model.trainable_parameters, not trainable_variables - only parameters can have priors! self.hmc_helper = gpflow.optimizers.SamplingHelper( self.model.log_posterior_density, self.model.trainable_parameters) hmc = tfp.mcmc.HamiltonianMonteCarlo( target_log_prob_fn=self.hmc_helper.target_log_prob_fn, num_leapfrog_steps=10, step_size=0.01) adaptive_hmc = tfp.mcmc.SimpleStepSizeAdaptation( hmc, num_adaptation_steps=10, target_accept_prob=f64(0.75), adaptation_rate=0.1) @tf.function def run_chain_fn(): return tfp.mcmc.sample_chain( num_results=num_samples, num_burnin_steps=num_burnin_steps, current_state=self.hmc_helper.current_state, kernel=adaptive_hmc, trace_fn=lambda _, pkr: pkr.inner_results.is_accepted, ) self.samples, traces = run_chain_fn()
def create_models(self, X, Y): """ Construct a separate GP model for every output/target dimensions, i.e. for every Delta_{t, i}. :param X: Data points, state-action pairs. (num_steps, state_dim + control_dim) :param Y: Data points, state differences. (num_steps, state_dim) :return: """ for i in range(self.num_outputs): kern = gpflow.kernels.RBF() kern.lengthscales.prior = tfd.Gamma(1.0, 10.0) kern.variance.prior = tfd.Gamma(1.5, 2.0) model = gpflow.models.GPR( data=(X, Y[:, i : i + 1]), kernel=kern, mean_function=None ) model.likelihood.variance.assign(2e-6) gpflow.set_trainable(model.likelihood, False) self.models.append(model) self.optimizers.append(gpflow.optimizers.Scipy())
def __init__(self, Nc, Ng, Kc=0, Kg=0, effLen=None, intercept=None, intercept_mode='gene', sigma=None, tau_prior=[3, 27], name=None): self.Nc = Nc self.Ng = Ng self.Kc = Kc self.Kg = Kg self.effLen = effLen # (Ng, 3 * 2) self.intercept_mode = intercept_mode self.Z_loc = tf.Variable( tf.random.normal([Nc, Ng]), name='Z_loc', constraint=lambda t: tf.clip_by_value(t, -9, 9)) self.Z_std_log = tf.Variable(tf.random.normal([Nc, Ng]), name='Z_var') self.Wc_loc = tf.Variable(tf.random.normal([Kc, Ng]), name='Wc_loc') self.Wg_loc = tf.Variable(tf.random.normal([Nc, Kg]), name='Wg_loc') if intercept_mode.upper() == 'GENE': _intercept_shape = (1, Ng) _sigma_shape = (1, Ng) elif intercept_mode.upper() == 'CELL': _intercept_shape = (Nc, 1) _sigma_shape = (Nc, 1) else: # print("[BIRE2] Error: intercept_mode only supports gene or cell") _intercept_shape = (1, Ng) _sigma_shape = (1, Ng) if intercept is None: self.intercept = tf.Variable( tf.random.normal(_intercept_shape), name='bias', constraint=lambda t: tf.clip_by_value(t, -9, 9)) else: _intercept = tf.ones(_intercept_shape) * intercept self.intercept = tf.constant(_intercept, name='bias') self.tau_a_log = tf.Variable(tf.ones(_sigma_shape), name='tau_a_log') self.tau_b_log = tf.Variable(tf.ones(_sigma_shape), name='tau_b_log') print(tau_prior) self.tauPrior = tfd.Gamma(tau_prior[0], tau_prior[1])
def empirical_Ey_and_Ey2_tf_logscore(ct=1.0, rt=1.0, cb=0.1, rb=0.1, nsamples_latent=100, nsamples_output=3, N=1, M=1, K=25): """ Returns E_prior[Y] and E_prior[Y^2] for given set of hyperparameters. The outputs are (tf) differentiable w.r.t. hyperparameters. Gradients are obtained using log-score derivative trick. """ if N != 1: warnings.warn("N!=1 will be ignored!") if N != 1: warnings.warn("M!=1 will be ignored!") theta = tfd.Gamma(ct, rt).sample((K, nsamples_latent)) beta = tfd.Gamma(cb, rb).sample((K, nsamples_latent)) latent = tf.reduce_sum(theta * beta, 0) poisson = tfd.Poisson(rate=latent) y_samples = poisson.sample([nsamples_output]) conditional_expectation = tfp.monte_carlo.expectation( f=lambda x: x, samples=y_samples, log_prob=poisson.log_prob, use_reparameterization=False) conditional_expectation_squared = tfp.monte_carlo.expectation( f=lambda x: x * x, samples=y_samples, log_prob=poisson.log_prob, use_reparameterization=False) expectation = tf.reduce_mean(conditional_expectation) expectation_squared = tf.reduce_mean(conditional_expectation_squared) return expectation, expectation_squared
def empirical_Ey_and_Ey2_tf(ct=1.0, rt=1.0, cb=0.1, rb=0.1, nsamples_latent=100, nsamples_output=3, N=1, M=1, K=25): """ Returns E_prior[Y] and E_prior[Y^2] for given set of hyperparameters. The outputs are (tf) differentiable w.r.t. hyperparameters. """ if N != 1: warnings.warn("N!=1 will be ignored!") if N != 1: warnings.warn("M!=1 will be ignored!") #ct, rt, cb, rb = _make_tf(ct), _make_tf(rt), _make_tf(cb), _make_tf(rb) theta = tfd.Gamma(ct, rt).sample((K, nsamples_latent)) beta = tfd.Gamma(cb, rb).sample((K, nsamples_latent)) latent = tf.reduce_sum(theta * beta, 0) poisson = tfd.Poisson(rate=latent) #y_samples = np.random.poisson(latent, size=[nsamples_output, nsamples_latent]) # NO x NL y_samples = tf.stop_gradient(poisson.sample([nsamples_output])) y_probs = tf.exp(poisson.log_prob(y_samples)) total_prob = tf.reduce_sum(y_probs, 0) conditional_expectation = tf.reduce_sum(y_probs * y_samples, 0) / total_prob conditional_expectation_squared = tf.reduce_sum(y_probs * (y_samples**2), 0) / total_prob expectation = tf.reduce_mean(conditional_expectation) expectation_squared = tf.reduce_mean(conditional_expectation_squared) return expectation, expectation_squared
def create_models(self, data): self.models = [] for i in range(self.num_outputs): kernel = gpflow.kernels.SquaredExponential( lengthscales=tf.ones([ data[0].shape[1], ], dtype=float_type)) transformed_lengthscales = Parameter( kernel.lengthscales, transform=positive(lower=1e-3)) kernel.lengthscales = transformed_lengthscales kernel.lengthscales.prior = tfd.Gamma(f64(1.1), f64(1 / 10.0)) if i == 0: self.models.append( FakeGPR((data[0], data[1][:, i:i + 1]), kernel)) else: self.models.append( FakeGPR((data[0], data[1][:, i:i + 1]), kernel, self.models[-1].X))
def german_credit_model(): x_numeric = tf.constant(numericals.astype(np.float32)) x_categorical = [tf.one_hot(c, c.max() + 1) for c in categoricals] all_x = tf.concat([x_numeric] + x_categorical, 1) num_features = int(all_x.shape[1]) overall_log_scale = ed.Normal(loc=0., scale=10., name='overall_log_scale') beta_log_scales = ed.TransformedDistribution( tfd.Gamma(0.5 * tf.ones([num_features]), 0.5), bijector=tfb.Invert(tfb.Exp()), name='beta_log_scales') beta = ed.Normal(loc=tf.zeros([num_features]), scale=tf.exp(overall_log_scale + beta_log_scales), name='beta') logits = tf.einsum('nd,md->mn', all_x, beta[tf.newaxis, :]) return ed.Bernoulli(logits=logits, name='y')
def set_prior(self, mu_prior=None, sigma_prior=None, theta_prior=None): """Set prior ditributions """ # Prior distributions for the means if mu_prior is None: self.mu_prior = tfd.Normal(tf.zeros((self.Nc, self.Nd)), tf.ones((self.Nc, self.Nd))) else: self.mu_prior = self.mu_prior # Prior distributions for the standard deviations if sigma_prior is None: self.sigma_prior = tfd.Gamma(2 * tf.ones((self.Nc, self.Nd)), 2 * tf.ones((self.Nc, self.Nd))) else: self.sigma_prior = sigma_prior # Prior distributions for the component weights if theta_prior is None: self.theta_prior = tfd.Dirichlet(5 * tf.ones((self.Nc, ))) else: self.theta_prior = theta_prior
def set_prior(self, mu_prior=None, sigma_prior=None, ident_prior=None): """Set prior ditributions """ # Prior distributions for the means if mu_prior is None: self.mu_prior = tfd.Normal(tf.zeros((self.Nc, self.Nd)), tf.ones((self.Nc, self.Nd))) else: self.mu_prior = self.mu_prior # Prior distributions for the standard deviations if sigma_prior is None: self.sigma_prior = tfd.Gamma(2 * tf.ones((self.Nc, self.Nd)), 2 * tf.ones((self.Nc, self.Nd))) else: self.sigma_prior = sigma_prior # Prior distributions for sample assignment if ident_prior is None: self.ident_prior = tfd.Multinomial( total_count=1, probs=tf.ones((self.Ns, self.Nc)) / self.Nc) else: self.ident_prior = ident_prior
def __repr__(self): component_mean = reprlib.repr( tfd.Gamma(concentration=.1, rate=.001).sample([self.num_components, self.var_dim])) return str(component_mean)
def _base_dist(self, alpha: TensorLike, beta: TensorLike, *args, **kwargs): return tfd.Gamma(concentration=alpha, rate=beta, *args, **kwargs)
def sigma(self): """Variational posterior for distribution variance""" return tfd.Gamma(self.alpha, self.beta)
def construct_model(self): with self.graph.as_default(): self.sess.close() self.sess = tf.compat.v1.InteractiveSession() self.sess.as_default() self.x = tf.convert_to_tensor(self.rescaled_features, dtype = tf.float32) self.y = tf.convert_to_tensor(self.targets, dtype = tf.float32) # construct precisness self.tau_rescaling = np.zeros((self.num_obs, self.bnn_output_size)) kernel_ranges = self.config.kernel_ranges for obs_index in range(self.num_obs): self.tau_rescaling[obs_index] += kernel_ranges self.tau_rescaling = self.tau_rescaling**2 # construct weight and bias shapes activations = [tf.nn.tanh] weight_shapes, bias_shapes = [[self.feature_size, self._hidden_shape]], [[self._hidden_shape]] for _ in range(1, self._num_layers - 1): activations.append(tf.nn.tanh) weight_shapes.append([self._hidden_shape, self._hidden_shape]) bias_shapes.append([self._hidden_shape]) activations.append(lambda x: x) weight_shapes.append([self._hidden_shape, self.bnn_output_size]) bias_shapes.append([self.bnn_output_size]) # construct prior self.prior_layer_outputs = [self.x] self.priors = {} for layer_index in range(self._num_layers): weight_shape, bias_shape = weight_shapes[layer_index], bias_shapes[layer_index] activation = activations[layer_index] weight = tfd.Normal(loc = tf.zeros(weight_shape) + self._weight_loc, scale = tf.zeros(weight_shape) + self._weight_scale) bias = tfd.Normal(loc = tf.zeros(bias_shape) + self._bias_loc, scale = tf.zeros(bias_shape) + self._bias_scale) self.priors['weight_%d' % layer_index] = weight self.priors['bias_%d' % layer_index] = bias prior_layer_output = activation(tf.matmul(self.prior_layer_outputs[-1], weight.sample()) + bias.sample()) self.prior_layer_outputs.append(prior_layer_output) self.prior_bnn_output = self.prior_layer_outputs[-1] self.prior_tau_normed = tfd.Gamma( self.num_obs**2 + tf.zeros((self.num_obs, self.bnn_output_size)), tf.ones((self.num_obs, self.bnn_output_size))) self.prior_tau = self.prior_tau_normed.sample() / self.tau_rescaling self.prior_scale = tfd.Deterministic(1. / tf.sqrt(self.prior_tau)) # construct posterior self.post_layer_outputs = [self.x] self.posteriors = {} for layer_index in range(self._num_layers): weight_shape, bias_shape = weight_shapes[layer_index], bias_shapes[layer_index] activation = activations[layer_index] weight = tfd.Normal(loc = tf.Variable(tf.random.normal(weight_shape)), scale = tf.nn.softplus(tf.Variable(tf.zeros(weight_shape)))) bias = tfd.Normal(loc = tf.Variable(tf.random.normal(bias_shape)), scale = tf.nn.softplus(tf.Variable(tf.zeros(bias_shape)))) self.posteriors['weight_%d' % layer_index] = weight self.posteriors['bias_%d' % layer_index] = bias post_layer_output = activation(tf.matmul(self.post_layer_outputs[-1], weight.sample()) + bias.sample()) self.post_layer_outputs.append(post_layer_output) self.post_bnn_output = self.post_layer_outputs[-1] self.post_tau_normed = tfd.Gamma( self.num_obs**2 + tf.Variable(tf.zeros((self.num_obs, self.bnn_output_size))), tf.nn.softplus(tf.Variable(tf.ones((self.num_obs, self.bnn_output_size))))) self.post_tau = self.post_tau_normed.sample() / self.tau_rescaling self.post_sqrt_tau = tf.sqrt(self.post_tau) self.post_scale = tfd.Deterministic(1. / self.post_sqrt_tau) # map bnn output to prediction post_kernels = {} targets_dict = {} inferences = [] target_element_index = 0 kernel_element_index = 0 while kernel_element_index < len(self.config.kernel_names): kernel_type = self.config.kernel_types[kernel_element_index] kernel_size = self.config.kernel_sizes[kernel_element_index] feature_begin, feature_end = target_element_index, target_element_index + 1 kernel_begin, kernel_end = kernel_element_index, kernel_element_index + kernel_size prior_relevant = self.prior_bnn_output[:, kernel_begin : kernel_end] post_relevant = self.post_bnn_output[:, kernel_begin : kernel_end] target = self.y[:, kernel_begin : kernel_end] lowers, uppers = self.config.kernel_lowers[kernel_begin : kernel_end], self.config.kernel_uppers[kernel_begin : kernel_end] prior_support = (uppers - lowers) * (1.2 * tf.nn.sigmoid(prior_relevant) - 0.1) + lowers post_support = (uppers - lowers) * (1.2 * tf.nn.sigmoid(post_relevant) - 0.1) + lowers prior_predict = tfd.Normal(prior_support, self.prior_scale[:, kernel_begin : kernel_end].sample()) post_predict = tfd.Normal(post_support, self.post_scale[:, kernel_begin : kernel_end].sample()) targets_dict[prior_predict] = target post_kernels['param_%d' % target_element_index] = { 'loc': tfd.Deterministic(post_support), 'sqrt_prec': tfd.Deterministic(self.post_sqrt_tau[:, kernel_begin : kernel_end]), 'scale': tfd.Deterministic(self.post_scale[:, kernel_begin : kernel_end].sample())} inference = {'pred': post_predict, 'target': target} inferences.append(inference) target_element_index += 1 kernel_element_index += kernel_size self.post_kernels = post_kernels self.targets_dict = targets_dict loss = 0. for inference in inferences: loss += - tf.reduce_sum( inference['pred'].log_prob(inference['target']) ) self.optimizer = tf.compat.v1.train.AdamOptimizer(self._learning_rate) self.train_op = self.optimizer.minimize(loss) tf.compat.v1.global_variables_initializer().run()
def tauDist(self): return tfd.Gamma(tf.exp(self.tau_a_log), tf.exp(self.tau_b_log))
def construct_model(self, learning_rate=None): if learning_rate is None: learning_rate = self.learning_rate with self.graph.as_default(): self.sess.close() self.sess = tf.compat.v1.InteractiveSession() self.sess.as_default() self.x = tf.convert_to_tensor(self.rescaled_features, dtype=tf.float32) self.y = tf.convert_to_tensor(self.targets, dtype=tf.float32) # construct precisness self.tau_rescaling = np.zeros((self.num_obs, self.bnn_output_size)) kernel_ranges = self.config.kernel_ranges for obs_index in range(self.num_obs): self.tau_rescaling[obs_index] += kernel_ranges self.tau_rescaling = self.tau_rescaling**2 # construct weight and bias shapes activations = [tf.nn.tanh] weight_shapes, bias_shapes = [[self.feature_size, self.hidden_shape]], [[self.hidden_shape]] for _ in range(1, self.num_layers - 1): activations.append(tf.nn.tanh) weight_shapes.append([self.hidden_shape, self.hidden_shape]) bias_shapes.append([self.hidden_shape]) activations.append(lambda x: x) weight_shapes.append([self.hidden_shape, self.bnn_output_size]) bias_shapes.append([self.bnn_output_size]) # --------------- # construct prior # --------------- self.prior_layer_outputs = [self.x] self.priors = {} for layer_index in range(self.num_layers): weight_shape, bias_shape = weight_shapes[layer_index], bias_shapes[layer_index] activation = activations[layer_index] weight = tfd.Normal(loc=tf.zeros(weight_shape) + self.weight_loc, scale=tf.zeros(weight_shape) + self.weight_scale) bias = tfd.Normal(loc=tf.zeros(bias_shape) + self.bias_loc, scale=tf.zeros(bias_shape) + self.bias_scale) self.priors['weight_%d' % layer_index] = weight self.priors['bias_%d' % layer_index] = bias prior_layer_output = activation(tf.matmul(self.prior_layer_outputs[-1], weight.sample()) + bias.sample()) self.prior_layer_outputs.append(prior_layer_output) self.prior_bnn_output = self.prior_layer_outputs[-1] # draw precisions from gamma distribution self.prior_tau_normed = tfd.Gamma( 12*(self.num_obs/self.frac_feas)**2 + tf.zeros((self.num_obs, self.bnn_output_size)), tf.ones((self.num_obs, self.bnn_output_size)), ) self.prior_tau = self.prior_tau_normed.sample() / self.tau_rescaling self.prior_scale = tfd.Deterministic(1. / tf.sqrt(self.prior_tau)) # ------------------- # construct posterior # ------------------- self.post_layer_outputs = [self.x] self.posteriors = {} for layer_index in range(self.num_layers): weight_shape, bias_shape = weight_shapes[layer_index], bias_shapes[layer_index] activation = activations[layer_index] weight = tfd.Normal(loc=tf.Variable(tf.random.normal(weight_shape)), scale=tf.nn.softplus(tf.Variable(tf.zeros(weight_shape)))) bias = tfd.Normal(loc=tf.Variable(tf.random.normal(bias_shape)), scale=tf.nn.softplus(tf.Variable(tf.zeros(bias_shape)))) self.posteriors['weight_%d' % layer_index] = weight self.posteriors['bias_%d' % layer_index] = bias post_layer_output = activation(tf.matmul(self.post_layer_outputs[-1], weight.sample()) + bias.sample()) self.post_layer_outputs.append(post_layer_output) self.post_bnn_output = self.post_layer_outputs[-1] self.post_tau_normed = tfd.Gamma( 12*(self.num_obs/self.frac_feas)**2 + tf.Variable(tf.zeros((self.num_obs, self.bnn_output_size))), tf.nn.softplus(tf.Variable(tf.ones((self.num_obs, self.bnn_output_size)))), ) self.post_tau = self.post_tau_normed.sample() / self.tau_rescaling self.post_sqrt_tau = tf.sqrt(self.post_tau) self.post_scale = tfd.Deterministic(1. / self.post_sqrt_tau) # map bnn output to prediction post_kernels = {} targets_dict = {} inferences = [] target_element_index = 0 kernel_element_index = 0 while kernel_element_index < len(self.config.kernel_names): kernel_type = self.config.kernel_types[kernel_element_index] kernel_size = self.config.kernel_sizes[kernel_element_index] feature_begin, feature_end = target_element_index, target_element_index + 1 kernel_begin, kernel_end = kernel_element_index, kernel_element_index + kernel_size prior_relevant = self.prior_bnn_output[:, kernel_begin: kernel_end] post_relevant = self.post_bnn_output[:, kernel_begin: kernel_end] if kernel_type == 'continuous': target = self.y[:, kernel_begin: kernel_end] lowers, uppers = self.config.kernel_lowers[kernel_begin: kernel_end], self.config.kernel_uppers[kernel_begin : kernel_end] prior_support = (uppers - lowers) * (1.2 * tf.nn.sigmoid(prior_relevant) - 0.1) + lowers post_support = (uppers - lowers) * (1.2 * tf.nn.sigmoid(post_relevant) - 0.1) + lowers prior_predict = tfd.Normal(prior_support, self.prior_scale[:, kernel_begin: kernel_end].sample()) post_predict = tfd.Normal(post_support, self.post_scale[:, kernel_begin: kernel_end].sample()) targets_dict[prior_predict] = target post_kernels['param_%d' % target_element_index] = { 'loc': tfd.Deterministic(post_support), 'sqrt_prec': tfd.Deterministic(self.post_sqrt_tau[:, kernel_begin: kernel_end]), 'scale': tfd.Deterministic(self.post_scale[:, kernel_begin: kernel_end].sample())} inference = {'pred': post_predict, 'target': target} inferences.append(inference) elif kernel_type in ['categorical', 'discrete']: target = tf.cast(self.y[:, kernel_begin: kernel_end], tf.int32) prior_temperature = 0.5 + 10.0 / (self.num_obs / self.frac_feas) #prior_temperature = 1.0 post_temperature = prior_temperature prior_support = prior_relevant post_support = post_relevant prior_predict_relaxed = tfd.RelaxedOneHotCategorical(prior_temperature, prior_support) prior_predict = tfd.OneHotCategorical(probs=prior_predict_relaxed.sample()) post_predict_relaxed = tfd.RelaxedOneHotCategorical(post_temperature, post_support) post_predict = tfd.OneHotCategorical(probs=post_predict_relaxed.sample()) targets_dict[prior_predict] = target post_kernels['param_%d' % target_element_index] = {'probs': post_predict_relaxed} inference = {'pred': post_predict, 'target': target} inferences.append(inference) ''' Temperature annealing schedule: - temperature of 100 yields 1e-2 deviation from uniform - temperature of 10 yields 1e-1 deviation from uniform - temperature of 1 yields *almost* perfect agreement with expectation - temperature of 0.1 yields perfect agreement with expectation ''' else: GryffinUnknownSettingsError(f'did not understand kernel type: {kernel_type}') target_element_index += 1 kernel_element_index += kernel_size self.post_kernels = post_kernels self.targets_dict = targets_dict self.loss = 0. for inference in inferences: self.loss += - tf.reduce_sum(inference['pred'].log_prob(inference['target'])) self.optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate) self.train_op = self.optimizer.minimize(self.loss) tf.compat.v1.global_variables_initializer().run()
validate_args=True, allow_nan_stats=False) }, "gamma": { "parameters": { "concentration": { "support": [0, inf], "activation function": softplus }, "rate": { "support": [0, inf], "activation function": softplus } }, "class": lambda theta: tensorflow_distributions.Gamma( concentration=theta["concentration"], rate=theta["rate"]) }, "categorical": { "parameters": { "logits": { "support": [-inf, inf], "activation function": identity } }, "class": lambda theta: tensorflow_distributions.Categorical(logits=theta[ "logits"]), }, "bernoulli": { "parameters": { "logits": {
# Data used by the control model (pre-intervention) xc = x[x <= ip] yc = y[x <= ip] xd = x[x > ip] yd = y[x > ip] # Data used by the (post-)intervention model xi = xd[xd <= ip2] yi = yd[xd <= ip2] xe = xd[xd > ip2] ye = yd[xd > ip2] ks = [RBF(), RBF()] ks[1].variance.prior = dist.Gamma(np.float64(20), np.float64(4.35)) m1 = None for k in ks: m1 = GPMContainer(gf.utilities.deepcopy(k), [(x, y)], []) m2 = GPMContainer(gf.utilities.deepcopy(k), [(xc, yc), (xd, yd)], [ip]) for name, m in zip(['c', 'd'], [m1, m2]): m.train() m.plot_regression() plt.show() print(f"{name} l: {m.log_posterior_density()}") print(f"trainable parameters: {m1.trainable_parameters}") print(f"log prior density: {m1.kernel.variance.log_prior_density()}")
def _init_distribution(conditions, **kwargs): concentration, rate = conditions["concentration"], conditions["rate"] return tfd.Gamma(concentration=concentration, rate=rate, **kwargs)
# %% [markdown] # Secondly, we initialize the model to the maximum likelihood solution. # %% optimizer = gpflow.optimizers.Scipy() optimizer.minimize(model.training_loss, model.trainable_variables) print(f"log posterior density at optimum: {model.log_posterior_density()}") # %% [markdown] # Thirdly, we add priors to the hyperparameters. # %% # tfp.distributions dtype is inferred from parameters - so convert to 64-bit model.kernel.lengthscales.prior = tfd.Gamma(f64(1.0), f64(1.0)) model.kernel.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) model.likelihood.variance.prior = tfd.Gamma(f64(1.0), f64(1.0)) model.mean_function.A.prior = tfd.Normal(f64(0.0), f64(10.0)) model.mean_function.b.prior = tfd.Normal(f64(0.0), f64(10.0)) gpflow.utilities.print_summary(model) # %% [markdown] # We now sample from the posterior using HMC. # %% num_burnin_steps = ci_niter(300) num_samples = ci_niter(500) # Note that here we need model.trainable_parameters, not trainable_variables - only parameters can have priors!