def test_log_likelihood2(self): d = 100 data = np.tile(np.eye(d)[None], [10, 1, 1]) S = np.diag(np.exp(np.random.normal(size=d))) sigma = IW([T.to_float(S), d + 1]) np.testing.assert_almost_equal( self.session.run(sigma.log_likelihood(T.to_float(data))), invwishart(scale=S, df=d + 1).logpdf(data.T), 3)
def test_log_likelihood2(self): d = 100 data = invwishart(scale=np.eye(d), df=d + 1).rvs(size=100) S = np.eye(d) sigma = IW([T.to_float(S), d + 1]) np.testing.assert_almost_equal( self.session.run(sigma.log_likelihood(T.to_float(data))), invwishart(scale=S, df=d + 1).logpdf(data.T), -1)
def initialize_node(node, children): if isinstance(node, Gaussian): d = T.shape(node) return Gaussian([T.eye(d[-1], batch_shape=d[:-1]), T.random_normal(d)]) elif isinstance(node, IW): d = T.shape(node) return IW([(T.to_float(d[-1]) + 1) * T.eye(d[-1], batch_shape=d[:-2]), T.to_float(d[-1]) + 1])
def initialize_objective(self): self.C, self.c = ( T.variable(T.random_normal([self.ds, self.ds])), T.variable(T.random_normal([self.ds])), ) if self.learn_stdev: self.stdev = T.variable(T.to_float(self.cost_stdev)) else: self.stdev = T.to_float(self.cost_stdev)
def test_log_likelihood1(self): d = 2 data = np.tile(np.eye(d)[None], [10, 1, 1]) sigma = IW([T.eye(d), d + 1]) np.testing.assert_almost_equal( self.session.run(sigma.log_likelihood(T.to_float(data))), invwishart(scale=np.eye(2), df=d + 1).logpdf(data.T), 5)
def vmp(graph, data, max_iter=100, tol=1e-4): q, visible = {}, {} for node in top_sort(graph)[::-1]: if node in data: visible[node] = T.to_float(data[node]) else: q[node] = initialize_node(node, {}) ordering = list(q.keys()) params = [q[var].get_parameters('natural') for var in ordering] prev_elbo = T.constant(float('inf')) def cond(i, elbo, prev_elbo, q): return T.logical_and(i < max_iter, abs(elbo - prev_elbo) > tol) def step(i, elbo, prev_elbo, q): prev_elbo = elbo q_vars = { var: var.__class__(param, 'natural') for var, param in zip(ordering, q) } q, elbo = message_passing(q_vars, visible) return i + 1, elbo, prev_elbo, [ q[var].get_parameters('natural') for var in ordering ] i, elbo, prev_elbo, q = T.while_loop(cond, step, [0, float('inf'), 0.0, params]) return { var: var.__class__(param, 'natural') for var, param in zip(ordering, q) }, elbo
def test_log_z(self): d = 100 data = invwishart(scale=np.eye(d), df=d + 1).rvs(size=100) S = np.eye(d) sigma = IW([T.to_float(S), d + 1]) np.testing.assert_almost_equal(self.session.run(sigma.log_z()), self.log_z(S, d + 1), 3) np.testing.assert_almost_equal( self.session.run(sigma.log_z('natural')), self.log_z(S, d + 1), 3)
def log_z(self, parameter_type='regular', stop_gradient=False): if parameter_type == 'regular': sigma, mu = self.get_parameters('regular', stop_gradient=stop_gradient) d = T.to_float(self.shape()[-1]) hsi, hlds = Stats.HSI(sigma), Stats.HLDS(sigma) mmT = Stats.XXT(mu) return ( - T.sum(hsi * mmT, [-1, -2]) - hlds + d / 2. * np.log(2 * np.pi) ) else: natparam = self.get_parameters('natural', stop_gradient=stop_gradient) d = T.to_float(self.shape()[-1]) J, m = natparam[Stats.XXT], natparam[Stats.X] return ( - 0.25 * (m[..., None, :]@T.matrix_inverse(J)@m[..., None])[..., 0, 0] - 0.5 * T.logdet(-2 * J) + d / 2. * np.log(2 * np.pi) )
def test_stats1(self): d = 2 S = np.eye(d) sigma = IW([T.to_float(S), d + 1]) stats = self.stats(S, d + 1) stats_ = self.session.run(sigma.expected_sufficient_statistics()) [ np.testing.assert_almost_equal(stats_[s], stats[s]) for s in sigma.statistics() ]
def initialize_objective(self): H, ds, da = self.horizon, self.ds, self.da if self.time_varying: A = T.concatenate( [T.eye(ds, batch_shape=[H - 1]), T.zeros([H - 1, ds, da])], -1) self.A_prior = stats.MNIW([ 2 * T.eye(ds, batch_shape=[H - 1]), A, T.eye(ds + da, batch_shape=[H - 1]), T.to_float(ds + 2) * T.ones([H - 1]) ], parameter_type='regular') self.A_variational = stats.MNIW(list( map( T.variable, stats.MNIW.regular_to_natural([ 2 * T.eye(ds, batch_shape=[H - 1]), A + 1e-2 * T.random_normal([H - 1, ds, ds + da]), T.eye(ds + da, batch_shape=[H - 1]), T.to_float(ds + 2) * T.ones([H - 1]) ]))), parameter_type='natural') else: A = T.concatenate([T.eye(ds), T.zeros([ds, da])], -1) self.A_prior = stats.MNIW( [2 * T.eye(ds), A, T.eye(ds + da), T.to_float(ds + 2)], parameter_type='regular') self.A_variational = stats.MNIW(list( map( T.variable, stats.MNIW.regular_to_natural([ 2 * T.eye(ds), A + 1e-2 * T.random_normal([ds, ds + da]), T.eye(ds + da), T.to_float(ds + 2) ]))), parameter_type='natural')
def kl_divergence(self, q_X, q_A, num_data): if (q_X, q_A) not in self.cache: if self.smooth: state_prior = stats.GaussianScaleDiag( [T.ones(self.ds), T.zeros(self.ds)]) self.p_X = stats.LDS( (self.sufficient_statistics(), state_prior, None, q_A.expected_value(), self.horizon), 'internal') local_kl = stats.kl_divergence(q_X, self.p_X) if self.time_varying: global_kl = T.sum( stats.kl_divergence(self.A_variational, self.A_prior)) else: global_kl = stats.kl_divergence(self.A_variational, self.A_prior) prior_kl = T.mean(local_kl, axis=0) + global_kl / T.to_float(num_data) A, Q = self.get_dynamics() model_stdev = T.sqrt(T.matrix_diag_part(Q)) self.cache[(q_X, q_A)] = prior_kl, { 'local-kl': local_kl, 'global-kl': global_kl, 'model-stdev': model_stdev, } else: q_Xt = q_X.__class__([ q_X.get_parameters('regular')[0][:, :-1], q_X.get_parameters('regular')[1][:, :-1], ]) q_At = q_A.__class__([ q_A.get_parameters('regular')[0][:, :-1], q_A.get_parameters('regular')[1][:, :-1], ]) p_Xt1 = self.forward(q_Xt, q_At) q_Xt1 = q_X.__class__([ q_X.get_parameters('regular')[0][:, 1:], q_X.get_parameters('regular')[1][:, 1:], ]) num_data = T.to_float(num_data) rmse = T.sqrt( T.sum(T.square( q_Xt1.get_parameters('regular')[1] - p_Xt1.get_parameters('regular')[1]), axis=-1)) A, Q = self.get_dynamics() model_stdev = T.sqrt(T.matrix_diag_part(Q)) local_kl = T.sum(stats.kl_divergence(q_Xt1, p_Xt1), axis=1) if self.time_varying: global_kl = T.sum( stats.kl_divergence(self.A_variational, self.A_prior)) else: global_kl = stats.kl_divergence(self.A_variational, self.A_prior) self.cache[(q_X, q_A)] = (T.mean(local_kl, axis=0) + global_kl / T.to_float(num_data), { 'rmse': rmse, 'model-stdev': model_stdev, 'local-kl': local_kl, 'global-kl': global_kl }) return self.cache[(q_X, q_A)]
def kl_gradients(self, q_X, q_A, _, num_data): if self.smooth: ds = self.ds ess = q_X.expected_sufficient_statistics() yyT = ess[..., :-1, ds:2 * ds, ds:2 * ds] xxT = ess[..., :-1, :ds, :ds] yxT = ess[..., :-1, ds:2 * ds, :ds] aaT, a = stats.Gaussian.unpack( q_A.expected_sufficient_statistics()) aaT, a = aaT[:, :-1], a[:, :-1] x = ess[..., :-1, -1, :ds] y = ess[..., :-1, -1, ds:2 * ds] xaT = T.outer(x, a) yaT = T.outer(y, a) xaxaT = T.concatenate([ T.concatenate([xxT, xaT], -1), T.concatenate([T.matrix_transpose(xaT), aaT], -1), ], -2) batch_size = T.shape(ess)[0] num_batches = T.to_float(num_data) / T.to_float(batch_size) ess = [ yyT, T.concatenate([yxT, yaT], -1), xaxaT, T.ones([batch_size, self.horizon - 1]) ] else: q_Xt = q_X.__class__([ q_X.get_parameters('regular')[0][:, :-1], q_X.get_parameters('regular')[1][:, :-1], ]) q_At = q_A.__class__([ q_A.get_parameters('regular')[0][:, :-1], q_A.get_parameters('regular')[1][:, :-1], ]) q_Xt1 = q_X.__class__([ q_X.get_parameters('regular')[0][:, 1:], q_X.get_parameters('regular')[1][:, 1:], ]) (XtAt_XtAtT, XtAt), (Xt1_Xt1T, Xt1) = self.get_statistics(q_Xt, q_At, q_Xt1) batch_size = T.shape(XtAt)[0] num_batches = T.to_float(num_data) / T.to_float(batch_size) ess = [ Xt1_Xt1T, T.einsum('nha,nhb->nhba', XtAt, Xt1), XtAt_XtAtT, T.ones([batch_size, self.horizon - 1]) ] if self.time_varying: ess = [ T.sum(ess[0], [0]), T.sum(ess[1], [0]), T.sum(ess[2], [0]), T.sum(ess[3], [0]), ] else: ess = [ T.sum(ess[0], [0, 1]), T.sum(ess[1], [0, 1]), T.sum(ess[2], [0, 1]), T.sum(ess[3], [0, 1]), ] return [ -(a + num_batches * b - c) / T.to_float(num_data) for a, b, c in zip( self.A_prior.get_parameters('natural'), ess, self.A_variational.get_parameters('natural'), ) ]
parameter_type='natural').get_parameters('regular') pi_cmessage = q_pi.expected_sufficient_statistics() x_tmessage = NIW.pack([ T.outer(X, X), X, T.ones([batch_size]), T.ones([batch_size]), ]) x_stats = Gaussian.pack([ T.outer(X, X), X, ]) theta_cmessage = q_theta.expected_sufficient_statistics() num_batches = N / T.to_float(batch_size) nat_scale = 10.0 parent_z = q_pi.expected_sufficient_statistics()[None] new_z = T.einsum('iab,jab->ij', x_tmessage, theta_cmessage) + parent_z q_z = Categorical(new_z - T.logsumexp(new_z, -1)[..., None], parameter_type='natural') p_z = Categorical(parent_z - T.logsumexp(parent_z, -1), parameter_type='natural') l_z = T.sum(kl_divergence(q_z, p_z)) z_pmessage = q_z.expected_sufficient_statistics() pi_stats = T.sum(z_pmessage, 0) parent_pi = p_pi.get_parameters('natural') current_pi = q_pi.get_parameters('natural') pi_gradient = nat_scale / N * (parent_pi + num_batches * pi_stats - current_pi)
data = generate_data(1000) N = data.shape[0] yt, yt1 = data[:, :-1], data[:, 1:] yt, yt1 = yt.reshape([-1, D]), yt1.reshape([-1, D]) transition_net = Tanh(D, 500) >> Tanh(500) >> nn.Gaussian(D) transition_net.initialize() rec_net = Tanh(D, 500) >> Tanh(500) >> nn.Gaussian(D) rec_net.initialize() Yt = T.placeholder(T.floatx(), [None, D]) Yt1 = T.placeholder(T.floatx(), [None, D]) batch_size = T.shape(Yt)[0] num_batches = N / T.to_float(batch_size) Yt_message = Gaussian.pack([ T.tile(T.eye(D)[None] * noise, [batch_size, 1, 1]), T.einsum('ab,ib->ia', T.eye(D) * noise, Yt) ]) Yt1_message = Gaussian.pack([ T.tile(T.eye(D)[None] * noise, [batch_size, 1, 1]), T.einsum('ab,ib->ia', T.eye(D) * noise, Yt1) ]) transition = Gaussian(transition_net(Yt)).expected_value() max_iter = 1000 tol = 1e-5
def make_variable(dist): return dist.__class__(T.variable(T.to_float( dist.get_parameters('natural'))), parameter_type='natural')
def make_variable(dist): return dist.__class__(T.variable(T.to_float( dist.get_parameters('natural'))), parameter_type='natural') (X, Y) = generate_data(N, D, seed=3) cf = LogisticRegression(fit_intercept=False) cf.fit(X, Y) coef_ = cf.coef_ score_ = cf.score(X, Y) q_w = make_variable( Gaussian([T.to_float(np.eye(D))[None], T.to_float(np.zeros(D))[None]])) x, y = T.matrix(), T.vector() lr = 1e-4 batch_size = T.shape(x)[0] num_batches = T.to_float(N / batch_size) with T.initialization('xavier'): # stats_net = Relu(D + 1, 20) >> Relu(20) >> GaussianLayer(D) stats_net = GaussianLayer(D + 1, D) net_out = stats_net(T.concat([x, y[..., None]], -1)) stats = T.sum(net_out.get_parameters('natural'), 0)[None] natural_gradient = (p_w.get_parameters('natural') + num_batches * stats -