def map_fn(data): data_shape = T.shape(data) leading = data_shape[:-1] dim_in = data_shape[-1] flattened = T.reshape(data, [-1, dim_in]) net_out = network(flattened) if isinstance(net_out, stats.GaussianScaleDiag): scale_diag, mu = net_out.get_parameters('regular') dim_out = T.shape(mu)[-1] return stats.GaussianScaleDiag([ T.reshape(scale_diag, T.concatenate([leading, [dim_out]])), T.reshape(mu, T.concatenate([leading, [dim_out]])), ]) elif isinstance(net_out, stats.Gaussian): sigma, mu = net_out.get_parameters('regular') dim_out = T.shape(mu)[-1] return stats.Gaussian([ T.reshape(sigma, T.concatenate([leading, [dim_out, dim_out]])), T.reshape(mu, T.concatenate([leading, [dim_out]])), ]) elif isinstance(net_out, stats.Bernoulli): params = net_out.get_parameters('natural') dim_out = T.shape(params)[-1] return stats.Bernoulli( T.reshape(params, T.concatenate([leading, [dim_out]])), 'natural') else: raise Exception("Unimplemented distribution")
def initialize_node(node, children): if isinstance(node, Gaussian): d = T.shape(node) return Gaussian([T.eye(d[-1], batch_shape=d[:-1]), T.random_normal(d)]) elif isinstance(node, IW): d = T.shape(node) return IW([(T.to_float(d[-1]) + 1) * T.eye(d[-1], batch_shape=d[:-2]), T.to_float(d[-1]) + 1])
def em(i, q_dyn_natparam, q_X_natparam, _, curr_elbo): q_X_ = stats.LDS(q_X_natparam, 'natural') ess = q_X_.expected_sufficient_statistics() batch_size = T.shape(ess)[0] yyT = ess[..., :-1, ds:2 * ds, ds:2 * ds] xxT = ess[..., :-1, :ds, :ds] yxT = ess[..., :-1, ds:2 * ds, :ds] x = ess[..., :-1, -1, :ds] y = ess[..., :-1, -1, ds:2 * ds] xaT = T.outer(x, a) yaT = T.outer(y, a) xaxaT = T.concatenate([ T.concatenate([xxT, xaT], -1), T.concatenate([T.matrix_transpose(xaT), aaT], -1), ], -2) ess = [ yyT, T.concatenate([yxT, yaT], -1), xaxaT, T.ones([batch_size, self.horizon - 1]) ] q_dyn_natparam = [ T.sum(a, [0]) * data_strength + b for a, b in zip(ess, initial_dyn_natparam) ] q_dyn_ = stats.MNIW(q_dyn_natparam, 'natural') q_stats = q_dyn_.expected_sufficient_statistics() p_X = stats.LDS((q_stats, state_prior, None, q_A.expected_value(), self.horizon)) q_X_ = stats.LDS((q_stats, state_prior, q_X, q_A.expected_value(), self.horizon)) elbo = (T.sum(stats.kl_divergence(q_X_, p_X)) + T.sum(stats.kl_divergence(q_dyn_, prior_dyn))) return i + 1, q_dyn_.get_parameters( 'natural'), q_X_.get_parameters('natural'), curr_elbo, elbo
def _sample(self, num_samples): sigma, mu = self.natural_to_regular(self.regular_to_natural(self.get_parameters('regular'))) L = T.cholesky(sigma) sample_shape = T.concat([[num_samples], T.shape(mu)], 0) noise = T.random_normal(sample_shape) L = T.tile(L[None], T.concat([[num_samples], T.ones([T.rank(sigma)], dtype=np.int32)])) return mu[None] + T.matmul(L, noise[..., None])[..., 0]
def next_state(self, state, action, t): A, Q = self.get_dynamics() leading_dim = T.shape(state)[:-1] state_action = T.concatenate([state, action], -1) return stats.Gaussian([ T.tile(Q[t][None], T.concatenate([leading_dim, [1, 1]])), T.einsum('ab,nb->na', A[t], state_action) ])
def forward(self, q_Xt, q_At): Xt, At = q_Xt.expected_value(), q_At.expected_value() batch_size = T.shape(Xt)[0] XAt = T.concatenate([Xt, At], -1) A, Q = self.get_dynamics() p_Xt1 = stats.Gaussian([ T.tile(Q[None], [batch_size, 1, 1, 1]), T.einsum('nhs,hxs->nhx', XAt, A) ]) return p_Xt1
def __init__(self, sensor_models, calibration_model, lr=1e-4, batch_size=20, log_dir=None, **kwargs): self.graph = T.core.Graph() self.log_dir = log_dir with self.graph.as_default(): self.calibration_model = calibration_model self.board_ids = list(sensor_models.keys()) self.board_map = {b: i for i, b in enumerate(self.board_ids)} self.sensor_map = sensor_models self.sensor_models = [ sensor_models[board_id] for board_id in self.board_ids ] self.architecture = pickle.dumps( [sensor_models, calibration_model]) self.batch_size = batch_size self.lr = lr self.learning_rate = T.placeholder(T.floatx(), []) self.sensors = T.placeholder(T.floatx(), [None, 3]) self.env = T.placeholder(T.floatx(), [None, 3]) self.board = T.placeholder(T.core.int32, [None]) self.boards = T.transpose( T.pack([self.board, T.range(T.shape(self.board)[0])])) self.rep = T.gather_nd( T.pack([ sensor_model(self.sensors) for sensor_model in self.sensor_models ]), self.boards) self.rep_ = T.placeholder(T.floatx(), [None, self.rep.get_shape()[-1]]) rep_env = T.concat([self.rep, self.env], -1) rep_env_ = T.concat([self.rep_, self.env], -1) self.y_ = self.calibration_model(rep_env) self.y_rep = self.calibration_model(rep_env_) self.y = T.placeholder(T.floatx(), [None, 2]) self.loss = T.mean((self.y - self.y_)**2) self.mae = T.mean(T.abs(self.y - self.y_)) T.core.summary.scalar('MSE', self.loss) T.core.summary.scalar('MAE', self.mae) self.summary = T.core.summary.merge_all() self.train_op = T.core.train.AdamOptimizer( self.learning_rate).minimize(self.loss) self.session = T.interactive_session(graph=self.graph)
def kl_divergence(self, q_X, q_A, num_data): mu_shape = T.shape(q_X.get_parameters('regular')[1]) p_X = stats.GaussianScaleDiag([T.ones(mu_shape), T.zeros(mu_shape)]) return T.mean(T.sum(stats.kl_divergence(q_X, p_X), -1), 0), {}
D = 2 sigma = 0.5 sigma0 = 100 data = generate_data(N, D, K, sigma=sigma, sigma0=sigma0, seed=None) p_pi = Dirichlet(T.constant(10.0 * np.ones([K], dtype=T.floatx()))) p_theta = NIW( list( map(lambda x: T.constant(np.array(x).astype(T.floatx())), [np.eye(D) * sigma, np.zeros(D), 1, D + 1]))) prior = (p_pi, p_theta) np.random.seed(None) X = T.placeholder(T.floatx(), [None, D]) batch_size = T.shape(X)[0] q_pi = make_variable(Dirichlet(np.ones([K], dtype=T.floatx()))) q_theta = make_variable( NIW( map(lambda x: np.array(x).astype(T.floatx()), [ np.tile(np.eye(D)[None] * 100, [K, 1, 1]), np.random.multivariate_normal( mean=np.zeros([D]), cov=np.eye(D) * 20, size=[K]), np.ones(K), np.ones(K) * (D + 1) ]))) sigma, mu = Gaussian(q_theta.expected_sufficient_statistics(), parameter_type='natural').get_parameters('regular') alpha = Categorical(q_pi.expected_sufficient_statistics(),
def posterior_dynamics(self, q_X, q_A, data_strength=1.0, max_iter=200, tol=1e-3): if self.smooth: if self.time_varying: prior_dyn = stats.MNIW( self.A_variational.get_parameters('natural'), 'natural') else: natparam = self.A_variational.get_parameters('natural') prior_dyn = stats.MNIW([ T.tile(natparam[0][None], [self.horizon - 1, 1, 1]), T.tile(natparam[1][None], [self.horizon - 1, 1, 1]), T.tile(natparam[2][None], [self.horizon - 1, 1, 1]), T.tile(natparam[3][None], [self.horizon - 1]), ], 'natural') state_prior = stats.Gaussian([T.eye(self.ds), T.zeros(self.ds)]) aaT, a = stats.Gaussian.unpack( q_A.expected_sufficient_statistics()) aaT, a = aaT[:, :-1], a[:, :-1] ds, da = self.ds, self.da initial_dyn_natparam = prior_dyn.get_parameters('natural') initial_X_natparam = stats.LDS( (self.sufficient_statistics(), state_prior, q_X, q_A.expected_value(), self.horizon), 'internal').get_parameters('natural') def em(i, q_dyn_natparam, q_X_natparam, _, curr_elbo): q_X_ = stats.LDS(q_X_natparam, 'natural') ess = q_X_.expected_sufficient_statistics() batch_size = T.shape(ess)[0] yyT = ess[..., :-1, ds:2 * ds, ds:2 * ds] xxT = ess[..., :-1, :ds, :ds] yxT = ess[..., :-1, ds:2 * ds, :ds] x = ess[..., :-1, -1, :ds] y = ess[..., :-1, -1, ds:2 * ds] xaT = T.outer(x, a) yaT = T.outer(y, a) xaxaT = T.concatenate([ T.concatenate([xxT, xaT], -1), T.concatenate([T.matrix_transpose(xaT), aaT], -1), ], -2) ess = [ yyT, T.concatenate([yxT, yaT], -1), xaxaT, T.ones([batch_size, self.horizon - 1]) ] q_dyn_natparam = [ T.sum(a, [0]) * data_strength + b for a, b in zip(ess, initial_dyn_natparam) ] q_dyn_ = stats.MNIW(q_dyn_natparam, 'natural') q_stats = q_dyn_.expected_sufficient_statistics() p_X = stats.LDS((q_stats, state_prior, None, q_A.expected_value(), self.horizon)) q_X_ = stats.LDS((q_stats, state_prior, q_X, q_A.expected_value(), self.horizon)) elbo = (T.sum(stats.kl_divergence(q_X_, p_X)) + T.sum(stats.kl_divergence(q_dyn_, prior_dyn))) return i + 1, q_dyn_.get_parameters( 'natural'), q_X_.get_parameters('natural'), curr_elbo, elbo def cond(i, _, __, prev_elbo, curr_elbo): with T.core.control_dependencies([T.core.print(curr_elbo)]): prev_elbo = T.core.identity(prev_elbo) return T.logical_and( T.abs(curr_elbo - prev_elbo) > tol, i < max_iter) result = T.while_loop( cond, em, [ 0, initial_dyn_natparam, initial_X_natparam, T.constant(-np.inf), T.constant(0.) ], back_prop=False) pd = stats.MNIW(result[1], 'natural') sigma, mu = pd.expected_value() q_X = stats.LDS(result[2], 'natural') return ((mu, sigma), pd.expected_sufficient_statistics()), (q_X, q_A) else: q_Xt = q_X.__class__([ q_X.get_parameters('regular')[0][:, :-1], q_X.get_parameters('regular')[1][:, :-1], ]) q_At = q_A.__class__([ q_A.get_parameters('regular')[0][:, :-1], q_A.get_parameters('regular')[1][:, :-1], ]) q_Xt1 = q_X.__class__([ q_X.get_parameters('regular')[0][:, 1:], q_X.get_parameters('regular')[1][:, 1:], ]) (XtAt_XtAtT, XtAt), (Xt1_Xt1T, Xt1) = self.get_statistics(q_Xt, q_At, q_Xt1) batch_size = T.shape(XtAt)[0] ess = [ Xt1_Xt1T, T.einsum('nha,nhb->nhba', XtAt, Xt1), XtAt_XtAtT, T.ones([batch_size, self.horizon - 1]) ] if self.time_varying: posterior = stats.MNIW([ T.sum(a, [0]) * data_strength + b for a, b in zip( ess, self.A_variational.get_parameters('natural')) ], 'natural') else: posterior = stats.MNIW([ T.sum(a, [0]) * data_strength + b[None] for a, b in zip( ess, self.A_variational.get_parameters('natural')) ], 'natural') Q, A = posterior.expected_value() return (A, Q), q_X
def shape(self): return T.shape(Stats.X(self.m))
def shape(self): return T.shape(self.value)
data = generate_data(1000) N = data.shape[0] yt, yt1 = data[:, :-1], data[:, 1:] yt, yt1 = yt.reshape([-1, D]), yt1.reshape([-1, D]) transition_net = Tanh(D, 500) >> Tanh(500) >> nn.Gaussian(D) transition_net.initialize() rec_net = Tanh(D, 500) >> Tanh(500) >> nn.Gaussian(D) rec_net.initialize() Yt = T.placeholder(T.floatx(), [None, D]) Yt1 = T.placeholder(T.floatx(), [None, D]) batch_size = T.shape(Yt)[0] num_batches = N / T.to_float(batch_size) Yt_message = Gaussian.pack([ T.tile(T.eye(D)[None] * noise, [batch_size, 1, 1]), T.einsum('ab,ib->ia', T.eye(D) * noise, Yt) ]) Yt1_message = Gaussian.pack([ T.tile(T.eye(D)[None] * noise, [batch_size, 1, 1]), T.einsum('ab,ib->ia', T.eye(D) * noise, Yt1) ]) transition = Gaussian(transition_net(Yt)).expected_value() max_iter = 1000
def initialize(self): self.graph = T.core.Graph() with self.graph.as_default(): prior_params = self.prior_params.copy() prior_type = prior_params.pop('prior_type') self.prior = PRIOR_MAP[prior_type](self.ds, self.da, self.horizon, **prior_params) cost_params = self.cost_params.copy() cost_type = cost_params.pop('cost_type') self.cost = COST_MAP[cost_type](self.ds, self.da, **cost_params) self.O = T.placeholder(T.floatx(), [None, None, self.do]) self.U = T.placeholder(T.floatx(), [None, None, self.du]) self.C = T.placeholder(T.floatx(), [None, None]) self.S = T.placeholder(T.floatx(), [None, None, self.ds]) self.A = T.placeholder(T.floatx(), [None, None, self.da]) self.t = T.placeholder(T.int32, []) self.state, self.action = T.placeholder(T.floatx(), [None, self.ds]), T.placeholder(T.floatx(), [None, self.da]) if self.prior.has_dynamics(): self.next_state = self.prior.next_state(self.state, self.action, self.t) self.prior_dynamics = self.prior.get_dynamics() self.num_data = T.scalar() self.beta = T.placeholder(T.floatx(), []) self.learning_rate = T.placeholder(T.floatx(), []) self.model_learning_rate = T.placeholder(T.floatx(), []) self.S_potentials = util.map_network(self.state_encoder)(self.O) self.A_potentials = util.map_network(self.action_encoder)(self.U) if self.prior.is_dynamics_prior(): self.data_strength = T.placeholder(T.floatx(), []) self.max_iter = T.placeholder(T.int32, []) posterior_dynamics, (encodings, actions) = \ self.prior.posterior_dynamics(self.S_potentials, self.A_potentials, data_strength=self.data_strength, max_iter=self.max_iter) self.posterior_dynamics_ = posterior_dynamics, (encodings.expected_value(), actions.expected_value()) if self.prior.is_filtering_prior(): self.prior_dynamics_stats = self.prior.sufficient_statistics() self.dynamics_stats = ( T.placeholder(T.floatx(), [None, self.ds, self.ds]), T.placeholder(T.floatx(), [None, self.ds, self.ds + self.da]), T.placeholder(T.floatx(), [None, self.ds + self.da, self.ds + self.da]), T.placeholder(T.floatx(), [None]), ) S_natparam = self.S_potentials.get_parameters('natural') num_steps = T.shape(S_natparam)[1] self.padded_S = stats.Gaussian(T.core.pad( self.S_potentials.get_parameters('natural'), [[0, 0], [0, self.horizon - num_steps], [0, 0], [0, 0]] ), 'natural') self.padded_A = stats.GaussianScaleDiag([ T.core.pad(self.A_potentials.get_parameters('regular')[0], [[0, 0], [0, self.horizon - num_steps], [0, 0]]), T.core.pad(self.A_potentials.get_parameters('regular')[1], [[0, 0], [0, self.horizon - num_steps], [0, 0]]) ], 'regular') self.q_S_padded, self.q_A_padded = self.prior.encode( self.padded_S, self.padded_A, dynamics_stats=self.dynamics_stats ) self.q_S_filter = self.q_S_padded.filter(max_steps=num_steps) self.q_A_filter = self.q_A_padded.__class__( self.q_A_padded.get_parameters('natural')[:, :num_steps] , 'natural') self.e_q_S_filter = self.q_S_filter.expected_value() self.e_q_A_filter = self.q_A_filter.expected_value() (self.q_S, self.q_A), self.prior_kl, self.kl_grads, self.info = self.prior.posterior_kl_grads( self.S_potentials, self.A_potentials, self.num_data ) self.q_S_sample = self.q_S.sample()[0] self.q_A_sample = self.q_A.sample()[0] self.q_O = util.map_network(self.state_decoder)(self.q_S_sample) self.q_U = util.map_network(self.action_decoder)(self.q_A_sample) self.q_O_sample = self.q_O.sample()[0] self.q_U_sample = self.q_U.sample()[0] self.q_O_ = util.map_network(self.state_decoder)(self.S) self.q_U_ = util.map_network(self.action_decoder)(self.A) self.q_O__sample = self.q_O_.sample()[0] self.q_U__sample = self.q_U_.sample()[0] self.cost_likelihood = self.cost.log_likelihood(self.q_S_sample, self.C) if self.cost.is_cost_function(): self.evaluated_cost = self.cost.evaluate(self.S) self.log_likelihood = T.sum(self.q_O.log_likelihood(self.O), axis=1) self.elbo = T.mean(self.log_likelihood + self.cost_likelihood - self.prior_kl) train_elbo = T.mean(self.log_likelihood + self.beta * (self.cost_likelihood - self.prior_kl)) T.core.summary.scalar("encoder-stdev", T.mean(self.S_potentials.get_parameters('regular')[0])) T.core.summary.scalar("log-likelihood", T.mean(self.log_likelihood)) T.core.summary.scalar("cost-likelihood", T.mean(self.cost_likelihood)) T.core.summary.scalar("prior-kl", T.mean(self.prior_kl)) T.core.summary.scalar("beta", self.beta) T.core.summary.scalar("elbo", self.elbo) T.core.summary.scalar("beta-elbo", train_elbo) for k, v in self.info.items(): T.core.summary.scalar(k, T.mean(v)) self.summary = T.core.summary.merge_all() neural_params = ( self.state_encoder.get_parameters() + self.state_decoder.get_parameters() + self.action_encoder.get_parameters() + self.action_decoder.get_parameters() ) cost_params = self.cost.get_parameters() if len(neural_params) > 0: optimizer = T.core.train.AdamOptimizer(self.learning_rate) gradients, variables = zip(*optimizer.compute_gradients(-train_elbo, var_list=neural_params)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) self.neural_op = optimizer.apply_gradients(zip(gradients, variables)) else: self.neural_op = T.core.no_op() if len(cost_params) > 0: self.cost_op = T.core.train.AdamOptimizer(self.learning_rate).minimize(-self.elbo, var_list=cost_params) else: self.cost_op = T.core.no_op() if len(self.kl_grads) > 0: if self.prior.is_dynamics_prior(): # opt = lambda x: T.core.train.MomentumOptimizer(x, 0.5) opt = lambda x: T.core.train.GradientDescentOptimizer(x) else: opt = T.core.train.AdamOptimizer self.dynamics_op = opt(self.model_learning_rate).apply_gradients([ (b, a) for a, b in self.kl_grads ]) else: self.dynamics_op = T.core.no_op() self.train_op = T.core.group(self.neural_op, self.dynamics_op, self.cost_op) self.session = T.interactive_session(graph=self.graph, allow_soft_placement=True, log_device_placement=False)
def activate(self, X): shape = T.shape(X) return stats.NIW.pack( [T.outer(X, X), X, T.ones(shape[:-1]), T.ones(shape[:-1])])
def kl_gradients(self, q_X, q_A, _, num_data): if self.smooth: ds = self.ds ess = q_X.expected_sufficient_statistics() yyT = ess[..., :-1, ds:2 * ds, ds:2 * ds] xxT = ess[..., :-1, :ds, :ds] yxT = ess[..., :-1, ds:2 * ds, :ds] aaT, a = stats.Gaussian.unpack( q_A.expected_sufficient_statistics()) aaT, a = aaT[:, :-1], a[:, :-1] x = ess[..., :-1, -1, :ds] y = ess[..., :-1, -1, ds:2 * ds] xaT = T.outer(x, a) yaT = T.outer(y, a) xaxaT = T.concatenate([ T.concatenate([xxT, xaT], -1), T.concatenate([T.matrix_transpose(xaT), aaT], -1), ], -2) batch_size = T.shape(ess)[0] num_batches = T.to_float(num_data) / T.to_float(batch_size) ess = [ yyT, T.concatenate([yxT, yaT], -1), xaxaT, T.ones([batch_size, self.horizon - 1]) ] else: q_Xt = q_X.__class__([ q_X.get_parameters('regular')[0][:, :-1], q_X.get_parameters('regular')[1][:, :-1], ]) q_At = q_A.__class__([ q_A.get_parameters('regular')[0][:, :-1], q_A.get_parameters('regular')[1][:, :-1], ]) q_Xt1 = q_X.__class__([ q_X.get_parameters('regular')[0][:, 1:], q_X.get_parameters('regular')[1][:, 1:], ]) (XtAt_XtAtT, XtAt), (Xt1_Xt1T, Xt1) = self.get_statistics(q_Xt, q_At, q_Xt1) batch_size = T.shape(XtAt)[0] num_batches = T.to_float(num_data) / T.to_float(batch_size) ess = [ Xt1_Xt1T, T.einsum('nha,nhb->nhba', XtAt, Xt1), XtAt_XtAtT, T.ones([batch_size, self.horizon - 1]) ] if self.time_varying: ess = [ T.sum(ess[0], [0]), T.sum(ess[1], [0]), T.sum(ess[2], [0]), T.sum(ess[3], [0]), ] else: ess = [ T.sum(ess[0], [0, 1]), T.sum(ess[1], [0, 1]), T.sum(ess[2], [0, 1]), T.sum(ess[3], [0, 1]), ] return [ -(a + num_batches * b - c) / T.to_float(num_data) for a, b, c in zip( self.A_prior.get_parameters('natural'), ess, self.A_variational.get_parameters('natural'), ) ]
def kl_divergence(self, q_X, q_A, num_data): batch_size = T.shape(q_X.expected_value())[0] return T.zeros(batch_size), {}
def shape(self): return T.shape(self.get_parameters('natural')[Stats.LogX])
(X, Y) = generate_data(N, D, seed=3) cf = LogisticRegression(fit_intercept=False) cf.fit(X, Y) coef_ = cf.coef_ score_ = cf.score(X, Y) q_w = make_variable( Gaussian([T.to_float(np.eye(D))[None], T.to_float(np.zeros(D))[None]])) x, y = T.matrix(), T.vector() lr = 1e-4 batch_size = T.shape(x)[0] num_batches = T.to_float(N / batch_size) with T.initialization('xavier'): # stats_net = Relu(D + 1, 20) >> Relu(20) >> GaussianLayer(D) stats_net = GaussianLayer(D + 1, D) net_out = stats_net(T.concat([x, y[..., None]], -1)) stats = T.sum(net_out.get_parameters('natural'), 0)[None] natural_gradient = (p_w.get_parameters('natural') + num_batches * stats - q_w.get_parameters('natural')) / N next_w = Gaussian(q_w.get_parameters('natural') + lr * natural_gradient, parameter_type='natural') l_w = kl_divergence(q_w, p_w)[0]