def __call__( self, states, actions, next_states, initial_omega=8, training_set_size=4000, actions_one_hot=None, sess=None, summary_writer=None, ): """ :param states: Nxm matrix :param actions: Vector of all possible actions: Nx n_actions :param next_states: Nxm matrix containing the next states :param initial_omega: value of the initial omega :return: """ self.sess = sess self.training_set_size = training_set_size self.summary_writer = summary_writer train_or_test = U.get_placeholder("train_or_test", tf.bool, ()) # statistics self.Xmean_ph = U.get_placeholder( name="Xmean", dtype=self.dtype, shape=(1, self.x_dim) ) self.Ymean_ph = U.get_placeholder( name="Ymean", dtype=self.dtype, shape=(1, self.state_dim) ) self.Xstd_ph = U.get_placeholder( name="Xstd", dtype=self.dtype, shape=(1, self.x_dim) ) self.Ystd_ph = U.get_placeholder( name="Ystd", dtype=self.dtype, shape=(1, self.state_dim) ) self.X = U.get_placeholder(name="X", dtype=self.dtype, shape=(None, self.x_dim)) self.Y = U.get_placeholder( name="Y", dtype=self.dtype, shape=(None, self.state_dim) ) with tf.variable_scope(self.name): # build the action vector self.omega = tf.get_variable( dtype=self.dtype, name="omega", shape=(), initializer=tf.initializers.constant(initial_omega), ) X = self.X # - Xmean_) / Xstd_ Y = self.Y # - YMean_) / Ystd_ # build the action vector forces = self.omega * actions forces_full = tf.concat( [tf.reshape(forces[:, 0], (-1, 1)), tf.reshape(forces[:, 1], (-1, 1))], axis=0, ) batch_size = tf.shape(states)[0] x_full = tf.concat([states, states], axis=0) x_full = tf.concat([x_full, forces_full], axis=1) x_full = (x_full - self.Xmean_ph) / self.Xstd_ph next_states_full = tf.concat([next_states, next_states], axis=0) next_states_full = (next_states_full - self.Ymean_ph) / self.Ystd_ph # build the network hidden_layer_size = 10 biases = tf.get_variable( "b", [hidden_layer_size], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) W = tf.get_variable( "W", [self.x_dim, hidden_layer_size], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) x_input = U.switch(train_or_test, X, x_full) h = tf.matmul(x_input, W) h = tf.tanh(h + biases) # now we need state_dim output neurons, one for each state dimension to predict biases_out = tf.get_variable( "b_out", [self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) W_out = tf.get_variable( "W_out", [hidden_layer_size, self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) means = tf.matmul(h, W_out) + biases_out # x_input_first = x_input[:, 0:self.x_dim - 1] # forces = tf.reshape(x_input[:, self.x_dim - 1], (-1, 1)) # x_input = tf.concat([x_input_first, tf.abs(forces)], axis=1) hidden_var = 10 biases_var = tf.get_variable( "b_var", [hidden_var], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) W_var = tf.get_variable( "W_var", [self.x_dim, hidden_var], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) h = tf.nn.sigmoid(tf.matmul(x_input, W_var) + biases_var) W_out_var = tf.get_variable( "W_out_var", [hidden_var, self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) biases_out_var = tf.get_variable( "b_out_var", [self.state_dim], initializer=tf.random_normal_initializer(0, 0.001, dtype=self.dtype), dtype=self.dtype, ) var = tf.exp(tf.matmul(h, W_out_var) + biases_out_var) std = tf.sqrt(var) pdf = Normal(means, std) y_output = U.switch(train_or_test, Y, next_states_full) log_prob = tf.reduce_sum(pdf.log_prob(y_output), axis=1, keepdims=True) prob = tf.reduce_prod(pdf.prob(y_output), axis=1, keepdims=True) # loss is the negative loss likelihood self.loss = -tf.reduce_mean(log_prob) self.valid_loss = -tf.reduce_mean(log_prob) self.fitting_vars = [ biases, W, biases_out, W_out, biases_var, W_var, W_out_var, biases_out_var, ] # create fitting collection for v in self.fitting_vars: tf.add_to_collection("fitting", v) opt = tf.train.AdamOptimizer() self.minimize_op = opt.minimize(self.loss, var_list=self.fitting_vars) log_prob_a0 = log_prob[0:batch_size, :] log_prob_a1 = log_prob[batch_size:, :] prob_a0 = prob[0:batch_size, :] prob_a1 = prob[batch_size:, :] self.log_prob = tf.concat([log_prob_a0, log_prob_a1], axis=1) self.prob = tf.concat([prob_a0, prob_a1], axis=1) means_list = [] var_list = [] for i in range(self.state_dim): means_a0 = tf.reshape(means[0:batch_size, i], (-1, 1)) means_a1 = tf.reshape(means[batch_size : 2 * batch_size, i], (-1, 1)) means_actions = tf.concat([means_a0, means_a1], axis=1) means_ = tf.reduce_sum( tf.multiply(means_actions, actions_one_hot), axis=1, keepdims=True ) means_list.append(means_) # same for variance var_a0 = tf.reshape(var[0:batch_size, i], (-1, 1)) var_a1 = tf.reshape(var[batch_size : 2 * batch_size, i], (-1, 1)) var_actions = tf.concat([var_a0, var_a1], axis=1) var_ = tf.reduce_sum( tf.multiply(var_actions, actions_one_hot), axis=1, keepdims=True ) var_list.append(var_) self.means = tf.concat(means_list, axis=1) self.variances = tf.concat(var_list, axis=1) self.train_or_test = train_or_test self.loss_summary = tf.summary.scalar("Loss", self.loss) self.valid_loss_summary = tf.summary.scalar("ValidLoss", self.valid_loss) return self.log_prob, self.prob
def interpolate_gaussian(coords, inputs, dim, wrap=False, kernel_size=None, kernel_step=None, stddev=2.0): """ interpolate_gaussian - samples with coords from inputs, interpolating the results via a differentiable gaussian kernel. :param coords shape: (N, dim, width, height, ...) :param inputs shape: (N, width, height, .. n_chan) :param dim - dimensionality of the data, e.g. 2 if inputs is a batch of images :param wrap - whether to wrap, or otherwise clip during the interpolation :returns - the sampled result :shape (N, width, height, ..., n_chan), where width, height, ... come from the coords shape """ if not wrap: print("Clipping is not supported for the gaussian kernel yet") raise NotImplementedError if K.backend() != "tensorflow": print( "Theano backend is currently not supported for the gaussian kernel" ) raise NotImplementedError inputs_shape = K.shape(inputs) inputs_shape_list = [inputs_shape[i] for i in range(dim + 2)] coords_shape = K.shape(coords) coords_shape_list = [coords_shape[i] for i in range(dim + 2)] inputs_dims = inputs_shape_list[1:-1] maxes = K.cast(inputs_shape[1:-1] - 1, "float32") coords_float = upscale(coords, maxes, dim) import tensorflow as tf from tensorflow.contrib.distributions import Normal if not kernel_step or not kernel_size: kernel_step = 1 # tile the float coords, extending them for the application of the gaussian aggregation later extended_coords = tf.reshape(coords_float, coords_shape_list + [1] * dim) if kernel_size: m = kernel_size // kernel_step + (1 if kernel_size % kernel_step != 0 else 0) extended_coords = tf.tile(extended_coords, [1] * len(coords_shape_list) + [m] * dim) else: extended_coords = tf.tile(extended_coords, [1] * len(coords_shape_list) + inputs_dims) # center a gaussian at each of the unstandardized transformed coordinates coord_gaussians = Normal(loc=extended_coords, scale=stddev) # shape: (N, dim, width, height, ..., img_width, img_height, ...) for i in range(dim): # create ranges for each of the dimensions to "spread" the coords across the image if kernel_size: m = kernel_size // kernel_step + ( 1 if kernel_size % kernel_step != 0 else 0) limit = kernel_size else: m = inputs_dims[i] limit = inputs_dims[i] range_offset = tf.cast( tf.range(start=0, limit=limit, delta=kernel_step), "float32") range_offset -= tf.cast((limit - 1.0) / 2.0, "float32") # reshape so that the offset is broadcastet in all dimensions but the # one for the current dimension broadcast_shape = [1] * len(coords_shape_list) + i * [1] + \ [m] + (dim - i - 1) * [1] # shape: (1, 1, 1, 1, ..., img_width, img_height, ...) range_offset = tf.reshape(range_offset, broadcast_shape) zero_pads = [tf.zeros_like(range_offset) for _ in range(dim - 1)] # concatenate zeros for the rest of the dimensions range_offset = tf.concat(zero_pads[:i] + [range_offset] + zero_pads[i + 1:], axis=1) range_offset = tf.cast(range_offset, "float32") extended_coords += range_offset # now round and then sample sampling_coords = tf.floor(extended_coords) # double the dim as those coords are extended samples = sample(inputs, sampling_coords, dim=dim * 2, wrapped=True) # since the gaussians are isotropic, I have to reduce a product along the dim-dimension first # TODO: this needs to be the meshgrid with image size, and not the scaled up coords coord_gaussian_pdfs = coord_gaussians.prob(extended_coords) coord_gaussian_pdfs = tf.reduce_prod(coord_gaussian_pdfs, axis=1) # expand one broadcastable dimension for the image channels coord_gaussian_pdfs = tf.expand_dims(coord_gaussian_pdfs, -1) samples = samples * coord_gaussian_pdfs # normalize the samples so that the weighting does not change the pixel intensities reduction_indices = [i for i in range(dim + 1, 2 * dim + 1)] norm_coeff = tf.reduce_sum(coord_gaussian_pdfs, keep_dims=True, reduction_indices=reduction_indices) samples /= norm_coeff # reduce_sum along the img_width, img_height, ... etc. axes samples = tf.reduce_sum(samples, reduction_indices=reduction_indices) return samples
class DetDropoutFC(Layer): """X->Dropout->Linear->LayerNorm->ReLU->mean""" def __init__(self, keep_prob, input_dim, output_dim, placeholders, sparse_inputs=False, norm=True, **kwargs): # TODO sparse inputs super(DetDropoutFC, self).__init__(**kwargs) self.sparse_inputs = sparse_inputs self.norm = norm self.keep_prob = keep_prob self.normal = Normal(0.0, 1.0) self.log_values = [] with tf.variable_scope(self.name + '_vars'): self.vars['weights'] = glorot([input_dim, output_dim], name='weights') if norm: self.vars['offset'] = zeros([1, output_dim], name='offset') self.vars['scale'] = ones ([1, output_dim], name='scale') if self.logging: self._log_vars() def _call(self, inputs): # Dropout p = self.keep_prob if isinstance(inputs, tuple): mu, var = inputs mu2 = tf.square(mu) var = (var+mu2) / p - mu2 else: mu = inputs var = (1-p)/p * tf.square(inputs) self.log_values.append((mu, var)) # Linear mu = dot(mu, self.vars['weights'], sparse=self.sparse_inputs) var = dot(var, tf.square(self.vars['weights']), sparse=self.sparse_inputs) * 1.2 # TODO hack self.log_values.append((mu, var)) # Norm if self.norm: mean, variance = tf.nn.moments(mu, axes=[1], keep_dims=True) mu = tf.nn.batch_normalization(mu, mean, variance, self.vars['offset'], self.vars['scale'], 1e-10) var = var * (tf.square(self.vars['scale']) / variance) self.log_values.append((mu, var)) # ReLU sigma = tf.sqrt(var) alpha = -mu / sigma phi = self.normal.prob(alpha) Phi = self.normal.cdf(alpha) Z = self.normal.cdf(-alpha) + 1e-10 phiZ = phi/Z m = mu + sigma * phiZ mu = Z * m #var = Z * Phi * tf.square(m) # TODO approximation var = tf.nn.relu(var * (1 + alpha*phiZ - tf.square(phiZ))) + 1e-10 var = Z * var + Z*Phi*tf.square(mu) self.log_values.append((mu, var)) return mu, var
def __init__(self, is_training, X, y): self._is_training = is_training self._rnn_params = None self._cell = None self.batch_size = 200 self.seq_length = 5 self.X = X if is_training: n_batch = n_batch_train else: n_batch = n_batch_test # Construct prior prior = ScaleMixturePrior() n_unit_pre = n_feature # create 2 LSTMCells rnn_layers = [] n_unit_pre = n_feature for i in range(n_layer): rnn_layers.append(BayesianLSTM(n_unit_pre, layers[i], prior, is_training, inference_mode=inference_mode, forget_bias=0.0, name='bbb_lstm_{}'.format(i), bias=True)) n_unit_pre = layers[i] multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) self._initial_state = multi_rnn_cell.zero_state(batch_size, tf.float32) state = self._initial_state # 'output' is a tensor of shape [batch_size, seq_length, n_feature] # 'state' is a N-tuple where N is the number of LSTMCells containing a # tf.contrib.rnn.LSTMStateTuple for each cell outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=X, time_major=False, dtype=tf.float32) # output layer # add weight term rho_min_init, rho_max_init = prior.normal_init() if bias: w = get_noisy_weights((50, 1), 'w', prior, is_training, rho_min_init, rho_max_init) else: w = tf.get_variable('w', (50, 1), tf.float32, tf.constant_initializer(0.)) # add bias term if bias: b = get_noisy_weights( (1), 'b', prior, is_training, rho_min_init, rho_max_init) else: b = tf.get_variable('b', (1), tf.float32, tf.constant_initializer(0.)) output = tf.reshape( tf.matmul(outputs[:, seq_length-1, :], w) + b, [-1]) y = tf.reshape(y, [-1]) y_pred = Normal(output, 1.) print("Finish predicting y") # Use the contrib sequence loss and average over the batches loss = - tf.log(y_pred.prob(y) + 1e-8) # Update the cost self._cost = tf.reduce_sum(loss) / batch_size self._final_state = state # 1. For testing, no kl term, just loss self._kl_div = 0. if not is_training: return # 2. For training, compute kl scaled by 1./n_batch # Add up all prior's kl values kl_div = tf.add_n(tf.get_collection('KL_layers'), 'kl_divergence') # Compute ELBO kl_const = 1. / n_batch self._kl_div = kl_div * kl_const self._total_loss = self._cost + self._kl_div # Optimization: # Learning rate self._lr = tf.Variable(0.0, trainable=False) # Update all weights with gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self._total_loss, tvars), max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) # Learning rate update self._new_lr = tf.placeholder( tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) print("Finish building model")