def get_KL_divergence_Sample(shape, mu, sigma, prior, Z): """ Compute KL divergence between posterior and prior. Instead of computing the real KL distance between the Prior and Variatiational posterior of the weights, we will jsut sample its value of the specific values of the sampled weights W. In this case: - Posterior: Multivariate Independent Gaussian. - Prior: Mixture model The sample of the posterior is: KL_sample = log(q(W|theta)) - log(p(W|theta_0)) where p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2) Input: - mus,sigmas: - Z: Samples weights values, the hidden variables ! shape = shape of the sample we want to compute the KL of mu = the mu variable used when sampling sigma= the sigma variable used when sampling prior = the prior object with parameters sample = the sample from the posterior """ # Flatten the hidden variables (weights) Z = tf.reshape(Z, [-1]) #Get the log probability distribution of your sampled variable # Distribution of the Variational Posterior VB_distribution = Normal(mu, sigma) # Distribution of the Gaussian Components of the prior prior_1_distribution = Normal(0.0, prior.sigma1) prior_2_distribution = Normal(0.0, prior.sigma2) # Now we compute the log likelihood of those Hidden variables for their # prior and posterior. #get: sum( log[ q( theta | mu, sigma ) ] ) q_ll = tf.reduce_sum(VB_distribution.log_prob(Z)) #get: sum( log[ p( theta ) ] ) for mixture prior mix1 = tf.reduce_sum(prior_1_distribution.log_prob(Z)) + tf.log(prior.pi_mix) mix2 = tf.reduce_sum(prior_2_distribution.log_prob(Z)) + tf.log(1.0 - prior.pi_mix) p_ll = tf.reduce_logsumexp([mix1,mix2]) #Compute the sample of the KL distance as the substaction ob both KL = q_ll - p_ll return KL
def get_kl_divergence(shape, mu, sigma, prior, sample): """ Compute KL divergence between posterior and prior. log(q(theta)) - log(p(theta)) where p(theta) = pi*N(0,sigma1) + (1-pi)*N(0,sigma2) shape = shape of the sample we want to compute the KL of mu = the mu variable used when sampling sigma= the sigma variable used when sampling prior = the prior object with parameters sample = the sample from the posterior """ #Flatten to a vector sample = tf.reshape(sample, [-1]) #Get the log probability distribution of your sampled variable #So essentially get: q( theta | mu, sigma ) posterior = Normal(mu, sigma) prior_1 = Normal(0.0, prior.sigma1) prior_2 = Normal(0.0, prior.sigma2) #get: sum( log[ q( theta | mu, sigma ) ] ) q_theta = tf.reduce_sum(posterior.log_prob(sample)) #get: sum( log[ p( theta ) ] ) for mixture prior mix1 = tf.reduce_sum(prior_1.log_prob(sample)) + tf.log(prior.pi_mix) mix2 = tf.reduce_sum(prior_2.log_prob(sample)) + tf.log(1.0 - prior.pi_mix) #Compute KL distance KL = q_theta - tf.reduce_logsumexp([mix1,mix2]) return KL
def _create_network(self): # Initialize autoencode network weights and biases network_weights = self._initialize_weights(**self.network_architecture) # Use recognition network to determine mean and # (log) variance of Gaussian distribution in latent # space self.z_mean, self.c_mean, self.z_log_sigma_sq, self.c_log_sigma_sq = \ self._recognition_network(network_weights["weights_recog"], network_weights["biases_recog"], self.x) self.z_mean_concat = tf.concat(1, [self.z_mean, self.c_mean]) self.z_log_sigma_sq_concat = tf.concat(1, [self.z_log_sigma_sq, self.c_log_sigma_sq]) # Compute I(Z,X) point estimate as H(Z|X) self.cond_ent_lat_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.z_log_sigma_sq_concat, tf.constant(2.838))), reduction_indices=1)) self.cond_ent_z_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.z_log_sigma_sq, tf.constant(2.838))), reduction_indices=1)) self.cond_ent_c_given_x = tf.reduce_mean(tf.reduce_sum(tf.mul(tf.constant(0.5), tf.add(self.c_log_sigma_sq, tf.constant(2.838))), reduction_indices=1)) # Draw one sample z from Gaussian distribution n_z = self.network_architecture["n_z"] n_c = self.network_architecture["n_c"] eps = tf.random_normal((self.batch_size, n_z + n_c), 0, 1, dtype=tf.float32) # z = mu + sigma*epsilon self.z = tf.add(self.z_mean_concat, tf.mul(tf.sqrt(tf.exp(self.z_log_sigma_sq_concat)), eps), name='z') # Use generator to determine mean of # Bernoulli distribution of reconstructed input self.x_reconstr_mean = \ self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"], z=self.z) #### #### #### eps = tf.random_normal((self.batch_size, n_z + n_c), 0, 1, dtype=tf.float32) self.z_theta_concat = tf.add(0.0, tf.mul(1.0, eps), name='z_theta') self.z_theta = self.z_theta_concat[:, :n_z] self.c_theta = self.z_theta_concat[:, n_z:] self.x_prime = self._generator_network(network_weights["weights_gener"], network_weights["biases_gener"], z=self.z_theta_concat) self.z_prime_mean, self.c_prime_mean, self.z_prime_log_sigma_sq, self.c_prime_log_sigma_sq = \ self._recognition_network(network_weights["weights_recog"], network_weights["biases_recog"], self.x_prime) self.z_prime_mean_concat = tf.concat(1, [self.z_prime_mean, self.c_prime_mean]) self.z_prime_log_sigma_sq_concat = tf.concat(1, [self.z_prime_log_sigma_sq, self.c_prime_log_sigma_sq]) # XEntropy for the code C dist = Normal(mu=self.c_prime_mean, sigma=tf.sqrt(tf.exp(self.c_prime_log_sigma_sq))) logli = tf.reduce_sum(dist.log_pdf(self.c_theta, name='xc_entropy'), reduction_indices=1) self.cross_entropy = tf.reduce_mean(- logli) self.entropy = tf.constant(1.4185 * n_c) # XEntropy for the entire latent code dist_all = Normal(mu=self.z_prime_mean_concat, sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq_concat))) logli_all = tf.reduce_sum(dist_all.log_pdf(self.z_theta_concat, name='x_entropy_concat'), reduction_indices=1) self.cross_entropy_concat = tf.reduce_mean(- logli_all) self.entropy_concat = tf.constant(1.4185 * (n_z + n_c)) # Entropy for the code Z dist_z = Normal(mu=self.z_prime_mean, sigma=tf.sqrt(tf.exp(self.z_prime_log_sigma_sq))) logli_z = tf.reduce_sum(dist_z.log_pdf(self.z_theta, name='xz_entropy'), reduction_indices=1) self.cross_entropy_z = tf.reduce_mean(- logli_z) self.entropy_z = tf.constant(1.4185 * n_z)
def __call__(self, session, trainX, trainY, testX, testY): """ Initialize the actual graph Parameters ---------- session : tf.Session Tensorflow session trainX : sparse array in coo format Test input OTU table, where rows are samples and columns are observations trainY : np.array Test output metabolite table testX : sparse array in coo format Test input OTU table, where rows are samples and columns are observations. This is mainly for cross validation. testY : np.array Test output metabolite table. This is mainly for cross validation. """ self.session = session self.nnz = len(trainX.data) self.d1 = trainX.shape[1] self.d2 = trainY.shape[1] self.cv_size = len(testX.data) # keep the multinomial sampling on the cpu # https://github.com/tensorflow/tensorflow/issues/18058 with tf.device('/cpu:0'): X_ph = tf.SparseTensor(indices=np.array([trainX.row, trainX.col]).T, values=trainX.data, dense_shape=trainX.shape) Y_ph = tf.constant(trainY, dtype=tf.float32) X_holdout = tf.SparseTensor(indices=np.array( [testX.row, testX.col]).T, values=testX.data, dense_shape=testX.shape) Y_holdout = tf.constant(testY, dtype=tf.float32) total_count = tf.reduce_sum(Y_ph, axis=1) batch_ids = tf.multinomial( tf.log(tf.reshape(X_ph.values, [1, -1])), self.batch_size) batch_ids = tf.squeeze(batch_ids) X_samples = tf.gather(X_ph.indices, 0, axis=1) X_obs = tf.gather(X_ph.indices, 1, axis=1) sample_ids = tf.gather(X_samples, batch_ids) Y_batch = tf.gather(Y_ph, sample_ids) X_batch = tf.gather(X_obs, batch_ids) with tf.device(self.device_name): self.qUmain = tf.Variable(tf.random_normal([self.d1, self.p]), name='qU') self.qUbias = tf.Variable(tf.random_normal([self.d1, 1]), name='qUbias') self.qVmain = tf.Variable(tf.random_normal([self.p, self.d2 - 1]), name='qV') self.qVbias = tf.Variable(tf.random_normal([1, self.d2 - 1]), name='qVbias') qU = tf.concat([tf.ones([self.d1, 1]), self.qUbias, self.qUmain], axis=1) qV = tf.concat( [self.qVbias, tf.ones([1, self.d2 - 1]), self.qVmain], axis=0) # regression coefficents distribution Umain = Normal(loc=tf.zeros([self.d1, self.p]) + self.u_mean, scale=tf.ones([self.d1, self.p]) * self.u_scale, name='U') Ubias = Normal(loc=tf.zeros([self.d1, 1]) + self.u_mean, scale=tf.ones([self.d1, 1]) * self.u_scale, name='biasU') Vmain = Normal(loc=tf.zeros([self.p, self.d2 - 1]) + self.v_mean, scale=tf.ones([self.p, self.d2 - 1]) * self.v_scale, name='V') Vbias = Normal(loc=tf.zeros([1, self.d2 - 1]) + self.v_mean, scale=tf.ones([1, self.d2 - 1]) * self.v_scale, name='biasV') du = tf.gather(qU, X_batch, axis=0, name='du') dv = tf.concat([tf.zeros([self.batch_size, 1]), du @ qV], axis=1, name='dv') tc = tf.gather(total_count, sample_ids) Y = Multinomial(total_count=tc, logits=dv, name='Y') num_samples = trainX.shape[0] norm = num_samples / self.batch_size logprob_vmain = tf.reduce_sum(Vmain.log_prob(self.qVmain), name='logprob_vmain') logprob_vbias = tf.reduce_sum(Vbias.log_prob(self.qVbias), name='logprob_vbias') logprob_umain = tf.reduce_sum(Umain.log_prob(self.qUmain), name='logprob_umain') logprob_ubias = tf.reduce_sum(Ubias.log_prob(self.qUbias), name='logprob_ubias') logprob_y = tf.reduce_sum(Y.log_prob(Y_batch), name='logprob_y') self.log_loss = -(logprob_y * norm + logprob_umain + logprob_ubias + logprob_vmain + logprob_vbias) # keep the multinomial sampling on the cpu # https://github.com/tensorflow/tensorflow/issues/18058 with tf.device('/cpu:0'): # cross validation with tf.name_scope('accuracy'): cv_batch_ids = tf.multinomial( tf.log(tf.reshape(X_holdout.values, [1, -1])), self.cv_size) cv_batch_ids = tf.squeeze(cv_batch_ids) X_cv_samples = tf.gather(X_holdout.indices, 0, axis=1) X_cv = tf.gather(X_holdout.indices, 1, axis=1) cv_sample_ids = tf.gather(X_cv_samples, cv_batch_ids) Y_cvbatch = tf.gather(Y_holdout, cv_sample_ids) X_cvbatch = tf.gather(X_cv, cv_batch_ids) holdout_count = tf.reduce_sum(Y_cvbatch, axis=1) cv_du = tf.gather(qU, X_cvbatch, axis=0, name='cv_du') pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax( tf.concat([tf.zeros([self.cv_size, 1]), cv_du @ qV], axis=1, name='pred')) self.cv = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_cvbatch))) # keep all summaries on the cpu with tf.device('/cpu:0'): tf.summary.scalar('logloss', self.log_loss) tf.summary.scalar('cv_rmse', self.cv) tf.summary.histogram('qUmain', self.qUmain) tf.summary.histogram('qVmain', self.qVmain) tf.summary.histogram('qUbias', self.qUbias) tf.summary.histogram('qVbias', self.qVbias) self.merged = tf.summary.merge_all() self.writer = tf.summary.FileWriter(self.save_path, self.session.graph) with tf.device(self.device_name): with tf.name_scope('optimize'): optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=self.beta_1, beta2=self.beta_2) gradients, self.variables = zip( *optimizer.compute_gradients(self.log_loss)) self.gradients, _ = tf.clip_by_global_norm( gradients, self.clipnorm) self.train = optimizer.apply_gradients( zip(self.gradients, self.variables)) tf.global_variables_initializer().run()
def fit(self, data, epochs=1000, max_seconds=600, activation=tf.nn.elu, batch_norm_decay=0.9, learning_rate=1e-5, batch_sz=1024, adapt_lr=False, print_progress=True, show_fig=True): os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # static features X = data['X_train_static_mins'] N, D = X.shape self.X = tf.placeholder(tf.float32, shape=(None, D), name='X') # timeseries features X_time = data['X_train_time_0'] T1, N1, D1 = X_time.shape assert N == N1 self.X_time = tf.placeholder(tf.float32, shape=(T1, None, D1), name='X_time') self.train = tf.placeholder(tf.bool, shape=(), name='train') self.rnn_keep_p_encode = tf.placeholder(tf.float32, shape=(), name='rnn_keep_p_encode') self.rnn_keep_p_decode = tf.placeholder(tf.float32, shape=(), name='rnn_keep_p_decode') adp_learning_rate = tf.placeholder(tf.float32, shape=(), name='adp_learning_rate') he_init = variance_scaling_initializer() bn_params = { 'is_training': self.train, 'decay': batch_norm_decay, 'updates_collections': None } latent_size = self.encoder_layer_sizes[-1] inputs = self.X with tf.variable_scope('static_encoder'): for layer_size, keep_p in zip(self.encoder_layer_sizes[:-1], self.encoder_dropout[:-1]): inputs = dropout(inputs, keep_p, is_training=self.train) inputs = fully_connected(inputs, layer_size, weights_initializer=he_init, activation_fn=activation, normalizer_fn=batch_norm, normalizer_params=bn_params) if self.rnn_encoder_layer_sizes: with tf.variable_scope('rnn_encoder'): rnn_cell = MultiRNNCell([ LayerNormBasicLSTMCell( s, activation=tf.tanh, dropout_keep_prob=self.rnn_encoder_dropout) for s in self.rnn_encoder_layer_sizes ]) time_inputs, states = tf.nn.dynamic_rnn(rnn_cell, self.X_time, swap_memory=True, time_major=True, dtype=tf.float32) time_inputs = tf.transpose(time_inputs, perm=(1, 0, 2)) time_inputs = tf.reshape( time_inputs, shape=(-1, self.rnn_encoder_layer_sizes[-1] * T1)) inputs = tf.concat([inputs, time_inputs], axis=1) with tf.variable_scope('latent_space'): inputs = dropout(inputs, self.encoder_dropout[-1], is_training=self.train) loc = fully_connected(inputs, latent_size, weights_initializer=he_init, activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params) scale = fully_connected(inputs, latent_size, weights_initializer=he_init, activation_fn=tf.nn.softplus, normalizer_fn=batch_norm, normalizer_params=bn_params) standard_normal = Normal(loc=np.zeros(latent_size, dtype=np.float32), scale=np.ones(latent_size, dtype=np.float32)) e = standard_normal.sample(tf.shape(loc)[0]) outputs = e * scale + loc static_output_size = self.decoder_layer_sizes[0] if self.rnn_decoder_layer_sizes: time_output_size = self.rnn_decoder_layer_sizes[0] * T1 output_size = static_output_size + time_output_size else: output_size = static_output_size outputs = fully_connected(outputs, output_size, weights_initializer=he_init, activation_fn=activation, normalizer_fn=batch_norm, normalizer_params=bn_params) if self.rnn_decoder_layer_sizes: outputs, time_outputs = tf.split( outputs, [static_output_size, time_output_size], axis=1) with tf.variable_scope('static_decoder'): for layer_size, keep_p in zip(self.decoder_layer_sizes, self.decoder_dropout[:-1]): outputs = dropout(outputs, keep_p, is_training=self.train) outputs = fully_connected(outputs, layer_size, weights_initializer=he_init, activation_fn=activation, normalizer_fn=batch_norm, normalizer_params=bn_params) outputs = dropout(outputs, self.decoder_dropout[-1], is_training=self.train) outputs = fully_connected(outputs, D, weights_initializer=he_init, activation_fn=None, normalizer_fn=batch_norm, normalizer_params=bn_params) X_hat = Bernoulli(logits=outputs) self.posterior_predictive = X_hat.sample() self.posterior_predictive_probs = tf.nn.sigmoid(outputs) if self.rnn_decoder_layer_sizes: with tf.variable_scope('rnn_decoder'): self.rnn_decoder_layer_sizes.append(D1) time_output_size = self.rnn_decoder_layer_sizes[0] time_outputs = tf.reshape(time_outputs, shape=(-1, T1, time_output_size)) time_outputs = tf.transpose(time_outputs, perm=(1, 0, 2)) rnn_cell = MultiRNNCell([ LayerNormBasicLSTMCell( s, activation=tf.tanh, dropout_keep_prob=self.rnn_decoder_dropout) for s in self.rnn_decoder_layer_sizes ]) time_outputs, states = tf.nn.dynamic_rnn(rnn_cell, time_outputs, swap_memory=True, time_major=True, dtype=tf.float32) time_outputs = tf.transpose(time_outputs, perm=(1, 0, 2)) time_outputs = tf.reshape(time_outputs, shape=(-1, T1 * D1)) X_hat_time = Bernoulli(logits=time_outputs) posterior_predictive_time = X_hat_time.sample() posterior_predictive_time = tf.reshape( posterior_predictive_time, shape=(-1, T1, D1)) self.posterior_predictive_time = tf.transpose( posterior_predictive_time, perm=(1, 0, 2)) self.posterior_predictive_probs_time = tf.nn.sigmoid( time_outputs) kl_div = -tf.log(scale) + 0.5 * (scale**2 + loc**2) - 0.5 kl_div = tf.reduce_sum(kl_div, axis=1) expected_log_likelihood = tf.reduce_sum(X_hat.log_prob(self.X), axis=1) X_time_trans = tf.transpose(self.X_time, perm=(1, 0, 2)) X_time_reshape = tf.reshape(X_time_trans, shape=(-1, T1 * D1)) if self.rnn_encoder_layer_sizes: expected_log_likelihood_time = tf.reduce_sum( X_hat_time.log_prob(X_time_reshape), axis=1) elbo = -tf.reduce_sum(expected_log_likelihood + expected_log_likelihood_time - kl_div) else: elbo = -tf.reduce_sum(expected_log_likelihood - kl_div) train_op = tf.train.AdamOptimizer( learning_rate=adp_learning_rate).minimize(elbo) tf.summary.scalar('elbo', elbo) if self.save_file: saver = tf.train.Saver() if self.tensorboard: for v in tf.trainable_variables(): tf.summary.histogram(v.name, v) train_merge = tf.summary.merge_all() writer = tf.summary.FileWriter(self.tensorboard) self.init_op = tf.global_variables_initializer() n = 0 n_batches = N // batch_sz costs = list() min_cost = np.inf t0 = dt.now() with tf.Session() as sess: sess.run(self.init_op) for epoch in range(epochs): idxs = shuffle(range(N)) X_train = X[idxs] X_train_time = X_time[:, idxs] for batch in range(n_batches): n += 1 X_batch = X_train[batch * batch_sz:(batch + 1) * batch_sz] X_batch_time = X_train_time[:, batch * batch_sz:(batch + 1) * batch_sz] sess.run(train_op, feed_dict={ self.X: X_batch, self.X_time: X_batch_time, self.rnn_keep_p_encode: self.rnn_encoder_dropout, self.rnn_keep_p_decode: self.rnn_decoder_dropout, self.train: True, adp_learning_rate: learning_rate }) if n % 100 == 0 and print_progress: cost = sess.run(elbo, feed_dict={ self.X: X, self.X_time: X_time, self.rnn_keep_p_encode: 1.0, self.rnn_keep_p_decode: 1.0, self.train: False }) cost /= N costs.append(cost) if adapt_lr and epoch > 0: if cost < min_cost: min_cost = cost elif cost > min_cost * 1.01: learning_rate *= 0.75 if print_progress: print('Updating Learning Rate', learning_rate) print('Epoch:', epoch, 'Batch:', batch, 'Cost:', cost) if self.tensorboard: train_sum = sess.run(train_merge, feed_dict={ self.X: X, self.X_time: X_time, self.rnn_keep_p_encode: 1.0, self.rnn_keep_p_decode: 1.0, self.train: False }) writer.add_summary(train_sum, n) seconds = (dt.now() - t0).seconds if seconds > max_seconds: if print_progress: print('Breaking after', seconds, 'seconds') break if self.save_file: saver.save(sess, self.save_file) if self.tensorboard: writer.add_graph(sess.graph) if show_fig: plt.plot(costs) plt.title('Costs and Scores') plt.show()
def __init__(self, is_training, X, y): self._is_training = is_training self._rnn_params = None self._cell = None self.batch_size = 200 self.seq_length = 5 self.X = X if is_training: n_batch = n_batch_train else: n_batch = n_batch_test # Construct prior prior = ScaleMixturePrior() n_unit_pre = n_feature # create 2 LSTMCells rnn_layers = [] n_unit_pre = n_feature for i in range(n_layer): rnn_layers.append(BayesianLSTM(n_unit_pre, layers[i], prior, is_training, inference_mode=inference_mode, forget_bias=0.0, name='bbb_lstm_{}'.format(i), bias=True)) n_unit_pre = layers[i] multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers) self._initial_state = multi_rnn_cell.zero_state(batch_size, tf.float32) state = self._initial_state # 'output' is a tensor of shape [batch_size, seq_length, n_feature] # 'state' is a N-tuple where N is the number of LSTMCells containing a # tf.contrib.rnn.LSTMStateTuple for each cell outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell, inputs=X, time_major=False, dtype=tf.float32) # output layer # add weight term rho_min_init, rho_max_init = prior.normal_init() if bias: w = get_noisy_weights((50, 1), 'w', prior, is_training, rho_min_init, rho_max_init) else: w = tf.get_variable('w', (50, 1), tf.float32, tf.constant_initializer(0.)) # add bias term if bias: b = get_noisy_weights( (1), 'b', prior, is_training, rho_min_init, rho_max_init) else: b = tf.get_variable('b', (1), tf.float32, tf.constant_initializer(0.)) output = tf.reshape( tf.matmul(outputs[:, seq_length-1, :], w) + b, [-1]) y = tf.reshape(y, [-1]) y_pred = Normal(output, 1.) print("Finish predicting y") # Use the contrib sequence loss and average over the batches loss = - tf.log(y_pred.prob(y) + 1e-8) # Update the cost self._cost = tf.reduce_sum(loss) / batch_size self._final_state = state # 1. For testing, no kl term, just loss self._kl_div = 0. if not is_training: return # 2. For training, compute kl scaled by 1./n_batch # Add up all prior's kl values kl_div = tf.add_n(tf.get_collection('KL_layers'), 'kl_divergence') # Compute ELBO kl_const = 1. / n_batch self._kl_div = kl_div * kl_const self._total_loss = self._cost + self._kl_div # Optimization: # Learning rate self._lr = tf.Variable(0.0, trainable=False) # Update all weights with gradients tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self._total_loss, tvars), max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) # Learning rate update self._new_lr = tf.placeholder( tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) print("Finish building model")
def __init__(self, args, d, logdir): super(bern_emb_model, self).__init__(args, d, logdir) self.n_minibatch = self.n_minibatch.sum() with tf.name_scope('model'): # Data Placeholder with tf.name_scope('input'): self.placeholders = tf.placeholder(tf.int32) self.words = self.placeholders # Index Masks with tf.name_scope('context_mask'): self.p_mask = tf.cast( tf.range(self.cs / 2, self.n_minibatch + self.cs / 2), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, self.cs / 2), [0]), [self.n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch), [1]), [1, self.cs / 2]), tf.int32) self.ctx_mask = tf.concat( [rows + columns, rows + columns + self.cs / 2 + 1], 1) with tf.name_scope('embeddings'): self.rho = tf.Variable(self.rho_init, name='rho') self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) if self.alpha_trainable: self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + prior.log_prob(self.alpha)) else: self.log_prior = tf.reduce_sum(prior.log_prob( self.rho)) with tf.name_scope('natural_param'): # Taget and Context Indices with tf.name_scope('target_word'): self.p_idx = tf.gather(self.words, self.p_mask) self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx)) # Negative samples with tf.name_scope('negative_samples'): unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]), [self.n_minibatch, 1]) self.n_idx = tf.multinomial(unigram_logits, self.ns) self.n_rho = tf.gather(self.rho, self.n_idx) with tf.name_scope('context'): self.ctx_idx = tf.squeeze( tf.gather(self.words, self.ctx_mask)) self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(self.ctx_alphas, [1]) self.p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1) self.n_eta = tf.reduce_sum( tf.multiply( self.n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, self.ns, 1])), -1) # Conditional likelihood self.y_pos = Bernoulli(logits=self.p_eta) self.y_neg = Bernoulli(logits=self.n_eta) self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0)) self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0)) self.log_likelihood = self.ll_pos + self.ll_neg scale = 1.0 * self.N / self.n_minibatch self.loss = -(self.n_epochs * self.log_likelihood + self.log_prior)
def loss(self, G_data, y_data, positive_batch, random_batch): """ Computes the loss. Parameters ---------- G_data : tf.Tensor Design matrix y_data : tf.SparseTensor Sparse tensor of counts positive_batch : tf.Tensor A Sparse tensor representing a batch of positive examples. random_batch : tf.Tensor A Sparse tensor representing a batch of random examples. Returns ------- log_loss : tf.Tensor Tensor representing the log likelihood of the model. """ with tf.name_scope('loss'): gamma_mean, gamma_scale = self.gamma_mean, self.gamma_scale beta_mean, beta_scale = self.beta_mean, self.beta_scale N, D, p = self.block_size, self.D, self.p num_nonzero = tf.cast(tf.size(y_data.values, out_type=tf.int32), dtype=tf.float32) # unpack sparse tensors pos_data = tf.cast(positive_batch.values, dtype=tf.float32) pos_row = tf.gather(positive_batch.indices, 0, axis=1) pos_col = tf.gather(positive_batch.indices, 1, axis=1) rand_row = tf.gather(random_batch.indices, 0, axis=1) rand_col = tf.gather(random_batch.indices, 1, axis=1) num_sampled = tf.size(pos_row, out_type=tf.float32) theta = tf.log( # basically log total counts tf.cast(tf.sparse_reduce_sum(y_data, axis=1), dtype=tf.float32)) # Regression coefficients qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma') qbeta = tf.Variable(tf.random_normal([p, D]), name='qbeta') self.V = tf.concat([qgamma, qbeta], axis=0, name='V') G = tf.concat([tf.ones([G_data.shape[0], 1]), G_data], axis=1, name='G') with tf.name_scope('positive_log_prob'): # add bias terms for samples Gpos = tf.gather(G, pos_row, axis=0) Vpos = tf.transpose(tf.gather(self.V, pos_col, axis=1), name='Vprime') # sparse matrix multiplication for positive samples y_pred = tf.reduce_sum(tf.multiply(Gpos, Vpos), axis=1) theta_pos = tf.squeeze(tf.gather(theta, pos_row)) pos_prob = tf.reduce_sum( tf.multiply(pos_data, y_pred + theta_pos)) sparse_scale = num_nonzero / num_sampled with tf.name_scope('coefficient_log_prob'): Grand = tf.gather(G, rand_row, axis=0) Vrand = tf.transpose(tf.gather(self.V, rand_col, axis=1), name='Vprime') # sparse matrix multiplication for random indices y_rand = tf.reduce_sum(tf.multiply(Grand, Vrand), axis=1) theta_rand = tf.squeeze(tf.gather(theta, rand_row)) coef_prob = tf.reduce_sum(tf.exp(y_rand + theta_rand)) coef_scale = N * D / self.num_neg_samples total_poisson = pos_prob * sparse_scale - coef_prob * coef_scale with tf.name_scope('priors'): # Normal priors (a.k.a. L2 regularization) # species intercepts gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean, scale=tf.ones([1, D]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D]) + beta_mean, scale=tf.ones([p, D]) * beta_scale, name='B') total_gamma = tf.reduce_sum(gamma.log_prob(qgamma)) total_beta = tf.reduce_sum(beta.log_prob(qbeta)) log_loss = - (total_gamma + total_beta + \ total_poisson) # save parameters to model self.qbeta = qbeta self.qgamma = qgamma return log_loss
def __init__(self, args, d, logdir): super(hierarchical_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) self.rho = tf.Variable(self.rho_init, name='rho', trainable=self.rho_trainable) self.geo_rho = {} for t, state in enumerate(d.states): self.geo_rho[state] = tf.Variable(self.rho_init + 0.001*tf.random_normal([d.L, self.K])/self.K, name = state+'_rho') with tf.name_scope('priors'): prior = Normal(loc = 0.0, scale = self.sig) if self.alpha_trainable: self.log_prior = tf.reduce_sum(prior.log_prob(self.rho) + prior.log_prob(self.alpha)) else: self.log_prior = tf.reduce_sum(prior.log_prob(self.rho)) local_prior = Normal(loc = 0.0, scale = self.sig/100.0) for state in d.states: self.log_prior += tf.reduce_sum(local_prior.log_prob(self.rho - self.geo_rho[state])) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t, state in enumerate(self.states): # Index Masks p_mask = tf.range(self.cs/2,self.n_minibatch[t] + self.cs/2) rows = tf.tile(tf.expand_dims(tf.range(0, self.cs/2),[0]), [self.n_minibatch[t], 1]) columns = tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, self.cs/2]) ctx_mask = tf.concat([rows+columns, rows+columns +self.cs/2+1], 1) # Data Placeholder self.placeholders[state] = tf.placeholder(tf.int32, shape = (self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[state], p_mask) ctx_idx = tf.squeeze(tf.gather(self.placeholders[state], ctx_mask)) # Negative samples unigram_logits = tf.tile(tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.alpha, ctx_idx) p_rho = tf.squeeze(tf.gather(self.geo_rho[state], p_idx)) n_rho = tf.gather(self.geo_rho[state], n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas,[1]) self.p_eta = tf.expand_dims(tf.reduce_sum(tf.multiply(p_rho, ctx_sum),-1),1) self.n_eta = tf.reduce_sum(tf.multiply(n_rho, tf.tile(tf.expand_dims(ctx_sum,1),[1,self.ns,1])),-1) # Conditional likelihood self.y_pos[state] = Bernoulli(logits = self.p_eta) self.y_neg[state] = Bernoulli(logits = self.n_eta) self.ll_pos += tf.reduce_sum(self.y_pos[state].log_prob(1.0)) self.ll_neg += tf.reduce_sum(self.y_neg[state].log_prob(0.0)) self.loss = - (self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior) self.init_eval_model()
def _multivariate_normal(self): return Normal([0.] * self._latent_dim, [1.] * self._latent_dim)
def step(self, time, inputs, input_latent_sample, states, use_inference, name=None): """Perform a decoding step. Args: time: scalar `int32`. inputs: A (structure of) input tensors. input_latent_sample: Can override sampling of new latent. states: A (structure of) state tensors and TensorArrays. use_inference: If True overrides checks for inference or prior network usage and always uses inference network. name: Name scope for any created operations. Returns: `output_frame, inference_dist, prior_dist, states`. """ cell_outputs, cell_states = dict(), dict() if self._prev_inputs is None: raise ValueError("Need previous input for VariationalDecoder!") with ops.name_scope(name, "VariationalDecoderStep", (time, inputs, states)): if input_latent_sample is None: # predict inference distribution from current frame if any if inputs is not None: cell_outputs['inference'], cell_states['inference'] = \ self._cells['inference'](self._maybe_encode_inputs(inputs), states['inference']) else: cell_outputs['inference'], cell_states[ 'inference'] = None, None # predict learned prior from previous frame if not self._fixed_prior: cell_outputs['prior'], cell_states['prior'] = \ self._cells['prior'](self._maybe_encode_inputs(self._prev_inputs), states['prior']) else: means = tf.zeros([self._batch_size, self._sample_dim]) log_std_dev = tf.log( tf.constant(1.0, shape=[self._batch_size, self._sample_dim])) cell_outputs['prior'] = tf.concat([means, log_std_dev], axis=1) # sample from inference or prior distribution if use_inference: means = cell_outputs['inference'][..., :self._sample_dim] std_dev = tf.exp( cell_outputs['inference'][..., self._sample_dim:]) else: means = cell_outputs['prior'][..., :self._sample_dim] std_dev = tf.exp(cell_outputs['prior'][..., self._sample_dim:]) z_dists = Normal(loc=means, scale=std_dev) z_sample = tf.squeeze(z_dists.sample( [1])) # sample one sample from each distribution if tf.flags.FLAGS.trajectory_space and not tf.flags.FLAGS.trajectory_autoencoding: z_sample = tf.concat([ z_sample, tf.zeros(z_sample.get_shape().as_list()[:-1] + [1], dtype=tf.float32) ], axis=-1) else: z_sample = input_latent_sample cell_outputs['inference'] = None cell_outputs['prior'] = None # reconstruct output with LSTM and decoder if self._use_cdna_model: decoder_input = [ self._prev_inputs, self._first_image, z_sample, self._is_training ] else: decoder_input = tf.concat((self._prev_inputs, z_sample), axis=-1) cell_outputs['output'], cell_states['output'] = \ self._cells['output'](decoder_input, states['output']) if self._output_layer is not None: cell_outputs['output'] = self._output_layer( cell_outputs['output']) return cell_outputs['output'], cell_outputs['inference'], \ cell_outputs['prior'], cell_states, z_sample
def build_nn4post( n_c, n_d, log_posterior_upto_const, init_var=None, base_graph=None, n_samples=100, r=1.0, beta=1.0, max_a_range=10, wall_slope=10, epsilon=1e-08, dtype='float32', name='nn4post'): r"""Add the name-scope `name` to the graph `base_graph`. This is the implementation of 'docs/main.pdf'. Args: n_c: `int`, as the number of categorical probabilities, i.e. the :math:`N_c` in the documentation. n_d: `int`, as the number of dimension, i.e. the :math:`N_d` in the documentation. log_posterior_upto_const: Callable from tensor of the shape `[n_d]` to scalar, both with the same dtype as the `dtype` argument, as the logorithm of the posterior up to a constant. init_var: `dict` for setting the initial values of variables. optional. It has keys `'a'`, `'mu'`, and `'zeta'`, and values of numpy arraies or tensors of the shapes `[n_c]`, `[n_c, n_d]`, and `[n_c, n_d]`, respectively. All these values shall be the same dtype as the `dtype` argument. base_graph: An instance of `tf.Graph`, optional, as the graph that the scope for "nn4post" are added to. If `None`, use the graph returned from `tf.get_default_graph()`. n_samples: `int` or `tf.placeholder` with scalar shape and `int` dtype, as the number of samples in the Monte Carlo integrals, optional. r: `float` or `tf.placeholder` with scalar shape and `dtype` dtype, as the rescaling factor of `a`, optional. beta: `float` or `tf.placeholder` with scalar shape and `dtype` dtype, as the "smooth switcher" :math:`\partial \mathcal{L} / \partial z_i` in the documentation, optional. max_a_range: `float` or `tf.placeholder` with scalar shape and `dtype` dtype, as the bound of `max(a) - min(a)`, optional. wall_slope: `float` or `tf.placeholder` with scalar shape and `dtype` dtype, as the slope-parameter in the wall-function in the regularization of loss, which bounds the maximum value of the range of `a`, optional. NOTE: The only restirction to this parameter is that `wall_slope` shall be much greater than unit. But when learning-rate of optimizer is not small enough (as generally demanded in the early stage of training), extremely great value of `wall_slope` will triger `NaN`. epsilon: `float` or `tf.placeholder` with scalar shape and `dtype` dtype, as the :math:`epsilon` in the documentation, optional. dtype: `str`, as the dtype of floats employed herein, like `float32`, `float64`, etc., optional. name: `str`, as the main name-scope. Returns: A tuple of two elements. The first is a `dict` for useful `tensor`s (for convinence), with keys `'a'`, `'mu'`, `'zeta'`, and `'loss'`, and with their associated tensors as values. The second is a list of tupes of gradient and its associated variable, as the argument of the method `tf.train.Optimizer.apply_gradients()`. """ graph = tf.get_default_graph() if base_graph is None else base_graph with graph.as_default(): with tf.name_scope(name): with tf.name_scope('variables'): if init_var is None: init_a = np.zeros([n_c], dtype=dtype) if n_c == 1: init_mu = np.random.normal(size=[n_c, n_d]) else: # Because of the curse of dimensionality init_mu = np.random.normal(size=[n_c, n_d]) * np.sqrt(n_d) init_mu = init_mu.astype(dtype) init_zeta = np.ones([n_c, n_d], dtype=dtype) else: init_a = init_var['a'] init_mu = init_var['mu'] init_zeta = init_var['zeta'] # shape: `[n_c]` a = tf.Variable(init_a, name='a') # shape: `[n_c, n_d]` mu = tf.Variable(init_mu, name='mu') # shape: `[n_c, n_d]` zeta = tf.Variable(init_zeta, name='zeta') with tf.name_scope('distributions'): with tf.name_scope('categorical'): # For gauge fixing. C.f. "/docs/nn4post.tm", section "Gauge # Fixing". # shape: `[]` a_mean = tf.reduce_mean(a, name='a_mean') # Rescaling of `a`. C.f. "/docs/nn4post.tm", section "Re- # scaling of a". # shape: `[n_c]` c = tf.nn.softmax(r * (a - a_mean), name='c') # Replaced by clipping the gradient of `a`, c.f. `name_scope` # `'gradients/clipping_grad_a'`. ## Additionally clip `c` by a minimal value ## shape: `[n_c]` #c = tf.clip_by_value(c, epsilon, 1, name='c_clipped') with tf.name_scope('standard_normal'): # shape: `[n_c, n_d]` sigma = tf.nn.softplus(zeta) # shape: `[n_c, n_d]` std_normal = Independent( Normal(tf.zeros(mu.shape), tf.ones(sigma.shape)) ) with tf.name_scope('loss'): with tf.name_scope('samples'): # shape: `[n_samples, n_c, n_d]` eta_samples = std_normal.sample(n_samples) with tf.name_scope('re_parameter'): # shape: `[n_samples, n_c, n_d]` theta_samples = eta_samples * sigma + mu # shape: `[n_samples * n_c, n_d]` flat_theta_samples = tf.reshape(theta_samples, [-1, n_d]) with tf.name_scope('p_part'): with tf.name_scope('expect_log_p'): def log_p(thetas): """Vectorize `log_posterior_upto_const`. Args: thetas: Tensor of the shape `[None, n_d]` Returns: Tensor of the shape `[None]`. """ return tf.map_fn(log_posterior_upto_const, thetas) # Expectation of :math:`\ln p` # shape: `[n_c]` expect_log_p = tf.reduce_mean( tf.reshape( log_p(flat_theta_samples), # shape `[n_samples * n_c]`. [n_samples, n_c]), axis=0) # shape: `[]` loss_p_part = - tf.reduce_sum(c * expect_log_p) with tf.name_scope('q_part'): with tf.name_scope('log_q'): gaussian_mixture_log_prob = \ get_gaussian_mixture_log_prob(c, mu, sigma) def log_q(thetas): """Vectorize `log_q`. Args: thetas: Tensor of the shape `[None, n_d]`. Returns: Tensor of the shape `[None]`. """ return tf.map_fn(gaussian_mixture_log_prob, thetas) with tf.name_scope('expect_log_q'): # Expectation of :math:`\ln q` # shape: `[n_c]` expect_log_q = tf.reduce_mean( tf.reshape( log_q(flat_theta_samples), # shape: `[n_samples * n_c]`. [n_samples, n_c]), axis=0) # shape: `[]` loss_q_part = tf.reduce_sum(c * expect_log_q) with tf.name_scope('loss'): with tf.name_scope('elbo'): elbo = loss_p_part + loss_q_part with tf.name_scope('regularization'): # NOTE: # Get punished if the range of `a` exceeds `max_a_range`. # Multiplied by `elbo` for automatically setting the order of # punishment. # shape: `[]`, and non-negative. a_range = tf.reduce_max(a) - tf.reduce_min(a) # Use "wall_function" for regularization. wall = get_wall(max_a_range, wall_slope) # shape: `[]` regularization = elbo * wall(a_range) # shape: `[]` loss = elbo + regularization with tf.name_scope('gradients'): # C.f. "/docs/nn4post.tm", section "Frozen-out Problem". with tf.name_scope('bared_gradients'): gradient = { variable: tf.gradients(loss, variable)[0] for variable in {a, mu, zeta} } with tf.name_scope('keep_non_frozen_out'): # Notice `tf.truediv` is not broadcastable denominator = tf.pow(c + epsilon, beta) # `[n_c]` gradient = { variable: grad / denominator if variable is a # `[n_c]` else grad / tf.expand_dims(denominator, axis=1) # `[n_c, n_d]` for variable, grad in gradient.items() } # Re-arrange as a list of tuples grads_and_vars = [(grad, var_) for var_, grad in gradient.items()] # -- Collections collection = { 'a': a, 'mu': mu, 'zeta': zeta, 'c': c, 'loss': loss, } if isinstance(r, tf.Tensor): collection['r'] = r if isinstance(beta, tf.Tensor): collection['beta'] = beta if isinstance(n_samples, tf.Tensor): collection['n_samples'] = n_samples for name, tensor in collection.items(): graph.add_to_collection(name, tensor) return collection, grads_and_vars
def __init__(self, args, d, logdir): super(dynamic_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) self.rho_t = {} for t in range(-1,self.T): self.rho_t[t] = tf.Variable(self.rho_init + 0.001*tf.random_normal([self.L, self.K])/self.K, name = 'rho_'+str(t)) with tf.name_scope('priors'): global_prior = Normal(loc = 0.0, scale = self.sig) local_prior = Normal(loc = 0.0, scale = self.sig/100.0) self.log_prior = tf.reduce_sum(global_prior.log_prob(self.alpha)) self.log_prior = tf.reduce_sum(global_prior.log_prob(self.rho_t[-1])) for t in range(self.T): self.log_prior += tf.reduce_sum(local_prior.log_prob(self.rho_t[t] - self.rho_t[t-1])) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t in range(self.T): # Index Masks p_mask = tf.range(self.cs//2,self.n_minibatch[t] + self.cs//2) rows = tf.tile(tf.expand_dims(tf.range(0, self.cs//2),[0]), [self.n_minibatch[t], 1]) columns = tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, int(self.cs/2)]) # columns = tf.cast(columns, tf.float32) # print(type(rows), rows.dtype) # print(type(columns), columns.dtype) ctx_mask = tf.concat([rows+columns, rows+columns +self.cs//2+1], 1) # Data Placeholder self.placeholders[t] = tf.placeholder(tf.int32, shape = (self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[t], p_mask) ctx_idx = tf.squeeze(tf.gather(self.placeholders[t], ctx_mask)) # Negative samples unigram_logits = tf.tile(tf.expand_dims(tf.log(tf.constant(self.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.alpha, ctx_idx) p_rho = tf.squeeze(tf.gather(self.rho_t[t], p_idx)) n_rho = tf.gather(self.rho_t[t], n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas,[1]) p_eta = tf.expand_dims(tf.reduce_sum(tf.multiply(p_rho, ctx_sum),-1),1) n_eta = tf.reduce_sum(tf.multiply(n_rho, tf.tile(tf.expand_dims(ctx_sum,1),[1,self.ns,1])),-1) # Conditional likelihood self.y_pos[t] = Bernoulli(logits = p_eta) self.y_neg[t] = Bernoulli(logits = n_eta) self.ll_pos += tf.reduce_sum(self.y_pos[t].log_prob(1.0)) self.ll_neg += tf.reduce_sum(self.y_neg[t].log_prob(0.0)) self.loss = - (self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior)
class PropagatePrior(snt.AbstractModule): """Flexible RNN prior for propagation. This implementation treats all objects as independet. """ def __init__(self, n_what, cell, prop_logit_bias, where_loc_bias=None): """Initialises the module. :param n_what: :param cell: :param prop_logit_bias: :param where_loc_bias: """ super(PropagatePrior, self).__init__() self._n_what = n_what self._cell = cell self._prop_logit_bias = prop_logit_bias self._where_loc_bias = where_loc_bias def _build(self, z_tm1, prior_rnn_hidden_state): """Applies the op. :param z_tm1: :param prior_rnn_hidden_state: :return: """ #latent variables from the step at time = t - 1 what_tm1, where_tm1, presence_tm1 = z_tm1[:3] #making input for the RNN by concat of latent where and what prior_rnn_inpt = tf.concat((what_tm1, where_tm1), -1) rnn = snt.BatchApply(self._cell) #running RNN and getting the weights and hidden states that we will pass through the # linear NN unit in order to get the values for parameters for propogation prior distribution outputs, prior_rnn_hidden_state = rnn(prior_rnn_inpt, prior_rnn_hidden_state) #specifying the number of output weights for Linear NN Unit n_outputs = 2 * (4 + self._n_what) + 1 #getting the parameters that we will use in order to #specify the parameters of propogation prior distributions for latent variables 'where', 'what' and 'presence' stats = snt.BatchApply(snt.Linear(n_outputs))(outputs) #splitting the outputs from Linear NN Unit into num_images * 1 vector for prop_prob_logit, #which are the parameters for Bernoulli prior distribution for latent 'presence' #and num_images * n_outputs - 1 vector for stats that will be used for # 'what' and 'where' latent variables prop_prob_logit, stats = tf.split(stats, [1, n_outputs - 1], -1) #updating parameters for Bernoulli prior distribution for latent 'presence' #by adding bias(some specidied hyperparameter) prop_prob_logit += self._prop_logit_bias #updating parameters for Bernoulli prior distribution for latent 'presence' #by applying sigma function prop_prob_logit = presence_tm1 * prop_prob_logit + (presence_tm1 - 1.) * 88. #splitting stats in order to get parameters (mean or locs, st deviation or scale) #for factorized Gaussian distribution for # latent variables 'where' and 'what' locs, scales = tf.split(stats, 2, -1) #splitting mean or loc parameter into # mean or loc for 'what' and 'where' latent variables separately prior_where_loc, prior_what_loc = tf.split(locs, [4, self._n_what], -1) #splitting scale or standard deviation parameter into #scale or standard deviation for 'what' and 'where' latent variables separately prior_where_scale, prior_what_scale = tf.split(scales, [4, self._n_what], -1) #making sure that standard deviation is positive and not equal to 0 prior_where_scale, prior_what_scale = (tf.nn.softplus(i) + 1e-2 for i in (prior_where_scale, prior_what_scale)) # adding bias for 'where' latent variable mean or loc parameter #for Gaussian distribution if there must exist one if self._where_loc_bias is not None: bias = np.asarray(self._where_loc_bias).reshape((1, 4)) prior_where_loc += bias #putting all parameters for propagation prior distribution together prior_stats = (prior_where_loc, prior_where_scale, prior_what_loc, prior_what_scale, prop_prob_logit) return prior_stats, prior_rnn_hidden_state def initial_state(self, batch_size, trainable=True, initializer=None): if initializer is not None and not isinstance(initializer, collections.Sequence): state_size = self._cell.state_size flat_state_size = nest.flatten(state_size) initializer = [initializer] * len(flat_state_size) initializer = nest.pack_sequence_as(state_size, initializer) #making initial state for the RNN that is used in propagation prior to compute distribution parameters init_state = self._cell.initial_state(batch_size, tf.float32, trainable=trainable, trainable_initializers=initializer) return init_state def make_distribs(self, (prior_where_loc, prior_where_scale, prior_what_loc, prior_what_scale, prop_prob_logit)): """Converts parameters return by `_build` into probability distributions. """ what_prior = Normal(prior_what_loc, prior_what_scale) where_prior = Normal(prior_where_loc, prior_where_scale) prop_prior = Bernoulli(logits=tf.squeeze(prop_prob_logit, -1)) return what_prior, where_prior, prop_prior
def _build_ad_nn(self, tensor_io): from drlutils.dataflow.tensor_io import TensorIO assert (isinstance(tensor_io, TensorIO)) from drlutils.model.base import get_current_nn_context from tensorpack.tfutils.common import get_global_step_var global_step = get_global_step_var() nnc = get_current_nn_context() is_training = nnc.is_training i_state = tensor_io.getInputTensor('state') i_agentIdent = tensor_io.getInputTensor('agentIdent') i_sequenceLength = tensor_io.getInputTensor('sequenceLength') i_resetRNN = tensor_io.getInputTensor('resetRNN') l = i_state # l = tf.Print(l, [i_state, tf.shape(i_state)], 'State = ') # l = tf.Print(l, [i_agentIdent, tf.shape(i_agentIdent)], 'agentIdent = ') # l = tf.Print(l, [i_sequenceLength, tf.shape(i_sequenceLength)], 'SeqLen = ') # l = tf.Print(l, [i_resetRNN, tf.shape(i_resetRNN)], 'resetRNN = ') with tf.variable_scope('critic', reuse=nnc.reuse) as vs: def _get_cell(): cell = tf.nn.rnn_cell.BasicLSTMCell(256) # if is_training: # cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.9) return cell cell = tf.nn.rnn_cell.MultiRNNCell([_get_cell() for _ in range(1)]) rnn_outputs = self._buildRNN( l, cell, tensor_io.batchSize, i_agentIdent=i_agentIdent, i_sequenceLength=i_sequenceLength, i_resetRNN=i_resetRNN, ) rnn_outputs = tf.reshape( rnn_outputs, [-1, rnn_outputs.get_shape().as_list()[-1]]) l = rnn_outputs from ad_cur.autodrive.model.selu import fc_selu for lidx in range(2): l = fc_selu( l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) value = tf.layers.dense(l, 1, name='fc-value') value = tf.squeeze(value, [1], name="value") if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor', reuse=nnc.reuse) as vs: l = tf.stop_gradient(l) l = tf.layers.dense(l, 128, activation=tf.nn.relu6, name='fc-actor') mu_steering = 0.5 * tf.layers.dense( l, 1, activation=tf.nn.tanh, name='fc-mu-steering') mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel') mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) def saturating_sigmoid(x): """Saturating sigmoid: 1.2 * sigmoid(x) - 0.1 cut to [0, 1].""" with tf.name_scope("saturating_sigmoid", [x]): y = tf.sigmoid(x) return tf.minimum(1.0, tf.maximum(0.0, 1.2 * y - 0.1)) sigma_steering_ = 0.1 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering') sigma_accel_ = 0.25 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel') if not nnc.is_evaluating: sigma_beta_steering = tf.get_default_graph( ).get_tensor_by_name('actor/sigma_beta_steering:0') sigma_beta_accel = tf.get_default_graph().get_tensor_by_name( 'actor/sigma_beta_accel:0') sigma_beta_steering = tf.constant(1e-4) # sigma_beta_steering_exp = tf.train.exponential_decay(0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp') # sigma_beta_accel_exp = tf.train.exponential_decay(0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp') else: sigma_beta_steering = tf.constant(1e-4) sigma_beta_accel = tf.constant(1e-4) sigma_steering = (sigma_steering_ + sigma_beta_steering) sigma_accel = (sigma_accel_ + sigma_beta_accel) sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas + 0.01) policy = tf.squeeze(dists.sample([1]), [0]) # 裁剪到两倍方差之内 policy = tf.clip_by_value(policy, mus - 2 * sigmas, mus + 2 * sigmas) if is_training: self._addMovingSummary( tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), # sigma_beta_accel, # sigma_beta_steering, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) if not is_training: tensor_io.setOutputTensors(policy, value, mus, sigmas) return i_actions = tensor_io.getInputTensor("action") # i_actions = tf.Print(i_actions, [i_actions], 'actions = ') i_actions = tf.reshape(i_actions, [-1] + i_actions.get_shape().as_list()[2:]) log_probs = dists.log_prob(i_actions) # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) i_advantage = tensor_io.getInputTensor("advantage") i_advantage = tf.reshape(i_advantage, [-1] + i_advantage.get_shape().as_list()[2:]) exp_v = log_probs * tf.expand_dims(i_advantage, -1) entropy = dists.entropy() entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy + exp_v loss_policy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') i_futurereward = tensor_io.getInputTensor("futurereward") i_futurereward = tf.reshape(i_futurereward, [-1] + i_futurereward.get_shape().as_list()[2:]) loss_value = tf.reduce_mean(0.5 * tf.square(value - i_futurereward)) loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(value, name='predict_reward') import tensorpack.tfutils.symbolic_functions as symbf advantage = symbf.rms(i_advantage, name='rms_advantage') self._addMovingSummary( loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(policy[:, 0], name='actor/steering/mean'), tf.reduce_mean(policy[:, 1], name='actor/accel/mean'), ) return loss_policy, loss_value
def gaussian_layer(x, in_dim, out_dim, scope, activation_fn=tf.nn.relu, reuse=False, use_mean=False, store=False, use_stored=False, prior_stddev=1.0, l2_const=0.0): """Single layer of fully-connected units where the weights follow a unit gaussian prior, and Args: x: batch of input in_dim: input dimension out_dim: output dimension scope: tensorflow variable scope name activation_fn: activation function use_mean: use the mean of approximate posterior, instead of sampling closed_form_kl: return closed form kl Returns: output and kl of the weights for the layer """ prior_var = prior_stddev**2 with tf.variable_scope(scope, reuse=reuse): w_mean = tf.get_variable('w_mean', shape=[in_dim, out_dim], initializer=xi()) w_row = tf.get_variable('w_row', shape=[in_dim, out_dim], initializer=ni(-3.0, 0.1)) w_stddev = tf.nn.softplus(w_row, name='w_std') + eps w_dist = Normal([0.0] * in_dim * out_dim, [1.0] * in_dim * out_dim) w_std_sample = tf.reshape(w_dist.sample(), [in_dim, out_dim], name='w_std_sample') # local reparametrization w_sample = w_mean + w_std_sample * w_stddev b = tf.get_variable('b', shape=[out_dim], dtype=tf.float32, initializer=xi()) # to store the previous theta value w_last = tf.get_variable('w_last', initializer=tf.zeros([in_dim, out_dim]), trainable=False) if use_mean: out = activation_fn(tf.matmul(x, w_mean) + b, name='activation') return out, 0.0 else: if store: store_op = tf.assign(w_last, w_sample) with tf.control_dependencies([store_op]): out = activation_fn(tf.matmul(x, w_sample) + b, name='activation') else: if use_stored: out = activation_fn(tf.matmul(x, w_last) + b, name='activation') else: out = activation_fn(tf.matmul(x, w_sample) + b, name='activation') D = in_dim * out_dim kl = tf.log(prior_stddev) * D - \ tf.reduce_sum(tf.log(w_stddev+eps)) + \ 0.5*(-D + (tf.reduce_sum(w_stddev**2) + tf.reduce_sum(w_mean**2)) / prior_var) return out, kl
def __call__(self, session, trainX, trainY, testX, testY): """ Initialize the actual graph Parameters ---------- session : tf.Session Tensorflow session trainX : np.array Input training design matrix. trainY : np.array Output training OTU table, where rows are samples and columns are observations. testX : np.array Input testing design matrix. testY : np.array Output testing OTU table, where rows are samples and columns are observations. """ self.session = session self.N, self.p = trainX.shape self.D = trainY.shape[1] holdout_size = testX.shape[0] # Place holder variables to accept input data self.X_ph = tf.constant(trainX, dtype=tf.float32, name='G_ph') self.Y_ph = tf.constant(trainY, dtype=tf.float32, name='Y_ph') self.X_holdout = tf.constant(testX, dtype=tf.float32, name='G_holdout') self.Y_holdout = tf.constant(testY, dtype=tf.float32, name='Y_holdout') batch_ids = tf.multinomial(tf.ones([1, self.N]), self.batch_size) sample_ids = tf.squeeze(batch_ids) Y_batch = tf.gather(self.Y_ph, sample_ids, axis=0) X_batch = tf.gather(self.X_ph, sample_ids, axis=0) total_count = tf.reduce_sum(Y_batch, axis=1) holdout_count = tf.reduce_sum(self.Y_holdout, axis=1) # Define PointMass Variables first self.qbeta = tf.Variable(tf.random_normal([self.p, self.D - 1]), name='qB') # regression coefficents distribution beta = Normal(loc=tf.zeros([self.p, self.D - 1]) + self.beta_mean, scale=tf.ones([self.p, self.D - 1]) * self.beta_scale, name='B') eta = tf.matmul(X_batch, self.qbeta, name='eta') phi = tf.nn.log_softmax(tf.concat( [tf.zeros([self.batch_size, 1]), eta], axis=1), name='phi') Y = Multinomial(total_count=total_count, logits=phi, name='Y') # cross validation with tf.name_scope('accuracy'): pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax( tf.concat([ tf.zeros([holdout_size, 1]), tf.matmul(self.X_holdout, self.qbeta) ], axis=1), name='phi') self.cv = tf.reduce_mean(tf.squeeze(tf.abs(pred - self.Y_holdout))) tf.summary.scalar('mean_absolute_error', self.cv) self.loss = -(tf.reduce_sum(beta.log_prob(self.qbeta)) + tf.reduce_sum(Y.log_prob(Y_batch)) * (self.N / self.batch_size)) optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=self.beta_1, beta2=self.beta_2) gradients, variables = zip(*optimizer.compute_gradients(self.loss)) self.gradients, _ = tf.clip_by_global_norm(gradients, self.clipnorm) self.train = optimizer.apply_gradients(zip(gradients, variables)) tf.summary.scalar('loss', self.loss) tf.summary.histogram('qbeta', self.qbeta) self.merged = tf.summary.merge_all() if self.save_path is not None: self.writer = tf.summary.FileWriter(self.save_path, self.session.graph) else: self.writer = None tf.global_variables_initializer().run()
def interpolate_gaussian(coords, inputs, dim, wrap=False, kernel_size=None, kernel_step=None, stddev=2.0): """ interpolate_gaussian - samples with coords from inputs, interpolating the results via a differentiable gaussian kernel. :param coords shape: (N, dim, width, height, ...) :param inputs shape: (N, width, height, .. n_chan) :param dim - dimensionality of the data, e.g. 2 if inputs is a batch of images :param wrap - whether to wrap, or otherwise clip during the interpolation :returns - the sampled result :shape (N, width, height, ..., n_chan), where width, height, ... come from the coords shape """ if not wrap: print("Clipping is not supported for the gaussian kernel yet") raise NotImplementedError if K.backend() != "tensorflow": print( "Theano backend is currently not supported for the gaussian kernel" ) raise NotImplementedError inputs_shape = K.shape(inputs) inputs_shape_list = [inputs_shape[i] for i in range(dim + 2)] coords_shape = K.shape(coords) coords_shape_list = [coords_shape[i] for i in range(dim + 2)] inputs_dims = inputs_shape_list[1:-1] maxes = K.cast(inputs_shape[1:-1] - 1, "float32") coords_float = upscale(coords, maxes, dim) import tensorflow as tf from tensorflow.contrib.distributions import Normal if not kernel_step or not kernel_size: kernel_step = 1 # tile the float coords, extending them for the application of the gaussian aggregation later extended_coords = tf.reshape(coords_float, coords_shape_list + [1] * dim) if kernel_size: m = kernel_size // kernel_step + (1 if kernel_size % kernel_step != 0 else 0) extended_coords = tf.tile(extended_coords, [1] * len(coords_shape_list) + [m] * dim) else: extended_coords = tf.tile(extended_coords, [1] * len(coords_shape_list) + inputs_dims) # center a gaussian at each of the unstandardized transformed coordinates coord_gaussians = Normal(loc=extended_coords, scale=stddev) # shape: (N, dim, width, height, ..., img_width, img_height, ...) for i in range(dim): # create ranges for each of the dimensions to "spread" the coords across the image if kernel_size: m = kernel_size // kernel_step + ( 1 if kernel_size % kernel_step != 0 else 0) limit = kernel_size else: m = inputs_dims[i] limit = inputs_dims[i] range_offset = tf.cast( tf.range(start=0, limit=limit, delta=kernel_step), "float32") range_offset -= tf.cast((limit - 1.0) / 2.0, "float32") # reshape so that the offset is broadcastet in all dimensions but the # one for the current dimension broadcast_shape = [1] * len(coords_shape_list) + i * [1] + \ [m] + (dim - i - 1) * [1] # shape: (1, 1, 1, 1, ..., img_width, img_height, ...) range_offset = tf.reshape(range_offset, broadcast_shape) zero_pads = [tf.zeros_like(range_offset) for _ in range(dim - 1)] # concatenate zeros for the rest of the dimensions range_offset = tf.concat(zero_pads[:i] + [range_offset] + zero_pads[i + 1:], axis=1) range_offset = tf.cast(range_offset, "float32") extended_coords += range_offset # now round and then sample sampling_coords = tf.floor(extended_coords) # double the dim as those coords are extended samples = sample(inputs, sampling_coords, dim=dim * 2, wrapped=True) # since the gaussians are isotropic, I have to reduce a product along the dim-dimension first # TODO: this needs to be the meshgrid with image size, and not the scaled up coords coord_gaussian_pdfs = coord_gaussians.prob(extended_coords) coord_gaussian_pdfs = tf.reduce_prod(coord_gaussian_pdfs, axis=1) # expand one broadcastable dimension for the image channels coord_gaussian_pdfs = tf.expand_dims(coord_gaussian_pdfs, -1) samples = samples * coord_gaussian_pdfs # normalize the samples so that the weighting does not change the pixel intensities reduction_indices = [i for i in range(dim + 1, 2 * dim + 1)] norm_coeff = tf.reduce_sum(coord_gaussian_pdfs, keep_dims=True, reduction_indices=reduction_indices) samples /= norm_coeff # reduce_sum along the img_width, img_height, ... etc. axes samples = tf.reduce_sum(samples, reduction_indices=reduction_indices) return samples
model_log_gamma = tf.Variable(tf.zeros([])) model_lambda = tf.exp(model_log_lambda) model_gamma = tf.exp(model_log_gamma) model_w_1 = tf.Variable(tf.zeros([n_feats, n_hidden])) model_b_1 = tf.Variable(tf.zeros([n_hidden])) model_w_2 = tf.Variable(tf.zeros([n_hidden, 1])) model_b_2 = tf.Variable(tf.zeros([])) # Compute the prediction from the network. with tf.variable_scope("prediction"): pred = tf.matmul( tf.nn.relu(tf.matmul(model_X, model_w_1) + model_b_1), model_w_2 ) + model_b_2 # Likelihood function. with tf.variable_scope("likelihood"): log_l_dist = Normal(pred, tf.reciprocal(tf.sqrt(model_gamma))) log_l = tf.reduce_sum(log_l_dist.log_prob(model_y)) # Priors. with tf.variable_scope("priors"): prior_lambda = Gamma(alpha, beta) prior_gamma = Gamma(alpha, beta) prior_w_1 = Normal( tf.zeros([n_feats, n_hidden]), tf.reciprocal(tf.sqrt(model_lambda)) ) prior_b_1 = Normal( tf.zeros([n_hidden]), tf.reciprocal(tf.sqrt(model_lambda)) ) prior_w_2 = Normal( tf.zeros([n_hidden, 1]),
def __init__(self, args, d, logdir): super(amortized_bern_emb_model, self).__init__(args, d, logdir) with tf.name_scope('model'): with tf.name_scope('embeddings'): self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=self.alpha_trainable) #self.alpha = tf.Variable(self.alpha_init, name='alpha', trainable=True) self.rho = tf.Variable(self.rho_init, name='rho', trainable=self.rho_trainable) #print('HACKING!') #self.rho = tf.Variable(self.rho_init, name='rho', trainable=self.alpha_trainable) trunc = np.sqrt(6)/np.sqrt(self.K + self.H0) phi_init = np.random.uniform( -trunc, trunc, [self.n_states, 2*self.K*self.H0]).astype('float32') self.phi = tf.Variable(phi_init, name='phi') self.geo_rho = {} for t, state in enumerate(d.states): self.geo_rho[state] = tf.Variable(tf.random_normal(self.rho_init.shape), trainable=False, name = state+'_rho') with tf.name_scope('priors'): prior = Normal(loc = 0.0, scale = self.sig) if self.alpha_trainable: self.log_prior = tf.reduce_sum(prior.log_prob(self.rho) + tf.reduce_sum(prior.log_prob(self.alpha)) + tf.reduce_sum(prior.log_prob(self.phi))) else: self.log_prior = tf.reduce_sum(prior.log_prob(self.rho)) + tf.reduce_sum(prior.log_prob(self.phi)) local_prior = Normal(loc = 0.0, scale = self.sig/100.0) for t, state in enumerate(d.states): self.log_prior += tf.reduce_sum(local_prior.log_prob(self.rho - neural_network(self.rho, self.phi, self.K, t, self.H0, self.resnet))) self.assign_ops = d.T*[0] for t, state in enumerate(d.states): self.assign_ops[t] = self.geo_rho[state].assign( neural_network(self.rho, self.phi, self.K, t, self.H0, self.resnet)) with tf.name_scope('likelihood'): self.placeholders = {} self.y_pos = {} self.y_neg = {} self.ll_pos = 0.0 self.ll_neg = 0.0 for t, state in enumerate(self.states): # Index Masks p_mask = tf.range(self.cs/2,self.n_minibatch[t] + self.cs/2) rows = tf.tile(tf.expand_dims(tf.range(0, self.cs/2),[0]), [self.n_minibatch[t], 1]) columns = tf.tile(tf.expand_dims(tf.range(0, self.n_minibatch[t]), [1]), [1, self.cs/2]) ctx_mask = tf.concat([rows+columns, rows+columns +self.cs/2+1], 1) # Data Placeholder self.placeholders[state] = tf.placeholder(tf.int32, shape = (self.n_minibatch[t] + self.cs)) # Taget and Context Indices p_idx = tf.gather(self.placeholders[state], p_mask) ctx_idx = tf.squeeze(tf.gather(self.placeholders[state], ctx_mask)) # Negative samples unigram_logits = tf.tile(tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [self.n_minibatch[t], 1]) n_idx = tf.multinomial(unigram_logits, self.ns) # Context vectors ctx_alphas = tf.gather(self.alpha, ctx_idx) rho_state = neural_network(self.rho, self.phi, self.K, t, self.H0, self.resnet) # TODO it would make more sense to gather first and modulate then! p_rho = tf.squeeze(tf.gather(rho_state, p_idx)) n_rho = tf.gather(rho_state, n_idx) # Natural parameter ctx_sum = tf.reduce_sum(ctx_alphas,[1]) p_eta = tf.expand_dims(tf.reduce_sum(tf.multiply(p_rho, ctx_sum),-1),1) n_eta = tf.reduce_sum(tf.multiply(n_rho, tf.tile(tf.expand_dims(ctx_sum,1),[1,self.ns,1])),-1) # Conditional likelihood self.y_pos[state] = Bernoulli(logits = p_eta) self.y_neg[state] = Bernoulli(logits = n_eta) self.ll_pos += tf.reduce_sum(self.y_pos[state].log_prob(1.0)) self.ll_neg += tf.reduce_sum(self.y_neg[state].log_prob(0.0)) self.loss = - (self.n_epochs * (self.ll_pos + self.ll_neg) + self.log_prior) self.init_eval_model()
class PropagatePrior(snt.AbstractModule): """Flexible RNN prior for propagation. This implementation treats all objects as independet. """ def __init__(self, n_what, cell, prop_logit_bias, where_loc_bias=None): """Initialises the module. :param n_what: :param cell: :param prop_logit_bias: :param where_loc_bias: """ super(PropagatePrior, self).__init__() self._n_what = n_what self._cell = cell self._prop_logit_bias = prop_logit_bias self._where_loc_bias = where_loc_bias def _build(self, z_tm1, prior_rnn_hidden_state): """Applies the op. :param z_tm1: :param prior_rnn_hidden_state: :return: """ what_tm1, where_tm1, presence_tm1 = z_tm1[:3] prior_rnn_inpt = tf.concat((what_tm1, where_tm1), -1) rnn = snt.BatchApply(self._cell) outputs, prior_rnn_hidden_state = rnn(prior_rnn_inpt, prior_rnn_hidden_state) n_outputs = 2 * (4 + self._n_what) + 1 stats = snt.BatchApply(snt.Linear(n_outputs))(outputs) prop_prob_logit, stats = tf.split(stats, [1, n_outputs - 1], -1) prop_prob_logit += self._prop_logit_bias prop_prob_logit = presence_tm1 * prop_prob_logit + (presence_tm1 - 1.) * 88. locs, scales = tf.split(stats, 2, -1) prior_where_loc, prior_what_loc = tf.split(locs, [4, self._n_what], -1) prior_where_scale, prior_what_scale = tf.split(scales, [4, self._n_what], -1) prior_where_scale, prior_what_scale = (tf.nn.softplus(i) + 1e-2 for i in (prior_where_scale, prior_what_scale)) if self._where_loc_bias is not None: bias = np.asarray(self._where_loc_bias).reshape((1, 4)) prior_where_loc += bias prior_stats = (prior_where_loc, prior_where_scale, prior_what_loc, prior_what_scale, prop_prob_logit) return prior_stats, prior_rnn_hidden_state def initial_state(self, batch_size, trainable=True, initializer=None): if initializer is not None and not isinstance(initializer, collections.Sequence): state_size = self._cell.state_size flat_state_size = nest.flatten(state_size) initializer = [initializer] * len(flat_state_size) initializer = nest.pack_sequence_as(state_size, initializer) init_state = self._cell.initial_state(batch_size, tf.float32, trainable=trainable, trainable_initializers=initializer) return init_state def make_distribs(self, (prior_where_loc, prior_where_scale, prior_what_loc, prior_what_scale, prop_prob_logit)): """Converts parameters return by `_build` into probability distributions. """ what_prior = Normal(prior_what_loc, prior_what_scale) where_prior = Normal(prior_where_loc, prior_where_scale) prop_prior = Bernoulli(logits=tf.squeeze(prop_prob_logit, -1)) return what_prior, where_prior, prop_prior
distributions = { "gaussian": { "parameters": { "mu": { "support": [-inf, inf], "activation function": identity, "initial value": tf.zeros }, "log_sigma": { "support": [-3, 3], "activation function": identity, "initial value": tf.zeros } }, "class": lambda theta: Normal(loc=theta["mu"], scale=tf.exp(theta["log_sigma"])) }, "modified gaussian": { "parameters": { "mean": { "support": [-inf, inf], "activation function": identity, "initial value": tf.zeros }, "variance": { "support": [-3, 3], "activation function": softplus, "initial value": tf.ones } }, "class":
def __init__(self, d, K, sig, sess, logdir): self.K = K self.sig = sig self.sess = sess self.logdir = logdir with tf.name_scope('model'): # Data Placeholder with tf.name_scope('input'): self.placeholders = tf.placeholder(tf.int32) self.words = self.placeholders # Index Masks with tf.name_scope('context_mask'): self.p_mask = tf.cast( tf.range(d.cs / 2, d.n_minibatch + d.cs / 2), tf.int32) rows = tf.cast( tf.tile(tf.expand_dims(tf.range(0, d.cs / 2), [0]), [d.n_minibatch, 1]), tf.int32) columns = tf.cast( tf.tile(tf.expand_dims(tf.range(0, d.n_minibatch), [1]), [1, d.cs / 2]), tf.int32) self.ctx_mask = tf.concat( [rows + columns, rows + columns + d.cs / 2 + 1], 1) with tf.name_scope('embeddings'): # Embedding vectors self.rho = tf.Variable(tf.random_normal([d.L, self.K]) / self.K, name='rho') # Context vectors self.alpha = tf.Variable(tf.random_normal([d.L, self.K]) / self.K, name='alpha') with tf.name_scope('priors'): prior = Normal(loc=0.0, scale=self.sig) self.log_prior = tf.reduce_sum( prior.log_prob(self.rho) + prior.log_prob(self.alpha)) with tf.name_scope('natural_param'): # Taget and Context Indices with tf.name_scope('target_word'): self.p_idx = tf.gather(self.words, self.p_mask) self.p_rho = tf.squeeze(tf.gather(self.rho, self.p_idx)) # Negative samples with tf.name_scope('negative_samples'): unigram_logits = tf.tile( tf.expand_dims(tf.log(tf.constant(d.unigram)), [0]), [d.n_minibatch, 1]) self.n_idx = tf.multinomial(unigram_logits, d.ns) self.n_rho = tf.gather(self.rho, self.n_idx) with tf.name_scope('context'): self.ctx_idx = tf.squeeze( tf.gather(self.words, self.ctx_mask)) self.ctx_alphas = tf.gather(self.alpha, self.ctx_idx) # Natural parameter ctx_sum = tf.reduce_sum(self.ctx_alphas, [1]) self.p_eta = tf.expand_dims( tf.reduce_sum(tf.multiply(self.p_rho, ctx_sum), -1), 1) self.n_eta = tf.reduce_sum( tf.multiply( self.n_rho, tf.tile(tf.expand_dims(ctx_sum, 1), [1, d.ns, 1])), -1) # Conditional likelihood self.y_pos = Bernoulli(logits=self.p_eta) self.y_neg = Bernoulli(logits=self.n_eta) self.ll_pos = tf.reduce_sum(self.y_pos.log_prob(1.0)) self.ll_neg = tf.reduce_sum(self.y_neg.log_prob(0.0)) self.log_likelihood = self.ll_pos + self.ll_neg scale = 1.0 * d.N / d.n_minibatch self.loss = -(scale * self.log_likelihood + self.log_prior) # Training optimizer = tf.train.AdamOptimizer() self.train = optimizer.minimize(self.loss) with self.sess.as_default(): tf.global_variables_initializer().run() variable_summaries('rho', self.rho) variable_summaries('alpha', self.alpha) with tf.name_scope('objective'): tf.summary.scalar('loss', self.loss) tf.summary.scalar('priors', self.log_prior) tf.summary.scalar('ll_pos', self.ll_pos) tf.summary.scalar('ll_neg', self.ll_neg) self.summaries = tf.summary.merge_all() self.train_writer = tf.summary.FileWriter(self.logdir, self.sess.graph) self.saver = tf.train.Saver() config = projector.ProjectorConfig() alpha = config.embeddings.add() alpha.tensor_name = 'model/embeddings/alpha' alpha.metadata_path = '../vocab.tsv' rho = config.embeddings.add() rho.tensor_name = 'model/embeddings/rho' rho.metadata_path = '../vocab.tsv' projector.visualize_embeddings(self.train_writer, config)
def _get_NN_prediction(self, state): from tensorpack.tfutils import symbolic_functions ctx = get_current_tower_context() is_training = ctx.is_training l = state # l = tf.Print(l, [state], 'State = ') with tf.variable_scope('critic') as vs: from autodrive.model.selu import fc_selu for lidx in range(8): l = fc_selu(l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) # l = tf.layers.dense(l, 512, activation=tf.nn.relu, name='fc-dense') # for lidx, hidden_size in enumerate([300, 600]): # l = tf.layers.dense(l, hidden_size, activation=tf.nn.relu, name='fc-%d'%lidx) value = tf.layers.dense(l, 1, name='fc-value',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.1)) if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor') as vs: l = tf.stop_gradient(l) mu_steering = 0.5 * tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-steering',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) sigma_steering_ = 0.5 * tf.layers.dense(l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) sigma_accel_ = 1. * tf.layers.dense(l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) # sigma_beta_steering = symbolic_functions.get_scalar_var('sigma_beta_steering', 0.3, summary=True, trainable=False) # sigma_beta_accel = symbolic_functions.get_scalar_var('sigma_beta_accel', 0.3, summary=True, trainable=False) from tensorpack.tfutils.common import get_global_step_var sigma_beta_steering_exp = tf.train.exponential_decay(0.001, get_global_step_var(), 1000, 0.5, name='sigma/beta/steering/exp') sigma_beta_accel_exp = tf.train.exponential_decay(0.5, get_global_step_var(), 5000, 0.5, name='sigma/beta/accel/exp') # sigma_steering = tf.minimum(sigma_steering_ + sigma_beta_steering, 0.5) # sigma_accel = tf.minimum(sigma_accel_ + sigma_beta_accel, 0.2) # sigma_steering = sigma_steering_ sigma_steering = (sigma_steering_ + sigma_beta_steering_exp) sigma_accel = (sigma_accel_ + sigma_beta_accel_exp) #* 0.1 # sigma_steering = sigma_steering_ # sigma_accel = sigma_accel_ sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # sigma_steering = tf.clip_by_value(sigma_steering, 0.1, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, 0.1, 0.5) # sigmas = sigmas_orig + 0.001 # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigma_beta = tf.get_variable('sigma_beta', shape=[], dtype=tf.float32, # initializer=tf.constant_initializer(.5), trainable=False) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas+1e-3) actions = tf.squeeze(dists.sample([1]), [0]) # 裁剪到一倍方差之内 # actions = tf.clip_by_value(actions, -1., 1.) if is_training: summary.add_moving_summary(tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), sigma_beta_accel_exp, sigma_beta_steering_exp, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) return actions, value, dists
def _get_NN_prediction(self, state): from tensorpack.tfutils import symbolic_functions from tensorpack.tfutils.common import get_global_step_var global_step = get_global_step_var() ctx = get_current_tower_context() is_training = ctx.is_training l = state # l = tf.Print(l, [state], 'State = ') with tf.variable_scope('critic') as vs: from ad_cur.autodrive.model.selu import fc_selu for lidx in range(8): l = fc_selu( l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) # l = tf.layers.dense(l, 512, activation=tf.nn.relu, name='fc-dense') # for lidx, hidden_size in enumerate([300, 600]): # l = tf.layers.dense(l, hidden_size, activation=tf.nn.relu, name='fc-%d'%lidx) value = tf.layers.dense(l, 1, name='fc-value') if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor') as vs: l = tf.stop_gradient(l) mu_steering = 0.5 * tf.layers.dense( l, 1, activation=tf.nn.tanh, name='fc-mu-steering') mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel') mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) sigma_steering_ = 0.5 * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering') sigma_accel_ = 1. * tf.layers.dense( l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel') # sigma_beta_steering = symbolic_functions.get_scalar_var('sigma_beta_steering', 0.3, summary=True, trainable=False) # sigma_beta_accel = symbolic_functions.get_scalar_var('sigma_beta_accel', 0.3, summary=True, trainable=False) if ctx.name.startswith("tower"): sigma_beta_steering_exp = tf.train.exponential_decay( 0.3, global_step, 1000, 0.5, name='sigma/beta/steering/exp') sigma_beta_accel_exp = tf.train.exponential_decay( 0.5, global_step, 5000, 0.5, name='sigma/beta/accel/exp') elif ctx.name == '': sigma_beta_steering_exp = 1e-4 sigma_beta_accel_exp = 1e-4 else: assert (0) # sigma_steering = tf.minimum(sigma_steering_ + sigma_beta_steering, 0.5) # sigma_accel = tf.minimum(sigma_accel_ + sigma_beta_accel, 0.2) # sigma_steering = sigma_steering_ sigma_steering = (sigma_steering_ + sigma_beta_steering_exp) sigma_accel = (sigma_accel_ + sigma_beta_accel_exp) #* 0.1 # sigma_steering = sigma_steering_ # sigma_accel = sigma_accel_ sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # sigma_steering = tf.clip_by_value(sigma_steering, 0.1, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, 0.1, 0.5) # sigmas = sigmas_orig + 0.001 # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigma_beta = tf.get_variable('sigma_beta', shape=[], dtype=tf.float32, # initializer=tf.constant_initializer(.5), trainable=False) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas + 1e-3) actions = tf.squeeze(dists.sample([1]), [0]) # 裁剪到一倍方差之内 # actions = tf.clip_by_value(actions, -1., 1.) if is_training: summary.add_moving_summary( tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), sigma_beta_accel_exp, sigma_beta_steering_exp, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) return actions, value, dists