def context_infer(pooled_features): with tf.variable_scope("fc", reuse=True): weights = tf.stop_gradient(tf.get_variable("weights")) # b = tf.stop_gradient(tf.get_variable("biases")) z = tf.stop_gradient(pooled_features) #Nx64 z = tf.expand_dims(z, -1) # Nx64x1 w = weights # 64x10 w = tf.expand_dims(w, 0) # 1x64x10 mean, variance = tf.nn.moments(w, [1], keep_dims=True) #1x1x10 response = tf.reduce_sum(tf.mul(z, w), 1, keep_dims=True) # Nx1x10 response_vec = tf.mul(response, w) # Nx64x10 response_vec = tf.div(response_vec, variance) # Nx64x10 h = tf.sub(z, response_vec) # Nx64x10 weights_initializer = tf.truncated_normal_initializer( stddev=FC_WEIGHT_STDDEV) with tf.variable_scope("context", reuse=True): context_weights = tf.stop_gradient(tf.get_variable("weights")) biases = tf.stop_gradient(tf.get_variable("biases")) context_weights = tf.expand_dims(context_weights, 0) biases = tf.expand_dims(biases, 0) scores = tf.reduce_sum(tf.mul(h, context_weights), 1) + biases # TODO how to deal with b? return scores
def get_dynamic_rebar_gradient(self): """Get the dynamic rebar gradient (t, eta optimized).""" tiled_pre_temperature = tf.tile([self.pre_temperature_variable], [self.batch_size]) temperature = tf.exp(tiled_pre_temperature) hardELBO, nvil_gradient, logQHard = self._create_hard_elbo() if self.hparams.quadratic: gumbel_cv, extra = self._create_gumbel_control_variate_quadratic(logQHard, temperature=temperature) else: gumbel_cv, extra = self._create_gumbel_control_variate(logQHard, temperature=temperature) f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient)) eta = {} h_grads, eta_statistics = self.multiply_by_eta_per_layer( self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)), eta) model_grads = U.add_grads_and_vars(f_grads, h_grads) total_grads = model_grads # Construct the variance objective g = U.vectorize(model_grads, set_none_to_zero=True) self.maintain_ema_ops.append(self.ema.apply([g])) gbar = 0 #tf.stop_gradient(self.ema.average(g)) variance_objective = tf.reduce_mean(tf.square(g - gbar)) reinf_g_t = 0 if self.hparams.quadratic: for layer in xrange(self.hparams.n_layer): gumbel_learning_signal, _ = extra[layer] df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0] reinf_g_t_i, _ = self.multiply_by_eta_per_layer( self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * logQHard[layer])), eta) reinf_g_t += U.vectorize(reinf_g_t_i, set_none_to_zero=True) reparam = tf.add_n([reparam_i for _, reparam_i in extra]) else: gumbel_learning_signal, reparam = extra df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0] reinf_g_t, _ = self.multiply_by_eta_per_layer( self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * tf.add_n(logQHard))), eta) reinf_g_t = U.vectorize(reinf_g_t, set_none_to_zero=True) reparam_g, _ = self.multiply_by_eta_per_layer( self.optimizer_class.compute_gradients(tf.reduce_mean(reparam)), eta) reparam_g = U.vectorize(reparam_g, set_none_to_zero=True) reparam_g_t = tf.gradients(tf.reduce_mean(2*tf.stop_gradient(g - gbar)*reparam_g), self.pre_temperature_variable)[0] variance_objective_grad = tf.reduce_mean(2*(g - gbar)*reinf_g_t) + reparam_g_t debug = { 'ELBO': hardELBO, 'etas': eta_statistics, 'variance_objective': variance_objective, } return total_grads, debug, variance_objective, variance_objective_grad
def build_loss(self): """ Loss function to minimize, whose gradient is a stochastic gradient inspired by adaptive importance sampling. loss = E_{p(z | x)} [ log p(z | x) - log q(z; lambda) ] is equivalent to minimizing E_{p(z | x)} [ log p(x, z) - log q(z; lambda) ] \approx 1/B sum_{b=1}^B w_norm(z^b; lambda) (log p(x, z^b) - log q(z^b; lambda)) with gradient \approx - 1/B sum_{b=1}^B w_norm(z^b; lambda) grad_{lambda} log q(z^b; lambda) where + z^b ~ q(z^b; lambda) + w_norm(z^b; lambda) = w(z^b; lambda) / sum_{b=1}^B w(z^b; lambda) + w(z^b; lambda) = p(x, z^b) / q(z^b; lambda) """ x = self.data.sample(self.n_data) z, self.samples = self.variational.sample(self.n_minibatch) q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32) for i in range(self.variational.num_factors): q_log_prob += self.variational.log_prob_i(i, tf.stop_gradient(z)) # normalized importance weights log_w = self.model.log_prob(x, z) - q_log_prob log_w_norm = log_w - log_sum_exp(log_w) w_norm = tf.exp(log_w_norm) self.loss = tf.reduce_mean(w_norm * log_w) return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
def _step(self, J, voltage, refractory, dt): delta_t = tf.clip_by_value(dt - refractory, self.zero, dt) dV = (voltage - J) * tf.expm1(-delta_t / self.tau_rc) voltage += dV spiked = voltage > self.one spikes = tf.cast(spiked, J.dtype) * self.alpha partial_ref = -self.tau_rc * tf.log1p((self.one - voltage) / (J - self.one)) # FastLIF version (linearly approximate spike time when calculating # remaining refractory period) # partial_ref = signals.dt * (voltage - self.one) / dV refractory = tf.where(spiked, self.tau_ref - partial_ref, refractory - dt) voltage = tf.where(spiked, self.zeros, tf.maximum(voltage, self.min_voltage)) # we use stop_gradient to avoid propagating any nans (those get # propagated through the cond even if the spiking version isn't # being used at all) return (tf.stop_gradient(spikes), tf.stop_gradient(voltage), tf.stop_gradient(refractory))
def get_next_input(output): # the next location is computed by the location network baseline = tf.sigmoid(tf.matmul(output,Wb_h_b) + Bb_h_b) baselines.append(baseline) # compute the next location, then impose noise if eyeCentered: # add the last sampled glimpse location # TODO max(-1, min(1, u + N(output, sigma) + prevLoc)) mean_loc = tf.maximum(-1.0, tf.minimum(1.0, tf.matmul(output, Wl_h_l) + sampled_locs[-1] )) else: mean_loc = tf.matmul(output, Wl_h_l) # mean_loc = tf.stop_gradient(mean_loc) mean_locs.append(mean_loc) mean_locs_stopGrad.append(tf.stop_gradient(mean_loc)) # add noise # sample_loc = tf.tanh(mean_loc + tf.random_normal(mean_loc.get_shape(), 0, loc_sd)) sample_loc = tf.maximum(-1.0, tf.minimum(1.0, mean_loc + tf.random_normal(mean_loc.get_shape(), 0, loc_sd))) # don't propagate throught the locations # sample_loc = tf.stop_gradient(sample_loc) sampled_locs.append(sample_loc) sampled_locs_stopGrad.append(tf.stop_gradient(sample_loc)) return get_glimpse(sample_loc)
def energy(self, visible_state, hidden_state, scope='energy'): with tf.variable_scope(scope): visible_state = tf.stop_gradient(visible_state, name="visible_state") hidden_state = tf.stop_gradient(hidden_state, name="hidden_state") energy = -tf.reduce_mean(tf.reduce_sum(tf.multiply(tf.matmul(visible_state, self.W, name='visible_weights'), hidden_state, name='weights_hidden') , axis=1, name='energy_sum'), name="batch_energy_mean") if self.visible.use_bias: if self.visible.binary: energy = tf.add(energy, -tf.reduce_mean( tf.reduce_sum(tf.multiply(self.visible.bias, visible_state, name='visible_bias_energy'), axis=1))) else: v = visible_state - self.visible.bias energy = tf.add(energy, tf.reduce_mean(tf.reduce_sum(tf.multiply(v, v) / 2, axis=1))) if self.hidden.use_bias: if self.hidden.binary: energy = tf.add(energy, -tf.reduce_mean( tf.reduce_sum(tf.multiply(self.hidden.bias, hidden_state, name='hidden_bias_energy'), axis=1))) else: h = hidden_state - self.hidden.bias energy = tf.add(energy, tf.reduce_mean(tf.reduce_sum(tf.multiply(h, h) / 2, axis=1))) return energy
def _create_gumbel_control_variate(self, logQHard, temperature=None): '''Calculate gumbel control variate. ''' if temperature is None: temperature = self.hparams.temperature logQ, softSamples = self._recognition_network(sampler=functools.partial( self._random_sample_soft, temperature=temperature)) softELBO, _ = self._generator_network(softSamples, logQ) logQ = tf.add_n(logQ) # Generate the softELBO_v (should be the same value but different grads) logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial( self._random_sample_soft_v, temperature=temperature)) softELBO_v, _ = self._generator_network(softSamples_v, logQ_v) logQ_v = tf.add_n(logQ_v) # Compute losses learning_signal = tf.stop_gradient(softELBO_v) # Control variate h = (tf.stop_gradient(learning_signal) * tf.add_n(logQHard) - softELBO + softELBO_v) extra = (softELBO_v, -softELBO + softELBO_v) return h, extra
def latent_prediction_model(inputs, ed_attention_bias, latents_discrete, latents_dense, hparams, name="latent_prediction"): """Transformer-based latent prediction model. It is an autoregressive decoder over latents_discrete given inputs. Args: inputs: Tensor of shape [batch, length_kv, hparams.hidden_size]. Inputs to attend to for the decoder on latents. ed_attention_bias: Tensor which broadcasts with shape [batch, hparams.num_heads, length_q, length_kv]. Encoder-decoder attention bias. latents_discrete: Tensor of shape [batch, length_q, vocab_size]. One-hot latents to compute log-probability of given inputs. latents_dense: Tensor of shape [batch, length_q, hparams.hidden_size]. hparams: tf.contrib.training.HParams. name: string, variable scope. Returns: latents_pred: Tensor of shape [batch, length_q, hparams.hidden_size]. latents_pred_loss: Tensor of shape [batch, length_q]. """ with tf.variable_scope(name): if hparams.mode != tf.estimator.ModeKeys.PREDICT: latents_pred = transformer_latent_decoder( tf.stop_gradient(latents_dense), inputs, ed_attention_bias, hparams, name) _, latent_pred_loss = ae_latent_softmax( latents_pred, tf.stop_gradient(latents_discrete), hparams) return latents_pred, latent_pred_loss
def rnn_decoder(cell, inputs, initial_state, embedding_size, embedding_length, sequence_length, name='RNNDecoder', reuse=False, use_inputs_prob=0.0, static_input=None): with tf.variable_scope(name, reuse=reuse): # print(tf.get_variable_scope().reuse, tf.get_variable_scope().name) with tf.name_scope("embedding"): batch_size = tf.shape(initial_state)[0] embedding_table = tf.get_variable( name='embedding_table', shape=[embedding_length, embedding_size], initializer=tf.truncated_normal_initializer(stddev=glorot_mul(embedding_length, embedding_size)), ) # 0 is index for _SOS_ (start of sentence symbol) initial_embedding = tf.gather(embedding_table, tf.zeros(tf.pack([batch_size]), tf.int32)) states = [initial_state] outputs = [] outputs_softmax = [] decoder_outputs_argmax_embedding = [] for j in range(sequence_length): with tf.variable_scope(tf.get_variable_scope(), reuse=True if j > 0 else None): # get input : # either feedback the previous decoder argmax output # or use the provided input (note that you have to use the previous input (index si therefore -1) input = initial_embedding if j > 0: true_input = tf.gather(embedding_table, inputs[j - 1]) decoded_input = decoder_outputs_argmax_embedding[-1] choice = tf.floor(tf.random_uniform([1], use_inputs_prob, 1 + use_inputs_prob, tf.float32)) input = choice * true_input + (1.0 - choice) * decoded_input if static_input: input = tf.concat(1, [input, static_input]) # print(tf.get_variable_scope().reuse, tf.get_variable_scope().name) output, state = cell(input, states[-1]) projection = linear( input=output, input_size=cell.output_size, output_size=embedding_length, name='output_linear_projection' ) outputs.append(projection) states.append(state) softmax = tf.nn.softmax(projection, name="output_softmax") # we do no compute the gradient trough argmax output_argmax = tf.stop_gradient(tf.argmax(softmax, 1)) # we do no compute the gradient for embeddings when used with noisy argmax outputs output_argmax_embedding = tf.stop_gradient(tf.gather(embedding_table, output_argmax)) decoder_outputs_argmax_embedding.append(output_argmax_embedding) outputs_softmax.append(tf.expand_dims(softmax, 1)) # remove the initial state states = states[1:] return states, outputs, outputs_softmax
def self_kl(self, logits, sampling_dim, act_dim, act_type): """Calculate KL of distribution with itself. Used layer only for the gradients. """ if self.env_spec.is_discrete(act_type): probs = tf.nn.softmax(logits) log_probs = tf.nn.log_softmax(logits) self_kl = tf.reduce_sum( tf.stop_gradient(probs) * (tf.stop_gradient(log_probs) - log_probs), -1) elif self.env_spec.is_box(act_type): means = logits[:, :sampling_dim / 2] std = logits[:, sampling_dim / 2:] my_means = tf.stop_gradient(means) my_std = tf.stop_gradient(std) self_kl = tf.reduce_sum( tf.log(std / my_std) + (tf.square(my_std) + tf.square(my_means - means)) / (2.0 * tf.square(std)) - 0.5, -1) else: assert False return self_kl
def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols): if output_projection is not None: prev = nn_ops.xw_plus_b( prev, output_projection[0], output_projection[1]) # prev= prev.get_shape().with_rank(2)[1] probs = tf.log(tf.nn.softmax(prev)) if i > 1: probs = tf.reshape(probs + log_beam_probs[-1], [-1, beam_size * num_symbols]) best_probs, indices = tf.nn.top_k(probs, beam_size) indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1]))) best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1])) symbols = indices % num_symbols # Which word in vocabulary. beam_parent = indices // num_symbols # Which hypothesis it came from. beam_symbols.append(symbols) beam_path.append(beam_parent) log_beam_probs.append(best_probs) # Note that gradients will not propagate through the second parameter of # embedding_lookup. emb_prev = embedding_ops.embedding_lookup(embedding, symbols) emb_prev = tf.reshape(emb_prev,[beam_size,embedding_size]) # emb_prev = embedding_ops.embedding_lookup(embedding, symbols) if not update_embedding: emb_prev = array_ops.stop_gradient(emb_prev) return emb_prev
def virtual_adversarial_loss_bidir(logits, embedded, inputs, logits_from_embedding_fn): """Virtual adversarial loss for bidirectional models.""" logits = tf.stop_gradient(logits) f_inputs, _ = inputs weights = _end_of_seq_mask(f_inputs.labels) perturbs = [ _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length) for emb in embedded ] for _ in xrange(FLAGS.num_power_iteration): perturbs = [ _scale_l2(d, FLAGS.small_constant_for_finite_diff) for d in perturbs ] d_logits = logits_from_embedding_fn( [emb + d for (emb, d) in zip(embedded, perturbs)]) kl = _kl_divergence_with_logits(logits, d_logits, weights) perturbs = tf.gradients( kl, perturbs, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) perturbs = [tf.stop_gradient(d) for d in perturbs] perturbs = [ _scale_l2(_mask_by_length(d, f_inputs.length), FLAGS.perturb_norm_length) for d in perturbs ] vadv_logits = logits_from_embedding_fn( [emb + d for (emb, d) in zip(embedded, perturbs)]) return _kl_divergence_with_logits(logits, vadv_logits, weights)
def _logits_cumulative(self, inputs, stop_gradient): """Evaluate logits of the cumulative densities. Arguments: inputs: The values at which to evaluate the cumulative densities, expected to be a `Tensor` of shape `(channels, 1, batch)`. stop_gradient: Boolean. Whether to add `tf.stop_gradient` calls so that the gradient of the output with respect to the density model parameters is disconnected (the gradient with respect to `inputs` is left untouched). Returns: A `Tensor` of the same shape as `inputs`, containing the logits of the cumulative densities evaluated at the given inputs. """ logits = inputs for i in range(len(self.filters) + 1): matrix = self._matrices[i] if stop_gradient: matrix = tf.stop_gradient(matrix) logits = tf.linalg.matmul(matrix, logits) bias = self._biases[i] if stop_gradient: bias = tf.stop_gradient(bias) logits += bias if i < len(self._factors): factor = self._factors[i] if stop_gradient: factor = tf.stop_gradient(factor) logits += factor * tf.math.tanh(logits) return logits
def _create_gumbel_control_variate_quadratic(self, logQHard, temperature=None): '''Calculate gumbel control variate. ''' if temperature is None: temperature = self.hparams.temperature h = 0 extra = [] for layer in xrange(self.hparams.n_layer): logQ, softSamples = self._recognition_network(sampler=functools.partial( self._random_sample_switch, switch_layer=layer, temperature=temperature)) softELBO, _ = self._generator_network(softSamples, logQ) # Generate the softELBO_v (should be the same value but different grads) logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial( self._random_sample_switch_v, switch_layer=layer, temperature=temperature)) softELBO_v, _ = self._generator_network(softSamples_v, logQ_v) # Compute losses learning_signal = tf.stop_gradient(softELBO_v) # Control variate h += (tf.stop_gradient(learning_signal) * logQHard[layer] - softELBO + softELBO_v) extra.append((softELBO_v, -softELBO + softELBO_v)) return h, extra
def target_critic_net(self, states, actions, for_critic_loss=False): """Returns the output of the target critic network. The target network is used to compute stable targets for training. Args: states: A [batch_size, num_state_dims] tensor representing a batch of states. actions: A [batch_size, num_action_dims] tensor representing a batch of actions. Returns: q values: A [batch_size] tensor of q values. Raises: ValueError: If `states` or `actions' do not have the expected dimensions. """ self._validate_states(states) self._validate_actions(actions) values1 = tf.stop_gradient( self._target_critic_net(states, actions, for_critic_loss=for_critic_loss)) values2 = tf.stop_gradient( self._target_critic_net2(states, actions, for_critic_loss=for_critic_loss)) if for_critic_loss: return values1, values2 return values1
def build_graph(self, state, action, futurereward, action_prob): logits, value = self._get_NN_prediction(state) value = tf.squeeze(value, [1], name='pred_value') # (B,) policy = tf.nn.softmax(logits, name='policy') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(policy + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, NUM_ACTIONS), 1) advantage = tf.subtract(tf.stop_gradient(value), futurereward, name='advantage') pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1) # (B,) importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10)) policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage * importance, name='policy_loss') xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(value, name='predict_reward') advantage = tf.sqrt(tf.reduce_mean(tf.square(advantage)), name='rms_advantage') entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss]) cost = tf.truediv(cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost') summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage, cost, tf.reduce_mean(importance, name='importance')) return cost
def virtual_adversarial_loss_bidir(logits, embedded, inputs, logits_from_embedding_fn): """Virtual adversarial loss for bidirectional models.""" logits = tf.stop_gradient(logits) f_inputs, _ = inputs weights = f_inputs.eos_weights if FLAGS.single_label: indices = tf.stack([tf.range(FLAGS.batch_size), f_inputs.length - 1], 1) weights = tf.expand_dims(tf.gather_nd(f_inputs.eos_weights, indices), 1) assert weights is not None perturbs = [ _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length) for emb in embedded ] for _ in xrange(FLAGS.num_power_iteration): perturbs = [ _scale_l2(d, FLAGS.small_constant_for_finite_diff) for d in perturbs ] d_logits = logits_from_embedding_fn( [emb + d for (emb, d) in zip(embedded, perturbs)]) kl = _kl_divergence_with_logits(logits, d_logits, weights) perturbs = tf.gradients( kl, perturbs, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) perturbs = [tf.stop_gradient(d) for d in perturbs] perturbs = [_scale_l2(d, FLAGS.perturb_norm_length) for d in perturbs] vadv_logits = logits_from_embedding_fn( [emb + d for (emb, d) in zip(embedded, perturbs)]) return _kl_divergence_with_logits(logits, vadv_logits, weights)
def get_muprop_gradient(self): """ random sample function that actually returns mean new forward pass that returns logQ as a list can get x_i from samples """ # Hard loss logQHard, hardSamples = self._recognition_network() hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard) # Soft loss logQ, muSamples = self._recognition_network(sampler=self._mean_sample) muELBO, _ = self._generator_network(muSamples, logQ) # Compute gradients muELBOGrads = tf.gradients(tf.reduce_sum(muELBO), [ muSamples[i]['activation'] for i in xrange(self.hparams.n_layer) ]) # Compute MuProp gradient estimates learning_signal = hardELBO optimizerLoss = 0.0 learning_signals = [] for i in xrange(self.hparams.n_layer): dfDiff = tf.reduce_sum( muELBOGrads[i] * (hardSamples[i]['activation'] - muSamples[i]['activation']), axis=1) dfMu = tf.reduce_sum( tf.stop_gradient(muELBOGrads[i]) * tf.nn.sigmoid(hardSamples[i]['log_param']), axis=1) scaling_baseline_0 = self._create_eta(collection='BASELINE') scaling_baseline_1 = self._create_eta(collection='BASELINE') learning_signals.append(learning_signal - scaling_baseline_0 * muELBO - scaling_baseline_1 * dfDiff - self._create_baseline()) self.baseline_loss.append(tf.square(learning_signals[i])) optimizerLoss += ( logQHard[i] * tf.stop_gradient(learning_signals[i]) + tf.stop_gradient(scaling_baseline_1) * dfMu) optimizerLoss += reinforce_model_grad optimizerLoss *= -1 optimizerLoss = tf.reduce_mean(optimizerLoss) muprop_gradient = self.optimizer_class.compute_gradients(optimizerLoss) debug = { 'ELBO': hardELBO, 'muELBO': muELBO, } debug.update(dict([ ('RMS learning signal layer %d' % i, U.rms(learning_signal)) for (i, learning_signal) in enumerate(learning_signals)])) return muprop_gradient, debug
def batch_norm(input_, dim, name, scale=True, train=True, epsilon=1e-8, decay=.1, axes=[0], bn_lag=DEFAULT_BN_LAG): """Batch normalization.""" # create variables with tf.variable_scope(name): var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) if scale: gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.)) beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.)) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_mean /= (1. - bn_lag**(step + 1)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # normalize res = (input_ - used_mean) / tf.sqrt(used_var + epsilon) # de-normalize if scale: res *= gamma res += beta # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) res += 0. * new_mean * new_var * new_step return res
def __init__(self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask, twin_q_t, twin_q_tp1, actor_loss_coeff=0.1, critic_loss_coeff=1.0, gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0, twin_q=False, policy_delay=1): q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked # compute the error (potentially clipped) if twin_q: td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) twin_td_error = twin_q_t_selected - tf.stop_gradient( q_t_selected_target) self.td_error = td_error + twin_td_error if use_huber: errors = _huber_loss(td_error, huber_threshold) + _huber_loss( twin_td_error, huber_threshold) else: errors = 0.5 * tf.square(td_error) + 0.5 * tf.square( twin_td_error) else: self.td_error = ( q_t_selected - tf.stop_gradient(q_t_selected_target)) if use_huber: errors = _huber_loss(self.td_error, huber_threshold) else: errors = 0.5 * tf.square(self.td_error) self.critic_loss = critic_loss_coeff * tf.reduce_mean( importance_weights * errors) # for policy gradient, update policy net one time v.s. # update critic net `policy_delay` time(s) global_step = tf.train.get_or_create_global_step() policy_delay_mask = tf.to_float( tf.equal(tf.mod(global_step, policy_delay), 0)) self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask * tf.reduce_mean(q_tp0))
def build_score_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_prob = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = 'inference_' + str(id(inference)) + '/' + str(s) dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for z in six.iterkeys(inference.latent_vars): z_copy = copy(z, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z])) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_prob[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_prob = tf.stack(p_log_prob) q_log_prob = tf.stack(q_log_prob) if inference.logging: summary_key = 'summaries_' + str(id(inference)) tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob), collections=[summary_key]) tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob), collections=[summary_key]) losses = p_log_prob - q_log_prob loss = -tf.reduce_mean(losses) grads = tf.gradients( -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)), var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def kmeans(x, means, hparams, name): with tf.variable_scope(name): x_means_hot = nearest(x, means, hparams) x_means = tf.gather(means, tf.argmax(x_means_hot, axis=-1)) reg_loss1 = tf.nn.l2_loss((tf.stop_gradient(x) - x_means)) reg_loss2 = hparams.beta * tf.nn.l2_loss((x - tf.stop_gradient(x_means))) l = reg_loss1 + reg_loss2 return x_means_hot, x_means, l
def build_score_kl_loss_and_gradients(inference, var_list): """Build loss function and gradients based on the score function estimator (Paisley et al., 2012). It assumes the KL is analytic. Computed by sampling from $q(z;\lambda)$ and evaluating the expectation using Monte Carlo sampling. """ p_log_lik = [0.0] * inference.n_samples q_log_prob = [0.0] * inference.n_samples base_scope = tf.get_default_graph().unique_name("inference") + '/' for s in range(inference.n_samples): # Form dictionary in order to replace conditioning on prior or # observed variable with conditioning on a specific value. scope = base_scope + tf.get_default_graph().unique_name("sample") dict_swap = {} for x, qx in six.iteritems(inference.data): if isinstance(x, RandomVariable): if isinstance(qx, RandomVariable): qx_copy = copy(qx, scope=scope) dict_swap[x] = qx_copy.value() else: dict_swap[x] = qx for z, qz in six.iteritems(inference.latent_vars): # Copy q(z) to obtain new set of posterior samples. qz_copy = copy(qz, scope=scope) dict_swap[z] = qz_copy.value() q_log_prob[s] += tf.reduce_sum( inference.scale.get(z, 1.0) * qz_copy.log_prob(tf.stop_gradient(dict_swap[z]))) for x in six.iterkeys(inference.data): if isinstance(x, RandomVariable): x_copy = copy(x, dict_swap, scope=scope) p_log_lik[s] += tf.reduce_sum( inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x])) p_log_lik = tf.stack(p_log_lik) q_log_prob = tf.stack(q_log_prob) kl_penalty = tf.reduce_sum([ inference.kl_scaling.get(z, 1.0) * tf.reduce_sum(kl_divergence(qz, z)) for z, qz in six.iteritems(inference.latent_vars)]) if inference.logging: tf.summary.scalar("loss/p_log_lik", tf.reduce_mean(p_log_lik), collections=[inference._summary_key]) tf.summary.scalar("loss/kl_penalty", kl_penalty, collections=[inference._summary_key]) loss = -(tf.reduce_mean(p_log_lik) - kl_penalty) grads = tf.gradients( -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl_penalty), var_list) grads_and_vars = list(zip(grads, var_list)) return loss, grads_and_vars
def __init__(self, env): self.env = env if not isinstance(env.observation_space, Box) or \ not isinstance(env.action_space, Discrete): print("Incompatible spaces.") exit(-1) print("Observation Space", env.observation_space) print("Action Space", env.action_space) self.session = tf.Session() self.end_count = 0 self.train = True self.obs = obs = tf.placeholder( dtype, shape=[ None, 2 * env.observation_space.shape[0] + env.action_space.n], name="obs") self.prev_obs = np.zeros((1, env.observation_space.shape[0])) self.prev_action = np.zeros((1, env.action_space.n)) self.action = action = tf.placeholder(tf.int64, shape=[None], name="action") self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant") self.oldaction_dist = oldaction_dist = tf.placeholder(dtype, shape=[None, env.action_space.n], name="oldaction_dist") # Create neural network. action_dist_n, _ = (pt.wrap(self.obs). fully_connected(64, activation_fn=tf.nn.tanh). softmax_classifier(env.action_space.n)) eps = 1e-6 self.action_dist_n = action_dist_n N = tf.shape(obs)[0] p_n = slice_2d(action_dist_n, tf.range(0, N), action) oldp_n = slice_2d(oldaction_dist, tf.range(0, N), action) ratio_n = p_n / oldp_n Nf = tf.cast(N, dtype) surr = -tf.reduce_mean(ratio_n * advant) # Surrogate loss var_list = tf.trainable_variables() kl = tf.reduce_sum(oldaction_dist * tf.log((oldaction_dist + eps) / (action_dist_n + eps))) / Nf ent = tf.reduce_sum(-action_dist_n * tf.log(action_dist_n + eps)) / Nf self.losses = [surr, kl, ent] self.pg = flatgrad(surr, var_list) # KL divergence where first arg is fixed # replace old->tf.stop_gradient from previous kl kl_firstfixed = tf.reduce_sum(tf.stop_gradient( action_dist_n) * tf.log(tf.stop_gradient(action_dist_n + eps) / (action_dist_n + eps))) / Nf grads = tf.gradients(kl_firstfixed, var_list) self.flat_tangent = tf.placeholder(dtype, shape=[None]) shapes = map(var_shape, var_list) start = 0 tangents = [] for shape in shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[start:(start + size)], shape) tangents.append(param) start += size gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] self.fvp = flatgrad(gvp, var_list) self.gf = GetFlat(self.session, var_list) self.sff = SetFromFlat(self.session, var_list) self.vf = VF(self.session) self.session.run(tf.initialize_all_variables())
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True): """ Batch Renormalization layer, as described in the paper: `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models <https://arxiv.org/abs/1702.03275>`_. Args: x (tf.Tensor): a NHWC or NC tensor. rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. """ shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] if len(shape) == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) beta, gamma, moving_mean, moving_var = get_bn_variables( n_out, use_scale, use_bias, tf.constant_initializer(1.0)) ctx = get_current_tower_context() use_local_stat = ctx.is_training # for BatchRenorm, use_local_stat should always be is_training, unless a # different usage comes out in the future. if use_local_stat: xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta, epsilon=epsilon, is_training=True) inv_sigma = tf.rsqrt(moving_var, 'inv_sigma') r = tf.stop_gradient(tf.clip_by_value( tf.sqrt(batch_var) * inv_sigma, 1.0 / rmax, rmax)) d = tf.stop_gradient(tf.clip_by_value( (batch_mean - moving_mean) * inv_sigma, -dmax, dmax)) xn = xn * r + d else: xn = tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) if len(shape) == 2: xn = tf.squeeze(xn, [1, 2]) if ctx.is_main_training_tower: return update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay) else: return tf.identity(xn, name='output')
def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels): """ Sample some ROIs from all proposals for training. #fg is guaranteed to be > 0, because grount truth boxes are added as RoIs. Args: boxes: nx4 region proposals, floatbox gt_boxes: mx4, floatbox gt_labels: m, int32 Returns: sampled_boxes: tx4 floatbox, the rois sampled_labels: t labels, in [0, #class-1]. Positive means foreground. fg_inds_wrt_gt: #fg indices, each in range [0, m-1]. It contains the matching GT of each foreground roi. """ iou = pairwise_iou(boxes, gt_boxes) # nxm proposal_metrics(iou) # add ground truth as proposals as well boxes = tf.concat([boxes, gt_boxes], axis=0) # (n+m) x 4 iou = tf.concat([iou, tf.eye(tf.shape(gt_boxes)[0])], axis=0) # (n+m) x m # #proposal=n+m from now on def sample_fg_bg(iou): fg_mask = tf.reduce_max(iou, axis=1) >= cfg.FRCNN.FG_THRESH fg_inds = tf.reshape(tf.where(fg_mask), [-1]) num_fg = tf.minimum(int( cfg.FRCNN.BATCH_PER_IM * cfg.FRCNN.FG_RATIO), tf.size(fg_inds), name='num_fg') fg_inds = tf.random_shuffle(fg_inds)[:num_fg] bg_inds = tf.reshape(tf.where(tf.logical_not(fg_mask)), [-1]) num_bg = tf.minimum( cfg.FRCNN.BATCH_PER_IM - num_fg, tf.size(bg_inds), name='num_bg') bg_inds = tf.random_shuffle(bg_inds)[:num_bg] add_moving_summary(num_fg, num_bg) return fg_inds, bg_inds fg_inds, bg_inds = sample_fg_bg(iou) # fg,bg indices w.r.t proposals best_iou_ind = tf.argmax(iou, axis=1) # #proposal, each in 0~m-1 fg_inds_wrt_gt = tf.gather(best_iou_ind, fg_inds) # num_fg all_indices = tf.concat([fg_inds, bg_inds], axis=0) # indices w.r.t all n+m proposal boxes ret_boxes = tf.gather(boxes, all_indices) ret_labels = tf.concat( [tf.gather(gt_labels, fg_inds_wrt_gt), tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0) # stop the gradient -- they are meant to be training targets return tf.stop_gradient(ret_boxes, name='sampled_proposal_boxes'), \ tf.stop_gradient(ret_labels, name='sampled_labels'), \ tf.stop_gradient(fg_inds_wrt_gt)
def __call__(self, batch_size, **kwargs): """Sample a batch of context. Args: batch_size: Batch size. Returns: Two [batch_size, num_context_dims] tensors. """ spec = self._context_spec context_range = self._context_range if isinstance(context_range[0], (int, float)): contexts = tf.random_uniform( shape=[ batch_size, ] + spec.shape.as_list(), minval=context_range[0], maxval=context_range[1], dtype=spec.dtype) elif isinstance(context_range[0], (list, tuple, np.ndarray)): assert len(spec.shape.as_list()) == 1 assert spec.shape.as_list()[0] == len(context_range[0]) assert spec.shape.as_list()[0] == len(context_range[1]) contexts = tf.concat( [ tf.random_uniform( shape=[ batch_size, 1, ] + spec.shape.as_list()[1:], minval=context_range[0][i], maxval=context_range[1][i], dtype=spec.dtype) for i in range(spec.shape.as_list()[0]) ], axis=1) else: raise NotImplementedError(context_range) self._validate_contexts(contexts) if 'sampler_fn' in kwargs: other_contexts = kwargs['sampler_fn']() else: other_contexts = contexts state, next_state = kwargs['state'], kwargs['next_state'] if state is not None and next_state is not None: my_context_range = (np.array(context_range[1]) - np.array(context_range[0])) / 2 * np.ones(spec.shape.as_list()) contexts = tf.concat( [0.1 * my_context_range[:self._k] * tf.random_normal(tf.shape(state[:, :self._k]), dtype=state.dtype) + tf.random_shuffle(state[:, :self._k]) - state[:, :self._k], other_contexts[:, self._k:]], 1) #contexts = tf.Print(contexts, # [contexts, tf.reduce_max(contexts, 0), # tf.reduce_min(state, 0), tf.reduce_max(state, 0)], 'contexts', summarize=15) next_contexts = tf.concat( #LALA [state[:, :self._k] + contexts[:, :self._k] - next_state[:, :self._k], other_contexts[:, self._k:]], 1) next_contexts = contexts #LALA cosine else: next_contexts = contexts return tf.stop_gradient(contexts), tf.stop_gradient(next_contexts)
def generate_fpn_proposals( multilevel_anchors, multilevel_label_logits, multilevel_box_logits, image_shape2d): """ Args: multilevel_anchors: #lvl RPNAnchors multilevel_label_logits: #lvl tensors of shape HxWxA multilevel_box_logits: #lvl tensors of shape HxWxAx4 Returns: boxes: kx4 float scores: k logits """ num_lvl = len(cfg.FPN.ANCHOR_STRIDES) assert len(multilevel_anchors) == num_lvl assert len(multilevel_label_logits) == num_lvl assert len(multilevel_box_logits) == num_lvl ctx = get_current_tower_context() all_boxes = [] all_scores = [] if cfg.FPN.PROPOSAL_MODE == 'Level': fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK for lvl in range(num_lvl): with tf.name_scope('Lvl{}'.format(lvl + 2)): anchors = multilevel_anchors[lvl] pred_boxes_decoded = anchors.decode_logits(multilevel_box_logits[lvl]) proposal_boxes, proposal_scores = generate_rpn_proposals( tf.reshape(pred_boxes_decoded, [-1, 4]), tf.reshape(multilevel_label_logits[lvl], [-1]), image_shape2d, fpn_nms_topk) all_boxes.append(proposal_boxes) all_scores.append(proposal_scores) proposal_boxes = tf.concat(all_boxes, axis=0) # nx4 proposal_scores = tf.concat(all_scores, axis=0) # n proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk) proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False) proposal_boxes = tf.gather(proposal_boxes, topk_indices) else: for lvl in range(num_lvl): with tf.name_scope('Lvl{}'.format(lvl + 2)): anchors = multilevel_anchors[lvl] pred_boxes_decoded = anchors.decode_logits(multilevel_box_logits[lvl]) all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4])) all_scores.append(tf.reshape(multilevel_label_logits[lvl], [-1])) all_boxes = tf.concat(all_boxes, axis=0) all_scores = tf.concat(all_scores, axis=0) proposal_boxes, proposal_scores = generate_rpn_proposals( all_boxes, all_scores, image_shape2d, cfg.RPN.TRAIN_PRE_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PRE_NMS_TOPK, cfg.RPN.TRAIN_POST_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_POST_NMS_TOPK) tf.sigmoid(proposal_scores, name='probs') # for visualization return tf.stop_gradient(proposal_boxes, name='boxes'), \ tf.stop_gradient(proposal_scores, name='scores')
def batch_norm_log_diff(input_, dim, name, train=True, epsilon=1e-8, decay=.1, axes=[0], reuse=None, bn_lag=DEFAULT_BN_LAG): """Batch normalization with corresponding log determinant Jacobian.""" if reuse is None: reuse = not train # create variables with tf.variable_scope(name) as scope: if reuse: scope.reuse_variables() var = variable_on_cpu( "var", [dim], tf.constant_initializer(1.), trainable=False) mean = variable_on_cpu( "mean", [dim], tf.constant_initializer(0.), trainable=False) step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False) # choose the appropriate moments if train: used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm") cur_mean, cur_var = used_mean, used_var if bn_lag > 0.: used_var = stable_var(input_=input_, mean=used_mean, axes=axes) cur_var = used_var used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean)) used_mean /= (1. - bn_lag**(step + 1)) used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var)) used_var /= (1. - bn_lag**(step + 1)) else: used_mean, used_var = mean, var cur_mean, cur_var = used_mean, used_var # update variables if train: with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]): with ops.colocate_with(mean): new_mean = tf.assign_sub( mean, tf.check_numerics( decay * (mean - cur_mean), "NaN in moving mean.")) with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]): with ops.colocate_with(var): new_var = tf.assign_sub( var, tf.check_numerics(decay * (var - cur_var), "NaN in moving variance.")) with tf.name_scope(name, "IncrementTime", [step]): with ops.colocate_with(step): new_step = tf.assign_add(step, 1.) used_var += 0. * new_mean * new_var * new_step used_var += epsilon return used_mean, used_var
def compute_loss(): labelsf = tf.cast(labels, logits.dtype) signs = 2. * labelsf - 1. errors = 1. - logits * tf.stop_gradient(signs) errors_sorted, perm = tf.nn.top_k(errors, k=tf.shape(errors)[0], name="descending_sort") gt_sorted = tf.gather(labelsf, perm) grad = lovasz_grad(gt_sorted) loss = tf.tensordot(tf.nn.relu(errors_sorted), tf.stop_gradient(grad), 1, name="loss_non_void") return loss
def _build_net(self): # ------------------ inputs --------------------- self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input State self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input Next State self.r = tf.placeholder(tf.float32, [ None, ], name='r') # input Reward self.a = tf.placeholder(tf.int32, [ None, ], name='a') # input Action w_initializer, b_initializer = tf.random_normal_initializer( 0., 0.3), tf.constant_initializer(0.1) # ------------------ evaluation_net -------------- with tf.variable_scope('eval_net'): e1 = tf.layers.dense(self.s, FIRSTLAYER_SIZE, tf.nn.relu, kernel_initializer=w_initializer, bias_initializer=b_initializer, name='e1') self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer, bias_initializer=b_initializer, name='q') # ------------------ target_net ------------------ with tf.variable_scope('target_net'): t1 = tf.layers.dense(self.s_, FIRSTLAYER_SIZE, tf.nn.relu, kernel_initializer=w_initializer, bias_initializer=b_initializer, name='t1') self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer, bias_initializer=b_initializer, name='t2') with tf.variable_scope('q_target'): q_target = self.r + self.gamma * tf.reduce_max( self.q_next, axis=1, name='Qmax_s_') # shape=(None, ) self.q_target = tf.stop_gradient(q_target) with tf.variable_scope('q_eval'): a_indices = tf.stack( [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) self.q_eval_wrt_a = tf.gather_nd( params=self.q_eval, indices=a_indices) # shape=(None, ) with tf.variable_scope('loss'): self.loss = tf.reduce_mean( tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error')) with tf.variable_scope('train'): self._train_op = tf.train.AdamOptimizer(self.lr).minimize( self.loss)
features = tf.placeholder(tf.float32 , (None , 32,32,3)) labels = tf.placeholder(tf.init64 , None) resized = tf.image.resize_images(features , (227 , 227)) # Returns the second final layer of the AlexNet model, # this allows us to redo the last layer for the traffic signs # model. fc7 = AlexNet(resized , feature_extract = True) fc7 = tf.stop_gradient(fc7) shape= (fc7.get_shape().as_list[-1] , nb_classes) # designing the new fully connected layer : fc8W = tf.Variable(tf.truncated_normal(shape, stddev = 1e-2)) fc8b = tf.varibale(tf.zeros(nb_classes)) logits = tf.matmul(fc7 , fc8W) + fc8b cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits loss_op = tf.reduce_mean(cross_entropy) opt = tf.train.AdamOptimizer() train_op =opt.minimize(loss_op , var_list= [fc8W , fc8b])
def _init_actor_update(self): """Create minimization operations for policy and entropy. Creates a `tf.optimizer.minimize` operations for updating policy and entropy with gradient descent, and adds them to `self._training_ops` attribute. See Section 4.2 in [1], for further information of the policy update, and Section 5 in [1] for further information of the entropy update. """ actions = self._policy.actions([self._observations_ph]) log_pis = self._policy.log_pis([self._observations_ph], actions) assert log_pis.shape.as_list() == [None, 1] log_alpha = self._log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) if isinstance(self._target_entropy, Number): alpha_loss = -tf.reduce_mean( log_alpha * tf.stop_gradient(log_pis + self._target_entropy)) self._alpha_optimizer = tf.train.AdamOptimizer( self._policy_lr, name='alpha_optimizer') self._alpha_train_op = self._alpha_optimizer.minimize( loss=alpha_loss, var_list=[log_alpha]) self._training_ops.update( {'temperature_alpha': self._alpha_train_op}) self._alpha = alpha if self._action_prior == 'normal': policy_prior = tfp.distributions.MultivariateNormalDiag( loc=tf.zeros(self._action_shape), scale_diag=tf.ones(self._action_shape)) policy_prior_log_probs = policy_prior.log_prob(actions) elif self._action_prior == 'uniform': policy_prior_log_probs = 0.0 Q_log_targets = tuple( Q([self._observations_ph, actions]) for Q in self._Qs) min_Q_log_target = tf.reduce_min(Q_log_targets, axis=0) if self._reparameterize: policy_kl_losses = (alpha * log_pis - min_Q_log_target - policy_prior_log_probs) else: raise NotImplementedError assert policy_kl_losses.shape.as_list() == [None, 1] self._policy_losses = policy_kl_losses policy_loss = tf.reduce_mean(policy_kl_losses) self._policy_optimizer = tf.train.AdamOptimizer( learning_rate=self._policy_lr, name="policy_optimizer") policy_train_op = self._policy_optimizer.minimize( loss=policy_loss, var_list=self._policy.trainable_variables) self._training_ops.update({'policy_train_op': policy_train_op})
def asac(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=200, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=5e-4, alpha_start=0.2, batch_size=100, start_steps=10000, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, loss_threshold=0.0001, delta=0.02, sample_step=2000): alpha = Alpha(alpha_start=alpha_start, delta=delta) alpha_t = alpha() logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None) x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) alpha_ph = core.scale_holder() # Main outputs from computation graph #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs) with tf.variable_scope('main'): mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic( x_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, _, _, _, _, _, v_targ, _, _, R_targ = actor_critic( x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in [ 'main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R', 'main' ]) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts) # Min Double-Q: min_q_pi = tf.minimum(q1_pi, q2_pi) # Targets for Q and V regression q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ) v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi) Q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * R_targ) R_backup = tf.stop_gradient(Q_pi) adv = Q_pi - R pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2) Q_loss = 0.5 * tf.reduce_mean((Q_backup - Q)**2) R_loss = 0.5 * tf.reduce_mean((R_backup - R)**2) value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') + get_vars('main/v') + get_vars( 'main/Q') + get_vars('main/R') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) """ R_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R')) """ # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step step_ops = [ pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op, train_value_op, target_update, R_loss, Q_loss ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) config = tf.ConfigProto(inter_op_parallelism_threads=30, intra_op_parallelism_threads=5) config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'mu': mu, 'pi': pi, 'q1': q1, 'q2': q2, 'v': v, 'Q': Q, 'R': R }) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)}) def test_agent(n=10): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] total_steps = steps_per_epoch * epochs counter = 0 ret_epi = [] obs_epi = [] loss_old = 10000 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 if d or (ep_len == max_ep_len): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], alpha_ph: alpha_t } outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], LossV=outs[3], Q1Vals=outs[4], Q2Vals=outs[5], VVals=outs[6], LogPi=outs[7], LossR=outs[11]) counter += 1 logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 ret_est = sess.run(R, feed_dict={x_ph: [o]})[0] logger.store(RetEst=ret_est) if counter >= 1000: loss_new, _ = logger.get_stats('LossPi') counter = 0 if (loss_old - loss_new) / np.absolute( loss_old) < loss_threshold and t > start_steps: rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32) rho_ptr = 0 for sample_t in range(sample_step): a = get_action(o) o2, r, d, _ = env.step(a) ep_len += 1 d = False if ep_len == max_ep_len else d rho_s[rho_ptr] = o o = o2 if d or (ep_len == max_ep_len): o, r, d, ep_ret, ep_len = env.reset( ), 0, False, 0, 0 advantages = sess.run(adv, feed_dict={x_ph: rho_s}) alpha.update_alpha(advantages) #alpha.update_alpha(rho_q-rho_v) alpha_t = alpha() print(alpha_t) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 loss_old = 10000 else: loss_old = loss_new # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EntCoeff', alpha_t) logger.log_tabular('RetEst', average_only=True) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('LossR', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def frequency_encoder(features, kernels, biases): # Check for valid weights restore = False if kernels[0] is not None: restore = True # Capture largest frequency dependent features conv_freq1 = tf.layers.Conv2D( 6, (HEIGHT / 2, 2), strides=(HEIGHT / 4, 2), activation='relu', padding='same', name='conv1-', kernel_initializer=kernels.pop(), kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10), bias_initializer=biases.pop())(features) # Capture large frequency dependent features conv_freq2 = tf.layers.Conv2D( 6, (HEIGHT / 4, 2), strides=(HEIGHT / 8, 2), activation='relu', padding='same', name='conv2-', kernel_initializer=kernels.pop(), kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10), bias_initializer=biases.pop())(features) # Capture small frequency dependent features conv_freq3 = tf.layers.Conv2D( 6, (HEIGHT / 8, 2), strides=(HEIGHT / 16, 2), activation='relu', padding='same', name='conv3-', kernel_initializer=kernels.pop(), kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10), bias_initializer=biases.pop())(features) # Capture smallest frequency dependent features conv_freq4 = tf.layers.Conv2D( 6, (HEIGHT / 21, 2), strides=(HEIGHT / 42, 2), activation='relu', padding='same', name='conv4-', kernel_initializer=kernels.pop(), kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10), bias_initializer=biases.pop())(features) # Pool out time scales pool_freq1 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16), padding='same', name='pool5-')(conv_freq1) pool_freq2 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16), padding='same', name='pool6-')(conv_freq2) pool_freq3 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16), padding='same', name='pool7-')(conv_freq3) pool_freq4 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16), padding='same', name='pool8-')(conv_freq4) ''' # Pad smaller feature maps pool_freq1 = tf.pad(pool_freq1, tf.constant([[0, 0], [17, 17], [0, 0], [0, 0]])) pool_freq2 = tf.pad(pool_freq2, tf.constant([[0, 0], [15, 15], [0, 0], [0, 0]])) pool_freq3 = tf.pad(pool_freq3, tf.constant([[0, 0], [11, 11], [0, 0], [0, 0]])) # Concat into same feature map freq_map = tf.concat([pool_freq1, pool_freq2, pool_freq3, pool_freq4], 3) ''' # Upscale smaller frequency maps _, height, width, depth = pool_freq4.get_shape() pool_freq1 = tf.image.resize_nearest_neighbor(pool_freq1, [height, width]) pool_freq2 = tf.image.resize_nearest_neighbor(pool_freq2, [height, width]) pool_freq3 = tf.image.resize_nearest_neighbor(pool_freq3, [height, width]) # Concat into same feature map freq_map = tf.concat([pool_freq1, pool_freq2, pool_freq3, pool_freq4], 3) # Post image of feedback map BROKEN feedback_map = tf.concat([pool_freq1, pool_freq2, pool_freq3, pool_freq4], 2) feedback_image0 = tf.slice(feedback_map, [0, 0, 0, 0], [-1, -1, -1, 3]) feedback_image1 = tf.slice(feedback_map, [0, 0, 0, 3], [-1, -1, -1, 3]) feedback_image = tf.concat([feedback_image0, feedback_image1], 2) tf.summary.image("feedback_map", feedback_image, max_outputs=18) # If valid weights were loaded if restore: # Don't update layers tf.stop_gradient(conv_freq1) tf.stop_gradient(conv_freq2) tf.stop_gradient(conv_freq3) tf.stop_gradient(conv_freq4) return freq_map
def __init__(self, size_obs, size_act, net_struct = [100, 100, 100, 100], name='dbg'): self.tensorboardpath = 'tensorboards/' + name self.train_writer = tf.summary.FileWriter(self.tensorboardpath) self.ModelPath = 'Models/Imitation' + name self.mse_train = [] self.mse_val = [] self.last_epoch = 0 size_inpt = 200 self.obs = tf.placeholder(tf.float32, shape=(None, size_obs)) self.ret = tf.placeholder(tf.float32, shape=(None)) act_trn = self.obs act_tst = self.obs prev_layer_size = size_obs #Hidden layers self.l2_reg = 1e-8 self.Q_lr = tf.placeholder(tf.float32, shape=(None)) self.lr = tf.placeholder(tf.float32, shape=(None)) if 1: for idx, l in enumerate(net_struct): act_trn, act_tst = ops.cascade_bn_relu_trn_tst( act_trn, prev_layer_size, l, name='layer' + str(idx), input_tst = act_tst) prev_layer_size += l w = tf.Variable(tf.random_uniform([prev_layer_size, size_act],minval = -1., maxval = 1.), name='net_output_w') * 1e-3 b = tf.Variable(tf.random_uniform([size_act],minval = -1., maxval = 1.), name='net_output_bias') * 1e-3 else: for idx, l in enumerate(net_struct): act_trn = ops.linear(act_trn, l, 'layer' + str(idx)) w = tf.Variable(tf.random_uniform([l, size_act],minval = -1., maxval = 1.), name='net_output_w') * 1e-2 b = tf.Variable(tf.random_uniform([size_act],minval = -1., maxval = 1.), name='net_output_bias') * 1e-2 self.yhat = tf.reshape(tf.matmul(act_trn, w) + b, [-1, size_act]) self.yhat_tst = tf.reshape(tf.matmul(act_tst, w) + b, [-1, size_act]) self.obs_act = tf.concat((self.obs, self.yhat),1) self.Q = Q(size_obs + size_act, tf.stop_gradient(self.obs_act)) self.act = tf.placeholder(tf.float32, shape=(None)) self.l2_loss = tf.reduce_mean(tf.square(self.yhat - self.act)) self.adv_loss = tf.reduce_mean(tf.square(self.yhat_tst - self.act)) #-1*tf.gather_nd(output_tst, self.y_raw, axis=1)output_tst[list(np.arange(bs)),self.y_raw] self.advers = tf.gradients(self.l2_loss, self.obs) t_vars = tf.trainable_variables() net_vars = [var for var in t_vars if 'net_' in var.name] self.reg_loss = tf.reduce_sum([tf.reduce_sum(tf.square(var)) for var in net_vars])*self.l2_reg optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) gvs = optimizer.compute_gradients(self.l2_loss + self.reg_loss - self.Q.yhat * self.Q_lr + self.Q.l2_loss) self.grad_norm = tf.reduce_mean([tf.reduce_mean(grad) for grad, var in gvs if grad is not None]) clip_norm = 100 clip_single = 1 capped_gvs = [(tf.clip_by_value(grad, -1*clip_single,clip_single), var) for grad, var in gvs if grad is not None] capped_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in capped_gvs if grad is not None] self.optimizer = optimizer.apply_gradients(capped_gvs) #self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.l2_loss) self.cur_Q_lr = 0 self.session = tf.Session() self.session.run(tf.global_variables_initializer()) self.Saver = tf.train.Saver()
def train(self, x, y=None, max_entropy=True, epochs=100, batch_size=64, lr=1e-3, tau_rate = 1e-4, task = 'autoencoder'): taskdict = { 'autoencoder': self.L1, 'classification': self.CrossEnt } schedule = lambda i: np.float32(np.max((0.5, np.exp(-tau_rate*i)))) y = (x if y is None else y) if task in taskdict: self.taskLoss = taskdict[task] self.Loss = self.taskLoss - (self.Entropy if max_entropy else tf.stop_gradient(self.Entropy)) else: raise ValueError('task not supported yet') #Optimizer solver = tf.train.AdagradOptimizer(learning_rate = lr).minimize(self.Loss, var_list=self.params) #Need to clip gradients as they get huge for gumbel softmax + stein gd #gradients, variables = zip(*solver.compute_gradients(self.Loss, var_list=self.params)) #gradients, _ = tf.clip_by_global_norm(gradients, 5.0) #solver = solver.apply_gradients(zip(gradients, variables)) #Training init = tf.global_variables_initializer() sess = tf.Session() self.sess = sess with sess.as_default(): sess.run(init) losses = [] tasklosses = [] ents = [] stds = [] n_batches = int(x.shape[0]/float(batch_size)) for epoch in range(epochs): rand_idxs = np.arange(x.shape[0]) np.random.shuffle(rand_idxs) loss = 0 task_loss = 0 ent = 0 std = 0 for batch in range(n_batches): tau = schedule(epoch*n_batches + batch) mb_idx = rand_idxs[batch*batch_size:(batch+1)*batch_size] x_mb = x[mb_idx] y_mb = y[mb_idx] g = np.random.gumbel(size=(len(x_mb), self.k, 2)) _, loss_curr, taskloss_curr, ent_curr, std_curr = sess.run([solver, self.Loss, self.taskLoss, self.Entropy, self.std], feed_dict = {self.X:x_mb, self.Y:y_mb, self.g: g, self.tau: tau}) loss += loss_curr/n_batches task_loss += taskloss_curr/n_batches ent += ent_curr/n_batches std += np.mean(np.abs(std_curr))/n_batches losses.append(loss) tasklosses.append(task_loss) ents.append(ent) stds.append(std) print('Final task loss: %f' %(tasklosses[-1])) plt.figure() plt.plot(losses) plt.title('total loss') plt.figure() plt.plot(tasklosses) plt.title('task loss') plt.figure() plt.plot(np.array(ents) * 1.442695) #in bits plt.title('ent') plt.figure() plt.plot(stds) plt.title('std')
def layers(vgg_layer3_out, vgg_layer4_out, vgg_layer7_out, num_classes): """ Create the layers for a fully convolutional network. Build skip-layers using the vgg layers. :param vgg_layer7_out: TF Tensor for VGG Layer 3 output :param vgg_layer4_out: TF Tensor for VGG Layer 4 output :param vgg_layer3_out: TF Tensor for VGG Layer 7 output :param num_classes: Number of classes to classify :return: The Tensor for the last layer of output """ # 1x1 convolution layer for feature extraction tf_conv1x1 = tf.layers.conv2d( vgg_layer7_out, num_classes, 1, 1, kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3), kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) # conv2d-transpose for 2x upsampling tf_2x = tf.layers.conv2d_transpose( tf_conv1x1, num_classes, 4, 2, padding='SAME', kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3), kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) # combine with pooling layer 4 tf_skip1 = tf.add( tf_2x, tf.layers.conv2d( vgg_layer4_out, num_classes, 1, 1, kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3), kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))) # perform conv2d-transpose for 2x upsampling again. # output: 4x features + 2x pool4 tf_4x = tf.layers.conv2d_transpose( tf_skip1, num_classes, 4, 2, padding='SAME', kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3), kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) # combile with pooling layer 3 tf_skip2 = tf.add( tf_4x, tf.layers.conv2d( vgg_layer3_out, num_classes, 1, 1, kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3), kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))) # perform conv2d-transpose for 2x upsampling again. # output: 8x features + 4x pool4 + 2x pool3 tf_final = tf.layers.conv2d_transpose( tf_skip2, num_classes, 16, 8, padding='SAME', kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3), kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) tf.stop_gradient(vgg_layer3_out) tf.stop_gradient(vgg_layer4_out) tf.stop_gradient(vgg_layer7_out) return tf_final
def setup_model(self): with SetVerbosity(self.verbose): self.graph = tf.Graph() with self.graph.as_default(): self.set_random_seed(self.seed) self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph) self.replay_buffer = ReplayBuffer(self.buffer_size) self.full_buffers = [FullBuffer(self.full_size, self.env.observation_space.shape[0], self.env.action_space.shape[0]) for _ in range(len(self.env.unwrapped.tasks))] self.env.task_idx = 0 with tf.variable_scope("input", reuse=False): # Create policy and target TF objects self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) self.target_policy = self.policy(self.sess, self.observation_space, self.action_space, **self.policy_kwargs) # Initialize Placeholders self.observations_ph = self.policy_tf.obs_ph # Normalized observation for pixels self.processed_obs_ph = self.policy_tf.processed_obs self.next_observations_ph = self.target_policy.obs_ph self.processed_next_obs_ph = self.target_policy.processed_obs self.action_target = self.target_policy.action_ph self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals') self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards') self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape, name='actions') self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") with tf.variable_scope("model", reuse=False): # Create the policy # first return value corresponds to deterministic actions # policy_out corresponds to stochastic actions, used for training # logp_pi is the log probability of actions taken by the policy self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph) # Monitor the entropy of the policy, # this is not used for training self.entropy = tf.reduce_mean(self.policy_tf.entropy) # Use two Q-functions to improve performance by reducing overestimation bias. qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph, create_qf=True, create_vf=True) qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph, policy_out, create_qf=True, create_vf=False, reuse=True) # Target entropy is used when learning the entropy coefficient if self.target_entropy == 'auto': # automatically set target entropy if needed self.target_entropy = -np.prod(self.env.action_space.shape).astype(np.float32) else: # Force conversion # this will also throw an error for unexpected string self.target_entropy = float(self.target_entropy) # The entropy coefficient or entropy can be learned automatically # see Automating Entropy Adjustment for Maximum Entropy RL section # of https://arxiv.org/abs/1812.05905 if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'): # Default initial value of ent_coef when learned init_value = 1.0 if '_' in self.ent_coef: init_value = float(self.ent_coef.split('_')[1]) assert init_value > 0., "The initial value of ent_coef must be greater than 0" self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32, initializer=np.log(init_value).astype(np.float32)) self.ent_coef = tf.exp(self.log_ent_coef) else: # Force conversion to float # this will throw an error if a malformed string (different from 'auto') # is passed self.ent_coef = float(self.ent_coef) with tf.variable_scope("target", reuse=False): # Create the value network _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph, create_qf=False, create_vf=True) self.value_target = value_target with tf.variable_scope("loss", reuse=False): # Take the min of the two Q-Values (Double-Q Learning) min_qf_pi = tf.minimum(qf1_pi, qf2_pi) # Target for Q value regression q_backup = tf.stop_gradient( self.rewards_ph + (1 - self.terminals_ph) * self.gamma * self.value_target ) # Compute Q-Function loss # TODO: test with huber loss (it would avoid too high values) qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2) qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2) # Compute the entropy temperature loss # it is used when the entropy coefficient is learned ent_coef_loss, entropy_optimizer = None, None if not isinstance(self.ent_coef, float): ent_coef_loss = -tf.reduce_mean( self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy)) entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) # Compute the policy loss # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi) policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi) # NOTE: in the original implementation, they have an additional # regularization loss for the Gaussian parameters # this is not used for now # policy_loss = (policy_kl_loss + policy_regularization_loss) policy_loss = policy_kl_loss # Target for value fn regression # We update the vf towards the min of two Q-functions in order to # reduce overestimation bias from function approximation error. v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi) value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2) values_losses = qf1_loss + qf2_loss + value_loss # Policy train op # (has to be separate from value train op, because min_qf_pi appears in policy_loss) policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) policy_train_op = policy_optimizer.minimize(policy_loss, var_list=get_vars('model/pi')) # Value train op value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph) values_params = get_vars('model/values_fn') source_params = get_vars("model/values_fn/vf") target_params = get_vars("target/values_fn/vf") # Polyak averaging for target variables self.target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Initializing target to match source variables target_init_op = [ tf.assign(target, source) for target, source in zip(target_params, source_params) ] # Control flow is used because sess.run otherwise evaluates in nondeterministic order # and we first need to compute the policy action before computing q values losses with tf.control_dependencies([policy_train_op]): train_values_op = value_optimizer.minimize(values_losses, var_list=values_params) self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy'] # All ops to call during one training step self.step_ops = [policy_loss, qf1_loss, qf2_loss, value_loss, qf1, qf2, value_fn, logp_pi, self.entropy, policy_train_op, train_values_op] # Add entropy coefficient optimization operation if needed if ent_coef_loss is not None: with tf.control_dependencies([train_values_op]): ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef) self.infos_names += ['ent_coef_loss', 'ent_coef'] self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef] # Monitor losses and entropy in tensorboard tf.summary.scalar('policy_loss', policy_loss) tf.summary.scalar('qf1_loss', qf1_loss) tf.summary.scalar('qf2_loss', qf2_loss) tf.summary.scalar('value_loss', value_loss) tf.summary.scalar('entropy', self.entropy) if ent_coef_loss is not None: tf.summary.scalar('ent_coef_loss', ent_coef_loss) tf.summary.scalar('ent_coef', self.ent_coef) tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) # Retrieve parameters that must be saved self.params = get_vars("model") self.target_params = get_vars("target/values_fn/vf") # Initialize Variables and target network with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init_op) self.summary = tf.summary.merge_all()
def build_loss(self, predictions, examples, **kwargs): """Build tf graph to compute loss. Args: predictions: dict of prediction results keyed by name. examples: dict of inputs keyed by name. Returns: loss_dict: dict of loss tensors keyed by name. """ options = self._model_proto loss_dict = {} with tf.name_scope('losses'): # Loss of the MIDN module. labels = self._label_extractor.extract_labels(examples) losses = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=predictions[Cap2DetPredictions.midn_class_logits]) loss_dict['midn_cross_entropy_loss'] = tf.multiply( tf.reduce_mean(losses), options.midn_loss_weight) # Losses of the OICR module. (num_proposals, proposals) = (predictions[DetectionResultFields.num_proposals], predictions[DetectionResultFields.proposal_boxes]) batch, max_num_proposals, _ = utils.get_tensor_shape(proposals) proposal_scores_0 = predictions[ Cap2DetPredictions.oicr_proposal_scores + '_at_0'] if options.oicr_use_proba_r_given_c: proposal_scores_0 = predictions[ Cap2DetPredictions.midn_proba_r_given_c] proposal_scores_0 = tf.concat([ tf.fill([batch, max_num_proposals, 1], 0.0), proposal_scores_0 ], axis=-1) for i in range(options.oicr_iterations): proposal_scores_1 = predictions[ Cap2DetPredictions.oicr_proposal_scores + '_at_{}'.format(i + 1)] oicr_cross_entropy_loss_at_i = model_utils.calc_oicr_loss( labels, num_proposals, proposals, tf.stop_gradient(proposal_scores_0), proposal_scores_1, scope='oicr_{}'.format(i + 1), iou_threshold=options.oicr_iou_threshold) loss_dict['oicr_cross_entropy_loss_at_{}'.format( i + 1)] = tf.multiply(oicr_cross_entropy_loss_at_i, options.oicr_loss_weight) proposal_scores_0 = tf.nn.softmax(proposal_scores_1, axis=-1) return loss_dict
def forward(self, Ts, images, depths, intrinsics, inds=None, num_fixed=0, init=tf.constant(False)): # motion network performs projection operations in features space cfg = self.cfg batch = tf.shape(images)[0] num = tf.shape(images)[1] if cfg.RESCALE_IMAGES: images = 2 * (images / 255.0) - 1.0 if inds is None: if self.mode == 'keyframe': self.inds = self._keyframe_pairs_indicies(num) num_fixed = 1 elif self.mode == 'global': self.inds = self._all_pairs_indicies(num) else: self.inds = inds (ii, jj) = self.inds intrinsics = intrinsics_vec_to_matrix(intrinsics) # if self.is_training and (not self.is_calibrated): # perturbation = 0.1 * tf.random.normal([batch, 1]) # intrinsics = update_intrinsics(intrinsics, perturbation) depths_low, intrinsics = rescale_depths_and_intrinsics(depths, intrinsics, downscale=4) with tf.variable_scope("motion", reuse=self.reuse) as sc: if Ts is None: Ts = self.pose_regressor_init(images) else: if self.use_regressor: Gs = self.pose_regressor_init(images) Ts = cond_transform(init, Gs, Ts) feats = self.extract_features(images) depths = tf.gather(depths_low, ii, axis=1) + EPS feats1 = tf.gather(feats, ii, axis=1) feats2 = tf.gather(feats, jj, axis=1) Ti = Ts.gather(ii) Tj = Ts.gather(jj) Tij = Tj * Ti.inv() for i in range(cfg.FLOWSE3.ITER_COUNT): Tij = Tij.copy(stop_gradients=True) Ts = Ts.copy(stop_gradients=True) intrinsics = tf.stop_gradient(intrinsics) coords, vmask = Tij.transform(depths, intrinsics, valid_mask=True) featsw = vmask * bilinear_sampler(feats2, coords, batch_dims=2) with tf.name_scope("residual"): flow, weight = self.flownet(feats1, featsw, reuse=i > 0) self.weights_history.append(weight) target = flow + coords weight = vmask * tf.nn.sigmoid(weight) with tf.name_scope("PnP"): if (self.mode == 'keyframe') and self.is_calibrated: Tij = Tij.keyframe_optim(target, weight, depths, intrinsics) Ts = Tij.append_identity( ) # set keyframe pose to identity else: Ts, intrinsics = Ts.global_optim( target, weight, depths, intrinsics, (jj, ii), num_fixed=num_fixed, include_intrinsics=(not self.is_calibrated)) Tij = Ts.gather(jj) * Ts.gather( ii).inv() # relative poses coords, vmask1 = Tij.transform(depths, intrinsics, valid_mask=True) self.transform_history.append(Ts) self.residual_history.append(vmask * vmask1 * (coords - target)) self.intrinsics_history.append( intrinsics_matrix_to_vec(intrinsics)) intrinsics = 4.0 * intrinsics_matrix_to_vec(intrinsics) return Ts, intrinsics
def train_step(): experience, _ = next(iterator) prior = predictor_net( (experience.observation[:, 0], experience.action[:, 0]), training=False) z_next = encoder_net(experience.observation[:, 1], training=False) # predictor_kl is a vector of size batch_size. predictor_kl = tfp.distributions.kl_divergence(z_next, prior) with tf.GradientTape() as tape: tape.watch(actor_net._log_kl_coefficient) # pylint: disable=protected-access dual_loss = -1.0 * actor_net._log_kl_coefficient * ( # pylint: disable=protected-access tf.stop_gradient(tf.reduce_mean(predictor_kl)) - kl_constraint) dual_grads = tape.gradient(dual_loss, [actor_net._log_kl_coefficient]) # pylint: disable=protected-access grads_and_vars = list( zip(dual_grads, [actor_net._log_kl_coefficient])) # pylint: disable=protected-access dual_optimizer.apply_gradients(grads_and_vars) # Clip the dual variable so exp(log_kl_coef) <= 1e6. log_kl_coef = tf.clip_by_value( actor_net._log_kl_coefficient, # pylint: disable=protected-access -1.0 * np.log(1e6), np.log(1e6)) actor_net._log_kl_coefficient.assign(log_kl_coef) # pylint: disable=protected-access with tf.name_scope('dual_loss'): tf.compat.v2.summary.scalar(name='dual_loss', data=tf.reduce_mean(dual_loss), step=global_step) tf.compat.v2.summary.scalar( name='log_kl_coefficient', data=actor_net._log_kl_coefficient, # pylint: disable=protected-access step=global_step) z_entropy = z_next.entropy() log_prob = prior.log_prob(z_next.sample()) with tf.name_scope('rp-metrics'): common.generate_tensor_summaries('predictor_kl', predictor_kl, global_step) common.generate_tensor_summaries('z_entropy', z_entropy, global_step) common.generate_tensor_summaries('log_prob', log_prob, global_step) common.generate_tensor_summaries('z_mean', z_next.mean(), global_step) common.generate_tensor_summaries('z_stddev', z_next.stddev(), global_step) common.generate_tensor_summaries('prior_mean', prior.mean(), global_step) common.generate_tensor_summaries('prior_stddev', prior.stddev(), global_step) if log_prob_reward_scale == 'auto': coef = tf.stop_gradient(tf.exp(actor_net._log_kl_coefficient)) # pylint: disable=protected-access else: coef = log_prob_reward_scale tf.debugging.check_numerics(tf.reduce_mean(predictor_kl), 'predictor_kl is inf or nan.') tf.debugging.check_numerics(coef, 'coef is inf or nan.') new_reward = experience.reward - coef * predictor_kl[:, None] experience = experience._replace(reward=new_reward) return tf_agent.train(experience)
def loss(self, predictions, policy, cfv): pi = cpea.rm_policy(predictions) inst_r = cfv - cpea.utility(pi, cfv) inst_q = tf.stop_gradient(tf.maximum(inst_r, -tf.nn.relu(predictions))) return tf.reduce_mean( tf.reduce_sum(tf.square(predictions - inst_q), axis=1)) / 2.0
def build_train(make_obs_ph, q_func, num_actions, optimizer_f, grad_norm_clipping=None, gamma=1.0, scope="setdeepq", reuse=None, test_eps=0.05, lr_init=0.001, lr_period_steps=250000, tau=0.05): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. lr_init : float initial learning rate lr_period : int learning rate schedule following a cosine with this period tau : float parameter for the soft target network update. tau <= 1.0 and 1.0 for the hard update. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ # Build action graphs act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) act_greedy = build_act_greedy(make_obs_ph, q_func, num_actions, scope=scope, reuse=True, eps=test_eps) with tf.compat.v1.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None], name="weight") iteration = tf.compat.v1.placeholder(tf.float32, name="iteration") # Cosine learning rate adjustment lr = tf.Variable(float(lr_init), trainable=False, dtype=tf.float32, name='lr') lr = tf.clip_by_value( 0.0005 * tf.math.cos(math.pi * iteration / lr_period_steps) + 0.000501, 1e-6, 1e-3) optimizer = optimizer_f(learning_rate=lr) # q network evaluation q1_t = q_func.forward(obs_t_input.get(), num_actions, scope="q1_func", reuse=True) # reuse q1 parameters from act q1_func_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q1_func") q2_t = q_func.forward(obs_t_input.get(), num_actions, scope="q2_func", reuse=True) # reuse q2 parameters from act q2_func_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/q2_func") # target q network evalution q1_tp1 = q_func.forward(obs_tp1_input.get(), num_actions, scope="target_q1_func", reuse=False) target_q1_func_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q1_func") q2_tp1 = q_func.forward(obs_tp1_input.get(), num_actions, scope="target_q2_func", reuse=False) target_q2_func_vars = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES, scope=tf.compat.v1.get_variable_scope().name + "/target_q2_func") # q scores for actions which we know were selected in the given state. q1_t_selected = tf.reduce_sum(input_tensor=q1_t * tf.one_hot(act_t_ph, num_actions), axis=1) q2_t_selected = tf.reduce_sum(input_tensor=q2_t * tf.one_hot(act_t_ph, num_actions), axis=1) # Actions selected with current q funcs at state t+1. q1_tp1_using_online_net = q_func.forward(obs_tp1_input.get(), num_actions, scope="q1_func", reuse=True) q2_tp1_using_online_net = q_func.forward(obs_tp1_input.get(), num_actions, scope="q2_func", reuse=True) tp1_best_action_using_online_net = tf.argmax( input=q1_tp1_using_online_net + q2_tp1_using_online_net, axis=1) # Using action at t+1 find target value associated with the action q1_tp1_selected = tf.reduce_sum( input_tensor=q1_tp1 * tf.one_hot(tp1_best_action_using_online_net, num_actions), axis=1) q2_tp1_selected = tf.reduce_sum( input_tensor=q2_tp1 * tf.one_hot(tp1_best_action_using_online_net, num_actions), axis=1) # Min of target q values to be used bellman equation q_tp1_best = tf.minimum(q1_tp1_selected, q2_tp1_selected) # compute RHS of bellman equation q_tp1_selected_target = rew_t_ph + gamma * q_tp1_best # compute the error (potentially clipped) td_error1 = q1_t_selected - tf.stop_gradient(q_tp1_selected_target) td_error2 = q2_t_selected - tf.stop_gradient(q_tp1_selected_target) errors1 = U.huber_loss(td_error1) errors2 = U.huber_loss(td_error2) errors = errors1 + errors2 weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph * errors) #Print total number of params total_parameters = 0 for variable in tf.compat.v1.trainable_variables(): # shape is an array of tf.Dimension shape = variable.get_shape() variable_parameters = 1 for dim in shape: variable_parameters *= dim.value # print("var params", variable_parameters) total_parameters += variable_parameters print( "===============================================================") print("Total number of trainable params:", total_parameters) print( "===============================================================") # Log for tensorboard tf.summary.scalar('q1_values', tf.math.reduce_mean(q1_t)) tf.summary.scalar('q2_values', tf.math.reduce_mean(q2_t)) tf.summary.scalar('td_1', tf.math.reduce_mean(td_error1)) tf.summary.scalar('td_2', tf.math.reduce_mean(td_error2)) tf.summary.scalar('weighted_loss', weighted_error) tf.summary.scalar('lr_schedule', lr) tf.summary.scalar('td_MSE_1', tf.math.reduce_mean(tf.math.square(td_error1))) tf.summary.scalar('td_MSE_2', tf.math.reduce_mean(tf.math.square(td_error2))) # combine variable scopes q_func_vars = q1_func_vars + q2_func_vars # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called every step to copy Q network to target Q network # target network is updated with polyak averaging update_target_expr1 = [] for var, var_target in zip( sorted(q1_func_vars, key=lambda v: v.name), sorted(target_q1_func_vars, key=lambda v: v.name)): update_target_expr1.append( var_target.assign(tau * var + (1 - tau) * var_target)) update_target_expr1 = tf.group(*update_target_expr1) update_target_expr2 = [] for var, var_target in zip( sorted(q2_func_vars, key=lambda v: v.name), sorted(target_q2_func_vars, key=lambda v: v.name)): update_target_expr2.append( var_target.assign(tau * var + (1 - tau) * var_target)) update_target_expr2 = tf.group(*update_target_expr2) merged_summary = tf.compat.v1.summary.merge_all( scope=tf.compat.v1.get_variable_scope().name) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph, iteration ], outputs=[ td_error1, td_error2, tf.reduce_mean(input_tensor=errors), merged_summary ], updates=[optimize_expr, lr]) update_target = U.function( [], [], updates=[update_target_expr1, update_target_expr2]) q_values = U.function(inputs=[obs_t_input], outputs=[q1_t, q2_t]) return act_f, act_greedy, q_values, train, update_target, { 'q_values': q_values }
def build_learner(pre, post, act_space, num_frames): global_step = tf.train.get_or_create_global_step() init_lr = FLAGS.init_lr decay = FLAGS.lr_decay warmup_steps = FLAGS.warmup_steps use_rmc = FLAGS.use_rmc use_hrmc = FLAGS.use_hrmc use_icm = FLAGS.use_icm use_coex = FLAGS.use_coex use_reward_prediction = FLAGS.use_reward_prediction use_pixel_control = FLAGS.use_pixel_control pq_kl_coef = FLAGS.pq_kl_coef p_kl_coef = FLAGS.p_kl_coef global_step_float = tf.cast(global_step, tf.float32) lr = tf.train.polynomial_decay( init_lr, global_step, FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen), init_lr / 10.) is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32) lr = is_warmup * global_step_float / warmup_steps * init_lr + ( 1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay) optimizer = tf.train.AdamOptimizer(lr) ent_coef = tf.train.polynomial_decay( FLAGS.ent_coef, global_step, FLAGS.total_environment_frames * 2 // (FLAGS.batch_size * FLAGS.seqlen), FLAGS.ent_coef / 10.) if FLAGS.zero_init: pre["state_in"] = tf.zeros_like(pre["state_in"]) if use_hrmc: rnn = TmpHierRMCRNN(4, 64, 4, 4, 4, return_sequences=True, return_state=True, name="hrmcrnn") elif use_rmc: rnn = RMCRNN(64, 4, 4, return_sequences=True, return_state=True, name="rmcrnn") else: rnn = tf.compat.v1.keras.layers.LSTM(256, return_sequences=True, return_state=True, name="lstm") pre_model = Model(act_space, rnn, use_rmc, use_hrmc, use_reward_prediction, use_pixel_control, "agent", **pre) post["state_in"] = tf.stop_gradient(pre_model.state_out) post_model = Model(act_space, rnn, use_rmc, use_hrmc, use_reward_prediction, use_pixel_control, "agent", **post) tf.summary.scalar("adv_mean", post_model.adv_mean) tf.summary.scalar("adv_std", post_model.adv_std) losses = dPPOcC(act=post_model.a_t, policy_logits=post_model.current_act_logits, old_policy_logits=post_model.old_act_logits, advantage=post_model.advantage, policy_clip=FLAGS.ppo_clip, vf=post_model.current_value, vf_target=post_model.ret, value_clip=FLAGS.vf_clip, old_vf=post_model.old_current_value) entropy_loss = tf.reduce_mean( entropy(post_model.current_act_logits) * post_model.slots) p_loss = tf.reduce_mean(losses.p_loss * post_model.slots) v_loss = tf.reduce_mean(losses.v_loss * post_model.slots) add_loss = 0.0 if use_icm: icmloss = icm(post_model.cnn_feature[:, :-1, :], post_model.cnn_feature[:, 1:, :], post_model.a_t[:, :-1], act_space) add_loss += 0.2 * tf.reduce_mean( icmloss.f_loss * post_model.slots[:, :-1]) + 0.8 * tf.reduce_mean( icmloss.i_loss * post_model.slots[:, :-1]) if use_coex: coexloss = coex(post_model.image_feature[:, :-1, :, :, :], post_model.image_feature[:, 1:, :, :, :], post_model.a_t[:, :-1], act_space) add_loss += tf.reduce_mean(coexloss * post_model.slots[:, :-1]) if use_hrmc: pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas, post_model.p_mus, post_model.p_sigmas) pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.slots) tf.summary.scalar("kl_div", pq_kl_loss) add_loss += pq_kl_coef * pq_kl_loss p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas, tf.zeros_like(post_model.p_mus), 0.01 * tf.ones_like(post_model.p_sigmas)) p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.slots) tf.summary.scalar("kl_div_prior", p_kl_loss) add_loss += p_kl_coef * p_kl_loss if use_reward_prediction: r_loss = tf.reduce_mean( mse(post_model.reward_prediction, post_model.r_t) * post_model.slots) tf.summary.scalar("r_loss", r_loss) add_loss += r_loss if use_pixel_control: rec_loss = tf.reduce_mean( mse(post_model.pixel_control, post_model.s_t) * post_model.slots[:, :, None, None, None]) tf.summary.scalar("rec_loss", rec_loss) add_loss += rec_loss loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss - ent_coef * entropy_loss + add_loss) train_op = miniOp(optimizer, loss, FLAGS.grad_clip) new_frames = tf.reduce_sum(post["slots"]) with tf.control_dependencies([train_op]): num_frames_and_train = tf.assign_add(num_frames, new_frames) global_step_and_train = tf.assign_add(global_step, 1) tf.summary.scalar("learning_rate", lr) tf.summary.scalar("ent_coef", ent_coef) tf.summary.scalar("ent_loss", entropy_loss) tf.summary.scalar("p_loss", p_loss) tf.summary.scalar("v_loss", v_loss) tf.summary.scalar("all_loss", loss) return num_frames_and_train, global_step_and_train
def create_variables(self): # создание нейросети T копированием из исходной нейросети N self.target_q_network = self.q_network.copy(scope="target_network") # расчет управляющего действия # FOR REGULAR ACTION SCORE COMPUTATION with tf.name_scope("taking_action"): # входные данные вектора состояния self.observation = tf.placeholder(tf.float32, (None, self.observation_size), name="observation") # расчитать очки оценки полезности каждого действия self.action_scores = tf.identity(self.q_network(self.observation), name="action_scores") tf.histogram_summary("action_scores", self.action_scores) # взять действие с максимальным количеством очков self.predicted_actions = tf.argmax(self.action_scores, dimension=1, name="predicted_actions") # расчет будущей пользы with tf.name_scope("estimating_future_rewards"): # FOR PREDICTING TARGET FUTURE REWARDS # входной параметр - будущие состояния self.next_observation = tf.placeholder( tf.float32, (None, self.observation_size), name="next_observation") # входной параметр - маски будущих состояний self.next_observation_mask = tf.placeholder( tf.float32, (None, ), name="next_observation_mask") # оценки полезности self.next_action_scores = tf.stop_gradient( self.target_q_network(self.next_observation)) tf.histogram_summary("target_action_scores", self.next_action_scores) # входной параметр - награды self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards") # взять максимальные оценки полезностей действий target_values = tf.identity( tf.reduce_max(self.next_action_scores, reduction_indices=[ 1, ]) * self.next_observation_mask, name="target_values") # r + DF * MAX(Q,s) см статью о Q-learning в википедии #self.future_rewards = self.rewards + self.discount_rate * target_values self.future_rewards = tf.identity( self.rewards + self.discount_rate * target_values, name="future_rewards") # обученте сети N with tf.name_scope("q_value_precition"): # FOR PREDICTION ERROR # входной параметр маски действий в наборе обучающих примеров self.action_mask = tf.placeholder(tf.float32, (None, self.num_actions), name="action_mask") # расчет полезностей действий набора обучающих примеров self.masked_action_scores = tf.reduce_sum( self.action_scores * self.action_mask, reduction_indices=[ 1, ], name="masked_action_scores") # разности текущих полезностей и будущих # - (r + DF * MAX(Q,s) — Q[s',a']) #temp_diff = self.masked_action_scores - self.future_rewards temp_diff = tf.identity(self.masked_action_scores - self.future_rewards, name="temp_diff") # ключевой момент обучения сети # RMSProp минимизирует среднее от вышеуказанных разностей self.prediction_error = tf.reduce_mean(tf.square(temp_diff), name="prediction_error") # работа RMSProp, первый шаг - вычисление градиентов gradients = self.optimizer.compute_gradients(self.prediction_error) #def get_zero(): return tf.constant(0.0) #def get_perror(): return self.prediction_error #gradients = self.optimizer.compute_gradients(tf.cond(tf.is_nan(self.prediction_error), get_zero, get_perror)) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, 5), var) # Add histograms for gradients. for grad, var in gradients: tf.histogram_summary(var.name, var) if grad is not None: tf.histogram_summary(var.name + '/gradients', grad) # второй шаг - оптимизация параметров нейросети self.train_op = self.optimizer.apply_gradients(gradients, name="train_op") # то самое место где настраивается сеть T # T = (1-alpha)*T + alpha*N # UPDATE TARGET NETWORK with tf.name_scope("target_network_update"): self.target_network_update = [] for v_source, v_target in zip(self.q_network.variables(), self.target_q_network.variables()): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub( self.target_network_update_rate * (v_target - v_source)) self.target_network_update.append(update_op) self.target_network_update = tf.group(*self.target_network_update, name="target_network_update") # summaries tf.scalar_summary("prediction_error", self.prediction_error) self.summarize = tf.merge_all_summaries() self.no_op1 = tf.no_op()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.compat.v1.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.compat.v1.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.compat.v1.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.compat.v1.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def train_eval( root_dir, env_name='HalfCheetah-v2', num_iterations=3000000, actor_fc_layers=(), critic_obs_fc_layers=None, critic_action_fc_layers=None, critic_joint_fc_layers=(256, 256), initial_collect_steps=10000, collect_steps_per_iteration=1, replay_buffer_capacity=1000000, # Params for target update target_update_tau=0.005, target_update_period=1, # Params for train train_steps_per_iteration=1, batch_size=256, actor_learning_rate=3e-4, critic_learning_rate=3e-4, alpha_learning_rate=3e-4, dual_learning_rate=3e-4, td_errors_loss_fn=tf.math.squared_difference, gamma=0.99, reward_scale_factor=0.1, gradient_clipping=None, use_tf_functions=True, # Params for eval num_eval_episodes=30, eval_interval=10000, # Params for summaries and logging train_checkpoint_interval=50000, policy_checkpoint_interval=50000, rb_checkpoint_interval=50000, log_interval=1000, summary_interval=1000, summaries_flush_secs=10, debug_summaries=False, summarize_grads_and_vars=False, eval_metrics_callback=None, latent_dim=10, log_prob_reward_scale=0.0, predictor_updates_encoder=False, predict_prior=True, use_recurrent_actor=False, rnn_sequence_length=20, clip_max_stddev=10.0, clip_min_stddev=0.1, clip_mean=30.0, predictor_num_layers=2, use_identity_encoder=False, identity_encoder_single_stddev=False, kl_constraint=1.0, eval_dropout=(), use_residual_predictor=True, gym_kwargs=None, predict_prior_std=True, random_seed=0, ): """A simple train and eval for SAC.""" np.random.seed(random_seed) tf.random.set_seed(random_seed) if use_recurrent_actor: batch_size = batch_size // rnn_sequence_length root_dir = os.path.expanduser(root_dir) train_dir = os.path.join(root_dir, 'train') eval_dir = os.path.join(root_dir, 'eval') train_summary_writer = tf.compat.v2.summary.create_file_writer( train_dir, flush_millis=summaries_flush_secs * 1000) train_summary_writer.set_as_default() eval_summary_writer = tf.compat.v2.summary.create_file_writer( eval_dir, flush_millis=summaries_flush_secs * 1000) global_step = tf.compat.v1.train.get_or_create_global_step() with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): _build_env = functools.partial( suite_gym.load, environment_name=env_name, # pylint: disable=invalid-name gym_env_wrappers=(), gym_kwargs=gym_kwargs) tf_env = tf_py_environment.TFPyEnvironment(_build_env()) eval_vec = [] # (name, env, metrics) eval_metrics = [ tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes), tf_metrics.AverageEpisodeLengthMetric( buffer_size=num_eval_episodes) ] eval_tf_env = tf_py_environment.TFPyEnvironment(_build_env()) name = '' eval_vec.append((name, eval_tf_env, eval_metrics)) time_step_spec = tf_env.time_step_spec() observation_spec = time_step_spec.observation action_spec = tf_env.action_spec() if latent_dim == 'obs': latent_dim = observation_spec.shape[0] def _activation(t): t1, t2 = tf.split(t, 2, axis=1) low = -np.inf if clip_mean is None else -clip_mean high = np.inf if clip_mean is None else clip_mean t1 = rpc_utils.squash_to_range(t1, low, high) if clip_min_stddev is None: low = -np.inf else: low = tf.math.log(tf.exp(clip_min_stddev) - 1.0) if clip_max_stddev is None: high = np.inf else: high = tf.math.log(tf.exp(clip_max_stddev) - 1.0) t2 = rpc_utils.squash_to_range(t2, low, high) return tf.concat([t1, t2], axis=1) if use_identity_encoder: assert latent_dim == observation_spec.shape[0] obs_input = tf.keras.layers.Input(observation_spec.shape) zeros = 0.0 * obs_input[:, :1] stddev_dim = 1 if identity_encoder_single_stddev else latent_dim pre_stddev = tf.keras.layers.Dense(stddev_dim, activation=None)(zeros) ones = zeros + tf.ones((1, latent_dim)) pre_stddev = pre_stddev * ones # Multiply to broadcast to latent_dim. pre_mean_stddev = tf.concat([obs_input, pre_stddev], axis=1) output = tfp.layers.IndependentNormal(latent_dim)(pre_mean_stddev) encoder_net = tf.keras.Model(inputs=obs_input, outputs=output) else: encoder_net = tf.keras.Sequential([ tf.keras.layers.Dense(256, activation='relu'), tf.keras.layers.Dense(256, activation='relu'), tf.keras.layers.Dense( tfp.layers.IndependentNormal.params_size(latent_dim), activation=_activation, kernel_initializer='glorot_uniform'), tfp.layers.IndependentNormal(latent_dim), ]) # Build the predictor net obs_input = tf.keras.layers.Input(observation_spec.shape) action_input = tf.keras.layers.Input(action_spec.shape) class ConstantIndependentNormal(tfp.layers.IndependentNormal): """A keras layer that always returns N(0, 1) distribution.""" def call(self, inputs): loc_scale = tf.concat([ tf.zeros((latent_dim, )), tf.fill((latent_dim, ), tf.math.log(tf.exp(1.0) - 1)) ], axis=0) # Multiple by [B x 1] tensor to broadcast batch dimension. loc_scale = loc_scale * tf.ones_like(inputs[:, :1]) return super(ConstantIndependentNormal, self).call(loc_scale) if predict_prior: z = encoder_net(obs_input) if not predictor_updates_encoder: z = tf.stop_gradient(z) za = tf.concat([z, action_input], axis=1) if use_residual_predictor: za_input = tf.keras.layers.Input(za.shape[1]) loc_scale = tf.keras.Sequential( predictor_num_layers * [tf.keras.layers.Dense(256, activation='relu')] + [ # pylint: disable=line-too-long tf.keras.layers.Dense(tfp.layers.IndependentNormal. params_size(latent_dim), activation=_activation, kernel_initializer='zeros'), ])(za_input) if predict_prior_std: combined_loc_scale = tf.concat([ loc_scale[:, :latent_dim] + za_input[:, :latent_dim], loc_scale[:, latent_dim:] ], axis=1) else: # Note that softplus(log(e - 1)) = 1. combined_loc_scale = tf.concat([ loc_scale[:, :latent_dim] + za_input[:, :latent_dim], tf.math.log(np.e - 1) * tf.ones_like(loc_scale[:, latent_dim:]) ], axis=1) dist = tfp.layers.IndependentNormal(latent_dim)( combined_loc_scale) output = tf.keras.Model(inputs=za_input, outputs=dist)(za) else: assert predict_prior_std output = tf.keras.Sequential( predictor_num_layers * [tf.keras.layers.Dense(256, activation='relu')] + # pylint: disable=line-too-long [ tf.keras.layers.Dense(tfp.layers.IndependentNormal. params_size(latent_dim), activation=_activation, kernel_initializer='zeros'), tfp.layers.IndependentNormal(latent_dim), ])(za) else: # scale is chosen by inverting the softplus function to equal 1. if len(obs_input.shape) > 2: input_reshaped = tf.reshape( obs_input, [-1, tf.math.reduce_prod(obs_input.shape[1:])]) # Multiply by [B x 1] tensor to broadcast batch dimension. za = tf.zeros(latent_dim + action_spec.shape[0], ) * tf.ones_like(input_reshaped[:, :1]) # pylint: disable=line-too-long else: # Multiple by [B x 1] tensor to broadcast batch dimension. za = tf.zeros(latent_dim + action_spec.shape[0], ) * tf.ones_like(obs_input[:, :1]) # pylint: disable=line-too-long output = tf.keras.Sequential([ ConstantIndependentNormal(latent_dim), ])(za) predictor_net = tf.keras.Model(inputs=(obs_input, action_input), outputs=output) if use_recurrent_actor: ActorClass = rpc_utils.RecurrentActorNet # pylint: disable=invalid-name else: ActorClass = rpc_utils.ActorNet # pylint: disable=invalid-name actor_net = ActorClass(input_tensor_spec=observation_spec, output_tensor_spec=action_spec, encoder=encoder_net, predictor=predictor_net, fc_layers=actor_fc_layers) critic_net = rpc_utils.CriticNet( (observation_spec, action_spec), observation_fc_layer_params=critic_obs_fc_layers, action_fc_layer_params=critic_action_fc_layers, joint_fc_layer_params=critic_joint_fc_layers, kernel_initializer='glorot_uniform', last_kernel_initializer='glorot_uniform') critic_net_2 = None target_critic_net_1 = None target_critic_net_2 = None tf_agent = rpc_agent.RpAgent( time_step_spec, action_spec, actor_network=actor_net, critic_network=critic_net, critic_network_2=critic_net_2, target_critic_network=target_critic_net_1, target_critic_network_2=target_critic_net_2, actor_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=actor_learning_rate), critic_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=critic_learning_rate), alpha_optimizer=tf.compat.v1.train.AdamOptimizer( learning_rate=alpha_learning_rate), target_update_tau=target_update_tau, target_update_period=target_update_period, td_errors_loss_fn=td_errors_loss_fn, gamma=gamma, reward_scale_factor=reward_scale_factor, gradient_clipping=gradient_clipping, debug_summaries=debug_summaries, summarize_grads_and_vars=summarize_grads_and_vars, train_step_counter=global_step) dual_optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=dual_learning_rate) tf_agent.initialize() # Make the replay buffer. replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=tf_agent.collect_data_spec, batch_size=tf_env.batch_size, max_length=replay_buffer_capacity) replay_observer = [replay_buffer.add_batch] train_metrics = [ tf_metrics.NumberOfEpisodes(), tf_metrics.EnvironmentSteps(), tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes, batch_size=tf_env.batch_size), tf_metrics.AverageEpisodeLengthMetric( buffer_size=num_eval_episodes, batch_size=tf_env.batch_size), ] kl_metric = rpc_utils.AverageKLMetric(encoder=encoder_net, predictor=predictor_net, batch_size=tf_env.batch_size) eval_policy = greedy_policy.GreedyPolicy(tf_agent.policy) initial_collect_policy = random_tf_policy.RandomTFPolicy( tf_env.time_step_spec(), tf_env.action_spec()) collect_policy = tf_agent.collect_policy checkpoint_items = { 'ckpt_dir': train_dir, 'agent': tf_agent, 'global_step': global_step, 'metrics': metric_utils.MetricsGroup(train_metrics, 'train_metrics'), 'dual_optimizer': dual_optimizer, } train_checkpointer = common.Checkpointer(**checkpoint_items) policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'policy'), policy=eval_policy, global_step=global_step) rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join( train_dir, 'replay_buffer'), max_to_keep=1, replay_buffer=replay_buffer) train_checkpointer.initialize_or_restore() rb_checkpointer.initialize_or_restore() initial_collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, initial_collect_policy, observers=replay_observer + train_metrics, num_steps=initial_collect_steps, transition_observers=[kl_metric]) collect_driver = dynamic_step_driver.DynamicStepDriver( tf_env, collect_policy, observers=replay_observer + train_metrics, num_steps=collect_steps_per_iteration, transition_observers=[kl_metric]) if use_tf_functions: initial_collect_driver.run = common.function( initial_collect_driver.run) collect_driver.run = common.function(collect_driver.run) tf_agent.train = common.function(tf_agent.train) if replay_buffer.num_frames() == 0: # Collect initial replay data. logging.info( 'Initializing replay buffer by collecting experience for %d steps ' 'with a random policy.', initial_collect_steps) initial_collect_driver.run() for name, eval_tf_env, eval_metrics in eval_vec: results = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics-%s' % name, ) if eval_metrics_callback is not None: eval_metrics_callback(results, global_step.numpy()) metric_utils.log_metrics(eval_metrics, prefix=name) time_step = None policy_state = collect_policy.get_initial_state(tf_env.batch_size) timed_at_step = global_step.numpy() time_acc = 0 train_time_acc = 0 env_time_acc = 0 if use_recurrent_actor: # default from sac/train_eval_rnn.py num_steps = rnn_sequence_length + 1 def _filter_invalid_transition(trajectories, unused_arg1): return tf.reduce_all(~trajectories.is_boundary()[:-1]) tf_agent._as_transition = data_converter.AsTransition( # pylint: disable=protected-access tf_agent.data_context, squeeze_time_dim=False) else: num_steps = 2 def _filter_invalid_transition(trajectories, unused_arg1): return ~trajectories.is_boundary()[0] dataset = replay_buffer.as_dataset( sample_batch_size=batch_size, num_steps=num_steps).unbatch().filter(_filter_invalid_transition) dataset = dataset.batch(batch_size).prefetch(5) # Dataset generates trajectories with shape [Bx2x...] iterator = iter(dataset) @tf.function def train_step(): experience, _ = next(iterator) prior = predictor_net( (experience.observation[:, 0], experience.action[:, 0]), training=False) z_next = encoder_net(experience.observation[:, 1], training=False) # predictor_kl is a vector of size batch_size. predictor_kl = tfp.distributions.kl_divergence(z_next, prior) with tf.GradientTape() as tape: tape.watch(actor_net._log_kl_coefficient) # pylint: disable=protected-access dual_loss = -1.0 * actor_net._log_kl_coefficient * ( # pylint: disable=protected-access tf.stop_gradient(tf.reduce_mean(predictor_kl)) - kl_constraint) dual_grads = tape.gradient(dual_loss, [actor_net._log_kl_coefficient]) # pylint: disable=protected-access grads_and_vars = list( zip(dual_grads, [actor_net._log_kl_coefficient])) # pylint: disable=protected-access dual_optimizer.apply_gradients(grads_and_vars) # Clip the dual variable so exp(log_kl_coef) <= 1e6. log_kl_coef = tf.clip_by_value( actor_net._log_kl_coefficient, # pylint: disable=protected-access -1.0 * np.log(1e6), np.log(1e6)) actor_net._log_kl_coefficient.assign(log_kl_coef) # pylint: disable=protected-access with tf.name_scope('dual_loss'): tf.compat.v2.summary.scalar(name='dual_loss', data=tf.reduce_mean(dual_loss), step=global_step) tf.compat.v2.summary.scalar( name='log_kl_coefficient', data=actor_net._log_kl_coefficient, # pylint: disable=protected-access step=global_step) z_entropy = z_next.entropy() log_prob = prior.log_prob(z_next.sample()) with tf.name_scope('rp-metrics'): common.generate_tensor_summaries('predictor_kl', predictor_kl, global_step) common.generate_tensor_summaries('z_entropy', z_entropy, global_step) common.generate_tensor_summaries('log_prob', log_prob, global_step) common.generate_tensor_summaries('z_mean', z_next.mean(), global_step) common.generate_tensor_summaries('z_stddev', z_next.stddev(), global_step) common.generate_tensor_summaries('prior_mean', prior.mean(), global_step) common.generate_tensor_summaries('prior_stddev', prior.stddev(), global_step) if log_prob_reward_scale == 'auto': coef = tf.stop_gradient(tf.exp(actor_net._log_kl_coefficient)) # pylint: disable=protected-access else: coef = log_prob_reward_scale tf.debugging.check_numerics(tf.reduce_mean(predictor_kl), 'predictor_kl is inf or nan.') tf.debugging.check_numerics(coef, 'coef is inf or nan.') new_reward = experience.reward - coef * predictor_kl[:, None] experience = experience._replace(reward=new_reward) return tf_agent.train(experience) if use_tf_functions: train_step = common.function(train_step) # Save the hyperparameters operative_filename = os.path.join(root_dir, 'operative.gin') with tf.compat.v1.gfile.Open(operative_filename, 'w') as f: f.write(gin.operative_config_str()) print(gin.operative_config_str()) global_step_val = global_step.numpy() while global_step_val < num_iterations: start_time = time.time() time_step, policy_state = collect_driver.run( time_step=time_step, policy_state=policy_state, ) env_time_acc += time.time() - start_time train_start_time = time.time() for _ in range(train_steps_per_iteration): train_loss = train_step() train_time_acc += time.time() - train_start_time time_acc += time.time() - start_time global_step_val = global_step.numpy() if global_step_val % log_interval == 0: logging.info('step = %d, loss = %f', global_step_val, train_loss.loss) steps_per_sec = (global_step_val - timed_at_step) / time_acc logging.info('%.3f steps/sec', steps_per_sec) tf.compat.v2.summary.scalar(name='global_steps_per_sec', data=steps_per_sec, step=global_step) train_steps_per_sec = (global_step_val - timed_at_step) / train_time_acc logging.info('Train: %.3f steps/sec', train_steps_per_sec) tf.compat.v2.summary.scalar(name='train_steps_per_sec', data=train_steps_per_sec, step=global_step) env_steps_per_sec = (global_step_val - timed_at_step) / env_time_acc logging.info('Env: %.3f steps/sec', env_steps_per_sec) tf.compat.v2.summary.scalar(name='env_steps_per_sec', data=env_steps_per_sec, step=global_step) timed_at_step = global_step_val time_acc = 0 train_time_acc = 0 env_time_acc = 0 for train_metric in train_metrics + [kl_metric]: train_metric.tf_summaries(train_step=global_step, step_metrics=train_metrics[:2]) if global_step_val % eval_interval == 0: start_time = time.time() for name, eval_tf_env, eval_metrics in eval_vec: results = metric_utils.eager_compute( eval_metrics, eval_tf_env, eval_policy, num_episodes=num_eval_episodes, train_step=global_step, summary_writer=eval_summary_writer, summary_prefix='Metrics-%s' % name, ) if eval_metrics_callback is not None: eval_metrics_callback(results, global_step_val) metric_utils.log_metrics(eval_metrics, prefix=name) logging.info('Evaluation: %d min', (time.time() - start_time) / 60) for prob_dropout in eval_dropout: rpc_utils.eval_dropout_fn(eval_tf_env, actor_net, global_step, prob_dropout=prob_dropout) if global_step_val % train_checkpoint_interval == 0: train_checkpointer.save(global_step=global_step_val) if global_step_val % policy_checkpoint_interval == 0: policy_checkpointer.save(global_step=global_step_val) if global_step_val % rb_checkpoint_interval == 0: rb_checkpointer.save(global_step=global_step_val)
def angle_cls_focal_loss(self, labels, pred, anchor_state, alpha=None, gamma=2.0, decimal_weight=None): indices = tf.reshape(tf.where(tf.equal(anchor_state, 1)), [ -1, ]) labels = tf.gather(labels, indices) pred = tf.gather(pred, indices) anchor_state = tf.gather(anchor_state, indices) # compute the focal loss per_entry_cross_ent = - labels * tf.log(tf.sigmoid(pred) + self.cfgs.EPSILON) \ - (1 - labels) * tf.log(1 - tf.sigmoid(pred) + self.cfgs.EPSILON) prediction_probabilities = tf.sigmoid(pred) p_t = ((labels * prediction_probabilities) + ((1 - labels) * (1 - prediction_probabilities))) modulating_factor = 1.0 if gamma: modulating_factor = tf.pow(1.0 - p_t, gamma) alpha_weight_factor = 1.0 if alpha is not None: alpha_weight_factor = (labels * alpha + (1 - labels) * (1 - alpha)) if decimal_weight is not None: angle_decode_labels = tf.py_func(func=angle_label_decode, inp=[ labels, self.cfgs.ANGLE_RANGE, self.cfgs.OMEGA, self.cfgs.ANGLE_MODE ], Tout=[tf.float32]) angle_decode_labels = tf.reshape(angle_decode_labels, [ -1, ]) * -1 angle_decode_pred = tf.py_func(func=angle_label_decode, inp=[ tf.sigmoid(pred), self.cfgs.ANGLE_RANGE, self.cfgs.OMEGA, self.cfgs.ANGLE_MODE ], Tout=[tf.float32]) angle_decode_pred = tf.reshape(angle_decode_pred, [ -1, ]) * -1 diff_weight = tf.reshape( tf.log(abs(angle_decode_labels - angle_decode_pred) + 1), [-1, 1]) else: diff_weight = tf.ones_like(tf.reshape(anchor_state, [-1, 1])) focal_cross_entropy_loss = (diff_weight * modulating_factor * alpha_weight_factor * per_entry_cross_ent) # compute the normalizer: the number of positive anchors # normalizer = tf.stop_gradient(tf.where(tf.greater(anchor_state, -2))) normalizer = tf.stop_gradient(tf.where(tf.equal(anchor_state, 1))) normalizer = tf.cast(tf.shape(normalizer)[0], tf.float32) normalizer = tf.maximum(1.0, normalizer) # normalizer = tf.stop_gradient(tf.cast(tf.equal(anchor_state, 1), tf.float32)) # normalizer = tf.maximum(tf.reduce_sum(normalizer), 1) return tf.reduce_sum(focal_cross_entropy_loss) / normalizer
def loss(self, predictions, policy, cfv): r = tf.stop_gradient( cpea.rm_policy(cfv - tf.reduce_sum(cfv * policy, axis=1, keepdims=True))) error = tf.square(r - predictions) / 2.0 return tf.reduce_mean(tf.reduce_sum(error, axis=1))
def model_fn(features, labels, mode, params): ''' Args: features: tensor with shape [BATCH_SIZE, go.N, go.N, features_lib.NEW_FEATURES_PLANES] labels: dict from string to tensor with shape 'pi_tensor': [BATCH_SIZE, go.N * go.N + 1] 'value_tensor': [BATCH_SIZE] mode: a tf.estimator.ModeKeys (batchnorm params update for TRAIN only) params: A dictionary (Typically derived from the FLAGS object.) Returns: tf.estimator.EstimatorSpec with props mode: same as mode arg predictions: dict of tensors 'policy': [BATCH_SIZE, go.N * go.N + 1] 'value': [BATCH_SIZE] loss: a single value tensor train_op: train op eval_metric_ops return dict of tensors logits: [BATCH_SIZE, go.N * go.N + 1] ''' policy_output, value_output, logits = model_inference_fn( features, mode == tf.estimator.ModeKeys.TRAIN, params) # train ops policy_cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=tf.stop_gradient( labels['pi_tensor']))) value_cost = params['value_cost_weight'] * tf.reduce_mean( tf.square(value_output - labels['value_tensor'])) reg_vars = [ v for v in tf.trainable_variables() if 'bias' not in v.name and 'beta' not in v.name ] l2_cost = params['l2_strength'] * \ tf.add_n([tf.nn.l2_loss(v) for v in reg_vars]) combined_cost = policy_cost + value_cost + l2_cost global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.piecewise_constant(global_step, params['lr_boundaries'], params['lr_rates']) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # Insert quantization ops if requested if params['quantize']: if mode == tf.estimator.ModeKeys.TRAIN: tf.contrib.quantize.create_training_graph( quant_delay=params['quant_delay']) else: tf.contrib.quantize.create_eval_graph() optimizer = tf.train.MomentumOptimizer(learning_rate, params['sgd_momentum']) if params['use_tpu']: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(combined_cost, global_step=global_step) # Computations to be executed on CPU, outside of the main TPU queues. def eval_metrics_host_call_fn(policy_output, value_output, pi_tensor, policy_cost, value_cost, l2_cost, combined_cost, step, est_mode=tf.estimator.ModeKeys.TRAIN): policy_entropy = -tf.reduce_mean( tf.reduce_sum(policy_output * tf.log(policy_output), axis=1)) # pi_tensor is one_hot when generated from sgfs (for supervised learning) # and soft-max when using self-play records. argmax normalizes the two. policy_target_top_1 = tf.argmax(pi_tensor, axis=1) policy_output_in_top1 = tf.to_float( tf.nn.in_top_k(policy_output, policy_target_top_1, k=1)) policy_output_in_top3 = tf.to_float( tf.nn.in_top_k(policy_output, policy_target_top_1, k=3)) policy_top_1_confidence = tf.reduce_max(policy_output, axis=1) policy_target_top_1_confidence = tf.boolean_mask( policy_output, tf.one_hot(policy_target_top_1, tf.shape(policy_output)[1])) with tf.variable_scope("metrics"): metric_ops = { 'policy_cost': tf.metrics.mean(policy_cost), 'value_cost': tf.metrics.mean(value_cost), 'l2_cost': tf.metrics.mean(l2_cost), 'policy_entropy': tf.metrics.mean(policy_entropy), 'combined_cost': tf.metrics.mean(combined_cost), 'policy_accuracy_top_1': tf.metrics.mean(policy_output_in_top1), 'policy_accuracy_top_3': tf.metrics.mean(policy_output_in_top3), 'policy_top_1_confidence': tf.metrics.mean(policy_top_1_confidence), 'policy_target_top_1_confidence': tf.metrics.mean(policy_target_top_1_confidence), 'value_confidence': tf.metrics.mean(tf.abs(value_output)), } if est_mode == tf.estimator.ModeKeys.EVAL: return metric_ops # NOTE: global_step is rounded to a multiple of FLAGS.summary_steps. eval_step = tf.reduce_min(step) # Create summary ops so that they show up in SUMMARIES collection # That way, they get logged automatically during training summary_writer = summary.create_file_writer(FLAGS.work_dir) with summary_writer.as_default(), \ summary.record_summaries_every_n_global_steps( params['summary_steps'], eval_step): for metric_name, metric_op in metric_ops.items(): summary.scalar(metric_name, metric_op[1], step=eval_step) # Reset metrics occasionally so that they are mean of recent batches. reset_op = tf.variables_initializer(tf.local_variables("metrics")) cond_reset_op = tf.cond( tf.equal(eval_step % params['summary_steps'], tf.to_int64(1)), lambda: reset_op, lambda: tf.no_op()) return summary.all_summary_ops() + [cond_reset_op] metric_args = [ policy_output, value_output, labels['pi_tensor'], tf.reshape(policy_cost, [1]), tf.reshape(value_cost, [1]), tf.reshape(l2_cost, [1]), tf.reshape(combined_cost, [1]), tf.reshape(global_step, [1]), ] predictions = { 'policy_output': policy_output, 'value_output': value_output, } eval_metrics_only_fn = functools.partial( eval_metrics_host_call_fn, est_mode=tf.estimator.ModeKeys.EVAL) host_call_fn = functools.partial(eval_metrics_host_call_fn, est_mode=tf.estimator.ModeKeys.TRAIN) tpu_estimator_spec = tpu_estimator.TPUEstimatorSpec( mode=mode, predictions=predictions, loss=combined_cost, train_op=train_op, eval_metrics=(eval_metrics_only_fn, metric_args), host_call=(host_call_fn, metric_args)) if params['use_tpu']: return tpu_estimator_spec else: return tpu_estimator_spec.as_estimator_spec()
# Selective softmax extractor = np.zeros((action_space_dimension, asize), dtype=np.float32) for i, a in enumerate(actionset): extractor[a, i] = 1.0 adaptor = np.transpose(extractor) compact = tf.tensordot(raw_pi, extractor, [[2], [0]]) compact_softmax = tf.nn.softmax(compact) softmax_policy = tf.tensordot(compact_softmax, adaptor, [[2], [0]]) # build loss flattened_value = tf.reshape(raw_value, [-1]) policy = tf.multiply(softmax_policy, action_tensor) policy = tf.reduce_sum(policy, axis=[1, 2]) log_policy = tf.log(tf.clip_by_value(policy, 1e-20, 1.0)) criticism = V_tensor - flattened_value policy_per_sample = log_policy * tf.stop_gradient(criticism) policy_loss = tf.reduce_sum(-policy_per_sample) value_loss = tf.nn.l2_loss(criticism) loss = policy_loss + value_loss optimizer = tf.train.AdamOptimizer(learning_rate=1e-3) grad_op = optimizer.compute_gradients(loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) dic = { action_tensor: actions_to_adist_array([3], action_space_dimension), V_tensor: [10.0], } grad = sess.run(grad_op, feed_dict=dic) print("grad 1\n{}".format(grad))
def NN_hidden(currentstate): hidden = tf.nn.relu(tf.matmul(currentstate, w['L1'])) out = tf.matmul(hidden, w['L2']) return out # Bellmans equations # G_O is 0 if game over, 1 otherwise G_O = rew + 1 # predictions # next state Qnext = tf.reshape(tf.reduce_max(NN_hidden(posteriorstate)), [-1, 1]) Qnext = tf.multiply(G_O, Qnext) # 0 if G_0 is 0 # current state Q = tf.reshape(tf.gather_nd(NN_hidden(currentstate), action), [-1, 1]) delt = rew + (discount_factor * tf.stop_gradient(Qnext)) - Q # LOSS/TRAINING DEFINITIONS loss = tf.multiply(0.50, tf.reduce_mean(tf.square(delt)), name="loss") # RMSprop parameters rmsprop_dec = 0.937 # RMSprop decay rmsprop_mom = 0.52 # RMSprop momentum # RMSprop optimisation train = tf.train.RMSPropOptimizer(learning_rate=learningrate, momentum=rmsprop_mom, decay=rmsprop_dec).minimize(loss) prediction = tf.argmax((NN_hidden(currentstate)), axis=1) # variable initialisation init = tf.global_variables_initializer() # saving functionality saver = tf.train.Saver() # TRAINING
def __init__(self, action_size, img_h, img_w, n_channels, c1, epochs, batch_size): self.epochs = epochs self.batch_size = batch_size self.regularizer = None #tf.contrib.layers.l2_regularizer(scale=0.001) self.initializer = None # counters for wrinting summaries to tensorboard self.i = 0 # overall training self.update_r = 0 # reward self.action_size = action_size self.sess = tf.Session() with tf.variable_scope("model"): # Placeholders Model self.o_t = tf.placeholder(shape=[None, img_h, img_w, n_channels], dtype=tf.float32) #self.o_t = self.o_t / 255. # Placeholders PPO self.action = tf.placeholder(shape=[self.batch_size], dtype=tf.int32) self.V_targ = tf.placeholder(shape=[self.batch_size], dtype=tf.float32) self.advantage = tf.placeholder(shape=[self.batch_size], dtype=tf.float32) # Placeholders summaries self.reward = tf.placeholder(shape=(), dtype=tf.float32) # Placeholders for Training self.lr = tf.placeholder(shape=(), dtype=tf.float32) self.lr_v = tf.placeholder(shape=(), dtype=tf.float32) self.epsilon = tf.placeholder(shape=(), dtype=tf.float32) self.c2 = tf.placeholder(shape=(), dtype=tf.float32) # constants self.n = tf.constant(self.action_size, dtype=tf.float32) self.c1 = tf.constant(c1) self.pi_greco = tf.constant(math.pi) # Define models self.V, self.pi = self.build_model("new") _, self.pi_old = self.build_model("old") # Compute Probability of the action taken in log space self.action_taken_one_hot = tf.one_hot(self.action, self.action_size) self.pi_sampled_log = tf.log(tf.reduce_sum(self.pi * self.action_taken_one_hot, -1) + 1e-5) self.pi_old_sampled_log = tf.log(tf.reduce_sum(self.pi_old * self.action_taken_one_hot, -1) + 1e-5) # PPO Loss self.ratio = tf.exp(self.pi_sampled_log - tf.stop_gradient(self.pi_old_sampled_log)) self.sur1 = tf.multiply(self.ratio, self.advantage) self.sur2 = tf.multiply(tf.clip_by_value(self.ratio, 1.0 - self.epsilon, 1.0 + self.epsilon), self.advantage) self.L_CLIP = tf.reduce_mean(tf.minimum(self.sur1, self.sur2)) self.L_V = 0.5 * tf.reduce_mean(tf.squared_difference(self.V_targ, self.V)) self.entropy = - tf.reduce_sum(self.pi * tf.log(self.pi)) self.loss = - self.L_CLIP + self.c1 * self.L_V - self.c2 * self.entropy # Training summaries self.s_pi = tf.summary.scalar('pi', tf.reduce_mean(tf.exp(self.pi_sampled_log))) self.s_ratio = tf.summary.scalar('Ratio', tf.reduce_mean(self.ratio)) self.s_v = tf.summary.scalar('Loss_V', self.L_V) self.s_c = tf.summary.scalar('Loss_CLIP', -self.L_CLIP) self.s_e = tf.summary.scalar('Loss_entropy', -self.entropy) self.merge = tf.summary.merge([self.s_pi, self.s_ratio, self.s_v, self.s_c, self.s_e]) self.s_r = tf.summary.scalar('Reward', self.reward) # Optimization steps self.optimizer = tf.train.AdamOptimizer(self.lr) self.train_ppo = self.optimizer.minimize(self.loss) self.optimizer_v = tf.train.AdamOptimizer(self.lr_v) self.train_ppo_v = self.optimizer_v.minimize(self.L_V) with tf.variable_scope("assign"): self.assign_arr = [] self.col_dict = {} self.col1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/new') for i in range(len(self.col1)): self.col_dict[self.col1[i].name.split('/')[-2] + "/" + self.col1[i].name.split('/')[-1]] = self.col1[i] self.col2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/old') for i in range(len(self.col2)): self.node_name = self.col2[i].name.split('/')[-2] + "/" + self.col2[i].name.split('/')[-1] self.assign0 = self.col2[i].assign(self.col_dict[self.node_name]) self.assign_arr.append(self.assign0) self.init = tf.global_variables_initializer() self.sess.run(self.init) self.train_writer = tf.summary.FileWriter('train/', self.sess.graph)
def _build_critic(self, tensors, v_bonus, q_bonus, output_dim=1, hidden_dim=64, total_loss=False): """ :param placeholders: [obs_ph, action_ph, next_obs_ph, terminal_ph, action_pi, ensemble_mask] :param v_bonus: is used to compute v_backup (for reward value: v_bonus=0, kl_value: v_bonus=-kl) :param q_bonus: is used to compute q_backup (for reward_value: q_bonus=r, kl_value: q_bonus=0) :param output_dim: integer :return: critic_v, critic_q, mean(q_losses), qs_pi, critic_train_op, target_update_op """ obs_ph, action_ph, next_obs_ph, terminal_ph, action_pi, ensemble_mask = tensors # Define V and Q networks critic_v = VNetwork(output_dim, hidden_dim=hidden_dim) critic_q = QNetwork(output_dim, self.num_critics, hidden_dim=hidden_dim) critic_v_target = VNetwork(output_dim, hidden_dim=hidden_dim) # Critic training (V, Q) qs_pi = critic_q([obs_ph, action_pi]) v = critic_v([obs_ph]) qs = critic_q([obs_ph, action_ph]) v_backup = tf.stop_gradient(self._reduce_q(qs_pi) + v_bonus) v_loss = tf.losses.mean_squared_error(v_backup, v) if total_loss and output_dim != 1: print('total_loss_added') v_loss = v_loss * output_dim + tf.losses.mean_squared_error(tf.reduce_sum(v_backup, axis=-1, keepdims=True), tf.reduce_sum(v, axis=-1, keepdims=True)) # Gradient panelty (V) if self.gradient_norm_panelty > 0: v_grad_obs = tf.gradients(v, [obs_ph])[0] # do not average, sum by the output dimension v_grad_norm = tf.sqrt(tf.reduce_sum(tf.square(v_grad_obs), axis=1) + 1e-8) v_grad_panelty_loss = tf.reduce_mean(tf.maximum(v_grad_norm - self.gradient_norm_limit * np.sqrt(self.state_dim), 0) ** 2) v_loss += self.gradient_norm_panelty * v_grad_panelty_loss value_target = critic_v_target([next_obs_ph]) q_backup = tf.stop_gradient((1 - terminal_ph) * self.gamma * value_target + q_bonus) # batch x 1 q_losses = [tf.losses.mean_squared_error(q_backup, qs[k], weights=ensemble_mask[:, k:k+1]) for k in range(self.num_critics)] if total_loss and output_dim != 1: for k in range(self.num_critics): q_losses[k] = q_losses[k] * output_dim + tf.losses.mean_squared_error( tf.reduce_sum(q_backup, axis=-1, keepdims=True), tf.reduce_sum(qs[k], axis=-1, keepdims=True), weights=ensemble_mask[:, k:k+1]) # Gradient panelty (Q) if self.gradient_norm_panelty > 0: qs_grad_obs_action = [tf.concat(tf.gradients(q, [obs_ph, action_ph]), axis=-1) for q in qs] # do not average, sum by the output dimension qs_grad_norm = [tf.sqrt(tf.reduce_sum(tf.square(q_grad_obs_action), axis=1) + 1e-8) for q_grad_obs_action in qs_grad_obs_action] qs_grad_panelty_loss = [tf.reduce_mean(tf.maximum(q_grad_norm - self.gradient_norm_limit * np.sqrt(self.state_dim + self.action_dim), 0) ** 2) for q_grad_norm in qs_grad_norm] for i, q_grad_panelty_loss in enumerate(qs_grad_panelty_loss): q_losses[i] += self.gradient_norm_panelty * q_grad_panelty_loss value_loss = v_loss + tf.reduce_sum(q_losses) critic_optimizer = tf.train.AdamOptimizer(self.learning_rate) critic_train_op = critic_optimizer.minimize(value_loss, var_list=critic_v.trainable_variables + critic_q.trainable_variables) with tf.control_dependencies([critic_train_op]): # Update target network source_params = critic_v.trainable_variables target_params = critic_v_target.trainable_variables target_update_op = [ tf.assign(target, (1 - self.tau) * target + self.tau * source) for target, source in zip(target_params, source_params) ] # Copy weights to target networks self.sess.run(tf.variables_initializer(critic_optimizer.variables())) critic_v_target.set_weights(critic_v.get_weights()) return critic_v, critic_q, tf.reduce_mean(q_losses), qs_pi, critic_train_op, target_update_op
def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None, param_noise=False, param_noise_filter_func=None): """Creates the train function: Parameters ---------- make_obs_ph: str -> tf.placeholder or TfInput a function that takes a name and creates a placeholder of input with that name q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. num_actions: int number of actions reuse: bool whether or not to reuse the graph variables optimizer: tf.train.Optimizer optimizer to use for the Q-learning objective. grad_norm_clipping: float or None clip gradient norms to this value. If None no clipping is performed. gamma: float discount rate. double_q: bool if true will use Double Q Learning (https://arxiv.org/abs/1509.06461). In general it is a good idea to keep it enabled. scope: str or VariableScope optional scope for variable_scope. reuse: bool or None whether or not the variables should be reused. To be able to reuse the scope must be given. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) param_noise_filter_func: tf.Variable -> bool function that decides whether or not a variable should be perturbed. Only applicable if param_noise is True. If set to None, default_param_noise_filter is used by default. Returns ------- act: (tf.Variable, bool, float) -> tf.Variable function to select and action given observation. ` See the top of the file for details. train: (object, np.array, np.array, object, np.array, np.array) -> np.array optimize the error in Bellman's equation. ` See the top of the file for details. update_target: () -> () copy the parameters from optimized Q function to the target Q function. ` See the top of the file for details. debug: {str: function} a bunch of functions to print debug data like q_values. """ if param_noise: act_f = build_act_with_param_noise( make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse, param_noise_filter_func=param_noise_filter_func) else: act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): # set up placeholders obs_t_input = make_obs_ph("obs_t") act_t_ph = tf.placeholder(tf.int32, [None], name="action") rew_t_ph = tf.placeholder(tf.float32, [None], name="reward") obs_tp1_input = make_obs_ph("obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [None], name="done") importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight") # q network evaluation q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True) # reuse parameters from act q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/q_func") # target q network evalution q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func") target_q_func_vars = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=tf.get_variable_scope().name + "/target_q_func") # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if double_q: q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked # compute the error (potentially clipped) td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) errors = U.huber_loss(td_error) weighted_error = tf.reduce_mean(importance_weights_ph * errors) # compute optimization op (potentially with gradient clipping) if grad_norm_clipping is not None: gradients = optimizer.compute_gradients(weighted_error, var_list=q_func_vars) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, grad_norm_clipping), var) optimize_expr = optimizer.apply_gradients(gradients) else: optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars) # update_target_fn will be called periodically to copy Q network to target Q network update_target_expr = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) update_target_expr = tf.group(*update_target_expr) # Create callable functions train = U.function(inputs=[ obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, importance_weights_ph ], outputs=td_error, updates=[optimize_expr]) update_target = U.function([], [], updates=[update_target_expr]) q_values = U.function([obs_t_input], q_t) return act_f, train, update_target, {'q_values': q_values}
def build_model(self): with tf.variable_scope('Model', reuse=tf.AUTO_REUSE): with tf.name_scope('Inputs'): # Model Feeds self.ratings = tf.placeholder(dtype=tf.float32, shape=[None, self.num_item], name='ratings') self.uid = tf.placeholder(dtype=tf.int32, shape=[None], name='user_id') self.istraining = tf.placeholder(dtype=tf.bool, shape=[], name='training_flag') self.layer1_dropout_rate = tf.placeholder( dtype=tf.float32, shape=[], name='layer1_dropout_rate') ######################################################################################################### with tf.name_scope('Variables'): input = self.ratings # Encoder Variables self.layer1_w = tf.get_variable( name='encoder_weights', shape=[self.num_item, self.num_factors], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01)) self.layer1_b = tf.get_variable( name='encoder_bias', shape=[self.num_factors], initializer=tf.zeros_initializer()) if self.is_user_node: self.user_embedding = tf.get_variable( name='user_embedding', shape=[self.num_user, self.num_factors], initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.01), dtype=tf.float32) # (users, embedding_size) # Decoder Variables self.layer2_w1 = tf.get_variable( name='decoder_weights', shape=[self.num_factors, self.num_item], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01)) layer2_w2 = tf.get_variable( name='decoder_concat', shape=[self.num_noise_factor, self.num_item], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01)) self.layer2_b = tf.get_variable( name='decoder_bias', shape=[self.num_item], initializer=tf.zeros_initializer()) # Noise Variables item_w1_noise = tf.get_variable( name='item_w1_noise', shape=[self.num_item, self.num_factors], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) if self.is_user_node: user_w_noise = tf.get_variable( name='user_w_noise', shape=[self.num_user, self.num_factors], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) item_w2_noise = tf.get_variable( name='item_w2_noise', shape=[self.num_factors, self.num_item], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) hidden_noise_tr = tf.get_variable( name='hidden_noise_tr', shape=[self.batch_size, self.num_factors], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) hidden_noise_eval = tf.get_variable( name='hidden_noise_eval', shape=[self.num_user, self.num_factors], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) noise_vector_tr = tf.get_variable( name='encoder_noise_tr', shape=[self.batch_size, self.num_noise_factor], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) noise_vector_eval = tf.get_variable( name='encoder_noise_eval', shape=[self.num_user, self.num_noise_factor], initializer=tf.zeros_initializer(), dtype=tf.float32, trainable=False) ######################################################################################################### with tf.name_scope('Original_AE'): ############# Original AE Model org_w1, org_w2 = self.layer1_w, self.layer2_w1 if self.robust_test: if self.noise_pos == 'W1': org_w1 += item_w1_noise elif self.noise_pos == 'W2': org_w2 += item_w2_noise elif self.noise_pos == 'USER': self.user_embedding += user_w_noise if self.is_user_node: user_node = tf.nn.embedding_lookup(self.user_embedding, self.uid) org_encoder = tf.sigmoid( tf.matmul(input, org_w1) + self.layer1_b + user_node) # org_encoder = tf.identity(tf.matmul(input, org_w1) + self.layer1_b + user_node) else: org_encoder = tf.sigmoid( tf.matmul(input, org_w1) + self.layer1_b) # org_encoder = tf.identity(tf.matmul(input, org_w1) + self.layer1_b) if self.robust_test and self.noise_pos == 'HID': org_encoder += hidden_noise_eval org_encoder = tf.cond( self.istraining, lambda: tf.layers.dropout(org_encoder, rate=self.layer1_dropout_rate, name='layer1_dropout'), lambda: org_encoder) org_decoder = tf.identity( tf.matmul(org_encoder, org_w2) + self.layer2_b) self.org_output = org_decoder org_base_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=input, logits=self.org_output)) # org_base_loss = tf.nn.l2_loss(self.org_output - input) org_base_loss = org_base_loss / tf.cast( tf.shape(input)[0], dtype=org_base_loss.dtype) ######################################################################################################### ###### The Noisy Auto-Encoder if self.adv_training: if self.noise_pos == 'CON': # ConCat Noise AE with tf.name_scope("ConCat_AE"): if self.is_user_node: user_node = tf.nn.embedding_lookup( self.user_embedding, self.uid) concat_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b + user_node) else: concat_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b) concat_noise_encoder = tf.cond( self.istraining, lambda: tf.concat( [concat_noise_encoder, noise_vector_tr], axis=1), lambda: tf.concat( [concat_noise_encoder, noise_vector_eval], axis=1)) concat_noise_encoder = tf.cond( self.istraining, lambda: tf.layers.dropout( concat_noise_encoder, rate=self.layer1_dropout_rate, name='layer1_dropout'), lambda: concat_noise_encoder) concat_w2 = tf.concat([self.layer2_w1, layer2_w2], axis=0) # out_vector = tf.sigmoid(tf.matmul(concat_noise_encoder, layer2_concat_w) + layer2_b) concat_noise_decoder = tf.identity( tf.matmul(concat_noise_encoder, concat_w2) + self.layer2_b) # Output self.concat_noise_output = concat_noise_decoder # Noisy Model Loss concat_noise_base_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=input, logits=self.concat_noise_output)) # concat_noise_base_loss = tf.nn.l2_loss(tf.sigmoid(self.concat_noise_output) - input) concat_noise_base_loss = concat_noise_base_loss / tf.cast( tf.shape(input)[0], dtype=concat_noise_base_loss.dtype) if self.noise_pos == 'W1' or self.noise_pos == 'W1W2': with tf.name_scope("W1_AE"): w1_noise_w1 = self.layer1_w + item_w1_noise if self.is_user_node: user_node = tf.nn.embedding_lookup( self.user_embedding, self.uid) w1_noise_encoder = tf.sigmoid( tf.matmul(input, w1_noise_w1) + self.layer1_b + user_node) # w1_noise_encoder = tf.identity(tf.matmul(input, w1_noise_w1) + self.layer1_b + user_node) else: w1_noise_encoder = tf.sigmoid( tf.matmul(input, w1_noise_w1) + self.layer1_b) # w1_noise_encoder = tf.identity(tf.matmul(input, w1_noise_w1) + self.layer1_b) w1_noise_encoder = tf.cond( self.istraining, lambda: tf.layers.dropout( w1_noise_encoder, rate=self.layer1_dropout_rate, name='layer1_dropout'), lambda: w1_noise_encoder) w1_noise_decoder = tf.identity( tf.matmul(w1_noise_encoder, self.layer2_w1) + self.layer2_b) # Output self.w1_noise_output = w1_noise_decoder # Noisy Model Loss w1_noise_base_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=input, logits=self.w1_noise_output)) # w1_noise_base_loss = tf.nn.l2_loss(self.w1_noise_output - input) w1_noise_base_loss = w1_noise_base_loss / tf.cast( tf.shape(input)[0], dtype=w1_noise_base_loss.dtype) if self.noise_pos == 'W2' or self.noise_pos == 'W1W2': with tf.name_scope("W2_AE"): w2_noise_w2 = self.layer2_w1 + item_w2_noise if self.is_user_node: user_node = tf.nn.embedding_lookup( self.user_embedding, self.uid) w2_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b + user_node) # w2_noise_encoder = tf.identity(tf.matmul(input, self.layer1_w) + self.layer1_b + user_node) else: w2_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b) # w2_noise_encoder = tf.identity(tf.matmul(input, self.layer1_w) + self.layer1_b) w2_noise_encoder = tf.cond( self.istraining, lambda: tf.layers.dropout( w2_noise_encoder, rate=self.layer1_dropout_rate, name='layer1_dropout'), lambda: w2_noise_encoder) w2_noise_decoder = tf.identity( tf.matmul(w2_noise_encoder, w2_noise_w2) + self.layer2_b) # Output self.w2_noise_output = w2_noise_decoder # Noisy Model Loss w2_noise_base_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=input, logits=self.w2_noise_output)) # w2_noise_base_loss = tf.nn.l2_loss(self.w2_noise_output - input) w2_noise_base_loss = w2_noise_base_loss / tf.cast( tf.shape(input)[0], dtype=w2_noise_base_loss.dtype) if self.noise_pos == 'USER': with tf.name_scope("USER_AE"): self.user_embedding += user_w_noise user_node = tf.nn.embedding_lookup( self.user_embedding, self.uid) user_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b + user_node) user_noise_encoder = tf.cond( self.istraining, lambda: tf.layers.dropout( user_noise_encoder, rate=self.layer1_dropout_rate, name='layer1_dropout'), lambda: user_noise_encoder) user_noise_decoder = tf.identity( tf.matmul(user_noise_encoder, self.layer2_w1) + self.layer2_b) # Output self.user_noise_output = user_noise_decoder # Noisy Model Loss user_noise_base_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=input, logits=self.user_noise_output)) # weight_noise_base_loss = tf.nn.l2_loss(tf.sigmoid(self.weight_noise_output) - input) user_noise_base_loss = user_noise_base_loss / tf.cast( tf.shape(input)[0], dtype=user_noise_base_loss.dtype) if self.noise_pos == 'HID': with tf.name_scope("Hidden_AE"): if self.is_user_node: user_node = tf.nn.embedding_lookup( self.user_embedding, self.uid) hidden_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b + user_node) + hidden_noise_tr else: hidden_noise_encoder = tf.sigmoid( tf.matmul(input, self.layer1_w) + self.layer1_b) + hidden_noise_tr hidden_noise_encoder = tf.cond( self.istraining, lambda: tf.layers.dropout( hidden_noise_encoder, rate=self.layer1_dropout_rate, name='layer1_dropout'), lambda: hidden_noise_encoder) hidden_noise_decoder = tf.identity( tf.matmul(hidden_noise_encoder, self.layer2_w1) + self.layer2_b) # Output self.hidden_noise_output = hidden_noise_decoder hidden_noise_base_loss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=input, logits=self.hidden_noise_output)) # hidden_noise_base_loss = tf.nn.l2_loss(tf.sigmoid(self.hidden_noise_output) - input) hidden_noise_base_loss = hidden_noise_base_loss / tf.cast( tf.shape(input)[0], dtype=self.hidden_noise_output.dtype) ############# Final Outputs with tf.name_scope('Prediction'): # self.mixed_output = (1-self.output_mix_ratio) * self.org_output + self.output_mix_ratio * self.noisy_output self.pred_y = tf.sigmoid(self.org_output) # self.pred_y = self.org_output # self.pred_y = tf.sigmoid(self.mixed_output) ############# Overall Losses with tf.name_scope('Loss'): if self.adv_training: if self.noise_pos == 'W1': base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio_W1 * w1_noise_base_loss reg_loss = self.ae_regs[0] * tf.nn.l2_loss(w1_noise_w1) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) if self.noise_pos == 'W2': base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * w2_noise_base_loss reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(w2_noise_w2) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) if self.noise_pos == 'HID': base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * hidden_noise_base_loss reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) if self.noise_pos == 'USER': base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * user_noise_base_loss reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) if self.noise_pos == 'CON': base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * concat_noise_base_loss reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(concat_w2) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) if self.noise_pos == 'W1W2': base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio_W1 * w1_noise_base_loss + self.noise_loss_ratio * w2_noise_base_loss if self.noise_loss_ratio_W1 != 0 and self.noise_loss_ratio != 0: reg_loss = self.ae_regs[0] * tf.nn.l2_loss(w1_noise_w1) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(w2_noise_w2) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) elif self.noise_loss_ratio_W1 == 0: #Noise on W2 only reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(w2_noise_w2) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) elif self.noise_loss_ratio == 0: #Noise on W1 only reg_loss = self.ae_regs[0] * tf.nn.l2_loss(w1_noise_w1) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) else: base_loss = org_base_loss reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \ self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \ self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \ self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b) if self.is_user_node: reg_loss += self.user_node_regs * tf.nn.l2_loss( self.user_embedding) self.loss = base_loss + reg_loss ############# Optimizer with tf.name_scope('Optimizer'): self.opt = tf.train.GradientDescentOptimizer(self.lr).minimize( self.loss) # self.opt = tf.train.AdagradOptimizer(self.lr).minimize(self.loss) ########### Robustness Testing (Random or Adversial) with tf.name_scope('Noise_Adding'): if self.adv_training or self.robust_test: if self.noise_type == 'random': if self.noise_pos == 'W1': random_noise = tf.random_normal( shape=tf.shape(org_w1), mean=tf.reduce_mean(org_w1), stddev=0.01) self.update_delta = item_w1_noise.assign( self.eps * random_noise / tf.norm(random_noise)) if self.noise_pos == 'W2': random_noise = tf.random_normal( shape=tf.shape(org_w2), mean=tf.reduce_mean(org_w2), stddev=0.01) self.update_delta = item_w2_noise.assign( self.eps * random_noise / tf.norm(random_noise)) if self.noise_pos == 'USER': random_noise = tf.random_normal( shape=tf.shape(self.user_embedding), mean=tf.reduce_mean(self.user_embedding), stddev=0.01) self.update_delta = user_w_noise.assign( self.eps * random_noise / tf.norm(random_noise)) if self.noise_pos == 'HID': random_noise = tf.random_normal( shape=tf.shape(org_encoder), mean=tf.reduce_mean(org_encoder), stddev=0.01) if self.robust_test: self.update_delta = hidden_noise_eval.assign( self.eps * random_noise / tf.norm(random_noise)) else: self.update_delta = hidden_noise_tr.assign( self.eps * random_noise / tf.norm(random_noise)) if self.noise_type == 'adv': if self.noise_pos == 'W1': if self.robust_test: self.grad_delta = tf.gradients( ys=org_base_loss, xs=item_w1_noise)[0] else: self.grad_delta = tf.gradients( ys=base_loss, xs=item_w1_noise)[0] self.grad_delta_dense = tf.stop_gradient( self.grad_delta) self.update_delta = item_w1_noise.assign( self.eps * self.grad_delta_dense / tf.norm(self.grad_delta_dense)) if self.noise_pos == 'W2': if self.robust_test: self.grad_delta = tf.gradients( ys=org_base_loss, xs=item_w2_noise)[0] else: self.grad_delta = tf.gradients( ys=base_loss, xs=item_w2_noise)[0] self.grad_delta_dense = tf.stop_gradient( self.grad_delta) self.update_delta = item_w2_noise.assign( self.eps * self.grad_delta_dense / tf.norm(self.grad_delta_dense)) if self.noise_pos == 'USER': if self.robust_test: self.grad_delta = tf.gradients( ys=org_base_loss, xs=user_w_noise)[0] else: self.grad_delta = tf.gradients( ys=base_loss, xs=user_w_noise)[0] self.grad_delta_dense = tf.stop_gradient( self.grad_delta) self.update_delta = user_w_noise.assign( self.eps * self.grad_delta_dense / tf.norm(self.grad_delta_dense)) if self.noise_pos == 'HID': if self.robust_test: self.grad_delta = tf.gradients( ys=org_base_loss, xs=hidden_noise_eval)[0] self.grad_delta_dense = tf.stop_gradient( self.grad_delta) self.update_delta = hidden_noise_eval.assign( self.eps * self.grad_delta_dense / tf.norm(self.grad_delta_dense)) else: self.grad_delta = tf.gradients( ys=base_loss, xs=hidden_noise_tr)[0] self.grad_delta_dense = tf.stop_gradient( self.grad_delta) self.update_delta = hidden_noise_tr.assign( self.eps * self.grad_delta_dense / tf.norm(self.grad_delta_dense)) if self.noise_pos == 'W1W2': if self.robust_test: if self.noise_loss_ratio != 0 and self.noise_loss_ratio_W1 != 0: self.grad_delta1 = tf.gradients( ys=org_base_loss, xs=item_w1_noise)[0] self.grad_delta2 = tf.gradients( ys=org_base_loss, xs=item_w2_noise)[0] elif self.noise_loss_ratio_W1 == 0: self.grad_delta2 = tf.gradients( ys=org_base_loss, xs=item_w2_noise)[0] elif self.noise_loss_ratio == 0: self.grad_delta1 = tf.gradients( ys=org_base_loss, xs=item_w1_noise)[0] else: if self.noise_loss_ratio != 0 and self.noise_loss_ratio_W1 != 0: self.grad_delta1 = tf.gradients( ys=base_loss, xs=item_w1_noise)[0] self.grad_delta2 = tf.gradients( ys=base_loss, xs=item_w2_noise)[0] elif self.noise_loss_ratio_W1 == 0: self.grad_delta2 = tf.gradients( ys=base_loss, xs=item_w2_noise)[0] elif self.noise_loss_ratio == 0: self.grad_delta1 = tf.gradients( ys=base_loss, xs=item_w1_noise)[0] if self.noise_loss_ratio != 0 and self.noise_loss_ratio_W1 != 0: self.grad_delta_dense1 = tf.stop_gradient( self.grad_delta1) self.grad_delta_dense2 = tf.stop_gradient( self.grad_delta2) self.update_delta1 = item_w1_noise.assign( self.eps * self.grad_delta_dense1 / tf.norm(self.grad_delta_dense1)) self.update_delta2 = item_w2_noise.assign( self.eps * self.grad_delta_dense2 / tf.norm(self.grad_delta_dense2)) self.update_delta = self.update_delta1 + tf.transpose( self.update_delta2) elif self.noise_loss_ratio_W1 == 0: self.grad_delta_dense2 = tf.stop_gradient( self.grad_delta2) self.update_delta2 = item_w2_noise.assign( self.eps * self.grad_delta_dense2 / tf.norm(self.grad_delta_dense2)) self.update_delta = self.update_delta2 elif self.noise_loss_ratio == 0: self.grad_delta_dense1 = tf.stop_gradient( self.grad_delta1) self.update_delta1 = item_w1_noise.assign( self.eps * self.grad_delta_dense1 / tf.norm(self.grad_delta_dense1)) self.update_delta = self.update_delta1 print('Model Building Completed.')
def loss(self, predictions, policy, cfv): r = tf.stop_gradient( cpea.rm_policy(cfv - tf.reduce_sum(cfv * policy, axis=1, keepdims=True))) log_policy = tf.log(tf.clip_by_value(policy, 1e-15, 1 - 1e-15)) return -tf.reduce_mean(tf.reduce_sum(r * log_policy, axis=1))
def _build(self): inpts = [self.tiled_obs] if self.coords is not None: inpts.append(self.tiled_coords) self.outputs = self.sequence(*inpts) self.__dict__.update(self.outputs) log_weights = tf.reduce_sum(self.outputs.log_weights_per_timestep, 0) self.log_weights = tf.reshape(log_weights, (self.batch_size, self.k_particles)) self.elbo_vae = tf.reduce_mean(self.log_weights) self.elbo_iwae_per_example = targets.iwae(self.log_weights) self.elbo_iwae = tf.reduce_mean(self.elbo_iwae_per_example) self.normalised_elbo_vae = self.elbo_vae / tf.to_float(self.n_timesteps) self.normalised_elbo_iwae = self.elbo_iwae / tf.to_float(self.n_timesteps) tf.summary.scalar('normalised_vae', self.normalised_elbo_vae) tf.summary.scalar('normalised_iwae', self.normalised_elbo_iwae) self.importance_weights = tf.stop_gradient(tf.nn.softmax(self.log_weights, -1)) self.ess = ops.ess(self.importance_weights, average=True) self.iw_distrib = tf.distributions.Categorical(probs=self.importance_weights) self.iw_resampling_idx = self.iw_distrib.sample() # Logging self._log_resampled(self.data_ll_per_sample, 'data_ll') self._log_resampled(self.log_p_z_per_sample, 'log_p_z') self._log_resampled(self.log_q_z_given_x_per_sample, 'log_q_z_given_x') self._log_resampled(self.kl_per_sample, 'kl') # Mean squared error between inpt and mean of output distribution inpt_obs = self.tiled_obs if inpt_obs.shape[-1] == 1: inpt_obs = tf.squeeze(inpt_obs, -1) axes = [0] + list(range(inpt_obs.shape.ndims)[2:]) self.mse_per_sample = tf.reduce_mean((inpt_obs - self.canvas) ** 2, axes) self._log_resampled(self.mse_per_sample, 'mse') self.raw_mse = tf.reduce_mean(self.mse_per_sample) tf.summary.scalar('raw_mse', self.raw_mse) if hasattr(self, 'num_steps_per_sample'): self._log_resampled(self.num_steps_per_sample, 'num_steps') if self.gt_presence is not None: self.gt_num_steps = tf.reduce_sum(self.gt_presence, -1) num_steps_per_sample = tf.reshape(self.num_steps_per_sample, (-1, self.batch_size, self.k_particles)) gt_num_steps = tf.expand_dims(self.gt_num_steps, -1) self.num_step_accuracy_per_example = tf.to_float(tf.equal(gt_num_steps, num_steps_per_sample)) self.raw_num_step_accuracy = tf.reduce_mean(self.num_step_accuracy_per_example) self.num_step_accuracy = self._imp_weighted_mean(self.num_step_accuracy_per_example) tf.summary.scalar('num_step_acc', self.num_step_accuracy) # For rendering resampled_names = 'obj_id canvas glimpse presence_prob presence presence_logit where'.split() for name in resampled_names: try: setattr(self, 'resampled_' + name, self.resample(getattr(self, name), axis=1)) except AttributeError: pass try: self._log_resampled(self.num_disc_steps_per_sample, 'num_disc_steps') self._log_resampled(self.num_prop_steps_per_sample, 'num_prop_steps') except AttributeError: pass
def main(_): since = time.time() tf.logging.set_verbosity(tf.logging.INFO) # Create directories to store TensorBoard summaries if tf.gfile.Exists(FLAGS.summaries_dir): tf.gfile.DeleteRecursively(FLAGS.summaries_dir) tf.gfile.MakeDirs(FLAGS.summaries_dir) # Set model Hyperparameter model_config = pretrain_model.get_model_config() # Read the folder structure, and create lists of all the images image_lists = utils.create_image_lists(FLAGS.images_dir) class_count = len(image_lists.keys()) if class_count == 0: tf.logging.error("No valid folders of images found at " + FLAGS.images_dir) return -1 if class_count == 1: tf.logging.error("Only one valid folder of images found at " + FLAGS.images_dir) return -1 # Create output_labels.txt displaying classes being trained with open(FLAGS.output_labels, "w") as f: f.write("\n".join(image_lists.keys()) + "\n") with tf.Session() as sess: # Set up the image decoding jpeg_data, decoded_image = pretrain_model.decode_jpeg( model_config["input_width"], model_config["input_height"], model_config["input_depth"], model_config["input_mean"], model_config["input_std"]) # Load DenseNet model densenet_model, bottlenecks, resized_image, bottlenecks_size = pretrain_model.load_densenet_169( FLAGS.model_dir) # store pretrained model bottlenecks bottleneck.store_bottlenecks(sess, image_lists, FLAGS.images_dir, FLAGS.bottlenecks_dir, FLAGS.model_name, jpeg_data, decoded_image, resized_image, bottlenecks, model=densenet_model) bottlenecks = tf.stop_gradient(bottlenecks) global_step = tf.Variable(tf.constant(0), trainable=False) # Initialized final layer (train_step, cross_entropy, accuracy, bottlenecks_input, labels_input, final_result) = train.final_layer(len(image_lists.keys()), FLAGS.final_name, bottlenecks, bottlenecks_size, FLAGS.learning_rate, global_step) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/train", sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/validation") # Initialize all variables init = tf.global_variables_initializer() sess.run(init) # Get validation bottlenecks for evaluation validation_bottlenecks, validation_labels = ( bottleneck.get_batch_of_bottlenecks( sess, image_lists, FLAGS.validation_batch_size, "validation", FLAGS.bottlenecks_dir, FLAGS.images_dir, FLAGS.model_name, jpeg_data, decoded_image, resized_image, bottlenecks)) # Get test bottlenecks for evaluation test_bottlenecks, test_labels = (bottleneck.get_batch_of_bottlenecks( sess, image_lists, FLAGS.test_batch_size, "testing", FLAGS.bottlenecks_dir, FLAGS.images_dir, FLAGS.model_name, jpeg_data, decoded_image, resized_image, bottlenecks)) best_acc = 0.0 for i in range(FLAGS.iterations): # Get training bottlenecks (train_bottlenecks, train_labels) = bottleneck.get_batch_of_bottlenecks( sess, image_lists, FLAGS.train_batch_size, "training", FLAGS.bottlenecks_dir, FLAGS.images_dir, FLAGS.model_name, jpeg_data, decoded_image, resized_image, bottlenecks) # Training step train_summary, _ = sess.run( [merged, train_step], feed_dict={ bottlenecks_input: train_bottlenecks, labels_input: train_labels, global_step: i }) train_writer.add_summary(train_summary, i) # Show evaluation based on specified frequency final_step = (i + 1 == FLAGS.iterations) if (i % FLAGS.eval_interval) == 0 or final_step: # Evaluation train_accuracy, train_loss = sess.run( [accuracy, cross_entropy], feed_dict={ bottlenecks_input: train_bottlenecks, labels_input: train_labels }) # Run evaluation step on validation bottlenecks validation_summary, validation_accuracy, validation_loss = sess.run( [merged, accuracy, cross_entropy], feed_dict={ bottlenecks_input: validation_bottlenecks, labels_input: validation_labels }) validation_writer.add_summary(validation_summary, i) # Save best accuracy and store model if validation_accuracy > best_acc: best_acc = validation_accuracy # Calculate the test accuracy with best validation on test bottlenecks test_accuracy1, test_loss1 = sess.run( [accuracy, cross_entropy], feed_dict={ bottlenecks_input: test_bottlenecks, labels_input: test_labels }) train.save_graph_to_file(sess, FLAGS.output_graph, FLAGS.final_name) train.save_checkpoint_to_file(sess, FLAGS.output_checkpoint_dir) tf.logging.info( "Iteration {}: train loss = {}, train acc = {}, val loss = {}, val acc = {}." .format(i, train_loss, train_accuracy, validation_loss, validation_accuracy)) # Calculate the final test accuracy on test bottlenecks. test_accuracy2, test_loss2 = sess.run([accuracy, cross_entropy], feed_dict={ bottlenecks_input: test_bottlenecks, labels_input: test_labels }) tf.logging.info("Best validation accuracy = {}".format(best_acc * 100)) tf.logging.info("Test accuracy with best validation = {}".format( test_accuracy1 * 100)) tf.logging.info("Final test accuracy = {}".format(test_accuracy2 * 100)) time_elapsed = time.time() - since print("Runtime: {}min, {:0.2f}sec".format(int(time_elapsed // 60), time_elapsed % 60)) with open(os.path.join("..", FLAGS.model_name, "results.txt"), "w") as f: f.write("Best validation accuracy: " + str(best_acc) + "\n") f.write("Test accuracy with best validation: " + str(test_accuracy1) + "\n") f.write("Final test accuracy = {}".format(test_accuracy2 * 100) + "\n") f.write("Runtime: " + str(int(time_elapsed // 60)) + "min," + str(time_elapsed % 60) + "sec")