Beispiel #1
0
def context_infer(pooled_features):
    with tf.variable_scope("fc", reuse=True):
        weights = tf.stop_gradient(tf.get_variable("weights"))
        # b = tf.stop_gradient(tf.get_variable("biases"))

    z = tf.stop_gradient(pooled_features) #Nx64
    z = tf.expand_dims(z, -1) # Nx64x1
    
    w = weights # 64x10
    w = tf.expand_dims(w, 0) # 1x64x10
    mean, variance = tf.nn.moments(w, [1], keep_dims=True) #1x1x10
    response = tf.reduce_sum(tf.mul(z, w), 1, keep_dims=True) # Nx1x10
    response_vec = tf.mul(response, w) # Nx64x10
    response_vec = tf.div(response_vec, variance) # Nx64x10
    h = tf.sub(z, response_vec) # Nx64x10

    weights_initializer = tf.truncated_normal_initializer(
        stddev=FC_WEIGHT_STDDEV)
    with tf.variable_scope("context", reuse=True):
        context_weights = tf.stop_gradient(tf.get_variable("weights"))
        biases = tf.stop_gradient(tf.get_variable("biases"))
    context_weights = tf.expand_dims(context_weights, 0)
    biases = tf.expand_dims(biases, 0)
    scores = tf.reduce_sum(tf.mul(h, context_weights), 1) + biases    
    
    # TODO how to deal with b?
    return scores
Beispiel #2
0
  def get_dynamic_rebar_gradient(self):
    """Get the dynamic rebar gradient (t, eta optimized)."""
    tiled_pre_temperature = tf.tile([self.pre_temperature_variable],
                                [self.batch_size])
    temperature = tf.exp(tiled_pre_temperature)

    hardELBO, nvil_gradient, logQHard = self._create_hard_elbo()
    if self.hparams.quadratic:
      gumbel_cv, extra  = self._create_gumbel_control_variate_quadratic(logQHard, temperature=temperature)
    else:
      gumbel_cv, extra  = self._create_gumbel_control_variate(logQHard, temperature=temperature)

    f_grads = self.optimizer_class.compute_gradients(tf.reduce_mean(-nvil_gradient))

    eta = {}
    h_grads, eta_statistics = self.multiply_by_eta_per_layer(
        self.optimizer_class.compute_gradients(tf.reduce_mean(gumbel_cv)),
        eta)

    model_grads = U.add_grads_and_vars(f_grads, h_grads)
    total_grads = model_grads

    # Construct the variance objective
    g = U.vectorize(model_grads, set_none_to_zero=True)
    self.maintain_ema_ops.append(self.ema.apply([g]))
    gbar = 0  #tf.stop_gradient(self.ema.average(g))
    variance_objective = tf.reduce_mean(tf.square(g - gbar))

    reinf_g_t = 0
    if self.hparams.quadratic:
      for layer in xrange(self.hparams.n_layer):
        gumbel_learning_signal, _ = extra[layer]
        df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
        reinf_g_t_i, _ = self.multiply_by_eta_per_layer(
            self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * logQHard[layer])),
            eta)
        reinf_g_t += U.vectorize(reinf_g_t_i, set_none_to_zero=True)

      reparam = tf.add_n([reparam_i for _, reparam_i in extra])
    else:
      gumbel_learning_signal, reparam = extra
      df_dt = tf.gradients(gumbel_learning_signal, tiled_pre_temperature)[0]
      reinf_g_t, _ = self.multiply_by_eta_per_layer(
          self.optimizer_class.compute_gradients(tf.reduce_mean(tf.stop_gradient(df_dt) * tf.add_n(logQHard))),
          eta)
      reinf_g_t = U.vectorize(reinf_g_t, set_none_to_zero=True)

    reparam_g, _ = self.multiply_by_eta_per_layer(
        self.optimizer_class.compute_gradients(tf.reduce_mean(reparam)),
        eta)
    reparam_g = U.vectorize(reparam_g, set_none_to_zero=True)
    reparam_g_t = tf.gradients(tf.reduce_mean(2*tf.stop_gradient(g - gbar)*reparam_g), self.pre_temperature_variable)[0]

    variance_objective_grad = tf.reduce_mean(2*(g - gbar)*reinf_g_t) + reparam_g_t

    debug = { 'ELBO': hardELBO,
             'etas': eta_statistics,
             'variance_objective': variance_objective,
             }
    return total_grads, debug, variance_objective, variance_objective_grad
Beispiel #3
0
    def build_loss(self):
        """
        Loss function to minimize, whose gradient is a stochastic
        gradient inspired by adaptive importance sampling.

        loss = E_{p(z | x)} [ log p(z | x) - log q(z; lambda) ]

        is equivalent to minimizing

        E_{p(z | x)} [ log p(x, z) - log q(z; lambda) ]
        \approx 1/B sum_{b=1}^B
            w_norm(z^b; lambda) (log p(x, z^b) - log q(z^b; lambda))

        with gradient
        \approx - 1/B sum_{b=1}^B
            w_norm(z^b; lambda) grad_{lambda} log q(z^b; lambda)

        where + z^b ~ q(z^b; lambda)
              + w_norm(z^b; lambda) = w(z^b; lambda) / sum_{b=1}^B w(z^b; lambda)
              + w(z^b; lambda) = p(x, z^b) / q(z^b; lambda)
        """
        x = self.data.sample(self.n_data)
        z, self.samples = self.variational.sample(self.n_minibatch)

        q_log_prob = tf.zeros([self.n_minibatch], dtype=tf.float32)
        for i in range(self.variational.num_factors):
            q_log_prob += self.variational.log_prob_i(i, tf.stop_gradient(z))

        # normalized importance weights
        log_w = self.model.log_prob(x, z) - q_log_prob
        log_w_norm = log_w - log_sum_exp(log_w)
        w_norm = tf.exp(log_w_norm)

        self.loss = tf.reduce_mean(w_norm * log_w)
        return -tf.reduce_mean(q_log_prob * tf.stop_gradient(w_norm))
    def _step(self, J, voltage, refractory, dt):
        delta_t = tf.clip_by_value(dt - refractory, self.zero, dt)

        dV = (voltage - J) * tf.expm1(-delta_t / self.tau_rc)
        voltage += dV

        spiked = voltage > self.one
        spikes = tf.cast(spiked, J.dtype) * self.alpha

        partial_ref = -self.tau_rc * tf.log1p((self.one - voltage) /
                                              (J - self.one))
        # FastLIF version (linearly approximate spike time when calculating
        # remaining refractory period)
        # partial_ref = signals.dt * (voltage - self.one) / dV

        refractory = tf.where(spiked, self.tau_ref - partial_ref,
                              refractory - dt)

        voltage = tf.where(spiked, self.zeros,
                           tf.maximum(voltage, self.min_voltage))

        # we use stop_gradient to avoid propagating any nans (those get
        # propagated through the cond even if the spiking version isn't
        # being used at all)
        return (tf.stop_gradient(spikes), tf.stop_gradient(voltage),
                tf.stop_gradient(refractory))
Beispiel #5
0
def get_next_input(output):
    # the next location is computed by the location network
    baseline = tf.sigmoid(tf.matmul(output,Wb_h_b) + Bb_h_b)
    baselines.append(baseline)
    # compute the next location, then impose noise
    if eyeCentered:
        # add the last sampled glimpse location
        # TODO max(-1, min(1, u + N(output, sigma) + prevLoc))
        mean_loc = tf.maximum(-1.0, tf.minimum(1.0, tf.matmul(output, Wl_h_l) + sampled_locs[-1] ))
    else:
        mean_loc = tf.matmul(output, Wl_h_l)

    # mean_loc = tf.stop_gradient(mean_loc)
    mean_locs.append(mean_loc)
    mean_locs_stopGrad.append(tf.stop_gradient(mean_loc))

    # add noise
    # sample_loc = tf.tanh(mean_loc + tf.random_normal(mean_loc.get_shape(), 0, loc_sd))
    sample_loc = tf.maximum(-1.0, tf.minimum(1.0, mean_loc + tf.random_normal(mean_loc.get_shape(), 0, loc_sd)))

    # don't propagate throught the locations
    # sample_loc = tf.stop_gradient(sample_loc)
    sampled_locs.append(sample_loc)
    sampled_locs_stopGrad.append(tf.stop_gradient(sample_loc))

    return get_glimpse(sample_loc)
Beispiel #6
0
    def energy(self, visible_state, hidden_state, scope='energy'):
        with tf.variable_scope(scope):
            visible_state = tf.stop_gradient(visible_state, name="visible_state")
            hidden_state = tf.stop_gradient(hidden_state, name="hidden_state")
            energy = -tf.reduce_mean(tf.reduce_sum(tf.multiply(tf.matmul(visible_state, self.W, name='visible_weights'),
                                                               hidden_state, name='weights_hidden')
                                                   , axis=1, name='energy_sum'), name="batch_energy_mean")

            if self.visible.use_bias:
                if self.visible.binary:
                    energy = tf.add(energy, -tf.reduce_mean(
                        tf.reduce_sum(tf.multiply(self.visible.bias, visible_state, name='visible_bias_energy'), axis=1)))
                else:
                    v = visible_state - self.visible.bias
                    energy = tf.add(energy,  tf.reduce_mean(tf.reduce_sum(tf.multiply(v, v) / 2, axis=1)))


            if self.hidden.use_bias:
                if self.hidden.binary:
                    energy = tf.add(energy, -tf.reduce_mean(
                        tf.reduce_sum(tf.multiply(self.hidden.bias, hidden_state, name='hidden_bias_energy'), axis=1)))
                else:
                    h = hidden_state - self.hidden.bias
                    energy = tf.add(energy, tf.reduce_mean(tf.reduce_sum(tf.multiply(h, h) / 2, axis=1)))

        return energy
Beispiel #7
0
  def _create_gumbel_control_variate(self, logQHard, temperature=None):
    '''Calculate gumbel control variate.
    '''
    if temperature is None:
      temperature = self.hparams.temperature

    logQ, softSamples = self._recognition_network(sampler=functools.partial(
        self._random_sample_soft, temperature=temperature))
    softELBO, _ = self._generator_network(softSamples, logQ)
    logQ = tf.add_n(logQ)

    # Generate the softELBO_v (should be the same value but different grads)
    logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
        self._random_sample_soft_v, temperature=temperature))
    softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)
    logQ_v = tf.add_n(logQ_v)

    # Compute losses
    learning_signal = tf.stop_gradient(softELBO_v)

    # Control variate
    h = (tf.stop_gradient(learning_signal) * tf.add_n(logQHard)
          - softELBO + softELBO_v)

    extra = (softELBO_v, -softELBO + softELBO_v)

    return h, extra
Beispiel #8
0
def latent_prediction_model(inputs,
                            ed_attention_bias,
                            latents_discrete,
                            latents_dense,
                            hparams,
                            name="latent_prediction"):
  """Transformer-based latent prediction model.

  It is an autoregressive decoder over latents_discrete given inputs.

  Args:
    inputs: Tensor of shape [batch, length_kv, hparams.hidden_size]. Inputs to
      attend to for the decoder on latents.
    ed_attention_bias: Tensor which broadcasts with shape [batch,
      hparams.num_heads, length_q, length_kv]. Encoder-decoder attention bias.
    latents_discrete: Tensor of shape [batch, length_q, vocab_size].
      One-hot latents to compute log-probability of given inputs.
    latents_dense: Tensor of shape [batch, length_q, hparams.hidden_size].
    hparams: tf.contrib.training.HParams.
    name: string, variable scope.

  Returns:
    latents_pred: Tensor of shape [batch, length_q, hparams.hidden_size].
    latents_pred_loss: Tensor of shape [batch, length_q].
  """
  with tf.variable_scope(name):
    if hparams.mode != tf.estimator.ModeKeys.PREDICT:
      latents_pred = transformer_latent_decoder(
          tf.stop_gradient(latents_dense), inputs, ed_attention_bias,
          hparams, name)
      _, latent_pred_loss = ae_latent_softmax(
          latents_pred, tf.stop_gradient(latents_discrete), hparams)
  return latents_pred, latent_pred_loss
Beispiel #9
0
def rnn_decoder(cell, inputs, initial_state, embedding_size, embedding_length, sequence_length,
                name='RNNDecoder', reuse=False, use_inputs_prob=0.0, static_input=None):
    with tf.variable_scope(name, reuse=reuse):
        # print(tf.get_variable_scope().reuse, tf.get_variable_scope().name)
        with tf.name_scope("embedding"):
            batch_size = tf.shape(initial_state)[0]
            embedding_table = tf.get_variable(
                name='embedding_table',
                shape=[embedding_length, embedding_size],
                initializer=tf.truncated_normal_initializer(stddev=glorot_mul(embedding_length, embedding_size)),
            )
            # 0 is index for _SOS_ (start of sentence symbol)
            initial_embedding = tf.gather(embedding_table, tf.zeros(tf.pack([batch_size]), tf.int32))

        states = [initial_state]
        outputs = []
        outputs_softmax = []
        decoder_outputs_argmax_embedding = []

        for j in range(sequence_length):
            with tf.variable_scope(tf.get_variable_scope(), reuse=True if j > 0 else None):
                # get input :
                #   either feedback the previous decoder argmax output
                #   or use the provided input (note that you have to use the previous input (index si therefore -1)
                input = initial_embedding
                if j > 0:
                    true_input = tf.gather(embedding_table, inputs[j - 1])
                    decoded_input = decoder_outputs_argmax_embedding[-1]
                    choice = tf.floor(tf.random_uniform([1], use_inputs_prob, 1 + use_inputs_prob, tf.float32))
                    input = choice * true_input + (1.0 - choice) * decoded_input

                if static_input:
                    input = tf.concat(1, [input, static_input])

                # print(tf.get_variable_scope().reuse, tf.get_variable_scope().name)
                output, state = cell(input, states[-1])

                projection = linear(
                    input=output,
                    input_size=cell.output_size,
                    output_size=embedding_length,
                    name='output_linear_projection'
                )

                outputs.append(projection)
                states.append(state)

                softmax = tf.nn.softmax(projection, name="output_softmax")
                # we do no compute the gradient trough argmax
                output_argmax = tf.stop_gradient(tf.argmax(softmax, 1))
                # we do no compute the gradient for embeddings when used with noisy argmax outputs
                output_argmax_embedding = tf.stop_gradient(tf.gather(embedding_table, output_argmax))
                decoder_outputs_argmax_embedding.append(output_argmax_embedding)

                outputs_softmax.append(tf.expand_dims(softmax, 1))

    # remove the initial state
    states = states[1:]

    return states, outputs, outputs_softmax
Beispiel #10
0
  def self_kl(self, logits,
              sampling_dim, act_dim, act_type):
    """Calculate KL of distribution with itself.

    Used layer only for the gradients.
    """

    if self.env_spec.is_discrete(act_type):
      probs = tf.nn.softmax(logits)
      log_probs = tf.nn.log_softmax(logits)
      self_kl = tf.reduce_sum(
          tf.stop_gradient(probs) *
          (tf.stop_gradient(log_probs) - log_probs), -1)
    elif self.env_spec.is_box(act_type):
      means = logits[:, :sampling_dim / 2]
      std = logits[:, sampling_dim / 2:]
      my_means = tf.stop_gradient(means)
      my_std = tf.stop_gradient(std)
      self_kl = tf.reduce_sum(
          tf.log(std / my_std) +
          (tf.square(my_std) + tf.square(my_means - means)) /
          (2.0 * tf.square(std)) - 0.5,
          -1)
    else:
      assert False

    return self_kl
  def loop_function(prev, i, log_beam_probs, beam_path, beam_symbols):
    if output_projection is not None:
      prev = nn_ops.xw_plus_b(
          prev, output_projection[0], output_projection[1])
    # prev= prev.get_shape().with_rank(2)[1]

    probs  = tf.log(tf.nn.softmax(prev))

    if i > 1:

        probs = tf.reshape(probs + log_beam_probs[-1],
                               [-1, beam_size * num_symbols])

    best_probs, indices = tf.nn.top_k(probs, beam_size)
    indices = tf.stop_gradient(tf.squeeze(tf.reshape(indices, [-1, 1])))
    best_probs = tf.stop_gradient(tf.reshape(best_probs, [-1, 1]))

    symbols = indices % num_symbols # Which word in vocabulary.
    beam_parent = indices // num_symbols # Which hypothesis it came from.


    beam_symbols.append(symbols)
    beam_path.append(beam_parent)
    log_beam_probs.append(best_probs)

    # Note that gradients will not propagate through the second parameter of
    # embedding_lookup.

    emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    emb_prev  = tf.reshape(emb_prev,[beam_size,embedding_size])
    # emb_prev = embedding_ops.embedding_lookup(embedding, symbols)
    if not update_embedding:
      emb_prev = array_ops.stop_gradient(emb_prev)
    return emb_prev
Beispiel #12
0
def virtual_adversarial_loss_bidir(logits, embedded, inputs,
                                   logits_from_embedding_fn):
  """Virtual adversarial loss for bidirectional models."""
  logits = tf.stop_gradient(logits)
  f_inputs, _ = inputs
  weights = _end_of_seq_mask(f_inputs.labels)

  perturbs = [
      _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length)
      for emb in embedded
  ]
  for _ in xrange(FLAGS.num_power_iteration):
    perturbs = [
        _scale_l2(d, FLAGS.small_constant_for_finite_diff) for d in perturbs
    ]
    d_logits = logits_from_embedding_fn(
        [emb + d for (emb, d) in zip(embedded, perturbs)])
    kl = _kl_divergence_with_logits(logits, d_logits, weights)
    perturbs = tf.gradients(
        kl,
        perturbs,
        aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
    perturbs = [tf.stop_gradient(d) for d in perturbs]

  perturbs = [
      _scale_l2(_mask_by_length(d, f_inputs.length), FLAGS.perturb_norm_length)
      for d in perturbs
  ]
  vadv_logits = logits_from_embedding_fn(
      [emb + d for (emb, d) in zip(embedded, perturbs)])
  return _kl_divergence_with_logits(logits, vadv_logits, weights)
  def _logits_cumulative(self, inputs, stop_gradient):
    """Evaluate logits of the cumulative densities.

    Arguments:
      inputs: The values at which to evaluate the cumulative densities, expected
        to be a `Tensor` of shape `(channels, 1, batch)`.
      stop_gradient: Boolean. Whether to add `tf.stop_gradient` calls so
        that the gradient of the output with respect to the density model
        parameters is disconnected (the gradient with respect to `inputs` is
        left untouched).

    Returns:
      A `Tensor` of the same shape as `inputs`, containing the logits of the
      cumulative densities evaluated at the given inputs.
    """
    logits = inputs

    for i in range(len(self.filters) + 1):
      matrix = self._matrices[i]
      if stop_gradient:
        matrix = tf.stop_gradient(matrix)
      logits = tf.linalg.matmul(matrix, logits)

      bias = self._biases[i]
      if stop_gradient:
        bias = tf.stop_gradient(bias)
      logits += bias

      if i < len(self._factors):
        factor = self._factors[i]
        if stop_gradient:
          factor = tf.stop_gradient(factor)
        logits += factor * tf.math.tanh(logits)

    return logits
Beispiel #14
0
  def _create_gumbel_control_variate_quadratic(self, logQHard, temperature=None):
    '''Calculate gumbel control variate.
    '''
    if temperature is None:
      temperature = self.hparams.temperature

    h = 0
    extra = []
    for layer in xrange(self.hparams.n_layer):
      logQ, softSamples = self._recognition_network(sampler=functools.partial(
          self._random_sample_switch, switch_layer=layer, temperature=temperature))
      softELBO, _ = self._generator_network(softSamples, logQ)

      # Generate the softELBO_v (should be the same value but different grads)
      logQ_v, softSamples_v = self._recognition_network(sampler=functools.partial(
          self._random_sample_switch_v, switch_layer=layer, temperature=temperature))
      softELBO_v, _ = self._generator_network(softSamples_v, logQ_v)

      # Compute losses
      learning_signal = tf.stop_gradient(softELBO_v)

      # Control variate
      h += (tf.stop_gradient(learning_signal) * logQHard[layer]
            - softELBO + softELBO_v)

      extra.append((softELBO_v, -softELBO + softELBO_v))

    return h, extra
Beispiel #15
0
  def target_critic_net(self, states, actions, for_critic_loss=False):
    """Returns the output of the target critic network.

    The target network is used to compute stable targets for training.

    Args:
      states: A [batch_size, num_state_dims] tensor representing a batch
        of states.
      actions: A [batch_size, num_action_dims] tensor representing a batch
        of actions.
    Returns:
      q values: A [batch_size] tensor of q values.
    Raises:
      ValueError: If `states` or `actions' do not have the expected dimensions.
    """
    self._validate_states(states)
    self._validate_actions(actions)
    values1 = tf.stop_gradient(
        self._target_critic_net(states, actions,
                                for_critic_loss=for_critic_loss))
    values2 = tf.stop_gradient(
        self._target_critic_net2(states, actions,
                                 for_critic_loss=for_critic_loss))
    if for_critic_loss:
      return values1, values2
    return values1
Beispiel #16
0
    def build_graph(self, state, action, futurereward, action_prob):
        logits, value = self._get_NN_prediction(state)
        value = tf.squeeze(value, [1], name='pred_value')  # (B,)
        policy = tf.nn.softmax(logits, name='policy')
        is_training = get_current_tower_context().is_training
        if not is_training:
            return
        log_probs = tf.log(policy + 1e-6)

        log_pi_a_given_s = tf.reduce_sum(
            log_probs * tf.one_hot(action, NUM_ACTIONS), 1)
        advantage = tf.subtract(tf.stop_gradient(value), futurereward, name='advantage')

        pi_a_given_s = tf.reduce_sum(policy * tf.one_hot(action, NUM_ACTIONS), 1)  # (B,)
        importance = tf.stop_gradient(tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10))

        policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage * importance, name='policy_loss')
        xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss')
        value_loss = tf.nn.l2_loss(value - futurereward, name='value_loss')

        pred_reward = tf.reduce_mean(value, name='predict_reward')
        advantage = tf.sqrt(tf.reduce_mean(tf.square(advantage)), name='rms_advantage')
        entropy_beta = tf.get_variable('entropy_beta', shape=[],
                                       initializer=tf.constant_initializer(0.01), trainable=False)
        cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss])
        cost = tf.truediv(cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost')
        summary.add_moving_summary(policy_loss, xentropy_loss,
                                   value_loss, pred_reward, advantage,
                                   cost, tf.reduce_mean(importance, name='importance'))
        return cost
Beispiel #17
0
def virtual_adversarial_loss_bidir(logits, embedded, inputs,
                                   logits_from_embedding_fn):
  """Virtual adversarial loss for bidirectional models."""
  logits = tf.stop_gradient(logits)
  f_inputs, _ = inputs
  weights = f_inputs.eos_weights
  if FLAGS.single_label:
    indices = tf.stack([tf.range(FLAGS.batch_size), f_inputs.length - 1], 1)
    weights = tf.expand_dims(tf.gather_nd(f_inputs.eos_weights, indices), 1)
  assert weights is not None

  perturbs = [
      _mask_by_length(tf.random_normal(shape=tf.shape(emb)), f_inputs.length)
      for emb in embedded
  ]
  for _ in xrange(FLAGS.num_power_iteration):
    perturbs = [
        _scale_l2(d, FLAGS.small_constant_for_finite_diff) for d in perturbs
    ]
    d_logits = logits_from_embedding_fn(
        [emb + d for (emb, d) in zip(embedded, perturbs)])
    kl = _kl_divergence_with_logits(logits, d_logits, weights)
    perturbs = tf.gradients(
        kl,
        perturbs,
        aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
    perturbs = [tf.stop_gradient(d) for d in perturbs]

  perturbs = [_scale_l2(d, FLAGS.perturb_norm_length) for d in perturbs]
  vadv_logits = logits_from_embedding_fn(
      [emb + d for (emb, d) in zip(embedded, perturbs)])
  return _kl_divergence_with_logits(logits, vadv_logits, weights)
Beispiel #18
0
  def get_muprop_gradient(self):
    """
    random sample function that actually returns mean
    new forward pass that returns logQ as a list

    can get x_i from samples
    """

    # Hard loss
    logQHard, hardSamples = self._recognition_network()
    hardELBO, reinforce_model_grad = self._generator_network(hardSamples, logQHard)

    # Soft loss
    logQ, muSamples = self._recognition_network(sampler=self._mean_sample)
    muELBO, _ = self._generator_network(muSamples, logQ)

    # Compute gradients
    muELBOGrads = tf.gradients(tf.reduce_sum(muELBO),
                               [ muSamples[i]['activation'] for
                                i in xrange(self.hparams.n_layer) ])

    # Compute MuProp gradient estimates
    learning_signal = hardELBO
    optimizerLoss = 0.0
    learning_signals = []
    for i in xrange(self.hparams.n_layer):
      dfDiff = tf.reduce_sum(
          muELBOGrads[i] * (hardSamples[i]['activation'] -
                            muSamples[i]['activation']),
          axis=1)
      dfMu = tf.reduce_sum(
          tf.stop_gradient(muELBOGrads[i]) *
          tf.nn.sigmoid(hardSamples[i]['log_param']),
          axis=1)

      scaling_baseline_0 = self._create_eta(collection='BASELINE')
      scaling_baseline_1 = self._create_eta(collection='BASELINE')
      learning_signals.append(learning_signal - scaling_baseline_0 * muELBO - scaling_baseline_1 * dfDiff - self._create_baseline())
      self.baseline_loss.append(tf.square(learning_signals[i]))

      optimizerLoss += (
          logQHard[i] * tf.stop_gradient(learning_signals[i]) +
          tf.stop_gradient(scaling_baseline_1) * dfMu)
    optimizerLoss += reinforce_model_grad
    optimizerLoss *= -1

    optimizerLoss = tf.reduce_mean(optimizerLoss)

    muprop_gradient = self.optimizer_class.compute_gradients(optimizerLoss)
    debug = {
        'ELBO': hardELBO,
        'muELBO': muELBO,
    }

    debug.update(dict([
        ('RMS learning signal layer %d' % i, U.rms(learning_signal))
        for (i, learning_signal) in enumerate(learning_signals)]))

    return muprop_gradient, debug
Beispiel #19
0
def batch_norm(input_,
               dim,
               name,
               scale=True,
               train=True,
               epsilon=1e-8,
               decay=.1,
               axes=[0],
               bn_lag=DEFAULT_BN_LAG):
    """Batch normalization."""
    # create variables
    with tf.variable_scope(name):
        var = variable_on_cpu(
            "var", [dim], tf.constant_initializer(1.), trainable=False)
        mean = variable_on_cpu(
            "mean", [dim], tf.constant_initializer(0.), trainable=False)
        step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False)
        if scale:
            gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.))
        beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.))
    # choose the appropriate moments
    if train:
        used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm")
        cur_mean, cur_var = used_mean, used_var
        if bn_lag > 0.:
            used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean))
            used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var))
            used_mean /= (1. - bn_lag**(step + 1))
            used_var /= (1. - bn_lag**(step + 1))
    else:
        used_mean, used_var = mean, var
        cur_mean, cur_var = used_mean, used_var

    # normalize
    res = (input_ - used_mean) / tf.sqrt(used_var + epsilon)
    # de-normalize
    if scale:
        res *= gamma
    res += beta

    # update variables
    if train:
        with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]):
            with ops.colocate_with(mean):
                new_mean = tf.assign_sub(
                    mean,
                    tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean."))
        with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]):
            with ops.colocate_with(var):
                new_var = tf.assign_sub(
                    var,
                    tf.check_numerics(decay * (var - cur_var),
                                      "NaN in moving variance."))
        with tf.name_scope(name, "IncrementTime", [step]):
            with ops.colocate_with(step):
                new_step = tf.assign_add(step, 1.)
        res += 0. * new_mean * new_var * new_step

    return res
    def __init__(self,
                 q_t,
                 q_tp1,
                 q_tp0,
                 importance_weights,
                 rewards,
                 done_mask,
                 twin_q_t,
                 twin_q_tp1,
                 actor_loss_coeff=0.1,
                 critic_loss_coeff=1.0,
                 gamma=0.99,
                 n_step=1,
                 use_huber=False,
                 huber_threshold=1.0,
                 twin_q=False,
                 policy_delay=1):

        q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
        if twin_q:
            twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
            q_tp1 = tf.minimum(q_tp1, twin_q_tp1)

        q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked

        # compute the error (potentially clipped)
        if twin_q:
            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            twin_td_error = twin_q_t_selected - tf.stop_gradient(
                q_t_selected_target)
            self.td_error = td_error + twin_td_error
            if use_huber:
                errors = _huber_loss(td_error, huber_threshold) + _huber_loss(
                    twin_td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(
                    twin_td_error)
        else:
            self.td_error = (
                q_t_selected - tf.stop_gradient(q_t_selected_target))
            if use_huber:
                errors = _huber_loss(self.td_error, huber_threshold)
            else:
                errors = 0.5 * tf.square(self.td_error)

        self.critic_loss = critic_loss_coeff * tf.reduce_mean(
            importance_weights * errors)

        # for policy gradient, update policy net one time v.s.
        # update critic net `policy_delay` time(s)
        global_step = tf.train.get_or_create_global_step()
        policy_delay_mask = tf.to_float(
            tf.equal(tf.mod(global_step, policy_delay), 0))
        self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
                           tf.reduce_mean(q_tp0))
Beispiel #21
0
def build_score_loss_and_gradients(inference, var_list):
  """Build loss function and gradients based on the score function
  estimator (Paisley et al., 2012).

  Computed by sampling from $q(z;\lambda)$ and evaluating the
  expectation using Monte Carlo sampling.
  """
  p_log_prob = [0.0] * inference.n_samples
  q_log_prob = [0.0] * inference.n_samples
  for s in range(inference.n_samples):
    # Form dictionary in order to replace conditioning on prior or
    # observed variable with conditioning on a specific value.
    scope = 'inference_' + str(id(inference)) + '/' + str(s)
    dict_swap = {}
    for x, qx in six.iteritems(inference.data):
      if isinstance(x, RandomVariable):
        if isinstance(qx, RandomVariable):
          qx_copy = copy(qx, scope=scope)
          dict_swap[x] = qx_copy.value()
        else:
          dict_swap[x] = qx

    for z, qz in six.iteritems(inference.latent_vars):
      # Copy q(z) to obtain new set of posterior samples.
      qz_copy = copy(qz, scope=scope)
      dict_swap[z] = qz_copy.value()
      q_log_prob[s] += tf.reduce_sum(
          inference.scale.get(z, 1.0) *
          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))

    for z in six.iterkeys(inference.latent_vars):
      z_copy = copy(z, dict_swap, scope=scope)
      p_log_prob[s] += tf.reduce_sum(
          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))

    for x in six.iterkeys(inference.data):
      if isinstance(x, RandomVariable):
        x_copy = copy(x, dict_swap, scope=scope)
        p_log_prob[s] += tf.reduce_sum(
            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))

  p_log_prob = tf.stack(p_log_prob)
  q_log_prob = tf.stack(q_log_prob)

  if inference.logging:
    summary_key = 'summaries_' + str(id(inference))
    tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
                      collections=[summary_key])
    tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
                      collections=[summary_key])

  losses = p_log_prob - q_log_prob
  loss = -tf.reduce_mean(losses)

  grads = tf.gradients(
      -tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)),
      var_list)
  grads_and_vars = list(zip(grads, var_list))
  return loss, grads_and_vars
def kmeans(x, means, hparams, name):
  with tf.variable_scope(name):
    x_means_hot = nearest(x, means, hparams)
    x_means = tf.gather(means, tf.argmax(x_means_hot, axis=-1))
    reg_loss1 = tf.nn.l2_loss((tf.stop_gradient(x) - x_means))
    reg_loss2 = hparams.beta * tf.nn.l2_loss((x - tf.stop_gradient(x_means)))
    l = reg_loss1 + reg_loss2
    return x_means_hot, x_means, l
Beispiel #23
0
def build_score_kl_loss_and_gradients(inference, var_list):
  """Build loss function and gradients based on the score function
  estimator (Paisley et al., 2012).

  It assumes the KL is analytic.

  Computed by sampling from $q(z;\lambda)$ and evaluating the
  expectation using Monte Carlo sampling.
  """
  p_log_lik = [0.0] * inference.n_samples
  q_log_prob = [0.0] * inference.n_samples
  base_scope = tf.get_default_graph().unique_name("inference") + '/'
  for s in range(inference.n_samples):
    # Form dictionary in order to replace conditioning on prior or
    # observed variable with conditioning on a specific value.
    scope = base_scope + tf.get_default_graph().unique_name("sample")
    dict_swap = {}
    for x, qx in six.iteritems(inference.data):
      if isinstance(x, RandomVariable):
        if isinstance(qx, RandomVariable):
          qx_copy = copy(qx, scope=scope)
          dict_swap[x] = qx_copy.value()
        else:
          dict_swap[x] = qx

    for z, qz in six.iteritems(inference.latent_vars):
      # Copy q(z) to obtain new set of posterior samples.
      qz_copy = copy(qz, scope=scope)
      dict_swap[z] = qz_copy.value()
      q_log_prob[s] += tf.reduce_sum(
          inference.scale.get(z, 1.0) *
          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))

    for x in six.iterkeys(inference.data):
      if isinstance(x, RandomVariable):
        x_copy = copy(x, dict_swap, scope=scope)
        p_log_lik[s] += tf.reduce_sum(
            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))

  p_log_lik = tf.stack(p_log_lik)
  q_log_prob = tf.stack(q_log_prob)

  kl_penalty = tf.reduce_sum([
      inference.kl_scaling.get(z, 1.0) * tf.reduce_sum(kl_divergence(qz, z))
      for z, qz in six.iteritems(inference.latent_vars)])

  if inference.logging:
    tf.summary.scalar("loss/p_log_lik", tf.reduce_mean(p_log_lik),
                      collections=[inference._summary_key])
    tf.summary.scalar("loss/kl_penalty", kl_penalty,
                      collections=[inference._summary_key])

  loss = -(tf.reduce_mean(p_log_lik) - kl_penalty)
  grads = tf.gradients(
      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl_penalty),
      var_list)
  grads_and_vars = list(zip(grads, var_list))
  return loss, grads_and_vars
Beispiel #24
0
    def __init__(self, env):
        self.env = env
        if not isinstance(env.observation_space, Box) or \
           not isinstance(env.action_space, Discrete):
            print("Incompatible spaces.")
            exit(-1)
        print("Observation Space", env.observation_space)
        print("Action Space", env.action_space)
        self.session = tf.Session()
        self.end_count = 0
        self.train = True
        self.obs = obs = tf.placeholder(
            dtype, shape=[
                None, 2 * env.observation_space.shape[0] + env.action_space.n], name="obs")
        self.prev_obs = np.zeros((1, env.observation_space.shape[0]))
        self.prev_action = np.zeros((1, env.action_space.n))
        self.action = action = tf.placeholder(tf.int64, shape=[None], name="action")
        self.advant = advant = tf.placeholder(dtype, shape=[None], name="advant")
        self.oldaction_dist = oldaction_dist = tf.placeholder(dtype, shape=[None, env.action_space.n], name="oldaction_dist")

        # Create neural network.
        action_dist_n, _ = (pt.wrap(self.obs).
                            fully_connected(64, activation_fn=tf.nn.tanh).
                            softmax_classifier(env.action_space.n))
        eps = 1e-6
        self.action_dist_n = action_dist_n
        N = tf.shape(obs)[0]
        p_n = slice_2d(action_dist_n, tf.range(0, N), action)
        oldp_n = slice_2d(oldaction_dist, tf.range(0, N), action)
        ratio_n = p_n / oldp_n
        Nf = tf.cast(N, dtype)
        surr = -tf.reduce_mean(ratio_n * advant)  # Surrogate loss
        var_list = tf.trainable_variables()
        kl = tf.reduce_sum(oldaction_dist * tf.log((oldaction_dist + eps) / (action_dist_n + eps))) / Nf
        ent = tf.reduce_sum(-action_dist_n * tf.log(action_dist_n + eps)) / Nf

        self.losses = [surr, kl, ent]
        self.pg = flatgrad(surr, var_list)
        # KL divergence where first arg is fixed
        # replace old->tf.stop_gradient from previous kl
        kl_firstfixed = tf.reduce_sum(tf.stop_gradient(
            action_dist_n) * tf.log(tf.stop_gradient(action_dist_n + eps) / (action_dist_n + eps))) / Nf
        grads = tf.gradients(kl_firstfixed, var_list)
        self.flat_tangent = tf.placeholder(dtype, shape=[None])
        shapes = map(var_shape, var_list)
        start = 0
        tangents = []
        for shape in shapes:
            size = np.prod(shape)
            param = tf.reshape(self.flat_tangent[start:(start + size)], shape)
            tangents.append(param)
            start += size
        gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)]
        self.fvp = flatgrad(gvp, var_list)
        self.gf = GetFlat(self.session, var_list)
        self.sff = SetFromFlat(self.session, var_list)
        self.vf = VF(self.session)
        self.session.run(tf.initialize_all_variables())
Beispiel #25
0
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5,
                use_scale=True, use_bias=True):
    """
    Batch Renormalization layer, as described in the paper:
    `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models
    <https://arxiv.org/abs/1702.03275>`_.

    Args:
        x (tf.Tensor): a NHWC or NC tensor.
        rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections.
        decay (float): decay rate of moving average.
        epsilon (float): epsilon to avoid divide-by-zero.
        use_scale, use_bias (bool): whether to use the extra affine transformation or not.

    Returns:
        tf.Tensor: a tensor named ``output`` with the same shape of x.

    Variable Names:

    * ``beta``: the bias term.
    * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``.
    * ``mean/EMA``: the moving average of mean.
    * ``variance/EMA``: the moving average of variance.
    """

    shape = x.get_shape().as_list()
    assert len(shape) in [2, 4]
    n_out = shape[-1]
    if len(shape) == 2:
        x = tf.reshape(x, [-1, 1, 1, n_out])
    beta, gamma, moving_mean, moving_var = get_bn_variables(
        n_out, use_scale, use_bias, tf.constant_initializer(1.0))

    ctx = get_current_tower_context()
    use_local_stat = ctx.is_training
    # for BatchRenorm, use_local_stat should always be is_training, unless a
    # different usage comes out in the future.

    if use_local_stat:
        xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta,
                                                           epsilon=epsilon, is_training=True)
        inv_sigma = tf.rsqrt(moving_var, 'inv_sigma')
        r = tf.stop_gradient(tf.clip_by_value(
            tf.sqrt(batch_var) * inv_sigma, 1.0 / rmax, rmax))
        d = tf.stop_gradient(tf.clip_by_value(
            (batch_mean - moving_mean) * inv_sigma,
            -dmax, dmax))
        xn = xn * r + d
    else:
        xn = tf.nn.batch_normalization(
            x, moving_mean, moving_var, beta, gamma, epsilon)

    if len(shape) == 2:
        xn = tf.squeeze(xn, [1, 2])
    if ctx.is_main_training_tower:
        return update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay)
    else:
        return tf.identity(xn, name='output')
Beispiel #26
0
def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels):
    """
    Sample some ROIs from all proposals for training.
    #fg is guaranteed to be > 0, because grount truth boxes are added as RoIs.

    Args:
        boxes: nx4 region proposals, floatbox
        gt_boxes: mx4, floatbox
        gt_labels: m, int32

    Returns:
        sampled_boxes: tx4 floatbox, the rois
        sampled_labels: t labels, in [0, #class-1]. Positive means foreground.
        fg_inds_wrt_gt: #fg indices, each in range [0, m-1].
            It contains the matching GT of each foreground roi.
    """
    iou = pairwise_iou(boxes, gt_boxes)     # nxm
    proposal_metrics(iou)

    # add ground truth as proposals as well
    boxes = tf.concat([boxes, gt_boxes], axis=0)    # (n+m) x 4
    iou = tf.concat([iou, tf.eye(tf.shape(gt_boxes)[0])], axis=0)   # (n+m) x m
    # #proposal=n+m from now on

    def sample_fg_bg(iou):
        fg_mask = tf.reduce_max(iou, axis=1) >= cfg.FRCNN.FG_THRESH

        fg_inds = tf.reshape(tf.where(fg_mask), [-1])
        num_fg = tf.minimum(int(
            cfg.FRCNN.BATCH_PER_IM * cfg.FRCNN.FG_RATIO),
            tf.size(fg_inds), name='num_fg')
        fg_inds = tf.random_shuffle(fg_inds)[:num_fg]

        bg_inds = tf.reshape(tf.where(tf.logical_not(fg_mask)), [-1])
        num_bg = tf.minimum(
            cfg.FRCNN.BATCH_PER_IM - num_fg,
            tf.size(bg_inds), name='num_bg')
        bg_inds = tf.random_shuffle(bg_inds)[:num_bg]

        add_moving_summary(num_fg, num_bg)
        return fg_inds, bg_inds

    fg_inds, bg_inds = sample_fg_bg(iou)
    # fg,bg indices w.r.t proposals

    best_iou_ind = tf.argmax(iou, axis=1)   # #proposal, each in 0~m-1
    fg_inds_wrt_gt = tf.gather(best_iou_ind, fg_inds)   # num_fg

    all_indices = tf.concat([fg_inds, bg_inds], axis=0)   # indices w.r.t all n+m proposal boxes
    ret_boxes = tf.gather(boxes, all_indices)

    ret_labels = tf.concat(
        [tf.gather(gt_labels, fg_inds_wrt_gt),
         tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0)
    # stop the gradient -- they are meant to be training targets
    return tf.stop_gradient(ret_boxes, name='sampled_proposal_boxes'), \
        tf.stop_gradient(ret_labels, name='sampled_labels'), \
        tf.stop_gradient(fg_inds_wrt_gt)
Beispiel #27
0
  def __call__(self, batch_size, **kwargs):
    """Sample a batch of context.

    Args:
      batch_size: Batch size.
    Returns:
      Two [batch_size, num_context_dims] tensors.
    """
    spec = self._context_spec
    context_range = self._context_range
    if isinstance(context_range[0], (int, float)):
      contexts = tf.random_uniform(
          shape=[
              batch_size,
          ] + spec.shape.as_list(),
          minval=context_range[0],
          maxval=context_range[1],
          dtype=spec.dtype)
    elif isinstance(context_range[0], (list, tuple, np.ndarray)):
      assert len(spec.shape.as_list()) == 1
      assert spec.shape.as_list()[0] == len(context_range[0])
      assert spec.shape.as_list()[0] == len(context_range[1])
      contexts = tf.concat(
          [
              tf.random_uniform(
                  shape=[
                      batch_size, 1,
                  ] + spec.shape.as_list()[1:],
                  minval=context_range[0][i],
                  maxval=context_range[1][i],
                  dtype=spec.dtype) for i in range(spec.shape.as_list()[0])
          ],
          axis=1)
    else: raise NotImplementedError(context_range)
    self._validate_contexts(contexts)
    if 'sampler_fn' in kwargs:
      other_contexts = kwargs['sampler_fn']()
    else:
      other_contexts = contexts
    state, next_state = kwargs['state'], kwargs['next_state']
    if state is not None and next_state is not None:
      my_context_range = (np.array(context_range[1]) - np.array(context_range[0])) / 2 * np.ones(spec.shape.as_list())
      contexts = tf.concat(
          [0.1 * my_context_range[:self._k] *
           tf.random_normal(tf.shape(state[:, :self._k]), dtype=state.dtype) +
           tf.random_shuffle(state[:, :self._k]) - state[:, :self._k],
           other_contexts[:, self._k:]], 1)
      #contexts = tf.Print(contexts,
      #                    [contexts, tf.reduce_max(contexts, 0),
      #                     tf.reduce_min(state, 0), tf.reduce_max(state, 0)], 'contexts', summarize=15)
      next_contexts = tf.concat( #LALA
          [state[:, :self._k] + contexts[:, :self._k] - next_state[:, :self._k],
           other_contexts[:, self._k:]], 1)
      next_contexts = contexts  #LALA cosine
    else:
      next_contexts = contexts
    return tf.stop_gradient(contexts), tf.stop_gradient(next_contexts)
Beispiel #28
0
def generate_fpn_proposals(
    multilevel_anchors, multilevel_label_logits,
        multilevel_box_logits, image_shape2d):
    """
    Args:
        multilevel_anchors: #lvl RPNAnchors
        multilevel_label_logits: #lvl tensors of shape HxWxA
        multilevel_box_logits: #lvl tensors of shape HxWxAx4

    Returns:
        boxes: kx4 float
        scores: k logits
    """
    num_lvl = len(cfg.FPN.ANCHOR_STRIDES)
    assert len(multilevel_anchors) == num_lvl
    assert len(multilevel_label_logits) == num_lvl
    assert len(multilevel_box_logits) == num_lvl

    ctx = get_current_tower_context()
    all_boxes = []
    all_scores = []
    if cfg.FPN.PROPOSAL_MODE == 'Level':
        fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK
        for lvl in range(num_lvl):
            with tf.name_scope('Lvl{}'.format(lvl + 2)):
                anchors = multilevel_anchors[lvl]
                pred_boxes_decoded = anchors.decode_logits(multilevel_box_logits[lvl])

                proposal_boxes, proposal_scores = generate_rpn_proposals(
                    tf.reshape(pred_boxes_decoded, [-1, 4]),
                    tf.reshape(multilevel_label_logits[lvl], [-1]),
                    image_shape2d, fpn_nms_topk)
                all_boxes.append(proposal_boxes)
                all_scores.append(proposal_scores)

        proposal_boxes = tf.concat(all_boxes, axis=0)  # nx4
        proposal_scores = tf.concat(all_scores, axis=0)  # n
        proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk)
        proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False)
        proposal_boxes = tf.gather(proposal_boxes, topk_indices)
    else:
        for lvl in range(num_lvl):
            with tf.name_scope('Lvl{}'.format(lvl + 2)):
                anchors = multilevel_anchors[lvl]
                pred_boxes_decoded = anchors.decode_logits(multilevel_box_logits[lvl])
                all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4]))
                all_scores.append(tf.reshape(multilevel_label_logits[lvl], [-1]))
        all_boxes = tf.concat(all_boxes, axis=0)
        all_scores = tf.concat(all_scores, axis=0)
        proposal_boxes, proposal_scores = generate_rpn_proposals(
            all_boxes, all_scores, image_shape2d,
            cfg.RPN.TRAIN_PRE_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PRE_NMS_TOPK,
            cfg.RPN.TRAIN_POST_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_POST_NMS_TOPK)

    tf.sigmoid(proposal_scores, name='probs')  # for visualization
    return tf.stop_gradient(proposal_boxes, name='boxes'), \
        tf.stop_gradient(proposal_scores, name='scores')
Beispiel #29
0
def batch_norm_log_diff(input_,
                        dim,
                        name,
                        train=True,
                        epsilon=1e-8,
                        decay=.1,
                        axes=[0],
                        reuse=None,
                        bn_lag=DEFAULT_BN_LAG):
    """Batch normalization with corresponding log determinant Jacobian."""
    if reuse is None:
        reuse = not train
    # create variables
    with tf.variable_scope(name) as scope:
        if reuse:
            scope.reuse_variables()
        var = variable_on_cpu(
            "var", [dim], tf.constant_initializer(1.), trainable=False)
        mean = variable_on_cpu(
            "mean", [dim], tf.constant_initializer(0.), trainable=False)
        step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False)
    # choose the appropriate moments
    if train:
        used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm")
        cur_mean, cur_var = used_mean, used_var
        if bn_lag > 0.:
            used_var = stable_var(input_=input_, mean=used_mean, axes=axes)
            cur_var = used_var
            used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean))
            used_mean /= (1. - bn_lag**(step + 1))
            used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var))
            used_var /= (1. - bn_lag**(step + 1))
    else:
        used_mean, used_var = mean, var
        cur_mean, cur_var = used_mean, used_var

    # update variables
    if train:
        with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]):
            with ops.colocate_with(mean):
                new_mean = tf.assign_sub(
                    mean,
                    tf.check_numerics(
                        decay * (mean - cur_mean), "NaN in moving mean."))
        with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]):
            with ops.colocate_with(var):
                new_var = tf.assign_sub(
                    var,
                    tf.check_numerics(decay * (var - cur_var),
                                      "NaN in moving variance."))
        with tf.name_scope(name, "IncrementTime", [step]):
            with ops.colocate_with(step):
                new_step = tf.assign_add(step, 1.)
        used_var += 0. * new_mean * new_var * new_step
    used_var += epsilon

    return used_mean, used_var
 def compute_loss():
     labelsf = tf.cast(labels, logits.dtype)
     signs = 2. * labelsf - 1.
     errors = 1. - logits * tf.stop_gradient(signs)
     errors_sorted, perm = tf.nn.top_k(errors, k=tf.shape(errors)[0], name="descending_sort")
     gt_sorted = tf.gather(labelsf, perm)
     grad = lovasz_grad(gt_sorted)
     loss = tf.tensordot(tf.nn.relu(errors_sorted), tf.stop_gradient(grad), 1, name="loss_non_void")
     return loss
Beispiel #31
0
    def _build_net(self):
        # ------------------ inputs ---------------------
        self.s = tf.placeholder(tf.float32, [None, self.n_features],
                                name='s')  # input State
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features],
                                 name='s_')  # input Next State
        self.r = tf.placeholder(tf.float32, [
            None,
        ], name='r')  # input Reward
        self.a = tf.placeholder(tf.int32, [
            None,
        ], name='a')  # input Action

        w_initializer, b_initializer = tf.random_normal_initializer(
            0., 0.3), tf.constant_initializer(0.1)

        # ------------------ evaluation_net --------------
        with tf.variable_scope('eval_net'):
            e1 = tf.layers.dense(self.s,
                                 FIRSTLAYER_SIZE,
                                 tf.nn.relu,
                                 kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer,
                                 name='e1')
            self.q_eval = tf.layers.dense(e1,
                                          self.n_actions,
                                          kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer,
                                          name='q')

        # ------------------ target_net ------------------
        with tf.variable_scope('target_net'):
            t1 = tf.layers.dense(self.s_,
                                 FIRSTLAYER_SIZE,
                                 tf.nn.relu,
                                 kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer,
                                 name='t1')
            self.q_next = tf.layers.dense(t1,
                                          self.n_actions,
                                          kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer,
                                          name='t2')

        with tf.variable_scope('q_target'):
            q_target = self.r + self.gamma * tf.reduce_max(
                self.q_next, axis=1, name='Qmax_s_')  # shape=(None, )
            self.q_target = tf.stop_gradient(q_target)
        with tf.variable_scope('q_eval'):
            a_indices = tf.stack(
                [tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a],
                axis=1)
            self.q_eval_wrt_a = tf.gather_nd(
                params=self.q_eval, indices=a_indices)  # shape=(None, )
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(
                tf.squared_difference(self.q_target,
                                      self.q_eval_wrt_a,
                                      name='TD_error'))
        with tf.variable_scope('train'):
            self._train_op = tf.train.AdamOptimizer(self.lr).minimize(
                self.loss)

features = tf.placeholder(tf.float32 , (None , 32,32,3))

labels   = tf.placeholder(tf.init64 , None)

resized  = tf.image.resize_images(features , (227 , 227))


# Returns the second final layer of the AlexNet model,
# this allows us to redo the last layer for the traffic signs
# model.


fc7 = AlexNet(resized , feature_extract = True)
fc7 = tf.stop_gradient(fc7)
shape= (fc7.get_shape().as_list[-1] , nb_classes)


# designing the new  fully connected layer :

fc8W = tf.Variable(tf.truncated_normal(shape, stddev = 1e-2))
fc8b = tf.varibale(tf.zeros(nb_classes))
logits = tf.matmul(fc7 , fc8W) + fc8b


cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits

loss_op = tf.reduce_mean(cross_entropy)
opt     = tf.train.AdamOptimizer()
train_op =opt.minimize(loss_op , var_list= [fc8W , fc8b])
Beispiel #33
0
    def _init_actor_update(self):
        """Create minimization operations for policy and entropy.

        Creates a `tf.optimizer.minimize` operations for updating
        policy and entropy with gradient descent, and adds them to
        `self._training_ops` attribute.

        See Section 4.2 in [1], for further information of the policy update,
        and Section 5 in [1] for further information of the entropy update.
        """

        actions = self._policy.actions([self._observations_ph])
        log_pis = self._policy.log_pis([self._observations_ph], actions)

        assert log_pis.shape.as_list() == [None, 1]

        log_alpha = self._log_alpha = tf.get_variable('log_alpha',
                                                      dtype=tf.float32,
                                                      initializer=0.0)
        alpha = tf.exp(log_alpha)

        if isinstance(self._target_entropy, Number):
            alpha_loss = -tf.reduce_mean(
                log_alpha * tf.stop_gradient(log_pis + self._target_entropy))

            self._alpha_optimizer = tf.train.AdamOptimizer(
                self._policy_lr, name='alpha_optimizer')
            self._alpha_train_op = self._alpha_optimizer.minimize(
                loss=alpha_loss, var_list=[log_alpha])

            self._training_ops.update(
                {'temperature_alpha': self._alpha_train_op})

        self._alpha = alpha

        if self._action_prior == 'normal':
            policy_prior = tfp.distributions.MultivariateNormalDiag(
                loc=tf.zeros(self._action_shape),
                scale_diag=tf.ones(self._action_shape))
            policy_prior_log_probs = policy_prior.log_prob(actions)
        elif self._action_prior == 'uniform':
            policy_prior_log_probs = 0.0

        Q_log_targets = tuple(
            Q([self._observations_ph, actions]) for Q in self._Qs)
        min_Q_log_target = tf.reduce_min(Q_log_targets, axis=0)

        if self._reparameterize:
            policy_kl_losses = (alpha * log_pis - min_Q_log_target -
                                policy_prior_log_probs)
        else:
            raise NotImplementedError

        assert policy_kl_losses.shape.as_list() == [None, 1]

        self._policy_losses = policy_kl_losses
        policy_loss = tf.reduce_mean(policy_kl_losses)

        self._policy_optimizer = tf.train.AdamOptimizer(
            learning_rate=self._policy_lr, name="policy_optimizer")

        policy_train_op = self._policy_optimizer.minimize(
            loss=policy_loss, var_list=self._policy.trainable_variables)

        self._training_ops.update({'policy_train_op': policy_train_op})
def asac(env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=200,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         lr=5e-4,
         alpha_start=0.2,
         batch_size=100,
         start_steps=10000,
         max_ep_len=1000,
         logger_kwargs=dict(),
         save_freq=1,
         loss_threshold=0.0001,
         delta=0.02,
         sample_step=2000):

    alpha = Alpha(alpha_start=alpha_start, delta=delta)
    alpha_t = alpha()

    logger = EpochLogger(**logger_kwargs)
    logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(), env_fn()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    #x_ph, a_ph, x2_ph, r_ph, d_ph, ret_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None, None)
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)
    alpha_ph = core.scale_holder()
    # Main outputs from computation graph

    #R, R_next = return_estimate(x_ph, x2_ph, **ac_kwargs)
    with tf.variable_scope('main'):
        mu, pi, logp_pi, q1, q2, q1_pi, q2_pi, v, Q, Q_pi, R = actor_critic(
            x_ph, a_ph, **ac_kwargs)
    # Target value network
    with tf.variable_scope('target'):
        _, _, _, _, _, _, _, v_targ, _, _, R_targ = actor_critic(
            x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope) for scope in [
            'main/pi', 'main/q1', 'main/q2', 'main/v', 'main/Q', 'main/R',
            'main'
        ])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t v: %d, \t Q: %d, \t R: %d, \t total: %d\n')%var_counts)
    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi, q2_pi)

    # Targets for Q and V regression
    q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * v_targ)
    v_backup = tf.stop_gradient(min_q_pi - alpha_ph * logp_pi)
    Q_backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * R_targ)
    R_backup = tf.stop_gradient(Q_pi)
    adv = Q_pi - R

    pi_loss = tf.reduce_mean(alpha_ph * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    v_loss = 0.5 * tf.reduce_mean((v_backup - v)**2)
    Q_loss = 0.5 * tf.reduce_mean((Q_backup - Q)**2)
    R_loss = 0.5 * tf.reduce_mean((R_backup - R)**2)
    value_loss = q1_loss + q2_loss + v_loss + Q_loss + R_loss
    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q') + get_vars('main/v') + get_vars(
        'main/Q') + get_vars('main/R')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)
    """
    R_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_R_op = R_optimizer.minimize(R_loss, var_list=get_vars('R'))
    """
    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    step_ops = [
        pi_loss, q1_loss, q2_loss, v_loss, q1, q2, v, logp_pi, train_pi_op,
        train_value_op, target_update, R_loss, Q_loss
    ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    config = tf.ConfigProto(inter_op_parallelism_threads=30,
                            intra_op_parallelism_threads=5)
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    # Setup model saving
    logger.setup_tf_saver(sess,
                          inputs={
                              'x': x_ph,
                              'a': a_ph
                          },
                          outputs={
                              'mu': mu,
                              'pi': pi,
                              'q1': q1,
                              'q2': q2,
                              'v': v,
                              'Q': Q,
                              'R': R
                          })

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})

    def test_agent(n=10):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
    total_steps = steps_per_epoch * epochs

    counter = 0
    ret_epi = []
    obs_epi = []
    loss_old = 10000
    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()
        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1
        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if ep_len == max_ep_len else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)
        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        if d or (ep_len == max_ep_len):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                    alpha_ph: alpha_t
                }
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             LossV=outs[3],
                             Q1Vals=outs[4],
                             Q2Vals=outs[5],
                             VVals=outs[6],
                             LogPi=outs[7],
                             LossR=outs[11])
                counter += 1
            logger.store(EpRet=ep_ret, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
            ret_est = sess.run(R, feed_dict={x_ph: [o]})[0]
            logger.store(RetEst=ret_est)
            if counter >= 1000:
                loss_new, _ = logger.get_stats('LossPi')
                counter = 0
                if (loss_old - loss_new) / np.absolute(
                        loss_old) < loss_threshold and t > start_steps:
                    rho_s = np.zeros([sample_step, obs_dim], dtype=np.float32)
                    rho_ptr = 0
                    for sample_t in range(sample_step):
                        a = get_action(o)
                        o2, r, d, _ = env.step(a)
                        ep_len += 1
                        d = False if ep_len == max_ep_len else d
                        rho_s[rho_ptr] = o
                        o = o2
                        if d or (ep_len == max_ep_len):
                            o, r, d, ep_ret, ep_len = env.reset(
                            ), 0, False, 0, 0
                    advantages = sess.run(adv, feed_dict={x_ph: rho_s})
                    alpha.update_alpha(advantages)
                    #alpha.update_alpha(rho_q-rho_v)
                    alpha_t = alpha()
                    print(alpha_t)
                    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
                    loss_old = 10000
                else:
                    loss_old = loss_new
        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            # Save model
            if (epoch % save_freq == 0) or (epoch == epochs - 1):
                logger.save_state({'env': env}, None)

            # Test the performance of the deterministic version of the agent.
            test_agent()

            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('EntCoeff', alpha_t)
            logger.log_tabular('RetEst', average_only=True)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=True)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('LossR', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()
Beispiel #35
0
def frequency_encoder(features, kernels, biases):

    # Check for valid weights
    restore = False
    if kernels[0] is not None:
        restore = True

    # Capture largest frequency dependent features
    conv_freq1 = tf.layers.Conv2D(
        6, (HEIGHT / 2, 2),
        strides=(HEIGHT / 4, 2),
        activation='relu',
        padding='same',
        name='conv1-',
        kernel_initializer=kernels.pop(),
        kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10),
        bias_initializer=biases.pop())(features)
    # Capture large frequency dependent features
    conv_freq2 = tf.layers.Conv2D(
        6, (HEIGHT / 4, 2),
        strides=(HEIGHT / 8, 2),
        activation='relu',
        padding='same',
        name='conv2-',
        kernel_initializer=kernels.pop(),
        kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10),
        bias_initializer=biases.pop())(features)
    # Capture small frequency dependent features
    conv_freq3 = tf.layers.Conv2D(
        6, (HEIGHT / 8, 2),
        strides=(HEIGHT / 16, 2),
        activation='relu',
        padding='same',
        name='conv3-',
        kernel_initializer=kernels.pop(),
        kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10),
        bias_initializer=biases.pop())(features)
    # Capture smallest frequency dependent features
    conv_freq4 = tf.layers.Conv2D(
        6, (HEIGHT / 21, 2),
        strides=(HEIGHT / 42, 2),
        activation='relu',
        padding='same',
        name='conv4-',
        kernel_initializer=kernels.pop(),
        kernel_regularizer=tf.contrib.layers.l2_regularizer(BETA / 10),
        bias_initializer=biases.pop())(features)

    # Pool out time scales
    pool_freq1 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16),
                                        padding='same',
                                        name='pool5-')(conv_freq1)
    pool_freq2 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16),
                                        padding='same',
                                        name='pool6-')(conv_freq2)
    pool_freq3 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16),
                                        padding='same',
                                        name='pool7-')(conv_freq3)
    pool_freq4 = tf.layers.MaxPooling2D((2, WIDTH / 8), (1, WIDTH / 16),
                                        padding='same',
                                        name='pool8-')(conv_freq4)
    '''
    # Pad smaller feature maps
    pool_freq1 = tf.pad(pool_freq1, tf.constant([[0, 0], [17, 17], [0, 0], [0, 0]]))
    pool_freq2 = tf.pad(pool_freq2, tf.constant([[0, 0], [15, 15], [0, 0], [0, 0]]))
    pool_freq3 = tf.pad(pool_freq3, tf.constant([[0, 0], [11, 11], [0, 0], [0, 0]]))
    
    # Concat into same feature map
    freq_map = tf.concat([pool_freq1, pool_freq2, pool_freq3, pool_freq4], 3)
    '''

    # Upscale smaller frequency maps
    _, height, width, depth = pool_freq4.get_shape()
    pool_freq1 = tf.image.resize_nearest_neighbor(pool_freq1, [height, width])
    pool_freq2 = tf.image.resize_nearest_neighbor(pool_freq2, [height, width])
    pool_freq3 = tf.image.resize_nearest_neighbor(pool_freq3, [height, width])

    # Concat into same feature map
    freq_map = tf.concat([pool_freq1, pool_freq2, pool_freq3, pool_freq4], 3)

    # Post image of feedback map BROKEN
    feedback_map = tf.concat([pool_freq1, pool_freq2, pool_freq3, pool_freq4],
                             2)
    feedback_image0 = tf.slice(feedback_map, [0, 0, 0, 0], [-1, -1, -1, 3])
    feedback_image1 = tf.slice(feedback_map, [0, 0, 0, 3], [-1, -1, -1, 3])
    feedback_image = tf.concat([feedback_image0, feedback_image1], 2)
    tf.summary.image("feedback_map", feedback_image, max_outputs=18)

    # If valid weights were loaded
    if restore:
        # Don't update layers
        tf.stop_gradient(conv_freq1)
        tf.stop_gradient(conv_freq2)
        tf.stop_gradient(conv_freq3)
        tf.stop_gradient(conv_freq4)

    return freq_map
Beispiel #36
0
 def __init__(self,  size_obs, size_act, net_struct = [100, 100, 100, 100], name='dbg'):
     self.tensorboardpath = 'tensorboards/' + name
     self.train_writer = tf.summary.FileWriter(self.tensorboardpath)
     self.ModelPath = 'Models/Imitation' + name
     
     self.mse_train = []
     self.mse_val = []
     self.last_epoch = 0
     size_inpt = 200
     self.obs = tf.placeholder(tf.float32, shape=(None, size_obs))
     self.ret = tf.placeholder(tf.float32, shape=(None))
     act_trn = self.obs
     act_tst = self.obs
     prev_layer_size = size_obs
     #Hidden layers
     self.l2_reg = 1e-8
     self.Q_lr = tf.placeholder(tf.float32, shape=(None))
     self.lr = tf.placeholder(tf.float32, shape=(None))
     if 1:
         for idx, l in enumerate(net_struct):
             act_trn, act_tst = ops.cascade_bn_relu_trn_tst(
                     act_trn, prev_layer_size, l, name='layer' + str(idx), input_tst = act_tst)
             prev_layer_size += l
             
         w = tf.Variable(tf.random_uniform([prev_layer_size, size_act],minval = -1., maxval = 1.), name='net_output_w') * 1e-3
         b = tf.Variable(tf.random_uniform([size_act],minval = -1., maxval = 1.), name='net_output_bias') * 1e-3
     else:
         for idx, l in enumerate(net_struct):
             act_trn = ops.linear(act_trn, l, 'layer' + str(idx))
         w = tf.Variable(tf.random_uniform([l, size_act],minval = -1., maxval = 1.), name='net_output_w') * 1e-2
         b = tf.Variable(tf.random_uniform([size_act],minval = -1., maxval = 1.), name='net_output_bias') * 1e-2
     self.yhat = tf.reshape(tf.matmul(act_trn, w) + b, [-1, size_act])
     self.yhat_tst = tf.reshape(tf.matmul(act_tst, w) + b, [-1, size_act])
     
     self.obs_act = tf.concat((self.obs, self.yhat),1)
     self.Q = Q(size_obs + size_act, tf.stop_gradient(self.obs_act))
             
     self.act = tf.placeholder(tf.float32, shape=(None))
     
     self.l2_loss = tf.reduce_mean(tf.square(self.yhat - self.act))
     self.adv_loss = tf.reduce_mean(tf.square(self.yhat_tst - self.act))
     #-1*tf.gather_nd(output_tst, self.y_raw, axis=1)output_tst[list(np.arange(bs)),self.y_raw]
     
     self.advers = tf.gradients(self.l2_loss, self.obs)
     
     t_vars = tf.trainable_variables()
     net_vars = [var for var in t_vars if 'net_' in var.name]
     self.reg_loss = tf.reduce_sum([tf.reduce_sum(tf.square(var)) for var in net_vars])*self.l2_reg
     
     
     
     optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
     gvs = optimizer.compute_gradients(self.l2_loss + self.reg_loss - self.Q.yhat * self.Q_lr + self.Q.l2_loss)
     self.grad_norm = tf.reduce_mean([tf.reduce_mean(grad) for grad, var in gvs if grad is not None])
     clip_norm = 100
     clip_single = 1
     capped_gvs = [(tf.clip_by_value(grad, -1*clip_single,clip_single), var) for grad, var in gvs if grad is not None]
     capped_gvs = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in capped_gvs if grad is not None]
     self.optimizer = optimizer.apply_gradients(capped_gvs)
     
     #self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.l2_loss)
     
     self.cur_Q_lr = 0
     
     self.session = tf.Session()
     self.session.run(tf.global_variables_initializer())
     self.Saver = tf.train.Saver()
    def train(self, x, y=None, max_entropy=True, epochs=100, batch_size=64, lr=1e-3, tau_rate = 1e-4, task = 'autoencoder'):
        taskdict = {
            'autoencoder': self.L1,
            'classification': self.CrossEnt
        }
        
        schedule = lambda i: np.float32(np.max((0.5, np.exp(-tau_rate*i))))
        
        y = (x if y is None else y)
        
        if task in taskdict:
            self.taskLoss = taskdict[task]
            self.Loss = self.taskLoss - (self.Entropy if max_entropy else tf.stop_gradient(self.Entropy))
        else:
            raise ValueError('task not supported yet')

        #Optimizer 
        solver = tf.train.AdagradOptimizer(learning_rate = lr).minimize(self.Loss, var_list=self.params)
        #Need to clip gradients as they get huge for gumbel softmax + stein gd
        #gradients, variables = zip(*solver.compute_gradients(self.Loss, var_list=self.params))
        #gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        #solver = solver.apply_gradients(zip(gradients, variables))

        #Training
        init = tf.global_variables_initializer()
        sess = tf.Session()
        self.sess = sess
        with sess.as_default():
            sess.run(init)

            losses = []
            tasklosses = []
            ents = []
            stds = []
            
            n_batches = int(x.shape[0]/float(batch_size))
            for epoch in range(epochs):
                rand_idxs = np.arange(x.shape[0]) 
                np.random.shuffle(rand_idxs)
                
                loss = 0
                task_loss = 0
                ent = 0
                std = 0
                for batch in range(n_batches):
                    tau = schedule(epoch*n_batches + batch)

                    mb_idx = rand_idxs[batch*batch_size:(batch+1)*batch_size]
                    x_mb = x[mb_idx]
                    y_mb = y[mb_idx]

                    g = np.random.gumbel(size=(len(x_mb), self.k, 2))
                    _, loss_curr, taskloss_curr, ent_curr, std_curr = sess.run([solver, self.Loss, self.taskLoss, self.Entropy, self.std], feed_dict = {self.X:x_mb, self.Y:y_mb, self.g: g, self.tau: tau})
                    
                    loss += loss_curr/n_batches
                    task_loss += taskloss_curr/n_batches
                    ent += ent_curr/n_batches
                    std += np.mean(np.abs(std_curr))/n_batches
                    
                losses.append(loss)
                tasklosses.append(task_loss)
                ents.append(ent)
                stds.append(std)

            print('Final task loss: %f' %(tasklosses[-1]))
            
            plt.figure()
            plt.plot(losses)
            plt.title('total loss')
            
            plt.figure()
            plt.plot(tasklosses)
            plt.title('task loss')
            
            plt.figure()
            plt.plot(np.array(ents) * 1.442695) #in bits
            plt.title('ent')
            
            plt.figure()
            plt.plot(stds)
            plt.title('std')
def layers(vgg_layer3_out, vgg_layer4_out, vgg_layer7_out, num_classes):
    """
    Create the layers for a fully convolutional network.  Build skip-layers using the vgg layers.
    :param vgg_layer7_out: TF Tensor for VGG Layer 3 output
    :param vgg_layer4_out: TF Tensor for VGG Layer 4 output
    :param vgg_layer3_out: TF Tensor for VGG Layer 7 output
    :param num_classes: Number of classes to classify
    :return: The Tensor for the last layer of output
    """
    # 1x1 convolution layer for feature extraction
    tf_conv1x1 = tf.layers.conv2d(
        vgg_layer7_out,
        num_classes,
        1,
        1,
        kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3),
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
    # conv2d-transpose for 2x upsampling
    tf_2x = tf.layers.conv2d_transpose(
        tf_conv1x1,
        num_classes,
        4,
        2,
        padding='SAME',
        kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3),
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
    # combine with pooling layer 4
    tf_skip1 = tf.add(
        tf_2x,
        tf.layers.conv2d(
            vgg_layer4_out,
            num_classes,
            1,
            1,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3),
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)))
    # perform conv2d-transpose for 2x upsampling again.
    # output: 4x features + 2x pool4
    tf_4x = tf.layers.conv2d_transpose(
        tf_skip1,
        num_classes,
        4,
        2,
        padding='SAME',
        kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3),
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))
    # combile with pooling layer 3
    tf_skip2 = tf.add(
        tf_4x,
        tf.layers.conv2d(
            vgg_layer3_out,
            num_classes,
            1,
            1,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3),
            kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)))
    # perform conv2d-transpose for 2x upsampling again.
    # output: 8x features + 4x pool4 + 2x pool3
    tf_final = tf.layers.conv2d_transpose(
        tf_skip2,
        num_classes,
        16,
        8,
        padding='SAME',
        kernel_regularizer=tf.contrib.layers.l2_regularizer(1e-3),
        kernel_initializer=tf.truncated_normal_initializer(stddev=0.01))

    tf.stop_gradient(vgg_layer3_out)
    tf.stop_gradient(vgg_layer4_out)
    tf.stop_gradient(vgg_layer7_out)

    return tf_final
Beispiel #39
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.full_buffers = [FullBuffer(self.full_size, self.env.observation_space.shape[0], self.env.action_space.shape[0])
                                            for _ in range(len(self.env.unwrapped.tasks))]
                self.env.task_idx = 0
                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess, self.observation_space, self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy = self.policy(self.sess, self.observation_space, self.action_space,
                                                     **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy.obs_ph
                    self.processed_next_obs_ph = self.target_policy.processed_obs
                    self.action_target = self.target_policy.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32, shape=(None, 1), name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32, shape=(None,) + self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    # first return value corresponds to deterministic actions
                    # policy_out corresponds to stochastic actions, used for training
                    # logp_pi is the log probability of actions taken by the policy
                    self.deterministic_action, policy_out, logp_pi = self.policy_tf.make_actor(self.processed_obs_ph)
                    # Monitor the entropy of the policy,
                    # this is not used for training
                    self.entropy = tf.reduce_mean(self.policy_tf.entropy)
                    #  Use two Q-functions to improve performance by reducing overestimation bias.
                    qf1, qf2, value_fn = self.policy_tf.make_critics(self.processed_obs_ph, self.actions_ph,
                                                                     create_qf=True, create_vf=True)
                    qf1_pi, qf2_pi, _ = self.policy_tf.make_critics(self.processed_obs_ph,
                                                                    policy_out, create_qf=True, create_vf=False,
                                                                    reuse=True)

                    # Target entropy is used when learning the entropy coefficient
                    if self.target_entropy == 'auto':
                        # automatically set target entropy if needed
                        self.target_entropy = -np.prod(self.env.action_space.shape).astype(np.float32)
                    else:
                        # Force conversion
                        # this will also throw an error for unexpected string
                        self.target_entropy = float(self.target_entropy)

                    # The entropy coefficient or entropy can be learned automatically
                    # see Automating Entropy Adjustment for Maximum Entropy RL section
                    # of https://arxiv.org/abs/1812.05905
                    if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
                        # Default initial value of ent_coef when learned
                        init_value = 1.0
                        if '_' in self.ent_coef:
                            init_value = float(self.ent_coef.split('_')[1])
                            assert init_value > 0., "The initial value of ent_coef must be greater than 0"

                        self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,
                                                            initializer=np.log(init_value).astype(np.float32))
                        self.ent_coef = tf.exp(self.log_ent_coef)
                    else:
                        # Force conversion to float
                        # this will throw an error if a malformed string (different from 'auto')
                        # is passed
                        self.ent_coef = float(self.ent_coef)

                with tf.variable_scope("target", reuse=False):
                    # Create the value network
                    _, _, value_target = self.target_policy.make_critics(self.processed_next_obs_ph,
                                                                         create_qf=False, create_vf=True)
                    self.value_target = value_target

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two Q-Values (Double-Q Learning)
                    min_qf_pi = tf.minimum(qf1_pi, qf2_pi)

                    # Target for Q value regression
                    q_backup = tf.stop_gradient(
                        self.rewards_ph +
                        (1 - self.terminals_ph) * self.gamma * self.value_target
                    )

                    # Compute Q-Function loss
                    # TODO: test with huber loss (it would avoid too high values)
                    qf1_loss = 0.5 * tf.reduce_mean((q_backup - qf1) ** 2)
                    qf2_loss = 0.5 * tf.reduce_mean((q_backup - qf2) ** 2)

                    # Compute the entropy temperature loss
                    # it is used when the entropy coefficient is learned
                    ent_coef_loss, entropy_optimizer = None, None
                    if not isinstance(self.ent_coef, float):
                        ent_coef_loss = -tf.reduce_mean(
                            self.log_ent_coef * tf.stop_gradient(logp_pi + self.target_entropy))
                        entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)

                    # Compute the policy loss
                    # Alternative: policy_kl_loss = tf.reduce_mean(logp_pi - min_qf_pi)
                    policy_kl_loss = tf.reduce_mean(self.ent_coef * logp_pi - qf1_pi)

                    # NOTE: in the original implementation, they have an additional
                    # regularization loss for the Gaussian parameters
                    # this is not used for now
                    # policy_loss = (policy_kl_loss + policy_regularization_loss)
                    policy_loss = policy_kl_loss


                    # Target for value fn regression
                    # We update the vf towards the min of two Q-functions in order to
                    # reduce overestimation bias from function approximation error.
                    v_backup = tf.stop_gradient(min_qf_pi - self.ent_coef * logp_pi)
                    value_loss = 0.5 * tf.reduce_mean((value_fn - v_backup) ** 2)

                    values_losses = qf1_loss + qf2_loss + value_loss

                    # Policy train op
                    # (has to be separate from value train op, because min_qf_pi appears in policy_loss)
                    policy_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(policy_loss, var_list=get_vars('model/pi'))

                    # Value train op
                    value_optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_ph)
                    values_params = get_vars('model/values_fn')

                    source_params = get_vars("model/values_fn/vf")
                    target_params = get_vars("target/values_fn/vf")

                    # Polyak averaging for target variables
                    self.target_update_op = [
                        tf.assign(target, (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]
                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Control flow is used because sess.run otherwise evaluates in nondeterministic order
                    # and we first need to compute the policy action before computing q values losses
                    with tf.control_dependencies([policy_train_op]):
                        train_values_op = value_optimizer.minimize(values_losses, var_list=values_params)

                        self.infos_names = ['policy_loss', 'qf1_loss', 'qf2_loss', 'value_loss', 'entropy']
                        # All ops to call during one training step
                        self.step_ops = [policy_loss, qf1_loss, qf2_loss,
                                         value_loss, qf1, qf2, value_fn, logp_pi,
                                         self.entropy, policy_train_op, train_values_op]

                        # Add entropy coefficient optimization operation if needed
                        if ent_coef_loss is not None:
                            with tf.control_dependencies([train_values_op]):
                                ent_coef_op = entropy_optimizer.minimize(ent_coef_loss, var_list=self.log_ent_coef)
                                self.infos_names += ['ent_coef_loss', 'ent_coef']
                                self.step_ops += [ent_coef_op, ent_coef_loss, self.ent_coef]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('value_loss', value_loss)
                    tf.summary.scalar('entropy', self.entropy)
                    if ent_coef_loss is not None:
                        tf.summary.scalar('ent_coef_loss', ent_coef_loss)
                        tf.summary.scalar('ent_coef', self.ent_coef)

                    tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/values_fn/vf")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()
Beispiel #40
0
    def build_loss(self, predictions, examples, **kwargs):
        """Build tf graph to compute loss.

    Args:
      predictions: dict of prediction results keyed by name.
      examples: dict of inputs keyed by name.

    Returns:
      loss_dict: dict of loss tensors keyed by name.
    """
        options = self._model_proto

        loss_dict = {}

        with tf.name_scope('losses'):

            # Loss of the MIDN module.

            labels = self._label_extractor.extract_labels(examples)
            losses = tf.nn.sigmoid_cross_entropy_with_logits(
                labels=labels,
                logits=predictions[Cap2DetPredictions.midn_class_logits])
            loss_dict['midn_cross_entropy_loss'] = tf.multiply(
                tf.reduce_mean(losses), options.midn_loss_weight)

            # Losses of the OICR module.

            (num_proposals,
             proposals) = (predictions[DetectionResultFields.num_proposals],
                           predictions[DetectionResultFields.proposal_boxes])
            batch, max_num_proposals, _ = utils.get_tensor_shape(proposals)

            proposal_scores_0 = predictions[
                Cap2DetPredictions.oicr_proposal_scores + '_at_0']
            if options.oicr_use_proba_r_given_c:
                proposal_scores_0 = predictions[
                    Cap2DetPredictions.midn_proba_r_given_c]
            proposal_scores_0 = tf.concat([
                tf.fill([batch, max_num_proposals, 1], 0.0), proposal_scores_0
            ],
                                          axis=-1)

            for i in range(options.oicr_iterations):
                proposal_scores_1 = predictions[
                    Cap2DetPredictions.oicr_proposal_scores +
                    '_at_{}'.format(i + 1)]
                oicr_cross_entropy_loss_at_i = model_utils.calc_oicr_loss(
                    labels,
                    num_proposals,
                    proposals,
                    tf.stop_gradient(proposal_scores_0),
                    proposal_scores_1,
                    scope='oicr_{}'.format(i + 1),
                    iou_threshold=options.oicr_iou_threshold)
                loss_dict['oicr_cross_entropy_loss_at_{}'.format(
                    i + 1)] = tf.multiply(oicr_cross_entropy_loss_at_i,
                                          options.oicr_loss_weight)

                proposal_scores_0 = tf.nn.softmax(proposal_scores_1, axis=-1)

        return loss_dict
Beispiel #41
0
    def forward(self,
                Ts,
                images,
                depths,
                intrinsics,
                inds=None,
                num_fixed=0,
                init=tf.constant(False)):
        # motion network performs projection operations in features space
        cfg = self.cfg
        batch = tf.shape(images)[0]
        num = tf.shape(images)[1]

        if cfg.RESCALE_IMAGES:
            images = 2 * (images / 255.0) - 1.0

        if inds is None:
            if self.mode == 'keyframe':
                self.inds = self._keyframe_pairs_indicies(num)
                num_fixed = 1
            elif self.mode == 'global':
                self.inds = self._all_pairs_indicies(num)
        else:
            self.inds = inds

        (ii, jj) = self.inds
        intrinsics = intrinsics_vec_to_matrix(intrinsics)

        # if self.is_training and (not self.is_calibrated):
        #     perturbation = 0.1 * tf.random.normal([batch, 1])
        #     intrinsics = update_intrinsics(intrinsics, perturbation)

        depths_low, intrinsics = rescale_depths_and_intrinsics(depths,
                                                               intrinsics,
                                                               downscale=4)

        with tf.variable_scope("motion", reuse=self.reuse) as sc:
            if Ts is None:
                Ts = self.pose_regressor_init(images)
            else:
                if self.use_regressor:
                    Gs = self.pose_regressor_init(images)
                    Ts = cond_transform(init, Gs, Ts)

            feats = self.extract_features(images)
            depths = tf.gather(depths_low, ii, axis=1) + EPS

            feats1 = tf.gather(feats, ii, axis=1)
            feats2 = tf.gather(feats, jj, axis=1)

            Ti = Ts.gather(ii)
            Tj = Ts.gather(jj)
            Tij = Tj * Ti.inv()

            for i in range(cfg.FLOWSE3.ITER_COUNT):
                Tij = Tij.copy(stop_gradients=True)
                Ts = Ts.copy(stop_gradients=True)
                intrinsics = tf.stop_gradient(intrinsics)

                coords, vmask = Tij.transform(depths,
                                              intrinsics,
                                              valid_mask=True)
                featsw = vmask * bilinear_sampler(feats2, coords, batch_dims=2)

                with tf.name_scope("residual"):
                    flow, weight = self.flownet(feats1, featsw, reuse=i > 0)
                    self.weights_history.append(weight)

                    target = flow + coords
                    weight = vmask * tf.nn.sigmoid(weight)

                with tf.name_scope("PnP"):
                    if (self.mode == 'keyframe') and self.is_calibrated:
                        Tij = Tij.keyframe_optim(target, weight, depths,
                                                 intrinsics)
                        Ts = Tij.append_identity(
                        )  # set keyframe pose to identity

                    else:
                        Ts, intrinsics = Ts.global_optim(
                            target,
                            weight,
                            depths,
                            intrinsics, (jj, ii),
                            num_fixed=num_fixed,
                            include_intrinsics=(not self.is_calibrated))
                        Tij = Ts.gather(jj) * Ts.gather(
                            ii).inv()  # relative poses

                    coords, vmask1 = Tij.transform(depths,
                                                   intrinsics,
                                                   valid_mask=True)
                    self.transform_history.append(Ts)
                    self.residual_history.append(vmask * vmask1 *
                                                 (coords - target))

                self.intrinsics_history.append(
                    intrinsics_matrix_to_vec(intrinsics))

        intrinsics = 4.0 * intrinsics_matrix_to_vec(intrinsics)
        return Ts, intrinsics
        def train_step():
            experience, _ = next(iterator)

            prior = predictor_net(
                (experience.observation[:, 0], experience.action[:, 0]),
                training=False)
            z_next = encoder_net(experience.observation[:, 1], training=False)
            # predictor_kl is a vector of size batch_size.
            predictor_kl = tfp.distributions.kl_divergence(z_next, prior)

            with tf.GradientTape() as tape:
                tape.watch(actor_net._log_kl_coefficient)  # pylint: disable=protected-access
                dual_loss = -1.0 * actor_net._log_kl_coefficient * (  # pylint: disable=protected-access
                    tf.stop_gradient(tf.reduce_mean(predictor_kl)) -
                    kl_constraint)
            dual_grads = tape.gradient(dual_loss,
                                       [actor_net._log_kl_coefficient])  # pylint: disable=protected-access
            grads_and_vars = list(
                zip(dual_grads, [actor_net._log_kl_coefficient]))  # pylint: disable=protected-access
            dual_optimizer.apply_gradients(grads_and_vars)

            # Clip the dual variable so exp(log_kl_coef) <= 1e6.
            log_kl_coef = tf.clip_by_value(
                actor_net._log_kl_coefficient,  # pylint: disable=protected-access
                -1.0 * np.log(1e6),
                np.log(1e6))
            actor_net._log_kl_coefficient.assign(log_kl_coef)  # pylint: disable=protected-access

            with tf.name_scope('dual_loss'):
                tf.compat.v2.summary.scalar(name='dual_loss',
                                            data=tf.reduce_mean(dual_loss),
                                            step=global_step)
                tf.compat.v2.summary.scalar(
                    name='log_kl_coefficient',
                    data=actor_net._log_kl_coefficient,  # pylint: disable=protected-access
                    step=global_step)

            z_entropy = z_next.entropy()
            log_prob = prior.log_prob(z_next.sample())
            with tf.name_scope('rp-metrics'):
                common.generate_tensor_summaries('predictor_kl', predictor_kl,
                                                 global_step)
                common.generate_tensor_summaries('z_entropy', z_entropy,
                                                 global_step)
                common.generate_tensor_summaries('log_prob', log_prob,
                                                 global_step)
                common.generate_tensor_summaries('z_mean', z_next.mean(),
                                                 global_step)
                common.generate_tensor_summaries('z_stddev', z_next.stddev(),
                                                 global_step)
                common.generate_tensor_summaries('prior_mean', prior.mean(),
                                                 global_step)
                common.generate_tensor_summaries('prior_stddev',
                                                 prior.stddev(), global_step)

            if log_prob_reward_scale == 'auto':
                coef = tf.stop_gradient(tf.exp(actor_net._log_kl_coefficient))  # pylint: disable=protected-access
            else:
                coef = log_prob_reward_scale
            tf.debugging.check_numerics(tf.reduce_mean(predictor_kl),
                                        'predictor_kl is inf or nan.')
            tf.debugging.check_numerics(coef, 'coef is inf or nan.')
            new_reward = experience.reward - coef * predictor_kl[:, None]

            experience = experience._replace(reward=new_reward)
            return tf_agent.train(experience)
Beispiel #43
0
 def loss(self, predictions, policy, cfv):
     pi = cpea.rm_policy(predictions)
     inst_r = cfv - cpea.utility(pi, cfv)
     inst_q = tf.stop_gradient(tf.maximum(inst_r, -tf.nn.relu(predictions)))
     return tf.reduce_mean(
         tf.reduce_sum(tf.square(predictions - inst_q), axis=1)) / 2.0
Beispiel #44
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer_f,
                grad_norm_clipping=None,
                gamma=1.0,
                scope="setdeepq",
                reuse=None,
                test_eps=0.05,
                lr_init=0.001,
                lr_period_steps=250000,
                tau=0.05):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    lr_init : float
        initial learning rate
    lr_period : int
        learning rate schedule following a cosine with this period
    tau : float
        parameter for the soft target network update. tau <= 1.0 and 1.0 for
        the hard update.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    # Build action graphs
    act_f = build_act(make_obs_ph,
                      q_func,
                      num_actions,
                      scope=scope,
                      reuse=reuse)

    act_greedy = build_act_greedy(make_obs_ph,
                                  q_func,
                                  num_actions,
                                  scope=scope,
                                  reuse=True,
                                  eps=test_eps)

    with tf.compat.v1.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.compat.v1.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.compat.v1.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.compat.v1.placeholder(tf.float32, [None],
                                                name="done")
        importance_weights_ph = tf.compat.v1.placeholder(tf.float32, [None],
                                                         name="weight")
        iteration = tf.compat.v1.placeholder(tf.float32, name="iteration")

        # Cosine learning rate adjustment
        lr = tf.Variable(float(lr_init),
                         trainable=False,
                         dtype=tf.float32,
                         name='lr')
        lr = tf.clip_by_value(
            0.0005 * tf.math.cos(math.pi * iteration / lr_period_steps) +
            0.000501, 1e-6, 1e-3)
        optimizer = optimizer_f(learning_rate=lr)

        # q network evaluation
        q1_t = q_func.forward(obs_t_input.get(),
                              num_actions,
                              scope="q1_func",
                              reuse=True)  # reuse q1 parameters from act
        q1_func_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
            scope=tf.compat.v1.get_variable_scope().name + "/q1_func")
        q2_t = q_func.forward(obs_t_input.get(),
                              num_actions,
                              scope="q2_func",
                              reuse=True)  # reuse q2 parameters from act
        q2_func_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
            scope=tf.compat.v1.get_variable_scope().name + "/q2_func")

        # target q network evalution
        q1_tp1 = q_func.forward(obs_tp1_input.get(),
                                num_actions,
                                scope="target_q1_func",
                                reuse=False)
        target_q1_func_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
            scope=tf.compat.v1.get_variable_scope().name + "/target_q1_func")
        q2_tp1 = q_func.forward(obs_tp1_input.get(),
                                num_actions,
                                scope="target_q2_func",
                                reuse=False)
        target_q2_func_vars = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES,
            scope=tf.compat.v1.get_variable_scope().name + "/target_q2_func")

        # q scores for actions which we know were selected in the given state.
        q1_t_selected = tf.reduce_sum(input_tensor=q1_t *
                                      tf.one_hot(act_t_ph, num_actions),
                                      axis=1)
        q2_t_selected = tf.reduce_sum(input_tensor=q2_t *
                                      tf.one_hot(act_t_ph, num_actions),
                                      axis=1)

        # Actions selected with current q funcs at state t+1.
        q1_tp1_using_online_net = q_func.forward(obs_tp1_input.get(),
                                                 num_actions,
                                                 scope="q1_func",
                                                 reuse=True)
        q2_tp1_using_online_net = q_func.forward(obs_tp1_input.get(),
                                                 num_actions,
                                                 scope="q2_func",
                                                 reuse=True)
        tp1_best_action_using_online_net = tf.argmax(
            input=q1_tp1_using_online_net + q2_tp1_using_online_net, axis=1)
        # Using action at t+1 find target value associated with the action
        q1_tp1_selected = tf.reduce_sum(
            input_tensor=q1_tp1 *
            tf.one_hot(tp1_best_action_using_online_net, num_actions),
            axis=1)
        q2_tp1_selected = tf.reduce_sum(
            input_tensor=q2_tp1 *
            tf.one_hot(tp1_best_action_using_online_net, num_actions),
            axis=1)
        # Min of target q values to be used bellman equation
        q_tp1_best = tf.minimum(q1_tp1_selected, q2_tp1_selected)

        # compute RHS of bellman equation
        q_tp1_selected_target = rew_t_ph + gamma * q_tp1_best

        # compute the error (potentially clipped)
        td_error1 = q1_t_selected - tf.stop_gradient(q_tp1_selected_target)
        td_error2 = q2_t_selected - tf.stop_gradient(q_tp1_selected_target)
        errors1 = U.huber_loss(td_error1)
        errors2 = U.huber_loss(td_error2)
        errors = errors1 + errors2
        weighted_error = tf.reduce_mean(input_tensor=importance_weights_ph *
                                        errors)

        #Print total number of params
        total_parameters = 0
        for variable in tf.compat.v1.trainable_variables():
            # shape is an array of tf.Dimension
            shape = variable.get_shape()
            variable_parameters = 1
            for dim in shape:
                variable_parameters *= dim.value
            # print("var params", variable_parameters)
            total_parameters += variable_parameters
        print(
            "===============================================================")
        print("Total number of trainable params:", total_parameters)
        print(
            "===============================================================")

        # Log for tensorboard
        tf.summary.scalar('q1_values', tf.math.reduce_mean(q1_t))
        tf.summary.scalar('q2_values', tf.math.reduce_mean(q2_t))
        tf.summary.scalar('td_1', tf.math.reduce_mean(td_error1))
        tf.summary.scalar('td_2', tf.math.reduce_mean(td_error2))
        tf.summary.scalar('weighted_loss', weighted_error)
        tf.summary.scalar('lr_schedule', lr)
        tf.summary.scalar('td_MSE_1',
                          tf.math.reduce_mean(tf.math.square(td_error1)))
        tf.summary.scalar('td_MSE_2',
                          tf.math.reduce_mean(tf.math.square(td_error2)))

        # combine variable scopes
        q_func_vars = q1_func_vars + q2_func_vars
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called every step to copy Q network to target Q network
        # target network is updated with polyak averaging
        update_target_expr1 = []
        for var, var_target in zip(
                sorted(q1_func_vars, key=lambda v: v.name),
                sorted(target_q1_func_vars, key=lambda v: v.name)):
            update_target_expr1.append(
                var_target.assign(tau * var + (1 - tau) * var_target))
        update_target_expr1 = tf.group(*update_target_expr1)

        update_target_expr2 = []
        for var, var_target in zip(
                sorted(q2_func_vars, key=lambda v: v.name),
                sorted(target_q2_func_vars, key=lambda v: v.name)):
            update_target_expr2.append(
                var_target.assign(tau * var + (1 - tau) * var_target))
        update_target_expr2 = tf.group(*update_target_expr2)

        merged_summary = tf.compat.v1.summary.merge_all(
            scope=tf.compat.v1.get_variable_scope().name)
        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph, iteration
        ],
                           outputs=[
                               td_error1, td_error2,
                               tf.reduce_mean(input_tensor=errors),
                               merged_summary
                           ],
                           updates=[optimize_expr, lr])
        update_target = U.function(
            [], [], updates=[update_target_expr1, update_target_expr2])

        q_values = U.function(inputs=[obs_t_input], outputs=[q1_t, q2_t])

        return act_f, act_greedy, q_values, train, update_target, {
            'q_values': q_values
        }
Beispiel #45
0
def build_learner(pre, post, act_space, num_frames):
    global_step = tf.train.get_or_create_global_step()
    init_lr = FLAGS.init_lr
    decay = FLAGS.lr_decay
    warmup_steps = FLAGS.warmup_steps
    use_rmc = FLAGS.use_rmc
    use_hrmc = FLAGS.use_hrmc
    use_icm = FLAGS.use_icm
    use_coex = FLAGS.use_coex
    use_reward_prediction = FLAGS.use_reward_prediction
    use_pixel_control = FLAGS.use_pixel_control
    pq_kl_coef = FLAGS.pq_kl_coef
    p_kl_coef = FLAGS.p_kl_coef

    global_step_float = tf.cast(global_step, tf.float32)

    lr = tf.train.polynomial_decay(
        init_lr, global_step,
        FLAGS.total_environment_frames // (FLAGS.batch_size * FLAGS.seqlen),
        init_lr / 10.)
    is_warmup = tf.cast(global_step_float < warmup_steps, tf.float32)
    lr = is_warmup * global_step_float / warmup_steps * init_lr + (
        1.0 - is_warmup) * (init_lr * (1.0 - decay) + lr * decay)
    optimizer = tf.train.AdamOptimizer(lr)

    ent_coef = tf.train.polynomial_decay(
        FLAGS.ent_coef, global_step, FLAGS.total_environment_frames * 2 //
        (FLAGS.batch_size * FLAGS.seqlen), FLAGS.ent_coef / 10.)

    if FLAGS.zero_init:
        pre["state_in"] = tf.zeros_like(pre["state_in"])

    if use_hrmc:
        rnn = TmpHierRMCRNN(4,
                            64,
                            4,
                            4,
                            4,
                            return_sequences=True,
                            return_state=True,
                            name="hrmcrnn")
    elif use_rmc:
        rnn = RMCRNN(64,
                     4,
                     4,
                     return_sequences=True,
                     return_state=True,
                     name="rmcrnn")
    else:
        rnn = tf.compat.v1.keras.layers.LSTM(256,
                                             return_sequences=True,
                                             return_state=True,
                                             name="lstm")
    pre_model = Model(act_space, rnn, use_rmc, use_hrmc, use_reward_prediction,
                      use_pixel_control, "agent", **pre)

    post["state_in"] = tf.stop_gradient(pre_model.state_out)

    post_model = Model(act_space, rnn, use_rmc, use_hrmc,
                       use_reward_prediction, use_pixel_control, "agent",
                       **post)

    tf.summary.scalar("adv_mean", post_model.adv_mean)
    tf.summary.scalar("adv_std", post_model.adv_std)

    losses = dPPOcC(act=post_model.a_t,
                    policy_logits=post_model.current_act_logits,
                    old_policy_logits=post_model.old_act_logits,
                    advantage=post_model.advantage,
                    policy_clip=FLAGS.ppo_clip,
                    vf=post_model.current_value,
                    vf_target=post_model.ret,
                    value_clip=FLAGS.vf_clip,
                    old_vf=post_model.old_current_value)

    entropy_loss = tf.reduce_mean(
        entropy(post_model.current_act_logits) * post_model.slots)

    p_loss = tf.reduce_mean(losses.p_loss * post_model.slots)
    v_loss = tf.reduce_mean(losses.v_loss * post_model.slots)

    add_loss = 0.0
    if use_icm:
        icmloss = icm(post_model.cnn_feature[:, :-1, :],
                      post_model.cnn_feature[:, 1:, :], post_model.a_t[:, :-1],
                      act_space)
        add_loss += 0.2 * tf.reduce_mean(
            icmloss.f_loss * post_model.slots[:, :-1]) + 0.8 * tf.reduce_mean(
                icmloss.i_loss * post_model.slots[:, :-1])
    if use_coex:
        coexloss = coex(post_model.image_feature[:, :-1, :, :, :],
                        post_model.image_feature[:, 1:, :, :, :],
                        post_model.a_t[:, :-1], act_space)
        add_loss += tf.reduce_mean(coexloss * post_model.slots[:, :-1])
    if use_hrmc:
        pq_kl_loss = KL_from_gaussians(post_model.q_mus, post_model.q_sigmas,
                                       post_model.p_mus, post_model.p_sigmas)
        pq_kl_loss = tf.reduce_mean(pq_kl_loss * post_model.slots)
        tf.summary.scalar("kl_div", pq_kl_loss)
        add_loss += pq_kl_coef * pq_kl_loss

        p_kl_loss = KL_from_gaussians(post_model.p_mus, post_model.p_sigmas,
                                      tf.zeros_like(post_model.p_mus),
                                      0.01 * tf.ones_like(post_model.p_sigmas))
        p_kl_loss = tf.reduce_mean(p_kl_loss * post_model.slots)
        tf.summary.scalar("kl_div_prior", p_kl_loss)
        add_loss += p_kl_coef * p_kl_loss
    if use_reward_prediction:
        r_loss = tf.reduce_mean(
            mse(post_model.reward_prediction, post_model.r_t) *
            post_model.slots)
        tf.summary.scalar("r_loss", r_loss)
        add_loss += r_loss
    if use_pixel_control:
        rec_loss = tf.reduce_mean(
            mse(post_model.pixel_control, post_model.s_t) *
            post_model.slots[:, :, None, None, None])
        tf.summary.scalar("rec_loss", rec_loss)
        add_loss += rec_loss

    loss = (FLAGS.pi_coef * p_loss + FLAGS.vf_coef * v_loss -
            ent_coef * entropy_loss + add_loss)

    train_op = miniOp(optimizer, loss, FLAGS.grad_clip)

    new_frames = tf.reduce_sum(post["slots"])

    with tf.control_dependencies([train_op]):
        num_frames_and_train = tf.assign_add(num_frames, new_frames)
        global_step_and_train = tf.assign_add(global_step, 1)

    tf.summary.scalar("learning_rate", lr)
    tf.summary.scalar("ent_coef", ent_coef)
    tf.summary.scalar("ent_loss", entropy_loss)
    tf.summary.scalar("p_loss", p_loss)
    tf.summary.scalar("v_loss", v_loss)
    tf.summary.scalar("all_loss", loss)

    return num_frames_and_train, global_step_and_train
Beispiel #46
0
    def create_variables(self):
        # создание нейросети T копированием из исходной нейросети N
        self.target_q_network = self.q_network.copy(scope="target_network")

        # расчет управляющего действия
        # FOR REGULAR ACTION SCORE COMPUTATION
        with tf.name_scope("taking_action"):
            # входные данные вектора состояния
            self.observation = tf.placeholder(tf.float32,
                                              (None, self.observation_size),
                                              name="observation")
            # расчитать очки оценки полезности каждого действия
            self.action_scores = tf.identity(self.q_network(self.observation),
                                             name="action_scores")
            tf.histogram_summary("action_scores", self.action_scores)
            # взять действие с максимальным количеством очков
            self.predicted_actions = tf.argmax(self.action_scores,
                                               dimension=1,
                                               name="predicted_actions")

        # расчет будущей пользы
        with tf.name_scope("estimating_future_rewards"):
            # FOR PREDICTING TARGET FUTURE REWARDS
            # входной параметр - будущие состояния
            self.next_observation = tf.placeholder(
                tf.float32, (None, self.observation_size),
                name="next_observation")
            # входной параметр - маски будущих состояний
            self.next_observation_mask = tf.placeholder(
                tf.float32, (None, ), name="next_observation_mask")
            # оценки полезности
            self.next_action_scores = tf.stop_gradient(
                self.target_q_network(self.next_observation))
            tf.histogram_summary("target_action_scores",
                                 self.next_action_scores)
            # входной параметр - награды
            self.rewards = tf.placeholder(tf.float32, (None, ), name="rewards")
            # взять максимальные оценки полезностей действий
            target_values = tf.identity(
                tf.reduce_max(self.next_action_scores, reduction_indices=[
                    1,
                ]) * self.next_observation_mask,
                name="target_values")
            # r + DF * MAX(Q,s) см статью о Q-learning в википедии
            #self.future_rewards            = self.rewards + self.discount_rate * target_values
            self.future_rewards = tf.identity(
                self.rewards + self.discount_rate * target_values,
                name="future_rewards")

        # обученте сети N
        with tf.name_scope("q_value_precition"):
            # FOR PREDICTION ERROR
            # входной параметр маски действий в наборе обучающих примеров
            self.action_mask = tf.placeholder(tf.float32,
                                              (None, self.num_actions),
                                              name="action_mask")
            # расчет полезностей действий набора обучающих примеров
            self.masked_action_scores = tf.reduce_sum(
                self.action_scores * self.action_mask,
                reduction_indices=[
                    1,
                ],
                name="masked_action_scores")
            # разности текущих полезностей и будущих
            # - (r + DF * MAX(Q,s) — Q[s',a'])
            #temp_diff                       = self.masked_action_scores - self.future_rewards
            temp_diff = tf.identity(self.masked_action_scores -
                                    self.future_rewards,
                                    name="temp_diff")
            # ключевой момент обучения сети
            # RMSProp минимизирует среднее от вышеуказанных разностей
            self.prediction_error = tf.reduce_mean(tf.square(temp_diff),
                                                   name="prediction_error")
            # работа RMSProp, первый шаг - вычисление градиентов
            gradients = self.optimizer.compute_gradients(self.prediction_error)
            #def get_zero(): return tf.constant(0.0)
            #def get_perror(): return self.prediction_error
            #gradients                       = self.optimizer.compute_gradients(tf.cond(tf.is_nan(self.prediction_error), get_zero, get_perror))
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad, 5), var)
            # Add histograms for gradients.
            for grad, var in gradients:
                tf.histogram_summary(var.name, var)
                if grad is not None:
                    tf.histogram_summary(var.name + '/gradients', grad)
            # второй шаг - оптимизация параметров нейросети
            self.train_op = self.optimizer.apply_gradients(gradients,
                                                           name="train_op")

        # то самое место где настраивается сеть T
        # T = (1-alpha)*T + alpha*N
        # UPDATE TARGET NETWORK
        with tf.name_scope("target_network_update"):
            self.target_network_update = []
            for v_source, v_target in zip(self.q_network.variables(),
                                          self.target_q_network.variables()):
                # this is equivalent to target = (1-alpha) * target + alpha * source
                update_op = v_target.assign_sub(
                    self.target_network_update_rate * (v_target - v_source))
                self.target_network_update.append(update_op)
            self.target_network_update = tf.group(*self.target_network_update,
                                                  name="target_network_update")

        # summaries
        tf.scalar_summary("prediction_error", self.prediction_error)

        self.summarize = tf.merge_all_summaries()
        self.no_op1 = tf.no_op()
Beispiel #47
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()

        # running averages
        with tf.compat.v1.variable_scope('o_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.compat.v1.variable_scope('g_stats') as vs:
            if reuse:
                vs.reuse_variables()
            self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess)

        # mini-batch sampling.
        batch = self.staging_tf.get()
        batch_tf = OrderedDict([(key, batch[i])
                                for i, key in enumerate(self.stage_shapes.keys())])
        batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1])

        #choose only the demo buffer samples
        mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0)

        # networks
        with tf.compat.v1.variable_scope('main') as vs:
            if reuse:
                vs.reuse_variables()
            self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__)
            vs.reuse_variables()
        with tf.compat.v1.variable_scope('target') as vs:
            if reuse:
                vs.reuse_variables()
            target_batch_tf = batch_tf.copy()
            target_batch_tf['o'] = batch_tf['o_2']
            target_batch_tf['g'] = batch_tf['g_2']
            self.target = self.create_actor_critic(
                target_batch_tf, net_type='target', **self.__dict__)
            vs.reuse_variables()
        assert len(self._vars("main")) == len(self._vars("target"))

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range)
        self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf))

        if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both
            maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only
            #define the cloning loss on the actor's actions only on the samples which adhere to the above masks
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight

        elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter
            self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask)))
            self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
            self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf

        else: #If  not training with demonstrations
            self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
            self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q'))
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi'))
        assert len(self._vars('main/Q')) == len(Q_grads_tf)
        assert len(self._vars('main/pi')) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q'))
        self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi'))
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q'))
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi'))

        # optimizers
        self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = self._vars('main/Q') + self._vars('main/pi')
        self.target_vars = self._vars('target/Q') + self._vars('target/pi')
        self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats')
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars)))

        # initialize all variables
        tf.variables_initializer(self._global_vars('')).run()
        self._sync_optimizers()
        self._init_target_net()
def train_eval(
    root_dir,
    env_name='HalfCheetah-v2',
    num_iterations=3000000,
    actor_fc_layers=(),
    critic_obs_fc_layers=None,
    critic_action_fc_layers=None,
    critic_joint_fc_layers=(256, 256),
    initial_collect_steps=10000,
    collect_steps_per_iteration=1,
    replay_buffer_capacity=1000000,
    # Params for target update
    target_update_tau=0.005,
    target_update_period=1,
    # Params for train
    train_steps_per_iteration=1,
    batch_size=256,
    actor_learning_rate=3e-4,
    critic_learning_rate=3e-4,
    alpha_learning_rate=3e-4,
    dual_learning_rate=3e-4,
    td_errors_loss_fn=tf.math.squared_difference,
    gamma=0.99,
    reward_scale_factor=0.1,
    gradient_clipping=None,
    use_tf_functions=True,
    # Params for eval
    num_eval_episodes=30,
    eval_interval=10000,
    # Params for summaries and logging
    train_checkpoint_interval=50000,
    policy_checkpoint_interval=50000,
    rb_checkpoint_interval=50000,
    log_interval=1000,
    summary_interval=1000,
    summaries_flush_secs=10,
    debug_summaries=False,
    summarize_grads_and_vars=False,
    eval_metrics_callback=None,
    latent_dim=10,
    log_prob_reward_scale=0.0,
    predictor_updates_encoder=False,
    predict_prior=True,
    use_recurrent_actor=False,
    rnn_sequence_length=20,
    clip_max_stddev=10.0,
    clip_min_stddev=0.1,
    clip_mean=30.0,
    predictor_num_layers=2,
    use_identity_encoder=False,
    identity_encoder_single_stddev=False,
    kl_constraint=1.0,
    eval_dropout=(),
    use_residual_predictor=True,
    gym_kwargs=None,
    predict_prior_std=True,
    random_seed=0,
):
    """A simple train and eval for SAC."""
    np.random.seed(random_seed)
    tf.random.set_seed(random_seed)
    if use_recurrent_actor:
        batch_size = batch_size // rnn_sequence_length
    root_dir = os.path.expanduser(root_dir)
    train_dir = os.path.join(root_dir, 'train')
    eval_dir = os.path.join(root_dir, 'eval')

    train_summary_writer = tf.compat.v2.summary.create_file_writer(
        train_dir, flush_millis=summaries_flush_secs * 1000)
    train_summary_writer.set_as_default()

    eval_summary_writer = tf.compat.v2.summary.create_file_writer(
        eval_dir, flush_millis=summaries_flush_secs * 1000)

    global_step = tf.compat.v1.train.get_or_create_global_step()
    with tf.compat.v2.summary.record_if(
            lambda: tf.math.equal(global_step % summary_interval, 0)):

        _build_env = functools.partial(
            suite_gym.load,
            environment_name=env_name,  # pylint: disable=invalid-name
            gym_env_wrappers=(),
            gym_kwargs=gym_kwargs)

        tf_env = tf_py_environment.TFPyEnvironment(_build_env())
        eval_vec = []  # (name, env, metrics)
        eval_metrics = [
            tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
            tf_metrics.AverageEpisodeLengthMetric(
                buffer_size=num_eval_episodes)
        ]
        eval_tf_env = tf_py_environment.TFPyEnvironment(_build_env())
        name = ''
        eval_vec.append((name, eval_tf_env, eval_metrics))

        time_step_spec = tf_env.time_step_spec()
        observation_spec = time_step_spec.observation
        action_spec = tf_env.action_spec()
        if latent_dim == 'obs':
            latent_dim = observation_spec.shape[0]

        def _activation(t):
            t1, t2 = tf.split(t, 2, axis=1)
            low = -np.inf if clip_mean is None else -clip_mean
            high = np.inf if clip_mean is None else clip_mean
            t1 = rpc_utils.squash_to_range(t1, low, high)

            if clip_min_stddev is None:
                low = -np.inf
            else:
                low = tf.math.log(tf.exp(clip_min_stddev) - 1.0)
            if clip_max_stddev is None:
                high = np.inf
            else:
                high = tf.math.log(tf.exp(clip_max_stddev) - 1.0)
            t2 = rpc_utils.squash_to_range(t2, low, high)
            return tf.concat([t1, t2], axis=1)

        if use_identity_encoder:
            assert latent_dim == observation_spec.shape[0]
            obs_input = tf.keras.layers.Input(observation_spec.shape)
            zeros = 0.0 * obs_input[:, :1]
            stddev_dim = 1 if identity_encoder_single_stddev else latent_dim
            pre_stddev = tf.keras.layers.Dense(stddev_dim,
                                               activation=None)(zeros)
            ones = zeros + tf.ones((1, latent_dim))
            pre_stddev = pre_stddev * ones  # Multiply to broadcast to latent_dim.
            pre_mean_stddev = tf.concat([obs_input, pre_stddev], axis=1)
            output = tfp.layers.IndependentNormal(latent_dim)(pre_mean_stddev)
            encoder_net = tf.keras.Model(inputs=obs_input, outputs=output)
        else:
            encoder_net = tf.keras.Sequential([
                tf.keras.layers.Dense(256, activation='relu'),
                tf.keras.layers.Dense(256, activation='relu'),
                tf.keras.layers.Dense(
                    tfp.layers.IndependentNormal.params_size(latent_dim),
                    activation=_activation,
                    kernel_initializer='glorot_uniform'),
                tfp.layers.IndependentNormal(latent_dim),
            ])

        # Build the predictor net
        obs_input = tf.keras.layers.Input(observation_spec.shape)
        action_input = tf.keras.layers.Input(action_spec.shape)

        class ConstantIndependentNormal(tfp.layers.IndependentNormal):
            """A keras layer that always returns N(0, 1) distribution."""
            def call(self, inputs):
                loc_scale = tf.concat([
                    tf.zeros((latent_dim, )),
                    tf.fill((latent_dim, ), tf.math.log(tf.exp(1.0) - 1))
                ],
                                      axis=0)
                # Multiple by [B x 1] tensor to broadcast batch dimension.
                loc_scale = loc_scale * tf.ones_like(inputs[:, :1])
                return super(ConstantIndependentNormal, self).call(loc_scale)

        if predict_prior:
            z = encoder_net(obs_input)
            if not predictor_updates_encoder:
                z = tf.stop_gradient(z)
            za = tf.concat([z, action_input], axis=1)
            if use_residual_predictor:
                za_input = tf.keras.layers.Input(za.shape[1])
                loc_scale = tf.keras.Sequential(
                    predictor_num_layers *
                    [tf.keras.layers.Dense(256, activation='relu')] + [  # pylint: disable=line-too-long
                        tf.keras.layers.Dense(tfp.layers.IndependentNormal.
                                              params_size(latent_dim),
                                              activation=_activation,
                                              kernel_initializer='zeros'),
                    ])(za_input)
                if predict_prior_std:
                    combined_loc_scale = tf.concat([
                        loc_scale[:, :latent_dim] + za_input[:, :latent_dim],
                        loc_scale[:, latent_dim:]
                    ],
                                                   axis=1)
                else:
                    # Note that softplus(log(e - 1)) = 1.
                    combined_loc_scale = tf.concat([
                        loc_scale[:, :latent_dim] + za_input[:, :latent_dim],
                        tf.math.log(np.e - 1) *
                        tf.ones_like(loc_scale[:, latent_dim:])
                    ],
                                                   axis=1)
                dist = tfp.layers.IndependentNormal(latent_dim)(
                    combined_loc_scale)
                output = tf.keras.Model(inputs=za_input, outputs=dist)(za)
            else:
                assert predict_prior_std
                output = tf.keras.Sequential(
                    predictor_num_layers *
                    [tf.keras.layers.Dense(256, activation='relu')] +  # pylint: disable=line-too-long
                    [
                        tf.keras.layers.Dense(tfp.layers.IndependentNormal.
                                              params_size(latent_dim),
                                              activation=_activation,
                                              kernel_initializer='zeros'),
                        tfp.layers.IndependentNormal(latent_dim),
                    ])(za)
        else:
            # scale is chosen by inverting the softplus function to equal 1.
            if len(obs_input.shape) > 2:
                input_reshaped = tf.reshape(
                    obs_input,
                    [-1, tf.math.reduce_prod(obs_input.shape[1:])])
                #  Multiply by [B x 1] tensor to broadcast batch dimension.
                za = tf.zeros(latent_dim + action_spec.shape[0], ) * tf.ones_like(input_reshaped[:, :1])  # pylint: disable=line-too-long
            else:
                #  Multiple by [B x 1] tensor to broadcast batch dimension.
                za = tf.zeros(latent_dim + action_spec.shape[0], ) * tf.ones_like(obs_input[:, :1])  # pylint: disable=line-too-long
            output = tf.keras.Sequential([
                ConstantIndependentNormal(latent_dim),
            ])(za)
        predictor_net = tf.keras.Model(inputs=(obs_input, action_input),
                                       outputs=output)
        if use_recurrent_actor:
            ActorClass = rpc_utils.RecurrentActorNet  # pylint: disable=invalid-name
        else:
            ActorClass = rpc_utils.ActorNet  # pylint: disable=invalid-name
        actor_net = ActorClass(input_tensor_spec=observation_spec,
                               output_tensor_spec=action_spec,
                               encoder=encoder_net,
                               predictor=predictor_net,
                               fc_layers=actor_fc_layers)

        critic_net = rpc_utils.CriticNet(
            (observation_spec, action_spec),
            observation_fc_layer_params=critic_obs_fc_layers,
            action_fc_layer_params=critic_action_fc_layers,
            joint_fc_layer_params=critic_joint_fc_layers,
            kernel_initializer='glorot_uniform',
            last_kernel_initializer='glorot_uniform')
        critic_net_2 = None
        target_critic_net_1 = None
        target_critic_net_2 = None

        tf_agent = rpc_agent.RpAgent(
            time_step_spec,
            action_spec,
            actor_network=actor_net,
            critic_network=critic_net,
            critic_network_2=critic_net_2,
            target_critic_network=target_critic_net_1,
            target_critic_network_2=target_critic_net_2,
            actor_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=actor_learning_rate),
            critic_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=critic_learning_rate),
            alpha_optimizer=tf.compat.v1.train.AdamOptimizer(
                learning_rate=alpha_learning_rate),
            target_update_tau=target_update_tau,
            target_update_period=target_update_period,
            td_errors_loss_fn=td_errors_loss_fn,
            gamma=gamma,
            reward_scale_factor=reward_scale_factor,
            gradient_clipping=gradient_clipping,
            debug_summaries=debug_summaries,
            summarize_grads_and_vars=summarize_grads_and_vars,
            train_step_counter=global_step)
        dual_optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=dual_learning_rate)
        tf_agent.initialize()

        # Make the replay buffer.
        replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=tf_agent.collect_data_spec,
            batch_size=tf_env.batch_size,
            max_length=replay_buffer_capacity)
        replay_observer = [replay_buffer.add_batch]

        train_metrics = [
            tf_metrics.NumberOfEpisodes(),
            tf_metrics.EnvironmentSteps(),
            tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes,
                                           batch_size=tf_env.batch_size),
            tf_metrics.AverageEpisodeLengthMetric(
                buffer_size=num_eval_episodes, batch_size=tf_env.batch_size),
        ]
        kl_metric = rpc_utils.AverageKLMetric(encoder=encoder_net,
                                              predictor=predictor_net,
                                              batch_size=tf_env.batch_size)
        eval_policy = greedy_policy.GreedyPolicy(tf_agent.policy)
        initial_collect_policy = random_tf_policy.RandomTFPolicy(
            tf_env.time_step_spec(), tf_env.action_spec())
        collect_policy = tf_agent.collect_policy

        checkpoint_items = {
            'ckpt_dir': train_dir,
            'agent': tf_agent,
            'global_step': global_step,
            'metrics': metric_utils.MetricsGroup(train_metrics,
                                                 'train_metrics'),
            'dual_optimizer': dual_optimizer,
        }
        train_checkpointer = common.Checkpointer(**checkpoint_items)

        policy_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'policy'),
                                                  policy=eval_policy,
                                                  global_step=global_step)
        rb_checkpointer = common.Checkpointer(ckpt_dir=os.path.join(
            train_dir, 'replay_buffer'),
                                              max_to_keep=1,
                                              replay_buffer=replay_buffer)

        train_checkpointer.initialize_or_restore()
        rb_checkpointer.initialize_or_restore()

        initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
            tf_env,
            initial_collect_policy,
            observers=replay_observer + train_metrics,
            num_steps=initial_collect_steps,
            transition_observers=[kl_metric])

        collect_driver = dynamic_step_driver.DynamicStepDriver(
            tf_env,
            collect_policy,
            observers=replay_observer + train_metrics,
            num_steps=collect_steps_per_iteration,
            transition_observers=[kl_metric])

        if use_tf_functions:
            initial_collect_driver.run = common.function(
                initial_collect_driver.run)
            collect_driver.run = common.function(collect_driver.run)
            tf_agent.train = common.function(tf_agent.train)

        if replay_buffer.num_frames() == 0:
            # Collect initial replay data.
            logging.info(
                'Initializing replay buffer by collecting experience for %d steps '
                'with a random policy.', initial_collect_steps)
            initial_collect_driver.run()

        for name, eval_tf_env, eval_metrics in eval_vec:
            results = metric_utils.eager_compute(
                eval_metrics,
                eval_tf_env,
                eval_policy,
                num_episodes=num_eval_episodes,
                train_step=global_step,
                summary_writer=eval_summary_writer,
                summary_prefix='Metrics-%s' % name,
            )
            if eval_metrics_callback is not None:
                eval_metrics_callback(results, global_step.numpy())
            metric_utils.log_metrics(eval_metrics, prefix=name)

        time_step = None
        policy_state = collect_policy.get_initial_state(tf_env.batch_size)

        timed_at_step = global_step.numpy()
        time_acc = 0
        train_time_acc = 0
        env_time_acc = 0

        if use_recurrent_actor:  # default from sac/train_eval_rnn.py
            num_steps = rnn_sequence_length + 1

            def _filter_invalid_transition(trajectories, unused_arg1):
                return tf.reduce_all(~trajectories.is_boundary()[:-1])

            tf_agent._as_transition = data_converter.AsTransition(  # pylint: disable=protected-access
                tf_agent.data_context,
                squeeze_time_dim=False)
        else:
            num_steps = 2

            def _filter_invalid_transition(trajectories, unused_arg1):
                return ~trajectories.is_boundary()[0]

        dataset = replay_buffer.as_dataset(
            sample_batch_size=batch_size,
            num_steps=num_steps).unbatch().filter(_filter_invalid_transition)

        dataset = dataset.batch(batch_size).prefetch(5)
        # Dataset generates trajectories with shape [Bx2x...]
        iterator = iter(dataset)

        @tf.function
        def train_step():
            experience, _ = next(iterator)

            prior = predictor_net(
                (experience.observation[:, 0], experience.action[:, 0]),
                training=False)
            z_next = encoder_net(experience.observation[:, 1], training=False)
            # predictor_kl is a vector of size batch_size.
            predictor_kl = tfp.distributions.kl_divergence(z_next, prior)

            with tf.GradientTape() as tape:
                tape.watch(actor_net._log_kl_coefficient)  # pylint: disable=protected-access
                dual_loss = -1.0 * actor_net._log_kl_coefficient * (  # pylint: disable=protected-access
                    tf.stop_gradient(tf.reduce_mean(predictor_kl)) -
                    kl_constraint)
            dual_grads = tape.gradient(dual_loss,
                                       [actor_net._log_kl_coefficient])  # pylint: disable=protected-access
            grads_and_vars = list(
                zip(dual_grads, [actor_net._log_kl_coefficient]))  # pylint: disable=protected-access
            dual_optimizer.apply_gradients(grads_and_vars)

            # Clip the dual variable so exp(log_kl_coef) <= 1e6.
            log_kl_coef = tf.clip_by_value(
                actor_net._log_kl_coefficient,  # pylint: disable=protected-access
                -1.0 * np.log(1e6),
                np.log(1e6))
            actor_net._log_kl_coefficient.assign(log_kl_coef)  # pylint: disable=protected-access

            with tf.name_scope('dual_loss'):
                tf.compat.v2.summary.scalar(name='dual_loss',
                                            data=tf.reduce_mean(dual_loss),
                                            step=global_step)
                tf.compat.v2.summary.scalar(
                    name='log_kl_coefficient',
                    data=actor_net._log_kl_coefficient,  # pylint: disable=protected-access
                    step=global_step)

            z_entropy = z_next.entropy()
            log_prob = prior.log_prob(z_next.sample())
            with tf.name_scope('rp-metrics'):
                common.generate_tensor_summaries('predictor_kl', predictor_kl,
                                                 global_step)
                common.generate_tensor_summaries('z_entropy', z_entropy,
                                                 global_step)
                common.generate_tensor_summaries('log_prob', log_prob,
                                                 global_step)
                common.generate_tensor_summaries('z_mean', z_next.mean(),
                                                 global_step)
                common.generate_tensor_summaries('z_stddev', z_next.stddev(),
                                                 global_step)
                common.generate_tensor_summaries('prior_mean', prior.mean(),
                                                 global_step)
                common.generate_tensor_summaries('prior_stddev',
                                                 prior.stddev(), global_step)

            if log_prob_reward_scale == 'auto':
                coef = tf.stop_gradient(tf.exp(actor_net._log_kl_coefficient))  # pylint: disable=protected-access
            else:
                coef = log_prob_reward_scale
            tf.debugging.check_numerics(tf.reduce_mean(predictor_kl),
                                        'predictor_kl is inf or nan.')
            tf.debugging.check_numerics(coef, 'coef is inf or nan.')
            new_reward = experience.reward - coef * predictor_kl[:, None]

            experience = experience._replace(reward=new_reward)
            return tf_agent.train(experience)

        if use_tf_functions:
            train_step = common.function(train_step)

        # Save the hyperparameters
        operative_filename = os.path.join(root_dir, 'operative.gin')
        with tf.compat.v1.gfile.Open(operative_filename, 'w') as f:
            f.write(gin.operative_config_str())
            print(gin.operative_config_str())

        global_step_val = global_step.numpy()
        while global_step_val < num_iterations:
            start_time = time.time()
            time_step, policy_state = collect_driver.run(
                time_step=time_step,
                policy_state=policy_state,
            )
            env_time_acc += time.time() - start_time
            train_start_time = time.time()
            for _ in range(train_steps_per_iteration):
                train_loss = train_step()
            train_time_acc += time.time() - train_start_time
            time_acc += time.time() - start_time

            global_step_val = global_step.numpy()

            if global_step_val % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step_val,
                             train_loss.loss)
                steps_per_sec = (global_step_val - timed_at_step) / time_acc
                logging.info('%.3f steps/sec', steps_per_sec)
                tf.compat.v2.summary.scalar(name='global_steps_per_sec',
                                            data=steps_per_sec,
                                            step=global_step)
                train_steps_per_sec = (global_step_val -
                                       timed_at_step) / train_time_acc
                logging.info('Train: %.3f steps/sec', train_steps_per_sec)
                tf.compat.v2.summary.scalar(name='train_steps_per_sec',
                                            data=train_steps_per_sec,
                                            step=global_step)
                env_steps_per_sec = (global_step_val -
                                     timed_at_step) / env_time_acc
                logging.info('Env: %.3f steps/sec', env_steps_per_sec)
                tf.compat.v2.summary.scalar(name='env_steps_per_sec',
                                            data=env_steps_per_sec,
                                            step=global_step)
                timed_at_step = global_step_val
                time_acc = 0
                train_time_acc = 0
                env_time_acc = 0

            for train_metric in train_metrics + [kl_metric]:
                train_metric.tf_summaries(train_step=global_step,
                                          step_metrics=train_metrics[:2])

            if global_step_val % eval_interval == 0:
                start_time = time.time()
                for name, eval_tf_env, eval_metrics in eval_vec:
                    results = metric_utils.eager_compute(
                        eval_metrics,
                        eval_tf_env,
                        eval_policy,
                        num_episodes=num_eval_episodes,
                        train_step=global_step,
                        summary_writer=eval_summary_writer,
                        summary_prefix='Metrics-%s' % name,
                    )
                    if eval_metrics_callback is not None:
                        eval_metrics_callback(results, global_step_val)
                    metric_utils.log_metrics(eval_metrics, prefix=name)
                logging.info('Evaluation: %d min',
                             (time.time() - start_time) / 60)
                for prob_dropout in eval_dropout:
                    rpc_utils.eval_dropout_fn(eval_tf_env,
                                              actor_net,
                                              global_step,
                                              prob_dropout=prob_dropout)

            if global_step_val % train_checkpoint_interval == 0:
                train_checkpointer.save(global_step=global_step_val)

            if global_step_val % policy_checkpoint_interval == 0:
                policy_checkpointer.save(global_step=global_step_val)

            if global_step_val % rb_checkpoint_interval == 0:
                rb_checkpointer.save(global_step=global_step_val)
Beispiel #49
0
    def angle_cls_focal_loss(self,
                             labels,
                             pred,
                             anchor_state,
                             alpha=None,
                             gamma=2.0,
                             decimal_weight=None):

        indices = tf.reshape(tf.where(tf.equal(anchor_state, 1)), [
            -1,
        ])
        labels = tf.gather(labels, indices)
        pred = tf.gather(pred, indices)
        anchor_state = tf.gather(anchor_state, indices)

        # compute the focal loss
        per_entry_cross_ent = - labels * tf.log(tf.sigmoid(pred) + self.cfgs.EPSILON) \
                              - (1 - labels) * tf.log(1 - tf.sigmoid(pred) + self.cfgs.EPSILON)

        prediction_probabilities = tf.sigmoid(pred)
        p_t = ((labels * prediction_probabilities) +
               ((1 - labels) * (1 - prediction_probabilities)))
        modulating_factor = 1.0
        if gamma:
            modulating_factor = tf.pow(1.0 - p_t, gamma)
        alpha_weight_factor = 1.0
        if alpha is not None:
            alpha_weight_factor = (labels * alpha + (1 - labels) * (1 - alpha))

        if decimal_weight is not None:
            angle_decode_labels = tf.py_func(func=angle_label_decode,
                                             inp=[
                                                 labels, self.cfgs.ANGLE_RANGE,
                                                 self.cfgs.OMEGA,
                                                 self.cfgs.ANGLE_MODE
                                             ],
                                             Tout=[tf.float32])
            angle_decode_labels = tf.reshape(angle_decode_labels, [
                -1,
            ]) * -1

            angle_decode_pred = tf.py_func(func=angle_label_decode,
                                           inp=[
                                               tf.sigmoid(pred),
                                               self.cfgs.ANGLE_RANGE,
                                               self.cfgs.OMEGA,
                                               self.cfgs.ANGLE_MODE
                                           ],
                                           Tout=[tf.float32])

            angle_decode_pred = tf.reshape(angle_decode_pred, [
                -1,
            ]) * -1

            diff_weight = tf.reshape(
                tf.log(abs(angle_decode_labels - angle_decode_pred) + 1),
                [-1, 1])
        else:
            diff_weight = tf.ones_like(tf.reshape(anchor_state, [-1, 1]))

        focal_cross_entropy_loss = (diff_weight * modulating_factor *
                                    alpha_weight_factor * per_entry_cross_ent)

        # compute the normalizer: the number of positive anchors
        # normalizer = tf.stop_gradient(tf.where(tf.greater(anchor_state, -2)))
        normalizer = tf.stop_gradient(tf.where(tf.equal(anchor_state, 1)))
        normalizer = tf.cast(tf.shape(normalizer)[0], tf.float32)
        normalizer = tf.maximum(1.0, normalizer)

        # normalizer = tf.stop_gradient(tf.cast(tf.equal(anchor_state, 1), tf.float32))
        # normalizer = tf.maximum(tf.reduce_sum(normalizer), 1)

        return tf.reduce_sum(focal_cross_entropy_loss) / normalizer
Beispiel #50
0
 def loss(self, predictions, policy, cfv):
     r = tf.stop_gradient(
         cpea.rm_policy(cfv -
                        tf.reduce_sum(cfv * policy, axis=1, keepdims=True)))
     error = tf.square(r - predictions) / 2.0
     return tf.reduce_mean(tf.reduce_sum(error, axis=1))
Beispiel #51
0
def model_fn(features, labels, mode, params):
    '''
    Args:
        features: tensor with shape
            [BATCH_SIZE, go.N, go.N, features_lib.NEW_FEATURES_PLANES]
        labels: dict from string to tensor with shape
            'pi_tensor': [BATCH_SIZE, go.N * go.N + 1]
            'value_tensor': [BATCH_SIZE]
        mode: a tf.estimator.ModeKeys (batchnorm params update for TRAIN only)
        params: A dictionary (Typically derived from the FLAGS object.)
    Returns: tf.estimator.EstimatorSpec with props
        mode: same as mode arg
        predictions: dict of tensors
            'policy': [BATCH_SIZE, go.N * go.N + 1]
            'value': [BATCH_SIZE]
        loss: a single value tensor
        train_op: train op
        eval_metric_ops
    return dict of tensors
        logits: [BATCH_SIZE, go.N * go.N + 1]
    '''

    policy_output, value_output, logits = model_inference_fn(
        features, mode == tf.estimator.ModeKeys.TRAIN, params)

    # train ops
    policy_cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
                                                   labels=tf.stop_gradient(
                                                       labels['pi_tensor'])))

    value_cost = params['value_cost_weight'] * tf.reduce_mean(
        tf.square(value_output - labels['value_tensor']))

    reg_vars = [
        v for v in tf.trainable_variables()
        if 'bias' not in v.name and 'beta' not in v.name
    ]
    l2_cost = params['l2_strength'] * \
        tf.add_n([tf.nn.l2_loss(v) for v in reg_vars])

    combined_cost = policy_cost + value_cost + l2_cost

    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.piecewise_constant(global_step,
                                                params['lr_boundaries'],
                                                params['lr_rates'])
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

    # Insert quantization ops if requested
    if params['quantize']:
        if mode == tf.estimator.ModeKeys.TRAIN:
            tf.contrib.quantize.create_training_graph(
                quant_delay=params['quant_delay'])
        else:
            tf.contrib.quantize.create_eval_graph()

    optimizer = tf.train.MomentumOptimizer(learning_rate,
                                           params['sgd_momentum'])
    if params['use_tpu']:
        optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)
    with tf.control_dependencies(update_ops):
        train_op = optimizer.minimize(combined_cost, global_step=global_step)

    # Computations to be executed on CPU, outside of the main TPU queues.
    def eval_metrics_host_call_fn(policy_output,
                                  value_output,
                                  pi_tensor,
                                  policy_cost,
                                  value_cost,
                                  l2_cost,
                                  combined_cost,
                                  step,
                                  est_mode=tf.estimator.ModeKeys.TRAIN):
        policy_entropy = -tf.reduce_mean(
            tf.reduce_sum(policy_output * tf.log(policy_output), axis=1))
        # pi_tensor is one_hot when generated from sgfs (for supervised learning)
        # and soft-max when using self-play records. argmax normalizes the two.
        policy_target_top_1 = tf.argmax(pi_tensor, axis=1)

        policy_output_in_top1 = tf.to_float(
            tf.nn.in_top_k(policy_output, policy_target_top_1, k=1))
        policy_output_in_top3 = tf.to_float(
            tf.nn.in_top_k(policy_output, policy_target_top_1, k=3))

        policy_top_1_confidence = tf.reduce_max(policy_output, axis=1)
        policy_target_top_1_confidence = tf.boolean_mask(
            policy_output,
            tf.one_hot(policy_target_top_1,
                       tf.shape(policy_output)[1]))

        with tf.variable_scope("metrics"):
            metric_ops = {
                'policy_cost':
                tf.metrics.mean(policy_cost),
                'value_cost':
                tf.metrics.mean(value_cost),
                'l2_cost':
                tf.metrics.mean(l2_cost),
                'policy_entropy':
                tf.metrics.mean(policy_entropy),
                'combined_cost':
                tf.metrics.mean(combined_cost),
                'policy_accuracy_top_1':
                tf.metrics.mean(policy_output_in_top1),
                'policy_accuracy_top_3':
                tf.metrics.mean(policy_output_in_top3),
                'policy_top_1_confidence':
                tf.metrics.mean(policy_top_1_confidence),
                'policy_target_top_1_confidence':
                tf.metrics.mean(policy_target_top_1_confidence),
                'value_confidence':
                tf.metrics.mean(tf.abs(value_output)),
            }

        if est_mode == tf.estimator.ModeKeys.EVAL:
            return metric_ops

        # NOTE: global_step is rounded to a multiple of FLAGS.summary_steps.
        eval_step = tf.reduce_min(step)

        # Create summary ops so that they show up in SUMMARIES collection
        # That way, they get logged automatically during training
        summary_writer = summary.create_file_writer(FLAGS.work_dir)
        with summary_writer.as_default(), \
                summary.record_summaries_every_n_global_steps(
                    params['summary_steps'], eval_step):
            for metric_name, metric_op in metric_ops.items():
                summary.scalar(metric_name, metric_op[1], step=eval_step)

        # Reset metrics occasionally so that they are mean of recent batches.
        reset_op = tf.variables_initializer(tf.local_variables("metrics"))
        cond_reset_op = tf.cond(
            tf.equal(eval_step % params['summary_steps'], tf.to_int64(1)),
            lambda: reset_op, lambda: tf.no_op())

        return summary.all_summary_ops() + [cond_reset_op]

    metric_args = [
        policy_output,
        value_output,
        labels['pi_tensor'],
        tf.reshape(policy_cost, [1]),
        tf.reshape(value_cost, [1]),
        tf.reshape(l2_cost, [1]),
        tf.reshape(combined_cost, [1]),
        tf.reshape(global_step, [1]),
    ]

    predictions = {
        'policy_output': policy_output,
        'value_output': value_output,
    }

    eval_metrics_only_fn = functools.partial(
        eval_metrics_host_call_fn, est_mode=tf.estimator.ModeKeys.EVAL)
    host_call_fn = functools.partial(eval_metrics_host_call_fn,
                                     est_mode=tf.estimator.ModeKeys.TRAIN)

    tpu_estimator_spec = tpu_estimator.TPUEstimatorSpec(
        mode=mode,
        predictions=predictions,
        loss=combined_cost,
        train_op=train_op,
        eval_metrics=(eval_metrics_only_fn, metric_args),
        host_call=(host_call_fn, metric_args))
    if params['use_tpu']:
        return tpu_estimator_spec
    else:
        return tpu_estimator_spec.as_estimator_spec()
# Selective softmax
extractor = np.zeros((action_space_dimension, asize), dtype=np.float32)
for i, a in enumerate(actionset):
    extractor[a, i] = 1.0
adaptor = np.transpose(extractor)
compact = tf.tensordot(raw_pi, extractor, [[2], [0]])
compact_softmax = tf.nn.softmax(compact)
softmax_policy = tf.tensordot(compact_softmax, adaptor, [[2], [0]])

# build loss
flattened_value = tf.reshape(raw_value, [-1])
policy = tf.multiply(softmax_policy, action_tensor)
policy = tf.reduce_sum(policy, axis=[1, 2])
log_policy = tf.log(tf.clip_by_value(policy, 1e-20, 1.0))
criticism = V_tensor - flattened_value
policy_per_sample = log_policy * tf.stop_gradient(criticism)
policy_loss = tf.reduce_sum(-policy_per_sample)
value_loss = tf.nn.l2_loss(criticism)

loss = policy_loss + value_loss
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
grad_op = optimizer.compute_gradients(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    dic = {
        action_tensor: actions_to_adist_array([3], action_space_dimension),
        V_tensor: [10.0],
    }
    grad = sess.run(grad_op, feed_dict=dic)
    print("grad 1\n{}".format(grad))
Beispiel #53
0
def NN_hidden(currentstate):
    hidden = tf.nn.relu(tf.matmul(currentstate, w['L1']))
    out = tf.matmul(hidden, w['L2'])
    return out


# Bellmans equations
# G_O is 0 if game over, 1 otherwise
G_O = rew + 1
# predictions
# next state
Qnext = tf.reshape(tf.reduce_max(NN_hidden(posteriorstate)), [-1, 1])
Qnext = tf.multiply(G_O, Qnext)  # 0 if G_0 is 0
#  current state
Q = tf.reshape(tf.gather_nd(NN_hidden(currentstate), action), [-1, 1])
delt = rew + (discount_factor * tf.stop_gradient(Qnext)) - Q
# LOSS/TRAINING DEFINITIONS
loss = tf.multiply(0.50, tf.reduce_mean(tf.square(delt)), name="loss")
# RMSprop parameters
rmsprop_dec = 0.937  # RMSprop decay
rmsprop_mom = 0.52  # RMSprop momentum
# RMSprop optimisation
train = tf.train.RMSPropOptimizer(learning_rate=learningrate,
                                  momentum=rmsprop_mom,
                                  decay=rmsprop_dec).minimize(loss)
prediction = tf.argmax((NN_hidden(currentstate)), axis=1)
# variable initialisation
init = tf.global_variables_initializer()
# saving functionality
saver = tf.train.Saver()
# TRAINING
Beispiel #54
0
    def __init__(self, action_size, img_h, img_w, n_channels, c1, epochs, batch_size):

        self.epochs = epochs
        self.batch_size = batch_size

        self.regularizer = None #tf.contrib.layers.l2_regularizer(scale=0.001)
        self.initializer = None

        # counters for wrinting summaries to tensorboard
        self.i = 0 # overall training
        self.update_r = 0 # reward

        self.action_size = action_size

        self.sess = tf.Session()

        with tf.variable_scope("model"):

            # Placeholders Model
            self.o_t = tf.placeholder(shape=[None, img_h, img_w, n_channels], dtype=tf.float32)
            #self.o_t = self.o_t / 255.

            # Placeholders PPO
            self.action = tf.placeholder(shape=[self.batch_size], dtype=tf.int32)
            self.V_targ = tf.placeholder(shape=[self.batch_size], dtype=tf.float32)
            self.advantage = tf.placeholder(shape=[self.batch_size], dtype=tf.float32)

            # Placeholders summaries
            self.reward = tf.placeholder(shape=(), dtype=tf.float32)

            # Placeholders for Training
            self.lr = tf.placeholder(shape=(), dtype=tf.float32)
            self.lr_v = tf.placeholder(shape=(), dtype=tf.float32)
            self.epsilon = tf.placeholder(shape=(), dtype=tf.float32)
            self.c2 = tf.placeholder(shape=(), dtype=tf.float32)

            # constants
            self.n = tf.constant(self.action_size, dtype=tf.float32)
            self.c1 = tf.constant(c1)
            self.pi_greco = tf.constant(math.pi)

            # Define models

            self.V, self.pi = self.build_model("new")
            _, self.pi_old = self.build_model("old")

            # Compute Probability of the action taken in log space

            self.action_taken_one_hot = tf.one_hot(self.action, self.action_size)

            self.pi_sampled_log = tf.log(tf.reduce_sum(self.pi * self.action_taken_one_hot, -1) + 1e-5)
            self.pi_old_sampled_log = tf.log(tf.reduce_sum(self.pi_old * self.action_taken_one_hot, -1) + 1e-5)

            # PPO Loss

            self.ratio = tf.exp(self.pi_sampled_log - tf.stop_gradient(self.pi_old_sampled_log))
            self.sur1 = tf.multiply(self.ratio, self.advantage)
            self.sur2 = tf.multiply(tf.clip_by_value(self.ratio, 1.0 - self.epsilon, 1.0 + self.epsilon), self.advantage)
            self.L_CLIP = tf.reduce_mean(tf.minimum(self.sur1, self.sur2))

            self.L_V = 0.5 * tf.reduce_mean(tf.squared_difference(self.V_targ, self.V))

            self.entropy = - tf.reduce_sum(self.pi * tf.log(self.pi))

            self.loss = - self.L_CLIP + self.c1 * self.L_V - self.c2 * self.entropy

            # Training summaries

            self.s_pi = tf.summary.scalar('pi', tf.reduce_mean(tf.exp(self.pi_sampled_log)))
            self.s_ratio = tf.summary.scalar('Ratio', tf.reduce_mean(self.ratio))
            self.s_v = tf.summary.scalar('Loss_V', self.L_V)
            self.s_c = tf.summary.scalar('Loss_CLIP', -self.L_CLIP)
            self.s_e = tf.summary.scalar('Loss_entropy', -self.entropy)

            self.merge = tf.summary.merge([self.s_pi, self.s_ratio, self.s_v, self.s_c, self.s_e])

            self.s_r = tf.summary.scalar('Reward', self.reward)

            # Optimization steps

            self.optimizer = tf.train.AdamOptimizer(self.lr)
            self.train_ppo = self.optimizer.minimize(self.loss)
            self.optimizer_v = tf.train.AdamOptimizer(self.lr_v)
            self.train_ppo_v = self.optimizer_v.minimize(self.L_V)

        with tf.variable_scope("assign"):

            self.assign_arr = []
            self.col_dict = {}
            self.col1 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/new')
            for i in range(len(self.col1)):
                self.col_dict[self.col1[i].name.split('/')[-2] + "/" + self.col1[i].name.split('/')[-1]] = self.col1[i]

            self.col2 = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/old')
            for i in range(len(self.col2)):
                self.node_name = self.col2[i].name.split('/')[-2] + "/" + self.col2[i].name.split('/')[-1]
                self.assign0 = self.col2[i].assign(self.col_dict[self.node_name])
                self.assign_arr.append(self.assign0)

        self.init = tf.global_variables_initializer()
        self.sess.run(self.init)

        self.train_writer = tf.summary.FileWriter('train/', self.sess.graph)
Beispiel #55
0
    def _build_critic(self, tensors, v_bonus, q_bonus, output_dim=1, hidden_dim=64, total_loss=False):
        """
        :param placeholders: [obs_ph, action_ph, next_obs_ph, terminal_ph, action_pi, ensemble_mask]
        :param v_bonus: is used to compute v_backup (for reward value: v_bonus=0, kl_value: v_bonus=-kl)
        :param q_bonus: is used to compute q_backup (for reward_value: q_bonus=r, kl_value: q_bonus=0)
        :param output_dim: integer
        :return: critic_v, critic_q, mean(q_losses), qs_pi, critic_train_op, target_update_op
        """
        obs_ph, action_ph, next_obs_ph, terminal_ph, action_pi, ensemble_mask = tensors

        # Define V and Q networks
        critic_v = VNetwork(output_dim, hidden_dim=hidden_dim)
        critic_q = QNetwork(output_dim, self.num_critics, hidden_dim=hidden_dim)
        critic_v_target = VNetwork(output_dim, hidden_dim=hidden_dim)

        # Critic training (V, Q)
        qs_pi = critic_q([obs_ph, action_pi])
        v = critic_v([obs_ph])
        qs = critic_q([obs_ph, action_ph])

        v_backup = tf.stop_gradient(self._reduce_q(qs_pi) + v_bonus)
        v_loss = tf.losses.mean_squared_error(v_backup, v)
        if total_loss and output_dim != 1:
            print('total_loss_added')
            v_loss = v_loss * output_dim + tf.losses.mean_squared_error(tf.reduce_sum(v_backup, axis=-1, keepdims=True), tf.reduce_sum(v, axis=-1, keepdims=True))

        # Gradient panelty (V)
        if self.gradient_norm_panelty > 0:
            v_grad_obs = tf.gradients(v, [obs_ph])[0]  # do not average, sum by the output dimension
            v_grad_norm = tf.sqrt(tf.reduce_sum(tf.square(v_grad_obs), axis=1) + 1e-8)
            v_grad_panelty_loss = tf.reduce_mean(tf.maximum(v_grad_norm - self.gradient_norm_limit * np.sqrt(self.state_dim), 0) ** 2)
            v_loss += self.gradient_norm_panelty * v_grad_panelty_loss

        value_target = critic_v_target([next_obs_ph])
        q_backup = tf.stop_gradient((1 - terminal_ph) * self.gamma * value_target + q_bonus)  # batch x 1
        q_losses = [tf.losses.mean_squared_error(q_backup, qs[k], weights=ensemble_mask[:, k:k+1]) for k in range(self.num_critics)]
        if total_loss and output_dim != 1:
            for k in range(self.num_critics):
                q_losses[k] = q_losses[k] * output_dim + tf.losses.mean_squared_error(
                    tf.reduce_sum(q_backup, axis=-1, keepdims=True), tf.reduce_sum(qs[k], axis=-1, keepdims=True), weights=ensemble_mask[:, k:k+1])

        # Gradient panelty (Q)
        if self.gradient_norm_panelty > 0:
            qs_grad_obs_action = [tf.concat(tf.gradients(q, [obs_ph, action_ph]), axis=-1) for q in qs]  # do not average, sum by the output dimension
            qs_grad_norm = [tf.sqrt(tf.reduce_sum(tf.square(q_grad_obs_action), axis=1) + 1e-8) for q_grad_obs_action in qs_grad_obs_action]
            qs_grad_panelty_loss = [tf.reduce_mean(tf.maximum(q_grad_norm - self.gradient_norm_limit * np.sqrt(self.state_dim + self.action_dim), 0) ** 2) for q_grad_norm in qs_grad_norm]
            for i, q_grad_panelty_loss in enumerate(qs_grad_panelty_loss):
                q_losses[i] += self.gradient_norm_panelty * q_grad_panelty_loss

        value_loss = v_loss + tf.reduce_sum(q_losses)
        critic_optimizer = tf.train.AdamOptimizer(self.learning_rate)
        critic_train_op = critic_optimizer.minimize(value_loss, var_list=critic_v.trainable_variables + critic_q.trainable_variables)

        with tf.control_dependencies([critic_train_op]):
            # Update target network
            source_params = critic_v.trainable_variables
            target_params = critic_v_target.trainable_variables
            target_update_op = [
                tf.assign(target, (1 - self.tau) * target + self.tau * source)
                for target, source in zip(target_params, source_params)
            ]

        # Copy weights to target networks
        self.sess.run(tf.variables_initializer(critic_optimizer.variables()))
        critic_v_target.set_weights(critic_v.get_weights())

        return critic_v, critic_q, tf.reduce_mean(q_losses), qs_pi, critic_train_op, target_update_op
Beispiel #56
0
def build_train(make_obs_ph,
                q_func,
                num_actions,
                optimizer,
                grad_norm_clipping=None,
                gamma=1.0,
                double_q=True,
                scope="deepq",
                reuse=None,
                param_noise=False,
                param_noise_filter_func=None):
    """Creates the train function:

    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    param_noise: bool
        whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905)
    param_noise_filter_func: tf.Variable -> bool
        function that decides whether or not a variable should be perturbed. Only applicable
        if param_noise is True. If set to None, default_param_noise_filter is used by default.

    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    if param_noise:
        act_f = build_act_with_param_noise(
            make_obs_ph,
            q_func,
            num_actions,
            scope=scope,
            reuse=reuse,
            param_noise_filter_func=param_noise_filter_func)
    else:
        act_f = build_act(make_obs_ph,
                          q_func,
                          num_actions,
                          scope=scope,
                          reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = make_obs_ph("obs_t")
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = make_obs_ph("obs_tp1")
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None],
                                               name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(),
                     num_actions,
                     scope="q_func",
                     reuse=True)  # reuse parameters from act
        q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                        scope=tf.get_variable_scope().name +
                                        "/q_func")

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.GLOBAL_VARIABLES,
            scope=tf.get_variable_scope().name + "/target_q_func")

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions),
                                     1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(),
                                            num_actions,
                                            scope="q_func",
                                            reuse=True)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
                1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)

        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            gradients = optimizer.compute_gradients(weighted_error,
                                                    var_list=q_func_vars)
            for i, (grad, var) in enumerate(gradients):
                if grad is not None:
                    gradients[i] = (tf.clip_by_norm(grad,
                                                    grad_norm_clipping), var)
            optimize_expr = optimizer.apply_gradients(gradients)
        else:
            optimize_expr = optimizer.minimize(weighted_error,
                                               var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(inputs=[
            obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph,
            importance_weights_ph
        ],
                           outputs=td_error,
                           updates=[optimize_expr])
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}
Beispiel #57
0
    def build_model(self):
        with tf.variable_scope('Model', reuse=tf.AUTO_REUSE):
            with tf.name_scope('Inputs'):
                # Model Feeds
                self.ratings = tf.placeholder(dtype=tf.float32,
                                              shape=[None, self.num_item],
                                              name='ratings')
                self.uid = tf.placeholder(dtype=tf.int32,
                                          shape=[None],
                                          name='user_id')
                self.istraining = tf.placeholder(dtype=tf.bool,
                                                 shape=[],
                                                 name='training_flag')
                self.layer1_dropout_rate = tf.placeholder(
                    dtype=tf.float32, shape=[], name='layer1_dropout_rate')

            #########################################################################################################
            with tf.name_scope('Variables'):
                input = self.ratings

                # Encoder Variables
                self.layer1_w = tf.get_variable(
                    name='encoder_weights',
                    shape=[self.num_item, self.num_factors],
                    initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.01))

                self.layer1_b = tf.get_variable(
                    name='encoder_bias',
                    shape=[self.num_factors],
                    initializer=tf.zeros_initializer())
                if self.is_user_node:
                    self.user_embedding = tf.get_variable(
                        name='user_embedding',
                        shape=[self.num_user, self.num_factors],
                        initializer=tf.truncated_normal_initializer(
                            mean=0.0, stddev=0.01),
                        dtype=tf.float32)  # (users, embedding_size)

                # Decoder Variables
                self.layer2_w1 = tf.get_variable(
                    name='decoder_weights',
                    shape=[self.num_factors, self.num_item],
                    initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.01))

                layer2_w2 = tf.get_variable(
                    name='decoder_concat',
                    shape=[self.num_noise_factor, self.num_item],
                    initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.01))

                self.layer2_b = tf.get_variable(
                    name='decoder_bias',
                    shape=[self.num_item],
                    initializer=tf.zeros_initializer())

                # Noise Variables
                item_w1_noise = tf.get_variable(
                    name='item_w1_noise',
                    shape=[self.num_item, self.num_factors],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32,
                    trainable=False)
                if self.is_user_node:
                    user_w_noise = tf.get_variable(
                        name='user_w_noise',
                        shape=[self.num_user, self.num_factors],
                        initializer=tf.zeros_initializer(),
                        dtype=tf.float32,
                        trainable=False)

                item_w2_noise = tf.get_variable(
                    name='item_w2_noise',
                    shape=[self.num_factors, self.num_item],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32,
                    trainable=False)

                hidden_noise_tr = tf.get_variable(
                    name='hidden_noise_tr',
                    shape=[self.batch_size, self.num_factors],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32,
                    trainable=False)

                hidden_noise_eval = tf.get_variable(
                    name='hidden_noise_eval',
                    shape=[self.num_user, self.num_factors],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32,
                    trainable=False)

                noise_vector_tr = tf.get_variable(
                    name='encoder_noise_tr',
                    shape=[self.batch_size, self.num_noise_factor],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32,
                    trainable=False)

                noise_vector_eval = tf.get_variable(
                    name='encoder_noise_eval',
                    shape=[self.num_user, self.num_noise_factor],
                    initializer=tf.zeros_initializer(),
                    dtype=tf.float32,
                    trainable=False)

            #########################################################################################################
            with tf.name_scope('Original_AE'):
                ############# Original AE Model
                org_w1, org_w2 = self.layer1_w, self.layer2_w1

                if self.robust_test:
                    if self.noise_pos == 'W1':
                        org_w1 += item_w1_noise
                    elif self.noise_pos == 'W2':
                        org_w2 += item_w2_noise
                    elif self.noise_pos == 'USER':
                        self.user_embedding += user_w_noise

                if self.is_user_node:
                    user_node = tf.nn.embedding_lookup(self.user_embedding,
                                                       self.uid)
                    org_encoder = tf.sigmoid(
                        tf.matmul(input, org_w1) + self.layer1_b + user_node)
                    # org_encoder = tf.identity(tf.matmul(input, org_w1) + self.layer1_b + user_node)
                else:
                    org_encoder = tf.sigmoid(
                        tf.matmul(input, org_w1) + self.layer1_b)
                    # org_encoder = tf.identity(tf.matmul(input, org_w1) + self.layer1_b)

                if self.robust_test and self.noise_pos == 'HID':
                    org_encoder += hidden_noise_eval

                org_encoder = tf.cond(
                    self.istraining,
                    lambda: tf.layers.dropout(org_encoder,
                                              rate=self.layer1_dropout_rate,
                                              name='layer1_dropout'),
                    lambda: org_encoder)

                org_decoder = tf.identity(
                    tf.matmul(org_encoder, org_w2) + self.layer2_b)

                self.org_output = org_decoder

                org_base_loss = tf.reduce_sum(
                    tf.nn.sigmoid_cross_entropy_with_logits(
                        labels=input, logits=self.org_output))
                # org_base_loss = tf.nn.l2_loss(self.org_output - input)
                org_base_loss = org_base_loss / tf.cast(
                    tf.shape(input)[0], dtype=org_base_loss.dtype)

            #########################################################################################################
            ###### The Noisy Auto-Encoder
            if self.adv_training:
                if self.noise_pos == 'CON':
                    # ConCat Noise AE
                    with tf.name_scope("ConCat_AE"):

                        if self.is_user_node:
                            user_node = tf.nn.embedding_lookup(
                                self.user_embedding, self.uid)
                            concat_noise_encoder = tf.sigmoid(
                                tf.matmul(input, self.layer1_w) +
                                self.layer1_b + user_node)
                        else:
                            concat_noise_encoder = tf.sigmoid(
                                tf.matmul(input, self.layer1_w) +
                                self.layer1_b)

                        concat_noise_encoder = tf.cond(
                            self.istraining, lambda: tf.concat(
                                [concat_noise_encoder, noise_vector_tr],
                                axis=1), lambda: tf.concat(
                                    [concat_noise_encoder, noise_vector_eval],
                                    axis=1))

                        concat_noise_encoder = tf.cond(
                            self.istraining, lambda: tf.layers.dropout(
                                concat_noise_encoder,
                                rate=self.layer1_dropout_rate,
                                name='layer1_dropout'),
                            lambda: concat_noise_encoder)

                        concat_w2 = tf.concat([self.layer2_w1, layer2_w2],
                                              axis=0)
                        # out_vector = tf.sigmoid(tf.matmul(concat_noise_encoder, layer2_concat_w) + layer2_b)
                        concat_noise_decoder = tf.identity(
                            tf.matmul(concat_noise_encoder, concat_w2) +
                            self.layer2_b)

                        # Output
                        self.concat_noise_output = concat_noise_decoder
                        # Noisy Model Loss
                        concat_noise_base_loss = tf.reduce_sum(
                            tf.nn.sigmoid_cross_entropy_with_logits(
                                labels=input, logits=self.concat_noise_output))
                        # concat_noise_base_loss = tf.nn.l2_loss(tf.sigmoid(self.concat_noise_output) - input)
                        concat_noise_base_loss = concat_noise_base_loss / tf.cast(
                            tf.shape(input)[0],
                            dtype=concat_noise_base_loss.dtype)

                if self.noise_pos == 'W1' or self.noise_pos == 'W1W2':
                    with tf.name_scope("W1_AE"):
                        w1_noise_w1 = self.layer1_w + item_w1_noise

                        if self.is_user_node:
                            user_node = tf.nn.embedding_lookup(
                                self.user_embedding, self.uid)
                            w1_noise_encoder = tf.sigmoid(
                                tf.matmul(input, w1_noise_w1) + self.layer1_b +
                                user_node)
                            # w1_noise_encoder = tf.identity(tf.matmul(input, w1_noise_w1) + self.layer1_b + user_node)
                        else:
                            w1_noise_encoder = tf.sigmoid(
                                tf.matmul(input, w1_noise_w1) + self.layer1_b)
                            # w1_noise_encoder = tf.identity(tf.matmul(input, w1_noise_w1) + self.layer1_b)

                        w1_noise_encoder = tf.cond(
                            self.istraining, lambda: tf.layers.dropout(
                                w1_noise_encoder,
                                rate=self.layer1_dropout_rate,
                                name='layer1_dropout'),
                            lambda: w1_noise_encoder)

                        w1_noise_decoder = tf.identity(
                            tf.matmul(w1_noise_encoder, self.layer2_w1) +
                            self.layer2_b)

                        # Output
                        self.w1_noise_output = w1_noise_decoder

                        # Noisy Model Loss
                        w1_noise_base_loss = tf.reduce_sum(
                            tf.nn.sigmoid_cross_entropy_with_logits(
                                labels=input, logits=self.w1_noise_output))
                        # w1_noise_base_loss = tf.nn.l2_loss(self.w1_noise_output - input)
                        w1_noise_base_loss = w1_noise_base_loss / tf.cast(
                            tf.shape(input)[0], dtype=w1_noise_base_loss.dtype)

                if self.noise_pos == 'W2' or self.noise_pos == 'W1W2':
                    with tf.name_scope("W2_AE"):
                        w2_noise_w2 = self.layer2_w1 + item_w2_noise

                        if self.is_user_node:
                            user_node = tf.nn.embedding_lookup(
                                self.user_embedding, self.uid)
                            w2_noise_encoder = tf.sigmoid(
                                tf.matmul(input, self.layer1_w) +
                                self.layer1_b + user_node)
                            # w2_noise_encoder = tf.identity(tf.matmul(input, self.layer1_w) + self.layer1_b + user_node)
                        else:
                            w2_noise_encoder = tf.sigmoid(
                                tf.matmul(input, self.layer1_w) +
                                self.layer1_b)
                            # w2_noise_encoder = tf.identity(tf.matmul(input, self.layer1_w) + self.layer1_b)

                        w2_noise_encoder = tf.cond(
                            self.istraining, lambda: tf.layers.dropout(
                                w2_noise_encoder,
                                rate=self.layer1_dropout_rate,
                                name='layer1_dropout'),
                            lambda: w2_noise_encoder)

                        w2_noise_decoder = tf.identity(
                            tf.matmul(w2_noise_encoder, w2_noise_w2) +
                            self.layer2_b)

                        # Output
                        self.w2_noise_output = w2_noise_decoder

                        # Noisy Model Loss
                        w2_noise_base_loss = tf.reduce_sum(
                            tf.nn.sigmoid_cross_entropy_with_logits(
                                labels=input, logits=self.w2_noise_output))
                        # w2_noise_base_loss = tf.nn.l2_loss(self.w2_noise_output - input)
                        w2_noise_base_loss = w2_noise_base_loss / tf.cast(
                            tf.shape(input)[0], dtype=w2_noise_base_loss.dtype)

                if self.noise_pos == 'USER':
                    with tf.name_scope("USER_AE"):
                        self.user_embedding += user_w_noise
                        user_node = tf.nn.embedding_lookup(
                            self.user_embedding, self.uid)
                        user_noise_encoder = tf.sigmoid(
                            tf.matmul(input, self.layer1_w) + self.layer1_b +
                            user_node)

                        user_noise_encoder = tf.cond(
                            self.istraining, lambda: tf.layers.dropout(
                                user_noise_encoder,
                                rate=self.layer1_dropout_rate,
                                name='layer1_dropout'),
                            lambda: user_noise_encoder)

                        user_noise_decoder = tf.identity(
                            tf.matmul(user_noise_encoder, self.layer2_w1) +
                            self.layer2_b)

                        # Output
                        self.user_noise_output = user_noise_decoder

                        # Noisy Model Loss
                        user_noise_base_loss = tf.reduce_sum(
                            tf.nn.sigmoid_cross_entropy_with_logits(
                                labels=input, logits=self.user_noise_output))
                        # weight_noise_base_loss = tf.nn.l2_loss(tf.sigmoid(self.weight_noise_output) - input)
                        user_noise_base_loss = user_noise_base_loss / tf.cast(
                            tf.shape(input)[0],
                            dtype=user_noise_base_loss.dtype)

                if self.noise_pos == 'HID':
                    with tf.name_scope("Hidden_AE"):
                        if self.is_user_node:
                            user_node = tf.nn.embedding_lookup(
                                self.user_embedding, self.uid)
                            hidden_noise_encoder = tf.sigmoid(
                                tf.matmul(input, self.layer1_w) +
                                self.layer1_b + user_node) + hidden_noise_tr
                        else:
                            hidden_noise_encoder = tf.sigmoid(
                                tf.matmul(input, self.layer1_w) +
                                self.layer1_b) + hidden_noise_tr

                        hidden_noise_encoder = tf.cond(
                            self.istraining, lambda: tf.layers.dropout(
                                hidden_noise_encoder,
                                rate=self.layer1_dropout_rate,
                                name='layer1_dropout'),
                            lambda: hidden_noise_encoder)

                        hidden_noise_decoder = tf.identity(
                            tf.matmul(hidden_noise_encoder, self.layer2_w1) +
                            self.layer2_b)

                        # Output
                        self.hidden_noise_output = hidden_noise_decoder

                        hidden_noise_base_loss = tf.reduce_sum(
                            tf.nn.sigmoid_cross_entropy_with_logits(
                                labels=input, logits=self.hidden_noise_output))
                        # hidden_noise_base_loss = tf.nn.l2_loss(tf.sigmoid(self.hidden_noise_output) - input)
                        hidden_noise_base_loss = hidden_noise_base_loss / tf.cast(
                            tf.shape(input)[0],
                            dtype=self.hidden_noise_output.dtype)

            ############# Final Outputs
            with tf.name_scope('Prediction'):
                # self.mixed_output = (1-self.output_mix_ratio) * self.org_output + self.output_mix_ratio * self.noisy_output
                self.pred_y = tf.sigmoid(self.org_output)
                # self.pred_y = self.org_output
                # self.pred_y = tf.sigmoid(self.mixed_output)

            ############# Overall Losses
            with tf.name_scope('Loss'):
                if self.adv_training:
                    if self.noise_pos == 'W1':
                        base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio_W1 * w1_noise_base_loss
                        reg_loss =  self.ae_regs[0] * tf.nn.l2_loss(w1_noise_w1) + \
                                    self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                    self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \
                                    self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                    if self.noise_pos == 'W2':
                        base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * w2_noise_base_loss
                        reg_loss =  self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \
                                    self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                    self.ae_regs[2] * tf.nn.l2_loss(w2_noise_w2) + \
                                    self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                    if self.noise_pos == 'HID':
                        base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * hidden_noise_base_loss
                        reg_loss =  self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \
                                    self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                    self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \
                                    self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                    if self.noise_pos == 'USER':
                        base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * user_noise_base_loss
                        reg_loss =  self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \
                                    self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                    self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \
                                    self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                    if self.noise_pos == 'CON':
                        base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio * concat_noise_base_loss
                        reg_loss =  self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \
                                    self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                    self.ae_regs[2] * tf.nn.l2_loss(concat_w2) + \
                                    self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                    if self.noise_pos == 'W1W2':
                        base_loss = self.org_loss_ratio * org_base_loss + self.noise_loss_ratio_W1 * w1_noise_base_loss + self.noise_loss_ratio * w2_noise_base_loss
                        if self.noise_loss_ratio_W1 != 0 and self.noise_loss_ratio != 0:
                            reg_loss = self.ae_regs[0] * tf.nn.l2_loss(w1_noise_w1) + \
                                       self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                       self.ae_regs[2] * tf.nn.l2_loss(w2_noise_w2) + \
                                       self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                        elif self.noise_loss_ratio_W1 == 0:  #Noise on W2 only
                            reg_loss =  self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \
                                        self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                        self.ae_regs[2] * tf.nn.l2_loss(w2_noise_w2) + \
                                        self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                        elif self.noise_loss_ratio == 0:  #Noise on W1 only
                            reg_loss = self.ae_regs[0] * tf.nn.l2_loss(w1_noise_w1) + \
                                        self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                                        self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \
                                        self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                else:
                    base_loss = org_base_loss
                    reg_loss = self.ae_regs[0] * tf.nn.l2_loss(self.layer1_w) + \
                               self.ae_regs[1] * tf.nn.l2_loss(self.layer1_b) + \
                               self.ae_regs[2] * tf.nn.l2_loss(self.layer2_w1) + \
                               self.ae_regs[3] * tf.nn.l2_loss(self.layer2_b)

                if self.is_user_node:
                    reg_loss += self.user_node_regs * tf.nn.l2_loss(
                        self.user_embedding)

                self.loss = base_loss + reg_loss

            ############# Optimizer
            with tf.name_scope('Optimizer'):
                self.opt = tf.train.GradientDescentOptimizer(self.lr).minimize(
                    self.loss)
                # self.opt = tf.train.AdagradOptimizer(self.lr).minimize(self.loss)
            ########### Robustness Testing (Random or Adversial)
            with tf.name_scope('Noise_Adding'):
                if self.adv_training or self.robust_test:
                    if self.noise_type == 'random':
                        if self.noise_pos == 'W1':
                            random_noise = tf.random_normal(
                                shape=tf.shape(org_w1),
                                mean=tf.reduce_mean(org_w1),
                                stddev=0.01)
                            self.update_delta = item_w1_noise.assign(
                                self.eps * random_noise /
                                tf.norm(random_noise))
                        if self.noise_pos == 'W2':
                            random_noise = tf.random_normal(
                                shape=tf.shape(org_w2),
                                mean=tf.reduce_mean(org_w2),
                                stddev=0.01)
                            self.update_delta = item_w2_noise.assign(
                                self.eps * random_noise /
                                tf.norm(random_noise))
                        if self.noise_pos == 'USER':
                            random_noise = tf.random_normal(
                                shape=tf.shape(self.user_embedding),
                                mean=tf.reduce_mean(self.user_embedding),
                                stddev=0.01)
                            self.update_delta = user_w_noise.assign(
                                self.eps * random_noise /
                                tf.norm(random_noise))
                        if self.noise_pos == 'HID':
                            random_noise = tf.random_normal(
                                shape=tf.shape(org_encoder),
                                mean=tf.reduce_mean(org_encoder),
                                stddev=0.01)
                            if self.robust_test:
                                self.update_delta = hidden_noise_eval.assign(
                                    self.eps * random_noise /
                                    tf.norm(random_noise))
                            else:
                                self.update_delta = hidden_noise_tr.assign(
                                    self.eps * random_noise /
                                    tf.norm(random_noise))
                    if self.noise_type == 'adv':
                        if self.noise_pos == 'W1':
                            if self.robust_test:
                                self.grad_delta = tf.gradients(
                                    ys=org_base_loss, xs=item_w1_noise)[0]
                            else:
                                self.grad_delta = tf.gradients(
                                    ys=base_loss, xs=item_w1_noise)[0]
                            self.grad_delta_dense = tf.stop_gradient(
                                self.grad_delta)
                            self.update_delta = item_w1_noise.assign(
                                self.eps * self.grad_delta_dense /
                                tf.norm(self.grad_delta_dense))
                        if self.noise_pos == 'W2':
                            if self.robust_test:
                                self.grad_delta = tf.gradients(
                                    ys=org_base_loss, xs=item_w2_noise)[0]
                            else:
                                self.grad_delta = tf.gradients(
                                    ys=base_loss, xs=item_w2_noise)[0]
                            self.grad_delta_dense = tf.stop_gradient(
                                self.grad_delta)
                            self.update_delta = item_w2_noise.assign(
                                self.eps * self.grad_delta_dense /
                                tf.norm(self.grad_delta_dense))
                        if self.noise_pos == 'USER':
                            if self.robust_test:
                                self.grad_delta = tf.gradients(
                                    ys=org_base_loss, xs=user_w_noise)[0]
                            else:
                                self.grad_delta = tf.gradients(
                                    ys=base_loss, xs=user_w_noise)[0]
                            self.grad_delta_dense = tf.stop_gradient(
                                self.grad_delta)
                            self.update_delta = user_w_noise.assign(
                                self.eps * self.grad_delta_dense /
                                tf.norm(self.grad_delta_dense))
                        if self.noise_pos == 'HID':
                            if self.robust_test:
                                self.grad_delta = tf.gradients(
                                    ys=org_base_loss, xs=hidden_noise_eval)[0]
                                self.grad_delta_dense = tf.stop_gradient(
                                    self.grad_delta)
                                self.update_delta = hidden_noise_eval.assign(
                                    self.eps * self.grad_delta_dense /
                                    tf.norm(self.grad_delta_dense))
                            else:
                                self.grad_delta = tf.gradients(
                                    ys=base_loss, xs=hidden_noise_tr)[0]
                                self.grad_delta_dense = tf.stop_gradient(
                                    self.grad_delta)
                                self.update_delta = hidden_noise_tr.assign(
                                    self.eps * self.grad_delta_dense /
                                    tf.norm(self.grad_delta_dense))
                        if self.noise_pos == 'W1W2':
                            if self.robust_test:
                                if self.noise_loss_ratio != 0 and self.noise_loss_ratio_W1 != 0:
                                    self.grad_delta1 = tf.gradients(
                                        ys=org_base_loss, xs=item_w1_noise)[0]
                                    self.grad_delta2 = tf.gradients(
                                        ys=org_base_loss, xs=item_w2_noise)[0]
                                elif self.noise_loss_ratio_W1 == 0:
                                    self.grad_delta2 = tf.gradients(
                                        ys=org_base_loss, xs=item_w2_noise)[0]
                                elif self.noise_loss_ratio == 0:
                                    self.grad_delta1 = tf.gradients(
                                        ys=org_base_loss, xs=item_w1_noise)[0]

                            else:
                                if self.noise_loss_ratio != 0 and self.noise_loss_ratio_W1 != 0:
                                    self.grad_delta1 = tf.gradients(
                                        ys=base_loss, xs=item_w1_noise)[0]
                                    self.grad_delta2 = tf.gradients(
                                        ys=base_loss, xs=item_w2_noise)[0]
                                elif self.noise_loss_ratio_W1 == 0:
                                    self.grad_delta2 = tf.gradients(
                                        ys=base_loss, xs=item_w2_noise)[0]
                                elif self.noise_loss_ratio == 0:
                                    self.grad_delta1 = tf.gradients(
                                        ys=base_loss, xs=item_w1_noise)[0]

                            if self.noise_loss_ratio != 0 and self.noise_loss_ratio_W1 != 0:
                                self.grad_delta_dense1 = tf.stop_gradient(
                                    self.grad_delta1)
                                self.grad_delta_dense2 = tf.stop_gradient(
                                    self.grad_delta2)
                                self.update_delta1 = item_w1_noise.assign(
                                    self.eps * self.grad_delta_dense1 /
                                    tf.norm(self.grad_delta_dense1))
                                self.update_delta2 = item_w2_noise.assign(
                                    self.eps * self.grad_delta_dense2 /
                                    tf.norm(self.grad_delta_dense2))
                                self.update_delta = self.update_delta1 + tf.transpose(
                                    self.update_delta2)

                            elif self.noise_loss_ratio_W1 == 0:
                                self.grad_delta_dense2 = tf.stop_gradient(
                                    self.grad_delta2)
                                self.update_delta2 = item_w2_noise.assign(
                                    self.eps * self.grad_delta_dense2 /
                                    tf.norm(self.grad_delta_dense2))
                                self.update_delta = self.update_delta2

                            elif self.noise_loss_ratio == 0:
                                self.grad_delta_dense1 = tf.stop_gradient(
                                    self.grad_delta1)
                                self.update_delta1 = item_w1_noise.assign(
                                    self.eps * self.grad_delta_dense1 /
                                    tf.norm(self.grad_delta_dense1))
                                self.update_delta = self.update_delta1

            print('Model Building Completed.')
Beispiel #58
0
 def loss(self, predictions, policy, cfv):
     r = tf.stop_gradient(
         cpea.rm_policy(cfv -
                        tf.reduce_sum(cfv * policy, axis=1, keepdims=True)))
     log_policy = tf.log(tf.clip_by_value(policy, 1e-15, 1 - 1e-15))
     return -tf.reduce_mean(tf.reduce_sum(r * log_policy, axis=1))
Beispiel #59
0
    def _build(self):

        inpts = [self.tiled_obs]
        if self.coords is not None:
            inpts.append(self.tiled_coords)

        self.outputs = self.sequence(*inpts)
        self.__dict__.update(self.outputs)

        log_weights = tf.reduce_sum(self.outputs.log_weights_per_timestep, 0)
        self.log_weights = tf.reshape(log_weights, (self.batch_size, self.k_particles))

        self.elbo_vae = tf.reduce_mean(self.log_weights)
        self.elbo_iwae_per_example = targets.iwae(self.log_weights)
        self.elbo_iwae = tf.reduce_mean(self.elbo_iwae_per_example)

        self.normalised_elbo_vae = self.elbo_vae / tf.to_float(self.n_timesteps)
        self.normalised_elbo_iwae = self.elbo_iwae / tf.to_float(self.n_timesteps)
        tf.summary.scalar('normalised_vae', self.normalised_elbo_vae)
        tf.summary.scalar('normalised_iwae', self.normalised_elbo_iwae)

        self.importance_weights = tf.stop_gradient(tf.nn.softmax(self.log_weights, -1))
        self.ess = ops.ess(self.importance_weights, average=True)
        self.iw_distrib = tf.distributions.Categorical(probs=self.importance_weights)
        self.iw_resampling_idx = self.iw_distrib.sample()


        # Logging
        self._log_resampled(self.data_ll_per_sample, 'data_ll')
        self._log_resampled(self.log_p_z_per_sample, 'log_p_z')
        self._log_resampled(self.log_q_z_given_x_per_sample, 'log_q_z_given_x')
        self._log_resampled(self.kl_per_sample, 'kl')

        # Mean squared error between inpt and mean of output distribution
        inpt_obs = self.tiled_obs
        if inpt_obs.shape[-1] == 1:
            inpt_obs = tf.squeeze(inpt_obs, -1)

        axes = [0] + list(range(inpt_obs.shape.ndims)[2:])
        self.mse_per_sample = tf.reduce_mean((inpt_obs - self.canvas) ** 2, axes)
        self._log_resampled(self.mse_per_sample, 'mse')
        self.raw_mse = tf.reduce_mean(self.mse_per_sample)
        tf.summary.scalar('raw_mse', self.raw_mse)

        if hasattr(self, 'num_steps_per_sample'):
            self._log_resampled(self.num_steps_per_sample, 'num_steps')

        if self.gt_presence is not None:
            self.gt_num_steps = tf.reduce_sum(self.gt_presence, -1)

            num_steps_per_sample = tf.reshape(self.num_steps_per_sample, (-1, self.batch_size, self.k_particles))
            gt_num_steps = tf.expand_dims(self.gt_num_steps, -1)

            self.num_step_accuracy_per_example = tf.to_float(tf.equal(gt_num_steps, num_steps_per_sample))
            self.raw_num_step_accuracy = tf.reduce_mean(self.num_step_accuracy_per_example)
            self.num_step_accuracy = self._imp_weighted_mean(self.num_step_accuracy_per_example)
            tf.summary.scalar('num_step_acc', self.num_step_accuracy)

        # For rendering
        resampled_names = 'obj_id canvas glimpse presence_prob presence presence_logit where'.split()
        for name in resampled_names:
            try:
                setattr(self, 'resampled_' + name, self.resample(getattr(self, name), axis=1))
            except AttributeError:
                pass
        try:
            self._log_resampled(self.num_disc_steps_per_sample, 'num_disc_steps')
            self._log_resampled(self.num_prop_steps_per_sample, 'num_prop_steps')
        except AttributeError:
            pass
Beispiel #60
0
def main(_):
    since = time.time()

    tf.logging.set_verbosity(tf.logging.INFO)

    # Create directories to store TensorBoard summaries
    if tf.gfile.Exists(FLAGS.summaries_dir):
        tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
    tf.gfile.MakeDirs(FLAGS.summaries_dir)

    # Set model Hyperparameter
    model_config = pretrain_model.get_model_config()

    # Read the folder structure, and create lists of all the images
    image_lists = utils.create_image_lists(FLAGS.images_dir)
    class_count = len(image_lists.keys())

    if class_count == 0:
        tf.logging.error("No valid folders of images found at " +
                         FLAGS.images_dir)
        return -1
    if class_count == 1:
        tf.logging.error("Only one valid folder of images found at " +
                         FLAGS.images_dir)
        return -1

    # Create output_labels.txt displaying classes being trained
    with open(FLAGS.output_labels, "w") as f:
        f.write("\n".join(image_lists.keys()) + "\n")

    with tf.Session() as sess:
        # Set up the image decoding
        jpeg_data, decoded_image = pretrain_model.decode_jpeg(
            model_config["input_width"], model_config["input_height"],
            model_config["input_depth"], model_config["input_mean"],
            model_config["input_std"])

        # Load DenseNet model
        densenet_model, bottlenecks, resized_image, bottlenecks_size = pretrain_model.load_densenet_169(
            FLAGS.model_dir)

        # store pretrained model bottlenecks
        bottleneck.store_bottlenecks(sess,
                                     image_lists,
                                     FLAGS.images_dir,
                                     FLAGS.bottlenecks_dir,
                                     FLAGS.model_name,
                                     jpeg_data,
                                     decoded_image,
                                     resized_image,
                                     bottlenecks,
                                     model=densenet_model)

        bottlenecks = tf.stop_gradient(bottlenecks)

        global_step = tf.Variable(tf.constant(0), trainable=False)

        # Initialized final layer
        (train_step, cross_entropy, accuracy, bottlenecks_input, labels_input,
         final_result) = train.final_layer(len(image_lists.keys()),
                                           FLAGS.final_name, bottlenecks,
                                           bottlenecks_size,
                                           FLAGS.learning_rate, global_step)

        merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + "/train",
                                             sess.graph)

        validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                                  "/validation")

        # Initialize all variables
        init = tf.global_variables_initializer()
        sess.run(init)

        # Get validation bottlenecks for evaluation
        validation_bottlenecks, validation_labels = (
            bottleneck.get_batch_of_bottlenecks(
                sess, image_lists, FLAGS.validation_batch_size, "validation",
                FLAGS.bottlenecks_dir, FLAGS.images_dir, FLAGS.model_name,
                jpeg_data, decoded_image, resized_image, bottlenecks))

        # Get test bottlenecks for evaluation
        test_bottlenecks, test_labels = (bottleneck.get_batch_of_bottlenecks(
            sess, image_lists, FLAGS.test_batch_size, "testing",
            FLAGS.bottlenecks_dir, FLAGS.images_dir, FLAGS.model_name,
            jpeg_data, decoded_image, resized_image, bottlenecks))

        best_acc = 0.0

        for i in range(FLAGS.iterations):
            # Get training bottlenecks
            (train_bottlenecks,
             train_labels) = bottleneck.get_batch_of_bottlenecks(
                 sess, image_lists, FLAGS.train_batch_size, "training",
                 FLAGS.bottlenecks_dir, FLAGS.images_dir, FLAGS.model_name,
                 jpeg_data, decoded_image, resized_image, bottlenecks)
            # Training step
            train_summary, _ = sess.run(
                [merged, train_step],
                feed_dict={
                    bottlenecks_input: train_bottlenecks,
                    labels_input: train_labels,
                    global_step: i
                })
            train_writer.add_summary(train_summary, i)

            # Show evaluation based on specified frequency
            final_step = (i + 1 == FLAGS.iterations)
            if (i % FLAGS.eval_interval) == 0 or final_step:
                # Evaluation
                train_accuracy, train_loss = sess.run(
                    [accuracy, cross_entropy],
                    feed_dict={
                        bottlenecks_input: train_bottlenecks,
                        labels_input: train_labels
                    })

                # Run evaluation step on validation bottlenecks
                validation_summary, validation_accuracy, validation_loss = sess.run(
                    [merged, accuracy, cross_entropy],
                    feed_dict={
                        bottlenecks_input: validation_bottlenecks,
                        labels_input: validation_labels
                    })
                validation_writer.add_summary(validation_summary, i)

                # Save best accuracy and store model
                if validation_accuracy > best_acc:
                    best_acc = validation_accuracy

                    # Calculate the test accuracy with best validation on test bottlenecks
                    test_accuracy1, test_loss1 = sess.run(
                        [accuracy, cross_entropy],
                        feed_dict={
                            bottlenecks_input: test_bottlenecks,
                            labels_input: test_labels
                        })

                    train.save_graph_to_file(sess, FLAGS.output_graph,
                                             FLAGS.final_name)
                    train.save_checkpoint_to_file(sess,
                                                  FLAGS.output_checkpoint_dir)

                tf.logging.info(
                    "Iteration {}: train loss = {}, train acc = {}, val loss = {}, val acc = {}."
                    .format(i, train_loss, train_accuracy, validation_loss,
                            validation_accuracy))

        # Calculate the final test accuracy on test bottlenecks.
        test_accuracy2, test_loss2 = sess.run([accuracy, cross_entropy],
                                              feed_dict={
                                                  bottlenecks_input:
                                                  test_bottlenecks,
                                                  labels_input: test_labels
                                              })

        tf.logging.info("Best validation accuracy = {}".format(best_acc * 100))
        tf.logging.info("Test accuracy with best validation =  {}".format(
            test_accuracy1 * 100))
        tf.logging.info("Final test accuracy =  {}".format(test_accuracy2 *
                                                           100))

    time_elapsed = time.time() - since

    print("Runtime: {}min, {:0.2f}sec".format(int(time_elapsed // 60),
                                              time_elapsed % 60))

    with open(os.path.join("..", FLAGS.model_name, "results.txt"), "w") as f:
        f.write("Best validation accuracy: " + str(best_acc) + "\n")
        f.write("Test accuracy with best validation: " + str(test_accuracy1) +
                "\n")
        f.write("Final test accuracy =  {}".format(test_accuracy2 * 100) +
                "\n")
        f.write("Runtime: " + str(int(time_elapsed // 60)) + "min," +
                str(time_elapsed % 60) + "sec")