Ejemplo n.º 1
0
    def _prepare_networks(self, hparams, sess):
        self.action = tf.placeholder(shape=(1, ), dtype=tf.int32)
        batch_env = batch_env_factory(hparams)
        self.reward, self.done = batch_env.simulate(self.action)
        self.observation = batch_env.observ
        self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32))

        environment_wrappers = hparams.environment_spec.wrappers
        wrappers = copy.copy(
            environment_wrappers) if environment_wrappers else []

        to_initialize = [batch_env]
        for w in wrappers:
            batch_env = w[0](batch_env, **w[1])
            to_initialize.append(batch_env)

        def initialization_lambda():
            for batch_env in to_initialize:
                batch_env.initialize(sess)

        self.initialize = initialization_lambda

        obs_copy = batch_env.observ + 0

        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        self.policy_probs = actor_critic.policy.probs[0, 0, :]
        self.value = actor_critic.value[0, :]
Ejemplo n.º 2
0
      def env_step(arg1, arg2):  # pylint: disable=unused-argument
        """Step of the environment."""
        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        policy = actor_critic.policy
        if policy_to_actions_lambda:
          action = policy_to_actions_lambda(policy)
        else:
          action = tf.cond(eval_phase,
                           policy.mode,
                           policy.sample)

        postprocessed_action = actor_critic.action_postprocessing(action)
        simulate_output = batch_env.simulate(postprocessed_action[0, ...])

        pdf = policy.prob(action)[0]
        value_function = actor_critic.value[0]
        pdf = tf.reshape(pdf, shape=(hparams.num_agents,))
        value_function = tf.reshape(value_function, shape=(hparams.num_agents,))

        with tf.control_dependencies(simulate_output):
          return tf.identity(pdf), tf.identity(value_function)
Ejemplo n.º 3
0
def define_ppo_step(data_points, optimizer, hparams):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points
    new_policy_dist, new_value, _ = get_policy(observation, hparams)
    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]

    gradients = [
        list(zip(*optimizer.compute_gradients(loss))) for loss in losses
    ]

    gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]

    gradients_flat = sum([gradient[0] for gradient in gradients], ())
    gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())

    if hparams.max_gradients_norm:
        gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
                                                   hparams.max_gradients_norm)

    optimize_op = optimizer.apply_gradients(
        zip(gradients_flat, gradients_variables_flat))

    with tf.control_dependencies([optimize_op]):
        return [tf.identity(x) for x in losses + gradients_norms]
Ejemplo n.º 4
0
def define_ppo_step(data_points, optimizer, hparams):
  """Define ppo step."""
  observation, action, discounted_reward, norm_advantage, old_pdf = data_points
  new_policy_dist, new_value, _ = get_policy(observation, hparams)
  new_pdf = new_policy_dist.prob(action)

  ratio = new_pdf / old_pdf
  clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                   1 + hparams.clipping_coef)

  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                   ratio * norm_advantage)
  policy_loss = -tf.reduce_mean(surrogate_objective)

  value_error = new_value - discounted_reward
  value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2)

  entropy = new_policy_dist.entropy()
  entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

  losses = [policy_loss, value_loss, entropy_loss]

  gradients = [list(zip(*optimizer.compute_gradients(loss)))
               for loss in losses]

  gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]

  gradients_flat = sum([gradient[0] for gradient in gradients], ())
  gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())

  if hparams.max_gradients_norm:
    gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
                                               hparams.max_gradients_norm)

  optimize_op = optimizer.apply_gradients(zip(gradients_flat,
                                              gradients_variables_flat))

  with tf.control_dependencies([optimize_op]):
    return [tf.identity(x) for x in losses + gradients_norms]