Beispiel #1
0
def define_ppo_step(data_points, hparams, action_space, lr):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points

    (logits, new_value) = get_policy(observation, hparams, action_space)
    new_policy_dist = tfp.distributions.Categorical(logits=logits)

    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]
    loss = sum(losses)
    train_op = optimize.optimize(loss, lr, hparams)

    with tf.control_dependencies([train_op]):
        return [tf.identity(x) for x in losses]
Beispiel #2
0
def define_ppo_step(data_points, hparams, action_space, lr):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points

    obs_shape = common_layers.shape_list(observation)
    observation = tf.reshape(observation,
                             [obs_shape[0] * obs_shape[1]] + obs_shape[2:])
    (logits, new_value) = get_policy(observation, hparams, action_space)
    logits = tf.reshape(logits, obs_shape[:2] + [action_space.n])
    new_value = tf.reshape(new_value, obs_shape[:2])
    new_policy_dist = tf.distributions.Categorical(logits=logits)

    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]
    loss = sum(losses)
    variables = tf.global_variables(hparams.policy_network + "/.*")
    train_op = optimize.optimize(loss, lr, hparams, variables=variables)

    with tf.control_dependencies([train_op]):
        return [tf.identity(x) for x in losses]
Beispiel #3
0
 def __init__(
     self, batch_size, observation_space, action_space, policy_hparams,
     policy_dir, sampling_temp
 ):
   super(PolicyAgent, self).__init__(
       batch_size, observation_space, action_space
   )
   self._sampling_temp = sampling_temp
   with tf.Graph().as_default():
     self._observations_t = tf.placeholder(
         shape=((batch_size,) + self.observation_space.shape),
         dtype=self.observation_space.dtype
     )
     (logits, self._values_t) = rl.get_policy(
         self._observations_t, policy_hparams, self.action_space
     )
     actions = common_layers.sample_with_temperature(logits, sampling_temp)
     self._probs_t = tf.nn.softmax(logits / sampling_temp)
     self._actions_t = tf.cast(actions, tf.int32)
     model_saver = tf.train.Saver(
         tf.global_variables(policy_hparams.policy_network + "/.*")  # pylint: disable=unexpected-keyword-arg
     )
     self._sess = tf.Session()
     self._sess.run(tf.global_variables_initializer())
     trainer_lib.restore_checkpoint(policy_dir, model_saver, self._sess)
  def _prepare_networks(self, hparams, sess):
    self.action = tf.placeholder(shape=(1,), dtype=tf.int32)
    batch_env = SimulatedBatchEnv(hparams.environment_spec, hparams.num_agents)
    self.reward, self.done = batch_env.simulate(self.action)
    self.observation = batch_env.observ
    self.reset_op = batch_env.reset(tf.constant([0], dtype=tf.int32))

    environment_wrappers = hparams.environment_spec.wrappers
    wrappers = copy.copy(environment_wrappers) if environment_wrappers else []

    to_initialize = [batch_env]
    for w in wrappers:
      batch_env = w[0](batch_env, **w[1])
      to_initialize.append(batch_env)

    def initialization_lambda():
      for batch_env in to_initialize:
        batch_env.initialize(sess)

    self.initialize = initialization_lambda

    obs_copy = batch_env.observ + 0

    actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
    self.policy_probs = actor_critic.policy.probs[0, 0, :]
    self.value = actor_critic.value[0, :]
Beispiel #5
0
def define_ppo_step(data_points,
                    hparams,
                    action_space,
                    lr,
                    distributional_size=1,
                    distributional_subscale=0.04):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points

    obs_shape = common_layers.shape_list(observation)
    observation = tf.reshape(observation,
                             [obs_shape[0] * obs_shape[1]] + obs_shape[2:])
    (logits, new_value) = get_policy(observation,
                                     hparams,
                                     action_space,
                                     distributional_size=distributional_size)
    logits = tf.reshape(logits, obs_shape[:2] + [action_space.n])
    new_policy_dist = tfp.distributions.Categorical(logits=logits)

    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    if distributional_size > 1:
        new_value = tf.reshape(new_value,
                               obs_shape[:2] + [distributional_size])
        new_value = tf.nn.log_softmax(new_value, axis=-1)
        # We assume the values range from (-half, half) -- set subscale accordingly.
        half = (distributional_size // 2) * distributional_subscale
        # To make values integers, we add half (to move range to (0, 2*half) and
        # then multiply by subscale after which we floor to get nearest int.
        quantized_dr = tf.floor(
            (discounted_reward + half) / distributional_subscale)
        hot_dr = tf.one_hot(tf.cast(quantized_dr, tf.int32),
                            distributional_size)
        value_loss = -tf.reduce_sum(new_value * hot_dr, axis=-1)
        value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss)
    else:
        new_value = tf.reshape(new_value, obs_shape[:2])
        value_error = new_value - discounted_reward
        value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]
    loss = sum(losses)
    variables = tf.global_variables(hparams.policy_network + "/.*")
    train_op = optimize.optimize(loss, lr, hparams, variables=variables)

    with tf.control_dependencies([train_op]):
        return [tf.identity(x) for x in losses]
Beispiel #6
0
      def env_step(arg1, arg2, arg3):  # pylint: disable=unused-argument
        """Step of the environment."""
        actor_critic = get_policy(tf.expand_dims(obs_copy, 0), hparams)
        policy = actor_critic.policy
        action = hparams.policy_to_actions_lambda(policy)

        postprocessed_action = actor_critic.action_postprocessing(action)
        reward, done = batch_env.simulate(postprocessed_action[0, ...])

        pdf = policy.prob(action)[0]
        value_function = actor_critic.value[0]
        pdf = tf.reshape(pdf, shape=(num_agents,))
        value_function = tf.reshape(value_function, shape=(num_agents,))
        done = tf.reshape(done, shape=(num_agents,))

        with tf.control_dependencies([reward, done]):
          return tf.identity(pdf), tf.identity(value_function), \
                 tf.identity(done)
    def __init__(self, hparams, action_space, observation_space, policy_dir):
        assert hparams.base_algo == "ppo"
        ppo_hparams = trainer_lib.create_hparams(hparams.base_algo_params)

        frame_stack_shape = (
            1, hparams.frame_stack_size) + observation_space.shape
        self._frame_stack = np.zeros(frame_stack_shape, dtype=np.uint8)

        with tf.Graph().as_default():
            self.obs_t = tf.placeholder(shape=self.frame_stack_shape,
                                        dtype=np.uint8)
            self.logits_t, self.value_function_t = get_policy(
                self.obs_t, ppo_hparams, action_space)
            model_saver = tf.train.Saver(
                tf.global_variables(scope=ppo_hparams.policy_network + "/.*")  # pylint: disable=unexpected-keyword-arg
            )
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
            trainer_lib.restore_checkpoint(policy_dir, model_saver, self.sess)
      def env_step(arg1, arg2, arg3):  # pylint: disable=unused-argument
        """Step of the environment."""

        (logits, value_function) = get_policy(
            obs_copy, ppo_hparams, batch_env.action_space
        )
        action = common_layers.sample_with_temperature(logits, sampling_temp)
        action = tf.cast(action, tf.int32)

        reward, done = batch_env.simulate(action[:, 0, ...])

        pdf = tfp.distributions.Categorical(logits=logits).prob(action)
        pdf = tf.reshape(pdf, shape=(num_agents,))
        value_function = tf.reshape(value_function, shape=(num_agents,))
        done = tf.reshape(done, shape=(num_agents,))

        with tf.control_dependencies([reward, done]):
          return tf.identity(pdf), tf.identity(value_function), \
                 tf.identity(done)
Beispiel #9
0
def define_ppo_step(data_points, optimizer, hparams, action_space):
    """Define ppo step."""
    observation, action, discounted_reward, norm_advantage, old_pdf = data_points
    new_policy_dist, new_value, _ = get_policy(observation, hparams,
                                               action_space)
    new_pdf = new_policy_dist.prob(action)

    ratio = new_pdf / old_pdf
    clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                     1 + hparams.clipping_coef)

    surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                     ratio * norm_advantage)
    policy_loss = -tf.reduce_mean(surrogate_objective)

    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error**2)

    entropy = new_policy_dist.entropy()
    entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

    losses = [policy_loss, value_loss, entropy_loss]

    gradients = [
        list(zip(*optimizer.compute_gradients(loss))) for loss in losses
    ]

    gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]

    gradients_flat = sum([gradient[0] for gradient in gradients], ())
    gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())

    if hparams.max_gradients_norm:
        gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
                                                   hparams.max_gradients_norm)

    optimize_op = optimizer.apply_gradients(
        zip(gradients_flat, gradients_variables_flat))

    with tf.control_dependencies([optimize_op]):
        return [tf.identity(x) for x in losses + gradients_norms]
Beispiel #10
0
def define_ppo_step(data_points, optimizer, hparams, action_space):
  """Define ppo step."""
  observation, action, discounted_reward, norm_advantage, old_pdf = data_points
  new_policy_dist, new_value, _ = get_policy(observation, hparams, action_space)
  new_pdf = new_policy_dist.prob(action)

  ratio = new_pdf / old_pdf
  clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                   1 + hparams.clipping_coef)

  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                   ratio * norm_advantage)
  policy_loss = -tf.reduce_mean(surrogate_objective)

  value_error = new_value - discounted_reward
  value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2)

  entropy = new_policy_dist.entropy()
  entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

  losses = [policy_loss, value_loss, entropy_loss]

  gradients = [list(zip(*optimizer.compute_gradients(loss)))
               for loss in losses]

  gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]

  gradients_flat = sum([gradient[0] for gradient in gradients], ())
  gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())

  if hparams.max_gradients_norm:
    gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
                                               hparams.max_gradients_norm)

  optimize_op = optimizer.apply_gradients(zip(gradients_flat,
                                              gradients_variables_flat))

  with tf.control_dependencies([optimize_op]):
    return [tf.identity(x) for x in losses + gradients_norms]
Beispiel #11
0
def define_ppo_step(data_points, hparams, action_space, lr, epoch=-1,
                    distributional_size=1, distributional_subscale=0.04):
  """Define ppo step."""
  del distributional_subscale
  (observation, action, discounted_reward, discounted_reward_probs,
   norm_advantage, old_pdf) = data_points

  obs_shape = common_layers.shape_list(observation)
  observation = tf.reshape(
      observation, [obs_shape[0] * obs_shape[1]] + obs_shape[2:]
  )
  (logits, new_value) = get_policy(observation, hparams, action_space,
                                   epoch=epoch,
                                   distributional_size=distributional_size)
  logits = tf.reshape(logits, obs_shape[:2] + [action_space.n])
  new_policy_dist = tfp.distributions.Categorical(logits=logits)

  new_pdf = new_policy_dist.prob(action)

  ratio = new_pdf / old_pdf
  clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                   1 + hparams.clipping_coef)

  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                   ratio * norm_advantage)
  policy_loss = -tf.reduce_mean(surrogate_objective)

  if distributional_size > 1:
    new_value = tf.reshape(new_value, obs_shape[:2] + [distributional_size])
    new_value = tf.nn.log_softmax(new_value, axis=-1)
    value_shape = common_layers.shape_list(new_value)
    # The above is the new value distribution. We are also given as discounted
    # reward the value distribution and the corresponding probabilities.
    # The given discounted reward is already rounded to integers but in range
    # increased by 2x for greater fidelity. Increase range of new_values here.
    new_value_shifted = tf.concat([new_value[1:], new_value[-1:]], axis=0)
    new_value_mean = (new_value + new_value_shifted) / 2
    new_value = tf.concat([tf.expand_dims(new_value, axis=-1),
                           tf.expand_dims(new_value_mean, axis=-1)], -1)
    new_value = tf.reshape(new_value, value_shape[:-1] + [2 * value_shape[-1]])
    # Cast discounted reward to integers and gather the new log-probs for them.
    discounted_reward = tf.cast(discounted_reward, tf.int32)
    value_loss = tf.batch_gather(new_value, discounted_reward)
    # Weight the gathered (new) log-probs by the old probabilities.
    discounted_reward_probs = tf.expand_dims(discounted_reward_probs, axis=1)
    value_loss = - tf.reduce_sum(value_loss * discounted_reward_probs, axis=-1)
    # Take the mean over batch and time as final loss, multiply by coefficient.
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_loss)
  else:
    new_value = tf.reshape(new_value, obs_shape[:2])
    value_error = new_value - discounted_reward
    value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2)

  entropy = new_policy_dist.entropy()
  entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

  losses = [policy_loss, value_loss, entropy_loss]
  loss = sum(losses)
  variables = tf.global_variables(hparams.policy_network + "/.*")
  train_op = optimize.optimize(loss, lr, hparams, variables=variables)

  with tf.control_dependencies([train_op]):
    return [tf.identity(x) for x in losses]