Esempio n. 1
0
def evaluate_policy(eval_env,
                    get_predictions,
                    max_timestep=20000,
                    n_evals=1,
                    len_history_for_policy=32,
                    rng=None):
    """Evaluate the policy."""

    avg_rewards = collections.defaultdict(float)
    avg_rewards_unclipped = collections.defaultdict(float)
    for _ in range(n_evals):
        for policy in [
                env_problem_utils.CATEGORICAL_SAMPLING,
                env_problem_utils.GUMBEL_SAMPLING,
        ]:
            trajs, _, _ = env_problem_utils.play_env_problem_with_policy(
                eval_env,
                get_predictions,
                num_trajectories=eval_env.batch_size,
                max_timestep=max_timestep,
                reset=True,
                policy_sampling=policy,
                rng=rng,
                len_history_for_policy=len_history_for_policy)
            avg_rewards[policy] += float(sum(
                np.sum(traj[2]) for traj in trajs)) / len(trajs)
            avg_rewards_unclipped[policy] += float(
                sum(np.sum(traj[3]) for traj in trajs)) / len(trajs)

    # Now average these out.
    for k in avg_rewards:
        avg_rewards[k] /= n_evals
        avg_rewards_unclipped[k] /= n_evals

    return avg_rewards, avg_rewards_unclipped
Esempio n. 2
0
def evaluate_policy(eval_env,
                    get_predictions,
                    boundary,
                    max_timestep=20000,
                    n_evals=1,
                    rng=None):
    """Evaluate the policy."""

    avg_rewards = collections.defaultdict(float)
    avg_rewards_unclipped = collections.defaultdict(float)
    for _ in range(n_evals):
        for policy in [
                env_problem_utils.CATEGORICAL_SAMPLING,
                env_problem_utils.GUMBEL_SAMPLING,
                env_problem_utils.EPSILON_GREEDY
        ]:
            trajs, _ = env_problem_utils.play_env_problem_with_policy(
                eval_env,
                get_predictions,
                boundary=boundary,
                max_timestep=max_timestep,
                reset=True,
                policy_sampling=policy,
                rng=rng)
            avg_rewards[policy] += float(sum(
                np.sum(traj[2]) for traj in trajs)) / len(trajs)
            avg_rewards_unclipped[policy] += float(
                sum(np.sum(traj[3]) for traj in trajs)) / len(trajs)

    # Now average these out.
    for k in avg_rewards:
        avg_rewards[k] /= n_evals
        avg_rewards_unclipped[k] /= n_evals

    return avg_rewards, avg_rewards_unclipped
Esempio n. 3
0
def evaluate_policy(eval_env,
                    get_predictions,
                    temperatures,
                    max_timestep=20000,
                    n_evals=1,
                    len_history_for_policy=32,
                    rng=None):
  """Evaluate the policy."""

  processed_reward_sums = collections.defaultdict(list)
  raw_reward_sums = collections.defaultdict(list)
  for eval_rng in jax_random.split(rng, num=n_evals):
    for temperature in temperatures:
      trajs, _, _ = env_problem_utils.play_env_problem_with_policy(
          eval_env,
          get_predictions,
          num_trajectories=eval_env.batch_size,
          max_timestep=max_timestep,
          reset=True,
          temperature=temperature,
          rng=eval_rng,
          len_history_for_policy=len_history_for_policy)
      processed_reward_sums[temperature].extend(sum(traj[2]) for traj in trajs)
      raw_reward_sums[temperature].extend(sum(traj[3]) for traj in trajs)

  # Return the mean and standard deviation for each temperature.
  def compute_stats(reward_dict):
    return {
        temperature: {"mean": onp.mean(rewards), "std": onp.std(rewards)}
        for (temperature, rewards) in reward_dict.items()
    }
  return {
      "processed": compute_stats(processed_reward_sums),
      "raw": compute_stats(raw_reward_sums),
  }
Esempio n. 4
0
  def test_play_env_problem_with_policy(self):
    env = env_problem.EnvProblem(
        base_env_name="CartPole-v0",
        batch_size=2,
        reward_range=(-1, 1))

    def policy_fun(observations, rng=None):
      b, t = observations.shape[:2]
      a = env.action_space.n
      p = np.random.uniform(size=(b, t, a))
      p = np.exp(p)
      p = p / np.sum(p, axis=-1, keepdims=True)
      return np.log(p), (), rng

    max_timestep = 15
    num_trajectories = 2
    trajectories, _ = env_problem_utils.play_env_problem_with_policy(
        env, policy_fun, num_trajectories=num_trajectories,
        max_timestep=max_timestep, boundary=20)

    self.assertEqual(num_trajectories, len(trajectories))

    # Check shapes within trajectories.
    traj = trajectories[0]
    T = traj[1].shape[0]  # pylint: disable=invalid-name
    self.assertEqual((T+1, 4), traj[0].shape)  # (4,) is OBS
    self.assertEqual((T,), traj[2].shape)
    self.assertLessEqual(T, max_timestep)

    traj = trajectories[1]
    T = traj[1].shape[0]  # pylint: disable=invalid-name
    self.assertEqual((T+1, 4), traj[0].shape)
    self.assertEqual((T,), traj[2].shape)
    self.assertLessEqual(T, max_timestep)
Esempio n. 5
0
def collect_trajectories(env,
                         policy_fn,
                         n_trajectories=1,
                         max_timestep=None,
                         reset=True,
                         len_history_for_policy=32,
                         boundary=32,
                         state=None,
                         temperature=1.0,
                         rng=None):
  """Collect trajectories with the given policy net and behaviour.

  Args:
    env: A gym env interface, for now this is not-batched.
    policy_fn: observations(B,T+1) -> log-probabs(B,T+1, A) callable.
    n_trajectories: int, number of trajectories.
    max_timestep: int or None, the index of the maximum time-step at which we
      return the trajectory, None for ending a trajectory only when env returns
      done.
    reset: bool, true if we want to reset the envs. The envs are also reset if
      max_max_timestep is None or < 0
    len_history_for_policy: int or None, the maximum history to keep for
      applying the policy on. If None, use the full history.
    boundary: int, pad the sequences to the multiples of this number.
    state: state for `policy_fn`.
    temperature: (float) temperature to sample action from policy_fn.
    rng: jax rng, splittable.

  Returns:
    A tuple (trajectory, number of trajectories that are done)
    trajectory: list of (observation, action, reward) tuples, where each element
    `i` is a tuple of numpy arrays with shapes as follows:
    observation[i] = (B, T_i + 1)
    action[i] = (B, T_i)
    reward[i] = (B, T_i)
  """

  assert isinstance(env, env_problem.EnvProblem)
  # This is an env_problem, run its collect function.
  trajs, n_done, timing_info, state = env_problem_utils.play_env_problem_with_policy(
      env,
      policy_fn,
      num_trajectories=n_trajectories,
      max_timestep=max_timestep,
      reset=reset,
      len_history_for_policy=len_history_for_policy,
      boundary=boundary,
      state=state,
      temperature=temperature,
      rng=rng)
  # Skip returning raw_rewards here, since they aren't used.

  # t is the return value of Trajectory.as_numpy, so:
  # (observation, action, processed_reward, raw_reward, infos)
  return trajs, n_done, timing_info, state
Esempio n. 6
0
    def test_play_env_problem_with_policy(self):
        env = gym_env_problem.GymEnvProblem(base_env_name="CartPole-v0",
                                            batch_size=2,
                                            reward_range=(-1, 1))

        # Let's make sure that at-most 4 observations come to the policy function.
        len_history_for_policy = 4

        def policy_fun(observations, state=None, rng=None):
            b, t = observations.shape[:2]
            # Assert that observations from time-step len_history_for_policy onwards
            # are zeros.
            self.assertTrue(
                np.all(observations[:, len_history_for_policy:, ...] == 0))
            self.assertFalse(
                np.all(observations[:, :len_history_for_policy, ...] == 0))
            a = env.action_space.n
            p = np.random.uniform(size=(b, t, a))
            p = np.exp(p)
            p = p / np.sum(p, axis=-1, keepdims=True)
            return np.log(p), np.mean(p, axis=-1), state, rng

        def action_index_fn(index):
            return index[:, None]

        max_timestep = 15
        num_trajectories = 2
        trajectories, _, _, _ = env_problem_utils.play_env_problem_with_policy(
            env,
            policy_fun,
            action_index_fn=action_index_fn,
            num_trajectories=num_trajectories,
            max_timestep=max_timestep,
            len_history_for_policy=len_history_for_policy)

        self.assertEqual(num_trajectories, len(trajectories))

        # Check shapes within trajectories.
        traj = trajectories[0]
        T = traj[1].shape[0]  # pylint: disable=invalid-name
        self.assertEqual((T + 1, 4), traj[0].shape)  # (4,) is OBS
        self.assertEqual((T, ), traj[2].shape)
        self.assertEqual(T, len(traj[4]["log_prob_actions"]))
        self.assertEqual(T, len(traj[4]["value_predictions"]))
        self.assertLessEqual(T, max_timestep)

        traj = trajectories[1]
        T = traj[1].shape[0]  # pylint: disable=invalid-name
        self.assertEqual((T + 1, 4), traj[0].shape)
        self.assertEqual((T, ), traj[2].shape)
        self.assertEqual(T, len(traj[4]["log_prob_actions"]))
        self.assertEqual(T, len(traj[4]["value_predictions"]))
        self.assertLessEqual(T, max_timestep)
Esempio n. 7
0
def collect_trajectories(env,
                         policy_fn,
                         n_trajectories=1,
                         policy=env_problem_utils.GUMBEL_SAMPLING,
                         max_timestep=None,
                         epsilon=0.1,
                         reset=True,
                         len_history_for_policy=32,
                         rng=None):
    """Collect trajectories with the given policy net and behaviour.

  Args:
    env: A gym env interface, for now this is not-batched.
    policy_fn: observations(B,T+1) -> log-probabs(B,T+1, A) callable.
    n_trajectories: int, number of trajectories.
    policy: string, "greedy", "epsilon-greedy", or "categorical-sampling" i.e.
      how to use the policy_fn to return an action.
    max_timestep: int or None, the index of the maximum time-step at which we
      return the trajectory, None for ending a trajectory only when env returns
      done.
    epsilon: float, the epsilon for `epsilon-greedy` policy.
    reset: bool, true if we want to reset the envs. The envs are also reset if
      max_max_timestep is None or < 0
    len_history_for_policy: int, the maximum history to keep for applying the
      policy on.
    rng: jax rng, splittable.

  Returns:
    A tuple (trajectory, number of trajectories that are done)
    trajectory: list of (observation, action, reward) tuples, where each element
    `i` is a tuple of numpy arrays with shapes as follows:
    observation[i] = (B, T_i + 1)
    action[i] = (B, T_i)
    reward[i] = (B, T_i)
  """

    assert isinstance(env, env_problem.EnvProblem)
    # This is an env_problem, run its collect function.
    trajs, n_done, timing_info = env_problem_utils.play_env_problem_with_policy(
        env,
        policy_fn,
        num_trajectories=n_trajectories,
        max_timestep=max_timestep,
        policy_sampling=policy,
        eps=epsilon,
        reset=reset,
        len_history_for_policy=len_history_for_policy,
        rng=rng)
    # Skip returning raw_rewards here, since they aren't used.

    # t is the return value of Trajectory.as_numpy, so:
    # (observation, action, processed_reward, raw_reward, infos)
    return [(t[0], t[1], t[2], t[4]) for t in trajs], n_done, timing_info
Esempio n. 8
0
def collect_trajectories(env,
                         policy_fun,
                         num_trajectories=1,
                         policy=env_problem_utils.CATEGORICAL_SAMPLING,
                         max_timestep=None,
                         boundary=20,
                         epsilon=0.1,
                         reset=True,
                         rng=None):
    """Collect trajectories with the given policy net and behaviour.

  Args:
    env: A gym env interface, for now this is not-batched.
    policy_fun: observations(B,T+1) -> log-probabs(B,T+1, A) callable.
    num_trajectories: int, number of trajectories.
    policy: string, "greedy", "epsilon-greedy", or "categorical-sampling" i.e.
      how to use the policy_fun to return an action.
    max_timestep: int or None, the index of the maximum time-step at which we
      return the trajectory, None for ending a trajectory only when env returns
      done.
    boundary: int, boundary for padding, used in EnvProblem envs.
    epsilon: float, the epsilon for `epsilon-greedy` policy.
    reset: bool, true if we want to reset the envs. The envs are also reset if
      max_max_timestep is None or < 0
    rng: jax rng, splittable.

  Returns:
    A tuple (trajectory, number of trajectories that are done)
    trajectory: list of (observation, action, reward) tuples, where each element
    `i` is a tuple of numpy arrays with shapes as follows:
    observation[i] = (B, T_i + 1)
    action[i] = (B, T_i)
    reward[i] = (B, T_i)
  """

    assert isinstance(env, env_problem.EnvProblem)
    # This is an env_problem, run its collect function.
    return env_problem_utils.play_env_problem_with_policy(
        env,
        policy_fun,
        num_trajectories=num_trajectories,
        max_timestep=max_timestep,
        boundary=boundary,
        policy_sampling=policy,
        eps=epsilon,
        reset=reset,
        rng=rng)
Esempio n. 9
0
def evaluate_policy(eval_env,
                    get_predictions,
                    boundary,
                    max_timestep=20000,
                    rng=None):
    """Evaluate the policy."""

    avg_rewards = {}
    for policy in [
            env_problem_utils.CATEGORICAL_SAMPLING,
            env_problem_utils.GUMBEL_SAMPLING, env_problem_utils.EPSILON_GREEDY
    ]:
        trajs, _ = env_problem_utils.play_env_problem_with_policy(
            eval_env,
            get_predictions,
            boundary=boundary,
            max_timestep=max_timestep,
            reset=True,
            policy_sampling=policy,
            rng=rng)
        avg_rewards[policy] = float(sum(np.sum(traj[2])
                                        for traj in trajs)) / len(trajs)
    return avg_rewards
Esempio n. 10
0
def collect_trajectories(env,
                         policy_fun,
                         num_trajectories=1,
                         policy="greedy",
                         max_timestep=None,
                         boundary=20,
                         epsilon=0.1):
  """Collect trajectories with the given policy net and behaviour.

  Args:
    env: A gym env interface, for now this is not-batched.
    policy_fun: observations(B,T+1) -> log-probabs(B,T+1, A) callable.
    num_trajectories: int, number of trajectories.
    policy: string, "greedy", "epsilon-greedy", or "categorical-sampling" i.e.
      how to use the policy_fun to return an action.
    max_timestep: int or None, the index of the maximum time-step at which we
      return the trajectory, None for ending a trajectory only when env returns
      done.
    boundary: int, boundary for padding, used in EnvProblem envs.
    epsilon: float, the epsilon for `epsilon-greedy` policy.

  Returns:
    trajectory: list of (observation, action, reward) tuples, where each element
    `i` is a tuple of numpy arrays with shapes as follows:
    observation[i] = (B, T_i + 1)
    action[i] = (B, T_i)
    reward[i] = (B, T_i)
  """

  if isinstance(env, env_problem.EnvProblem):
    # This is an env_problem, run its collect function.
    return env_problem_utils.play_env_problem_with_policy(
        env,
        policy_fun,
        num_trajectories=num_trajectories,
        max_timestep=max_timestep,
        boundary=boundary)

  trajectories = []

  for t in range(num_trajectories):
    t_start = time.time()
    rewards = []
    actions = []
    done = False

    observation = env.reset()

    # This is currently shaped (1, 1) + OBS, but new observations will keep
    # getting added to it, making it eventually (1, T+1) + OBS
    observation_history = observation[np.newaxis, np.newaxis, :]

    # Run either till we're done OR if max_timestep is defined only till that
    # timestep.
    ts = 0
    while ((not done) and
           (not max_timestep or observation_history.shape[1] < max_timestep)):
      ts_start = time.time()
      # Run the policy, to pick an action, shape is (1, t, A) because
      # observation_history is shaped (1, t) + OBS
      predictions = policy_fun(observation_history)

      # We need the predictions for the last time-step, so squeeze the batch
      # dimension and take the last time-step.
      predictions = np.squeeze(predictions, axis=0)[-1]

      # Policy can be run in one of the following ways:
      #  - Greedy
      #  - Epsilon-Greedy
      #  - Categorical-Sampling
      action = None
      if policy == "greedy":
        action = np.argmax(predictions)
      elif policy == "epsilon-greedy":
        # A schedule for epsilon is 1/k where k is the episode number sampled.
        if onp.random.random() < epsilon:
          # Choose an action at random.
          action = onp.random.randint(0, high=len(predictions))
        else:
          # Return the best action.
          action = np.argmax(predictions)
      elif policy == "categorical-sampling":
        # NOTE: The predictions aren't probabilities but log-probabilities
        # instead, since they were computed with LogSoftmax.
        # So just np.exp them to make them probabilities.
        predictions = np.exp(predictions)
        action = onp.argwhere(onp.random.multinomial(1, predictions) == 1)
      else:
        raise ValueError("Unknown policy: %s" % policy)

      # NOTE: Assumption, single batch.
      try:
        action = int(action)
      except TypeError as err:
        # Let's dump some information before we die off.
        logging.error("Cannot convert action into an integer: [%s]", err)
        logging.error("action.shape: [%s]", action.shape)
        logging.error("action: [%s]", action)
        logging.error("predictions.shape: [%s]", predictions.shape)
        logging.error("predictions: [%s]", predictions)
        logging.error("observation_history: [%s]", observation_history)
        raise err

      observation, reward, done, _ = env.step(action)

      # observation is of shape OBS, so add extra dims and concatenate on the
      # time dimension.
      observation_history = np.concatenate(
          [observation_history, observation[np.newaxis, np.newaxis, :]], axis=1)

      rewards.append(reward)
      actions.append(action)

      ts += 1
      logging.vlog(
          2, "  Collected time-step[ %5d] of trajectory[ %5d] in [%0.2f] msec.",
          ts, t, get_time(ts_start))
    logging.vlog(2, " Collected trajectory[ %5d] in [%0.2f] msec.", t,
                 get_time(t_start))

    # This means we are done we're been terminated early.
    assert done or (max_timestep and
                    max_timestep >= observation_history.shape[1])
    # observation_history is (1, T+1) + OBS, lets squeeze out the batch dim.
    observation_history = np.squeeze(observation_history, axis=0)
    trajectories.append(
        (observation_history, np.stack(actions), np.stack(rewards)))

  return trajectories