Ejemplo n.º 1
0
    def run(num_rollouts: int):
        policy_values = []
        while len(policy_values) < 64:
            # Estimated hitting probabilities
            estimated_hp = frozenlake.estimate_hitting_probabilities(
                env, frozenlake.deterministic_policy(env, policy_actions),
                num_rollouts)
            v = estop_map_optimal_policy_value(estimated_hp)
            if v is not None:
                policy_values.append(v)

        return policy_values
Ejemplo n.º 2
0
def main():
    np.random.seed(0)

    def build_env(lake: frozenlake.Lake):
        # return frozenlake.FrozenLakeEnv(lake, infinite_time=True)
        return frozenlake.FrozenLakeWithEscapingEnv(
            lake, hole_retention_probability=0.99)

    lake_map = frozenlake.MAP_8x8
    policy_evaluation_frequency = 10
    gamma = 0.99
    num_random_seeds = 96

    results_dir = Path("results/frozenlake_qlearning")
    estop_results_dir = results_dir / "estop"
    full_results_dir = results_dir / "full"
    results_dir.mkdir()
    estop_results_dir.mkdir()
    full_results_dir.mkdir()

    # Build the full environment and run value iteration to calculate the optimal
    # policy.
    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)
    state_action_values, _ = frozenlake.value_iteration(env,
                                                        gamma,
                                                        tolerance=1e-6)
    state_values = np.max(state_action_values, axis=-1)
    optimal_policy_reward = np.dot(state_values,
                                   env.initial_state_distribution)

    # Estimate hitting probabilities.
    optimal_policy = frozenlake.deterministic_policy(
        env, np.argmax(state_action_values, axis=-1))
    estimated_hp = frozenlake.estimate_hitting_probabilities(env,
                                                             optimal_policy,
                                                             num_rollouts=1000)
    estimated_hp2d = lake.reshape(estimated_hp)

    # Build e-stop environment.
    estop_map = np.copy(lake_map)
    percentile = 50
    threshold = np.percentile(estimated_hp, percentile)
    estop_map[estimated_hp2d <= threshold] = "E"

    estop_lake = frozenlake.Lake(estop_map)
    estop_env = build_env(estop_lake)

    # pickle dump the environemnt setup/metadata...
    pickle.dump(
        {
            "lake_map": lake_map,
            "policy_evaluation_frequency": policy_evaluation_frequency,
            "gamma": gamma,
            "num_random_seeds": num_random_seeds,
            "lake": lake,
            "env": env,
            "state_action_values": state_action_values,
            "state_values": state_values,
            "optimal_policy_reward": optimal_policy_reward,
            "optimal_policy": optimal_policy,
            "estimated_hp": estimated_hp,
            "estimated_hp2d": estimated_hp2d,
            "estop_map": estop_map,
            "percentile": percentile,
            "threshold": threshold,
            "estop_lake": estop_lake,
            "estop_env": estop_env,
        }, (results_dir / "metadata.pkl").open(mode="wb"))

    pool = Pool()

    # Run Q-learning on the full environment.
    for _ in tqdm.tqdm(pool.imap_unordered(
            functools.partial(
                q_learning_job,
                env=env,
                gamma=gamma,
                policy_evaluation_frequency=policy_evaluation_frequency,
                folder=full_results_dir,
            ), range(num_random_seeds)),
                       desc="full",
                       total=num_random_seeds):
        pass

    # Run Q-learning on the e-stop environment.
    for _ in tqdm.tqdm(pool.imap_unordered(
            functools.partial(
                q_learning_job,
                env=estop_env,
                gamma=gamma,
                policy_evaluation_frequency=policy_evaluation_frequency,
                folder=estop_results_dir,
            ), range(num_random_seeds)),
                       desc="estop",
                       total=num_random_seeds):
        pass
Ejemplo n.º 3
0
def main():
    # pylint: disable=too-many-statements
    np.random.seed(0)

    lake_map = frozenlake.MAP_8x8
    gamma = 0.99

    lake = frozenlake.Lake(lake_map)
    env = build_env(lake)
    state_action_values, policy_rewards_per_iter = frozenlake.value_iteration(
        env, gamma, tolerance=1e-6)
    policy_actions = np.argmax(state_action_values, axis=-1)
    state_values = np.max(state_action_values, axis=-1)

    # Show value function map.
    plt.figure()
    viz.plot_heatmap(lake, state_values)
    # plt.title("FrozenLake-v0 environment")
    plt.tick_params(
        axis="both",
        which="both",
        bottom=False,
        top=False,
        left=False,
        right=False,
        labelbottom=False,
        labeltop=False,
        labelleft=False,
        labelright=False,
    )
    plt.tight_layout()
    plt.savefig("figs/value_function_full_env.pdf")

    # Show hitting probability map.
    policy_transitions = np.array([
        env.transitions[i, policy_actions[i], :]
        for i in range(lake.num_states)
    ])
    hp, esta = frozenlake.markov_chain_stats(env, policy_transitions)
    hp2d = lake.reshape(hp)

    plt.figure()
    viz.plot_heatmap(lake, hp)
    plt.title("Hitting probabilities")
    plt.savefig("figs/hitting_probabilities.pdf")

    # Show estimated hitting probability map.
    estimated_hp = frozenlake.estimate_hitting_probabilities(
        env,
        frozenlake.deterministic_policy(env, policy_actions),
        num_rollouts=1000)
    plt.figure()
    viz.plot_heatmap(lake, estimated_hp)
    plt.title("Estimated hitting probabilities")

    plt.figure()
    viz.plot_heatmap(lake, esta)
    plt.title("Expected number of states to completion")

    # Show optimal policy on top of hitting probabilities.
    plt.figure()
    im = plt.imshow(hp2d)
    for s, a in zip(lake.ij_states, policy_actions):
        i, j = s
        if a == 0:
            arrow = "←"
        elif a == 1:
            arrow = "↓"
        elif a == 2:
            arrow = "→"
        elif a == 3:
            arrow = "↑"
        else:
            raise Exception("bad bad bad")

        im.axes.text(j, i, arrow, {
            "horizontalalignment": "center",
            "verticalalignment": "center"
        })
    plt.title("Optimal policy overlayed on hitting probabilities")
    plt.savefig("figs/optimal_policy.pdf")

    # Show value CDF.
    plt.figure()
    plt.hist(state_values, bins=100, histtype="step", cumulative=True)
    plt.xlabel("V(s)")
    plt.ylabel(f"Number of states (out of {lake.num_states})")
    plt.title("CDF of state values")
    plt.savefig("figs/value_function_cdf.pdf")

    #######

    # New map has hole everywhere with bad prob.
    estop_map = np.copy(lake_map)
    percentile = 50
    threshold = np.percentile(estimated_hp, percentile)
    # Use less than or equal because the estimated hitting probabilities can be
    # zero and the threshold can be zero, so nothing on the map changes.
    estop_map[lake.reshape(estimated_hp) <= threshold] = "E"

    estop_lake = frozenlake.Lake(estop_map)
    estop_env = build_env(estop_lake)
    estop_state_action_values, estop_policy_rewards_per_iter = frozenlake.value_iteration(
        estop_env, gamma, tolerance=1e-6)
    estop_state_values = np.max(estop_state_action_values, axis=-1)

    # Show value function map.
    plt.figure()
    viz.plot_heatmap(estop_lake, estop_state_values)
    plt.title(f"E-stop map ({percentile}% of states removed)")
    plt.savefig("figs/estop_map.pdf")

    # Show policy rewards per iter
    # There are 4 S * A * S FLOPS in each iteration:
    #   * multiplying transitions with state_values
    #   * multiplying times gamma
    #   * adding expected_rewards
    #   * max'ing over state_action_values

    plt.figure()
    plt.plot(
        4 * (frozenlake.NUM_ACTIONS *
             (frozenlake.num_mdp_states(lake_map)**2)) *
        np.arange(len(policy_rewards_per_iter)), policy_rewards_per_iter)
    plt.plot(
        4 * (frozenlake.NUM_ACTIONS *
             (frozenlake.num_mdp_states(estop_map)**2)) *
        np.arange(len(estop_policy_rewards_per_iter)),
        estop_policy_rewards_per_iter)
    plt.xlabel("FLOPS")
    plt.ylabel("Policy reward")
    plt.legend(["Full MDP", "E-stop MDP"])
    plt.title("Convergence comparison")
    plt.savefig("figs/convergence_comparison.pdf")

    print(
        f"Exact solution, policy value: {np.dot(env.initial_state_distribution, state_values)}"
    )
    print(
        f"E-stop solution, policy value: {np.dot(env.initial_state_distribution, estop_state_values)}"
    )

    plt.show()
Ejemplo n.º 4
0
def main():
  np.random.seed(0)

  # lake_map = frozenlake.MAP_CORRIDOR_4x1
  lake_map = frozenlake.MAP_8x8
  policy_evaluation_frequency = 100
  gamma = 0.99

  lake = frozenlake.Lake(lake_map)
  env = build_env(lake)
  print(
      f"Optimal policy reward on full env: {frozenlake.optimal_policy_reward(env, gamma)}"
  )

  # Estimate hitting probabilities.
  state_action_values, _ = frozenlake.value_iteration(
      env,
      gamma,
      tolerance=1e-6,
  )
  optimal_policy = frozenlake.deterministic_policy(
      env, np.argmax(state_action_values, axis=-1))
  estimated_hp = frozenlake.estimate_hitting_probabilities(
      env,
      optimal_policy,
      num_rollouts=1000,
  )
  estimated_hp2d = lake.reshape(estimated_hp)

  # Build e-stop environment.
  estop_map = np.copy(lake_map)
  percentile = 50
  threshold = np.percentile(estimated_hp, percentile)
  estop_map[estimated_hp2d <= threshold] = "E"

  estop_lake = frozenlake.Lake(estop_map)
  estop_env = build_env(estop_lake)
  print(
      f"Optimal policy reward on e-stop: {frozenlake.optimal_policy_reward(estop_env, gamma)}"
  )

  plt.figure()
  viz.plot_heatmap(estop_lake, np.zeros(estop_lake.num_states))
  plt.title("E-stop map")

  plt.figure()
  viz.plot_heatmap(lake, np.zeros(lake.num_states))
  plt.title("Full map")

  plt.show()

  plt.figure()
  for seed in range(1):
    np.random.seed(seed)

    x0 = 1e-2 * np.random.randn(estop_env.lake.num_states,
                                frozenlake.NUM_ACTIONS)
    optimizer = optimizers.Adam(x0, learning_rate=1e-3)
    # optimizer = reinforce.Momentum(x0, learning_rate=1e-2, mass=0.0)
    states_seen, policy_rewards = reinforce.run_reinforce(
        estop_env,
        gamma,
        optimizer,
        num_episodes=50000,
        policy_evaluation_frequency=policy_evaluation_frequency)

    plt.plot(states_seen, policy_rewards)

  plt.axhline(frozenlake.optimal_policy_reward(env, gamma),
              color="grey",
              linestyle="--")
  plt.axhline(frozenlake.optimal_policy_reward(estop_env, gamma),
              color="grey",
              linestyle="--")
  plt.title(f"Learning rate={optimizer.learning_rate}")
  plt.show()