Exemple #1
0
def learn_with_VI(P, nS, nA, terminal_states):
    V = value_iteration(P, nS, nA, terminal_states)
    policy = np.zeros((nS)).astype(int)

    for state in range(nS):
        opt_reward = 0
        opt_action = 0
        for action in range(nA):
            reward = 0
            for i in range(len(P[state][action])):
                # print("possible next state %d, prob %g, V %g, E(V) %g" % (P[state][action][i][1], P[state][action][i][0], V[P[state][action][i][1]], P[state][action][i][0] * V[P[state][action][i][1]]))
                reward += P[state][action][i][0] * V[P[state][action][i][1]]
                # print("state %d, action %d, reward %g" % (state, action, reward))
            if reward > opt_reward:
                opt_reward = reward
                opt_action = action
        policy[state] = opt_action
Exemple #2
0
def learn_with_mdp_model(env,
                         num_episodes=5000,
                         gamma=0.95,
                         e=0.8,
                         decay_rate=0.999):
    """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new
    policy using epsilon-greedy exploration.

    Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number
    of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for
    going from at "state" with "action" leading to "next_state".

    For a single episode, create a list called "history" with all the experience
    from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history".

    You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to
    an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA)
    to get a policy.

    Parameters
    ----------
    env: gym.core.Environment
      Environment to compute Q function for. Must have nS, nA, and P as
      attributes.
    num_episodes: int
      Number of episodes of training.
    gamma: float
      Discount factor. Number in range [0, 1)
    learning_rate: float
      Learning rate. Number in range [0, 1)
    e: float
      Epsilon value used in the epsilon-greedy method.
    decay_rate: float
      Rate at which learning rate falls. Number in range [0, 1)

    Returns
    -------
    policy: np.array
      An array of shape [env.nS] representing the action to take at a given state.
    """

    P = initialize_P(env.nS, env.nA)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    ############################
    # YOUR IMPLEMENTATION HERE #
    ############################
    policy = np.zeros((env.nS))
    rs = []
    for ep in range(num_episodes):
        terminal = False
        s = 0
        history = []
        while not terminal:
            #sample an action
            u = np.random.rand(1)
            if u > e:
                a = policy[s]
            else:
                a = np.random.randint(env.nA)

            #sample new state
            u = np.random.rand(1)
            for tup in env.P[s][a]:
                u = u - tup[0]
                if u <= 0:
                    t = tup
                    break
            history.append((s, a, t[1], t[2], t[3]))

            s = t[1]
            if t[3]:
                terminal = True
                rs.append(t[2])

        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(counts, rewards)
        value, policy = value_iteration(P, env.nS, env.nA, gamma, 200, 1e-3)
        e = e * decay_rate
    np.save('model_based_rewards.npy', rs)
    return policy
def learn_with_mdp_model(env, num_episodes=20000, gamma = 0.95, e = 0.8, decay_rate = 0.999):
  """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new 
    policy using epsilon-greedy exploration. 

    Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number
    of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for 
    going from at "state" with "action" leading to "next_state".

    For a single episode, create a list called "history" with all the experience
    from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". 

    You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to 
    an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) 
    to get a policy.

    Parameters
    ----------
    env: gym.core.Environment
      Environment to compute Q function for. Must have nS, nA, and P as
      attributes.
    num_episodes: int 
      Number of episodes of training.
    gamma: float
      Discount factor. Number in range [0, 1)
    learning_rate: float
      Learning rate. Number in range [0, 1)
    e: float
      Epsilon value used in the epsilon-greedy method. 
    decay_rate: float
      Rate at which epsilon falls. Number in range [0, 1)

    Returns
    -------
    policy: np.array
      An array of shape [env.nS] representing the action to take at a given state.
    """

  P = initialize_P(env.nS, env.nA)
  counts = initialize_counts(env.nS, env.nA)
  rewards = initialize_rewards(env.nS, env.nA)

  ############################
  # YOUR IMPLEMENTATION HERE #
  ############################

  average_rewards = []
  average_rewards.append(0.0)

  history = []
  policy = np.zeros(env.nS, dtype=int)
  epsilon = e

  for i in range(num_episodes):
    # _, policy = value_iteration(P, env.nS, env.nA)
    state = env.reset()
    history[:] = []
    done = False
    reward = 0

    while not done:
      eps_check = random.random()
      action = 0

      if eps_check < epsilon:
        action = random.randint(0, env.nA - 1)
      else:
        action = policy[state]

      new_state, reward, done, _ = env.step(action)

      new_history = [state, action, reward, new_state, done]
      history.append(new_history)
      state = new_state

    if i < 1000:
      prev_reward = average_rewards[i] * i
      new_reward = (prev_reward + reward) / (i + 1)
      average_rewards.append(new_reward)

    counts, rewards = update_mdp_model_with_history(counts, rewards, history)
    epsilon *= decay_rate
    P = counts_and_rewards_to_P(counts, rewards)
    _, policy = value_iteration(P, env.nS, env.nA)

  """plt.plot(average_rewards)
  plt.ylabel('Average reward')
  plt.xlabel('Episodes')
  plt.show()"""

  return policy
Exemple #4
0
def learn_with_mdp_model(env,
                         num_episodes=5000,
                         gamma=0.95,
                         e=0.8,
                         decay_rate=0.99):
    """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new 
    policy using epsilon-greedy exploration. 
    
    Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number
    of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for 
    going from at "state" with "action" leading to "next_state". 
    
    For a single episode, create a list called "history" with all the experience
    from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". 
    
    You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to 
    an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) 
    to get a policy.
    
    Parameters
    ----------
    env: gym.core.Environment
      Environment to compute Q function for. Must have nS, nA, and P as
      attributes.
    num_episodes: int 
      Number of episodes of training.
    gamma: float
      Discount factor. Number in range [0, 1)
    learning_rate: float
      Learning rate. Number in range [0, 1)
    e: float
      Epsilon value used in the epsilon-greedy method. 
    decay_rate: float
      Rate at which epsilon falls. Number in range [0, 1)
    
    Returns
    -------
    policy: np.array
      An array of shape [env.nS] representing the action to take at a given state.
    """

    P = initialize_P(env.nS, env.nA)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    ############################
    # YOUR IMPLEMENTATION HERE #
    ############################
    policy = np.zeros(env.nS, dtype=int)
    for i in range(num_episodes):
        s = env.reset()
        done = False
        history = []
        while not done:
            # greedy
            if np.random.random() < e:
                a = np.random.randint(env.nA)
            else:
                a = policy[s]
            # update
            next_state, reward, done, _ = env.step(a)
            history.append([s, a, reward, next_state, done])
            s = next_state

        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(counts, rewards)
        V, policy = value_iteration(P, env.nS, env.nA)

        if i % 10 == 0:
            e *= decay_rate

    return policy
def learn_with_mdp_model(env,
                         num_episodes=5000,
                         gamma=0.95,
                         e=0.9,
                         decay_rate=0.996,
                         episode_scores=None):
    """
        Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new
        policy using epsilon-greedy exploration.

        Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number
        of times that "state" with "action" led to "next_state", and the 8ewards array is the running average of rewards for
        going from at "state" with "action" leading to "next_state".

        For a single episode, create a list called "history" with all the experience
        from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history".

        You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to
        an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA)
        to get a policy.

        Parameters
        ----------
        env: gym.core.Environment
            Environment to compute Q function for. Must have nS, nA, and P as
            attributes.
        num_episodes: int
            Number of episodes of training.
        gamma: float
            Discount factor. Number in range [0, 1)
        e: float
            Epsilon value used in the epsilon-greedy method.
        decay_rate: float
            Rate at which epsilon falls. Number in range [0, 1)

        Returns
        -------
        policy: np.array
            An array of shape [env.nS] representing the action to take at a given state.
    """

    P = initialize_P(env.nS, env.nA)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    ############################
    for episode_idx in tqdm.tqdm(range(num_episodes)):
        # Choose a random starting state
        cur_state = env.reset()

        # Calculate a policy for this episode using what we know about the env so far (P)
        _, policy = value_iteration(P,
                                    env.nS,
                                    env.nA,
                                    gamma=gamma,
                                    max_iteration=100,
                                    verbose=False)

        # Start the episode. The episode ends when we reach a terminal state (i.e. "done is True")
        done = False
        history = []
        episode_reward = 0.0
        while not done:
            # Choose an action "epsilon-greedily" (where epsilon is the var "e")
            action = _choose_egreedy_action(env, cur_state, policy, e)

            # Use env's transition probs to "choose" next state
            next_state, reward, done, _ = env.step(action)

            # Record this step in the history
            history.append((cur_state, action, reward, next_state, done))

            # Move to next state
            cur_state = next_state

            episode_reward += reward

        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(counts, rewards)

        # If we're running this as part of 5d, then record the scores
        if episode_scores is not None:
            # NOTE: Here I am simply recording 0 or 1 (the undiscounted score).
            episode_scores[episode_idx] = episode_reward

        # Decay the randomness of our action selection (i.e. increase greediness)
        e *= decay_rate

    _, policy = value_iteration(P,
                                env.nS,
                                env.nA,
                                gamma=gamma,
                                max_iteration=100,
                                verbose=False)
    ############################

    return policy
def learn_with_mdp_model(env,
                         num_episodes=5000,
                         gamma=0.95,
                         e=0.8,
                         decay_rate=0.99):
    """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new 
    policy using epsilon-greedy exploration. 

    Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number
    of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for 
    going from at "state" with "action" leading to "next_state". 

    For a single episode, create a list called "history" with all the experience
    from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history".

    You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to
    an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) 
    to get a policy.

    Parameters
    ----------
    env: gym.core.Environment
      Environment to compute Q function for. Must have nS, nA, and P as
      attributes.
    num_episodes: int 
      Number of episodes of training.
    gamma: float
      Discount factor. Number in range [0, 1)
    learning_rate: float
      Learning rate. Number in range [0, 1)
    e: float
      Epsilon value used in the epsilon-greedy method. 
    decay_rate: float
      Rate at which epsilon falls. Number in range [0, 1)

    Returns
    -------
    policy: np.array
      An array of shape [env.nS] representing the action to take at a given state.
    """

    P = initialize_P(env.nS, env.nA)
    policy = np.zeros((env.nS)).astype(int)
    counts = initialize_counts(env.nS, env.nA)
    rewards = initialize_rewards(env.nS, env.nA)

    ############################
    # YOUR IMPLEMENTATION HERE #
    for k in range(num_episodes):
        print k
        history = render(env, policy, e)
        counts, rewards = update_mdp_model_with_history(
            counts, rewards, history)
        P = counts_and_rewards_to_P(
            counts, rewards
        )  #P[state][action] is a list of (prob, next_state, reward, done) tuples.
        V, policy = value_iteration(P, env.nS, env.nA, gamma)
        e = e**decay_rate

    ############################

    return policy