def learn_with_VI(P, nS, nA, terminal_states): V = value_iteration(P, nS, nA, terminal_states) policy = np.zeros((nS)).astype(int) for state in range(nS): opt_reward = 0 opt_action = 0 for action in range(nA): reward = 0 for i in range(len(P[state][action])): # print("possible next state %d, prob %g, V %g, E(V) %g" % (P[state][action][i][1], P[state][action][i][0], V[P[state][action][i][1]], P[state][action][i][0] * V[P[state][action][i][1]])) reward += P[state][action][i][0] * V[P[state][action][i][1]] # print("state %d, action %d, reward %g" % (state, action, reward)) if reward > opt_reward: opt_reward = reward opt_action = action policy[state] = opt_action
def learn_with_mdp_model(env, num_episodes=5000, gamma=0.95, e=0.8, decay_rate=0.999): """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new policy using epsilon-greedy exploration. Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for going from at "state" with "action" leading to "next_state". For a single episode, create a list called "history" with all the experience from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) to get a policy. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which learning rate falls. Number in range [0, 1) Returns ------- policy: np.array An array of shape [env.nS] representing the action to take at a given state. """ P = initialize_P(env.nS, env.nA) counts = initialize_counts(env.nS, env.nA) rewards = initialize_rewards(env.nS, env.nA) ############################ # YOUR IMPLEMENTATION HERE # ############################ policy = np.zeros((env.nS)) rs = [] for ep in range(num_episodes): terminal = False s = 0 history = [] while not terminal: #sample an action u = np.random.rand(1) if u > e: a = policy[s] else: a = np.random.randint(env.nA) #sample new state u = np.random.rand(1) for tup in env.P[s][a]: u = u - tup[0] if u <= 0: t = tup break history.append((s, a, t[1], t[2], t[3])) s = t[1] if t[3]: terminal = True rs.append(t[2]) counts, rewards = update_mdp_model_with_history( counts, rewards, history) P = counts_and_rewards_to_P(counts, rewards) value, policy = value_iteration(P, env.nS, env.nA, gamma, 200, 1e-3) e = e * decay_rate np.save('model_based_rewards.npy', rs) return policy
def learn_with_mdp_model(env, num_episodes=20000, gamma = 0.95, e = 0.8, decay_rate = 0.999): """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new policy using epsilon-greedy exploration. Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for going from at "state" with "action" leading to "next_state". For a single episode, create a list called "history" with all the experience from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) to get a policy. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- policy: np.array An array of shape [env.nS] representing the action to take at a given state. """ P = initialize_P(env.nS, env.nA) counts = initialize_counts(env.nS, env.nA) rewards = initialize_rewards(env.nS, env.nA) ############################ # YOUR IMPLEMENTATION HERE # ############################ average_rewards = [] average_rewards.append(0.0) history = [] policy = np.zeros(env.nS, dtype=int) epsilon = e for i in range(num_episodes): # _, policy = value_iteration(P, env.nS, env.nA) state = env.reset() history[:] = [] done = False reward = 0 while not done: eps_check = random.random() action = 0 if eps_check < epsilon: action = random.randint(0, env.nA - 1) else: action = policy[state] new_state, reward, done, _ = env.step(action) new_history = [state, action, reward, new_state, done] history.append(new_history) state = new_state if i < 1000: prev_reward = average_rewards[i] * i new_reward = (prev_reward + reward) / (i + 1) average_rewards.append(new_reward) counts, rewards = update_mdp_model_with_history(counts, rewards, history) epsilon *= decay_rate P = counts_and_rewards_to_P(counts, rewards) _, policy = value_iteration(P, env.nS, env.nA) """plt.plot(average_rewards) plt.ylabel('Average reward') plt.xlabel('Episodes') plt.show()""" return policy
def learn_with_mdp_model(env, num_episodes=5000, gamma=0.95, e=0.8, decay_rate=0.99): """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new policy using epsilon-greedy exploration. Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for going from at "state" with "action" leading to "next_state". For a single episode, create a list called "history" with all the experience from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) to get a policy. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- policy: np.array An array of shape [env.nS] representing the action to take at a given state. """ P = initialize_P(env.nS, env.nA) counts = initialize_counts(env.nS, env.nA) rewards = initialize_rewards(env.nS, env.nA) ############################ # YOUR IMPLEMENTATION HERE # ############################ policy = np.zeros(env.nS, dtype=int) for i in range(num_episodes): s = env.reset() done = False history = [] while not done: # greedy if np.random.random() < e: a = np.random.randint(env.nA) else: a = policy[s] # update next_state, reward, done, _ = env.step(a) history.append([s, a, reward, next_state, done]) s = next_state counts, rewards = update_mdp_model_with_history( counts, rewards, history) P = counts_and_rewards_to_P(counts, rewards) V, policy = value_iteration(P, env.nS, env.nA) if i % 10 == 0: e *= decay_rate return policy
def learn_with_mdp_model(env, num_episodes=5000, gamma=0.95, e=0.9, decay_rate=0.996, episode_scores=None): """ Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new policy using epsilon-greedy exploration. Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number of times that "state" with "action" led to "next_state", and the 8ewards array is the running average of rewards for going from at "state" with "action" leading to "next_state". For a single episode, create a list called "history" with all the experience from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) to get a policy. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- policy: np.array An array of shape [env.nS] representing the action to take at a given state. """ P = initialize_P(env.nS, env.nA) counts = initialize_counts(env.nS, env.nA) rewards = initialize_rewards(env.nS, env.nA) ############################ for episode_idx in tqdm.tqdm(range(num_episodes)): # Choose a random starting state cur_state = env.reset() # Calculate a policy for this episode using what we know about the env so far (P) _, policy = value_iteration(P, env.nS, env.nA, gamma=gamma, max_iteration=100, verbose=False) # Start the episode. The episode ends when we reach a terminal state (i.e. "done is True") done = False history = [] episode_reward = 0.0 while not done: # Choose an action "epsilon-greedily" (where epsilon is the var "e") action = _choose_egreedy_action(env, cur_state, policy, e) # Use env's transition probs to "choose" next state next_state, reward, done, _ = env.step(action) # Record this step in the history history.append((cur_state, action, reward, next_state, done)) # Move to next state cur_state = next_state episode_reward += reward counts, rewards = update_mdp_model_with_history( counts, rewards, history) P = counts_and_rewards_to_P(counts, rewards) # If we're running this as part of 5d, then record the scores if episode_scores is not None: # NOTE: Here I am simply recording 0 or 1 (the undiscounted score). episode_scores[episode_idx] = episode_reward # Decay the randomness of our action selection (i.e. increase greediness) e *= decay_rate _, policy = value_iteration(P, env.nS, env.nA, gamma=gamma, max_iteration=100, verbose=False) ############################ return policy
def learn_with_mdp_model(env, num_episodes=5000, gamma=0.95, e=0.8, decay_rate=0.99): """Build a model of the environment and use value iteration to learn a policy. In the next episode, play with the new policy using epsilon-greedy exploration. Your model of the environment should be based on updating counts and rewards arrays. The counts array counts the number of times that "state" with "action" led to "next_state", and the rewards array is the running average of rewards for going from at "state" with "action" leading to "next_state". For a single episode, create a list called "history" with all the experience from that episode, then update the "counts" and "rewards" arrays using the function "update_mdp_model_with_history". You may then call the prewritten function "counts_and_rewards_to_P" to convert your counts and rewards arrays to an environment data structure P consistent with the Gym environment's one. You may then call on value_iteration(P, nS, nA) to get a policy. Parameters ---------- env: gym.core.Environment Environment to compute Q function for. Must have nS, nA, and P as attributes. num_episodes: int Number of episodes of training. gamma: float Discount factor. Number in range [0, 1) learning_rate: float Learning rate. Number in range [0, 1) e: float Epsilon value used in the epsilon-greedy method. decay_rate: float Rate at which epsilon falls. Number in range [0, 1) Returns ------- policy: np.array An array of shape [env.nS] representing the action to take at a given state. """ P = initialize_P(env.nS, env.nA) policy = np.zeros((env.nS)).astype(int) counts = initialize_counts(env.nS, env.nA) rewards = initialize_rewards(env.nS, env.nA) ############################ # YOUR IMPLEMENTATION HERE # for k in range(num_episodes): print k history = render(env, policy, e) counts, rewards = update_mdp_model_with_history( counts, rewards, history) P = counts_and_rewards_to_P( counts, rewards ) #P[state][action] is a list of (prob, next_state, reward, done) tuples. V, policy = value_iteration(P, env.nS, env.nA, gamma) e = e**decay_rate ############################ return policy