def plot(self,stats): # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, actions in self.Q.items(): action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="Final Value Function")
def main(): env = BlackjackEnv() V_10k = mc_prediction(sample_policy, env, num_episodes=10000) plotting.plot_value_function(V_10k, title="10,000 Steps") V_500k = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(V_500k, title="500,000 Steps")
def main(): # V_10k, stats = mc_prediction(sample_policy, env, num_episodes=100) # plotting.plot_value_function(V_10k, title="10,000 Steps") V_500k, stats = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(V_500k, title="500,000 Steps")
def find_optimal_policy(num_episodes): Q, policy = mc_control_epsilon_greedy(env, num_episodes=num_episodes, epsilon=0.1) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function ({} episodes)".format(num_episodes))
def main(): Q, behaviour_policy = mc_control_importance_sampling(env, num_episodes=50000) V = defaultdict(float) for state, action_values in Q.items(): action_value = np.max(action_values) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
def main(): # Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1) Q, policy= mc_control_epsilon_greedy(env, num_episodes=300000, epsilon=0.4) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="On-Policy MC Control, 30000 steps - Value Function")
def main(): random_policy = create_random_policy(env.action_space.n) #use an epsilon-greedy policy Q, policy = mc_control_random_off_policy(env, num_episodes=300000, behavior_policy=random_policy) V = defaultdict(float) for state, action_values in Q.items(): action_value = np.max(action_values) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
def main(): Q, behaviour_policy = mc_control_importance_sampling(env, num_episodes=500000) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, action_values in Q.items(): action_value = np.max(action_values) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
def main(): random_policy = create_random_policy(env.action_space.n) #use an epsilon-greedy policy Q, policy, Advantage_Function = mc_control_importance_sampling( env, num_episodes=5000, behavior_policy=random_policy) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, action_values in Advantage_Function.items(): action_value = np.max(action_values) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
obs = newObs return (episode) def compute_return(episode, begin, discount_factor): output = 0 for i, transition in enumerate(episode[begin:]): #print(transition[2]*(discount_factor**i)) output += transition[2] * (discount_factor**i) return (output) def plot_results(evolution, index): fig = pl.figure() pl.plot(evolution) pl.title('variations of the computed means') pl.xlabel('iteration') pl.ylabel('L1 difference of variations') pl.legend() pl.savefig(index + '_evolutionMean.png') pl.show() V_10k, evolution = mc_prediction(sample_policy, env, num_episodes=10000) plotting.plot_value_function(V_10k, title="10,000 Steps") plot_results(evolution, '10K') V_500k, evolution = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(V_500k, title="500,000 Steps") plot_results(evolution, '500K')
# find its first occurrence first_occurrence_idx = next((ith for ith, x in enumerate(episode) if tuple(x[0:2]) == state_action)) # sum up all rewards since the first occurrence G = sum([ observation[2] * discount_factor**ith for ith, observation in enumerate(episode[first_occurrence_idx:]) ]) # record total returns of each (state, action) pair and their first-visit numbers returns_sum[state_action] += G returns_count[state_action] += 1 # update Q(s,a) and epsilon-greedy policy Q[state_action[0]][state_action[ 1]] = returns_sum[state_action] / returns_count[state_action] # our policy is updated implicitly # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) return Q, policy Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1) # For plotting: Create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, actions in Q.items(): print(state, actions) action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
def plot(Q): V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[tuple([state[1], state[0]])] = action_value plotting.plot_value_function(V, title="Optimal Value Function")
# Not waiting to generate one episode for TD update for s in States: V[s] = V[s] + alfa * delta * Z[s] Z[s] = lmbd * discount * Z[s] # Move to the next state now_state = next_state if done: terminate = True return V """Block code below evaluate a policy and return a plotted V-value """ print "TEMPORAL-DIFFERENCE-LAMBDA EVALUATE POLICY_0" V = tdlambda_prediction(policy_0, n_episodes=10000) # Delete state with player score below 12 to make it same with example # Because we call V[next_state] in the end, to make the same plot # we should delete some keys new_V = defaultdict(float) for key, data in V.iteritems(): if key[0] >= 12 and key[1]<=11: new_V[key] = data # Using plotting library from Denny Britz repo plotting.plot_value_function(new_V, title="Policy_0 Evaluation")
def mc_control_epsilon_greedy(env, num_episodes, discount_factor=1.0, epsilon=0.1): """ Monte Carlo Control using Epsilon-Greedy policies. Finds an optimal epsilon-greedy policy. Args: env: OpenAI gym environment. num_episodes: Number of episodes to sample. discount_factor: Gamma discount factor. epsilon: Chance the sample a random action. Float betwen 0 and 1. Returns: A tuple (Q, policy). Q is a dictionary mapping state -> action values. policy is a function that takes an observation as an argument and returns action probabilities """ # Keeps track of sum and count of returns for each state # to calculate an average. We could use an array to save all # returns (like in the book) but that's memory inefficient. returns_sum = defaultdict(float) returns_count = defaultdict(float) # The final action-value function. # A nested dictionary that maps state -> (action -> action-value). Q = defaultdict(lambda: np.zeros(env.action_space.n)) # The policy we're following policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) for episode in range(num_episodes): if episode % 1000 == 0: print("Episode: " + str(episode) + "/" + str(num_episodes)) if (episode + 1) % 100000 == 0: print("Nice") V = defaultdict(float) for state, actions in Q.items(): action_value = np.max(actions) V[state] = action_value plotting.plot_value_function(V, title="Optimal Value Function") states_visited = [] state = env.reset() while True: action_values = policy(state) action = np.random.choice(range(env.action_space.n), p=action_values) next_state, reward, done, info = env.step(action) states_visited.append([state, action, reward]) if done: break state = next_state G = 0 for i, state in enumerate(reversed(states_visited)): sa_pair = (state[0], action) G += (discount_factor * G) + state[2] if states_visited[0:i - 1].count(state) == 0: returns_sum[sa_pair] += G returns_count[sa_pair] += 1 Q[state[0]][ action] = returns_sum[sa_pair] / returns_count[sa_pair] # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n) return Q, policy
pl.legend() pl.savefig(index + '_evolutionMean.png') pl.show() #on 500K iterations Q500K, policy, evolution500K = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1) # For plotting: Create value function from action-value function # by picking the best action at each state V500K = defaultdict(float) for state, actions in Q500K.items(): action_value = np.max(actions) V500K[state] = action_value plotting.plot_value_function(V500K, title="Optimal Value Function_500K") plot_results(evolution500K, 'evolution_500K') plot_policy(Q500K, policy, 'policy_500K') #on 1M iterations Q1M, policy, evolution1M = mc_control_epsilon_greedy(env, num_episodes=1000000, epsilon=0.1) # For plotting: Create value function from action-value function # by picking the best action at each state V1M = defaultdict(float) for state, actions in Q1M.items(): action_value = np.max(actions) V1M[state] = action_value plotting.plot_value_function(V1M, title="Optimal Value Function_1M")
V[state] += (episode_rewards[state] - V[state]) / returns_num[state] return V # In[4]: def sample_policy(observation): """ A policy that sticks if the player score is > 20 and hits otherwise. """ player_score, dealer_score, usable_ace = observation return np.array([1.0, 0.0]) if player_score >= 20 else np.array([0.0, 1.0]) # In[5]: V_10k = mc_prediction(sample_policy, env, num_episodes=10000) plotting.plot_value_function(V_10k, title="10,000 Steps") # In[18]: V_500k = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(V_500k, title="500,000 Steps")
def plot_value_function(self): assert self.policy is not None plot_value_function(self.q)
action = np.random.choice(np.arange(len(A)), p=A) next_state, reward, done, info = env.step(action) episode.append((observation, action, reward)) if done: break observation = next_state temp_set = set() for w in episode: temp_set.add((w[0], w[1])) for state, action in temp_set: ind = 0 for i, w in enumerate(episode): if w[0] == state and w[1] == action: ind = i returns_sum[(state, action)] += sum([w[2] * discount_factor ** i for i, w in enumerate(episode[ind:])]) returns_counts[(state, action)] += 1 Q[state][action] = returns_sum[(state, action)] / returns_counts[(state, action)] return Q if __name__ == "__main__": #V_over500k = monte_carlo_prediction(env, num_eps=500000) Q_over500k = monte_corlo_control(env, num_eps=500000) V_over500k = defaultdict(float) for state in Q_over500k.keys(): V_over500k[state] = np.max(Q_over500k[state]) pprint(V_over500k) plotting.plot_value_function(V_over500k, title="5,00,000 Steps")
for t in range(len(episode))[::-1]: state, action, reward = episode[t] # Update the total reward since step t G = discount_factor * G + reward # Update weighted importance sampling formula denominator C[state][action] += W # Update the action-value function using the incremental update formula # This also improves our target policy which holds a reference to Q Q[state][action] += (W / C[state][action]) * (G - Q[state][action]) # If the action taken by the behavior policy is not the action # taken by the traget policy then probability will be 0 and we can break if action != np.argmax(target_policy(state)): break W = W * 1. / behavior_policy(state)[action] return Q, target_policy if __name__ == '__main__': random_policy = create_random_policy(env.action_space.n) Q, policy = mc_control_importance_sampling(env, num_episodes=500000, behavior_policy=random_policy) # For plotting, create value function from action-value function # by picking the best action at each state V = defaultdict(float) for state, action_values in Q.items(): action_value = np.max(action_values) V[state] = action_value plotting.plot_value_function(V, title='Optimal Value Function')
for j, e in enumerate(episode[i:]) ]) returns_sum[exp[0]][exp[1]] += G returns_count[exp[0]][exp[1]] += 1 Q[exp[0]][exp[1]] = returns_sum[exp[0]][ exp[1]] / returns_count[exp[0]][exp[1]] #############################################Implement your code end################################################################################################### return Q, policy Q_first, policy_first = mc(env, num_episodes=500000, epsilon=0.1) Q_every, policy_every = mc(env, num_episodes=500000, epsilon=0.1, first_visit=0) # For plotting: Create value function from action-value function # by picking the best action at each state V_first = defaultdict(float) V_every = defaultdict(float) for state, actions in Q_first.items(): action_value = np.max(actions) V_first[state] = action_value for state, actions in Q_every.items(): action_value = np.max(actions) V_every[state] = action_value plotting.plot_value_function(V_first, title="(first_visit)Optimal Value Function") plotting.plot_value_function(V_every, title="(every_visit)Optimal Value Function")
break state = ns states_in_episode = set([tuple(state) for state, _, _ in episode]) for s in states_in_episode: first_occurence_idx = next(i for i, (state, _, _) in enumerate(episode) if s == state) G = sum([ gamma**i * reward for i, (_, _, reward) in enumerate(episode[first_occurence_idx:]) ]) # sum over sample episodes returns_sum[s] += G returns_count[s] += 1.0 V[s] = returns_sum[s] / returns_count[s] return V def sample_policy(state): score, _, _ = state return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0]) if __name__ == '__main__': env = BlackjackEnv() V = mc_prediction(sample_policy, env, 10000) plotting.plot_value_function(V, title='10,000 Steps')
episode.append((state, action, reward)) if done: break state = next_state states_in_episode = set([tuple(x[0]) for x in episode]) for state in states_in_episode: first_time_id = next(i for i, x in enumerate(episode) if x[0] == state) G = sum([ x[2] * (discount_factor**i) for i, x in enumerate(episode[first_time_id:]) ]) returns_sum[state] += G returns_count[state] += 1 V[state] = returns_sum[state] / returns_count[state] return V def sample_policy(observation): score, dealer_score, usable_ace = observation return 0 if score >= 18 else 1 if __name__ == '__main__': env = BlackjackEnv() V_50k = mc_prediction(sample_policy, env, 500000, 1.0) plotting.plot_value_function(V_50k, title='50k >= 18')
episode.append(state) probs = policy(state) action = np.random.choice(np.arange(len(probs)), p=probs) next_state, reward, done, _ = env.step(action) state = next_state if done: for state in episode: returns_sum[state] += reward returns_count[state] += 1.0 break for k in returns_sum.keys(): V[k] = returns_sum[k] / returns_count[k] return V def sample_policy(observation): """ A policy that sticks if the player score is > 20 and hits otherwise. """ score, dealer_score, usable_ace = observation return np.array([1.0, 0.0]) if score >= 16 else np.array([0.0, 1.0]) V_SMALL = mc_prediction(sample_policy, env, num_episodes=50000) print(V_SMALL) plotting.plot_value_function(V_SMALL, title="10,000 Steps") #V_500k = mc_prediction(sample_policy, env, num_episodes=500000) #plotting.plot_value_function(V_500k, title="500,000 Steps")
# Find the first occurance of the state in the episode first_occurance_idx = next(i for i, x in enumerate(episode) if x[0] == state) print(episode[first_occurance_idx][0]) # Sum up all rewards since the first occurance G = sum([ x[2] * (discount_factor**i) for i, x in enumerate(episode[first_occurance_idx:]) ]) # Calculate average return for this state over all sampled episodes returns_sum[state] += G returns_count[state] += 1.0 V[state] = returns_sum[state] / returns_count[state] return V def sample_policy(observation): """ A policy that sticks if the player score is over 20 and hits otherwise. """ score, deal_score, usable_ace = observation return 0 if score >= 20 else 1 if __name__ == '__main__': v_10k = mc_prediction(sample_policy, env, num_episodes=10000) plotting.plot_value_function(v_10k, title='10000 Steps') v_500k = mc_prediction(sample_policy, env, num_episodes=500000) plotting.plot_value_function(v_500k, title='500000 Steps')