def plot(self,stats):
     # For plotting: Create value function from action-value function
     # by picking the best action at each state
     V = defaultdict(float)
     for state, actions in self.Q.items():
         action_value = np.max(actions)
         V[state] = action_value
     plotting.plot_value_function(V, title="Final Value Function")
Example #2
0
def main():
    env = BlackjackEnv()

    V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
    plotting.plot_value_function(V_10k, title="10,000 Steps")

    V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
    plotting.plot_value_function(V_500k, title="500,000 Steps")
def main():

    # V_10k, stats = mc_prediction(sample_policy, env, num_episodes=100)
    # plotting.plot_value_function(V_10k, title="10,000 Steps")

    V_500k, stats = mc_prediction(sample_policy, env, num_episodes=500000)

    plotting.plot_value_function(V_500k, title="500,000 Steps")
Example #4
0
def find_optimal_policy(num_episodes):
    Q, policy = mc_control_epsilon_greedy(env, num_episodes=num_episodes, epsilon=0.1)
    
    # For plotting: Create value function from action-value function
    # by picking the best action at each state
    V = defaultdict(float)
    for state, actions in Q.items():
        action_value = np.max(actions)
        V[state] = action_value
    plotting.plot_value_function(V, title="Optimal Value Function ({} episodes)".format(num_episodes))
Example #5
0
def main():

    Q, behaviour_policy = mc_control_importance_sampling(env,
                                                         num_episodes=50000)

    V = defaultdict(float)
    for state, action_values in Q.items():
        action_value = np.max(action_values)
        V[state] = action_value

    plotting.plot_value_function(V, title="Optimal Value Function")
def main():
	# Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)
	Q, policy= mc_control_epsilon_greedy(env, num_episodes=300000, epsilon=0.4)


	# For plotting: Create value function from action-value function
	# by picking the best action at each state
	V = defaultdict(float)
	for state, actions in Q.items():
	    action_value = np.max(actions)
	    V[state] = action_value


	plotting.plot_value_function(V, title="On-Policy MC Control, 30000 steps - Value Function")
Example #7
0
def main():

    random_policy = create_random_policy(env.action_space.n)

    #use an epsilon-greedy policy
    Q, policy = mc_control_random_off_policy(env,
                                             num_episodes=300000,
                                             behavior_policy=random_policy)

    V = defaultdict(float)
    for state, action_values in Q.items():
        action_value = np.max(action_values)
        V[state] = action_value

    plotting.plot_value_function(V, title="Optimal Value Function")
Example #8
0
def main():


    Q, behaviour_policy = mc_control_importance_sampling(env, num_episodes=500000)


    # For plotting: Create value function from action-value function
    # by picking the best action at each state
    V = defaultdict(float)
    for state, action_values in Q.items():
        action_value = np.max(action_values)
        V[state] = action_value


    plotting.plot_value_function(V, title="Optimal Value Function")
def main():

    random_policy = create_random_policy(env.action_space.n)
    #use an epsilon-greedy policy
    Q, policy, Advantage_Function = mc_control_importance_sampling(
        env, num_episodes=5000, behavior_policy=random_policy)

    # For plotting: Create value function from action-value function
    # by picking the best action at each state
    V = defaultdict(float)
    for state, action_values in Advantage_Function.items():
        action_value = np.max(action_values)
        V[state] = action_value

    plotting.plot_value_function(V, title="Optimal Value Function")
Example #10
0
        obs = newObs
    return (episode)


def compute_return(episode, begin, discount_factor):
    output = 0
    for i, transition in enumerate(episode[begin:]):
        #print(transition[2]*(discount_factor**i))
        output += transition[2] * (discount_factor**i)
    return (output)


def plot_results(evolution, index):
    fig = pl.figure()
    pl.plot(evolution)
    pl.title('variations of the computed means')
    pl.xlabel('iteration')
    pl.ylabel('L1 difference of variations')
    pl.legend()
    pl.savefig(index + '_evolutionMean.png')
    pl.show()


V_10k, evolution = mc_prediction(sample_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="10,000 Steps")
plot_results(evolution, '10K')

V_500k, evolution = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="500,000 Steps")
plot_results(evolution, '500K')
            # find its first occurrence
            first_occurrence_idx = next((ith for ith, x in enumerate(episode)
                                         if tuple(x[0:2]) == state_action))
            # sum up all rewards since the first occurrence
            G = sum([
                observation[2] * discount_factor**ith for ith, observation in
                enumerate(episode[first_occurrence_idx:])
            ])
            # record total returns of each (state, action) pair and their first-visit numbers
            returns_sum[state_action] += G
            returns_count[state_action] += 1
            # update Q(s,a) and epsilon-greedy policy
            Q[state_action[0]][state_action[
                1]] = returns_sum[state_action] / returns_count[state_action]
            # our policy is updated implicitly
            # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    return Q, policy


Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)

# For plotting: Create value function from action-value function
# by picking the best action at each state
V = defaultdict(float)
for state, actions in Q.items():
    print(state, actions)
    action_value = np.max(actions)
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")
Example #12
0
def plot(Q):
    V = defaultdict(float)
    for state, actions in Q.items():
        action_value = np.max(actions)
        V[tuple([state[1], state[0]])] = action_value
    plotting.plot_value_function(V, title="Optimal Value Function")
Example #13
0
            # Not waiting to generate one episode for TD update
            for s in States:
                V[s] = V[s] + alfa * delta * Z[s]
                Z[s] = lmbd * discount * Z[s]

            # Move to the next state
            now_state = next_state

            if done:
                terminate = True

    return V


"""Block code below evaluate a policy and return a plotted V-value
"""
print "TEMPORAL-DIFFERENCE-LAMBDA EVALUATE POLICY_0"

V = tdlambda_prediction(policy_0, n_episodes=10000)

# Delete state with player score below 12 to make it same with example
# Because we call V[next_state] in the end, to make the same plot
# we should delete some keys
new_V = defaultdict(float)
for key, data in V.iteritems():
    if key[0] >= 12 and key[1]<=11:
        new_V[key] = data

# Using plotting library from Denny Britz repo
plotting.plot_value_function(new_V, title="Policy_0 Evaluation")
def mc_control_epsilon_greedy(env,
                              num_episodes,
                              discount_factor=1.0,
                              epsilon=0.1):
    """
    Monte Carlo Control using Epsilon-Greedy policies.
    Finds an optimal epsilon-greedy policy.
    
    Args:
        env: OpenAI gym environment.
        num_episodes: Number of episodes to sample.
        discount_factor: Gamma discount factor.
        epsilon: Chance the sample a random action. Float betwen 0 and 1.
    
    Returns:
        A tuple (Q, policy).
        Q is a dictionary mapping state -> action values.
        policy is a function that takes an observation as an argument and returns
        action probabilities
    """

    # Keeps track of sum and count of returns for each state
    # to calculate an average. We could use an array to save all
    # returns (like in the book) but that's memory inefficient.
    returns_sum = defaultdict(float)
    returns_count = defaultdict(float)

    # The final action-value function.
    # A nested dictionary that maps state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    # The policy we're following
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    for episode in range(num_episodes):
        if episode % 1000 == 0:
            print("Episode: " + str(episode) + "/" + str(num_episodes))

        if (episode + 1) % 100000 == 0:
            print("Nice")
            V = defaultdict(float)
            for state, actions in Q.items():
                action_value = np.max(actions)
                V[state] = action_value
            plotting.plot_value_function(V, title="Optimal Value Function")

        states_visited = []
        state = env.reset()

        while True:
            action_values = policy(state)
            action = np.random.choice(range(env.action_space.n),
                                      p=action_values)
            next_state, reward, done, info = env.step(action)

            states_visited.append([state, action, reward])

            if done:
                break
            state = next_state

        G = 0
        for i, state in enumerate(reversed(states_visited)):
            sa_pair = (state[0], action)
            G += (discount_factor * G) + state[2]
            if states_visited[0:i - 1].count(state) == 0:
                returns_sum[sa_pair] += G
                returns_count[sa_pair] += 1
                Q[state[0]][
                    action] = returns_sum[sa_pair] / returns_count[sa_pair]

                # policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)

    return Q, policy
    pl.legend()
    pl.savefig(index + '_evolutionMean.png')
    pl.show()


#on 500K iterations
Q500K, policy, evolution500K = mc_control_epsilon_greedy(env,
                                                         num_episodes=500000,
                                                         epsilon=0.1)
# For plotting: Create value function from action-value function
# by picking the best action at each state
V500K = defaultdict(float)
for state, actions in Q500K.items():
    action_value = np.max(actions)
    V500K[state] = action_value
plotting.plot_value_function(V500K, title="Optimal Value Function_500K")

plot_results(evolution500K, 'evolution_500K')
plot_policy(Q500K, policy, 'policy_500K')

#on 1M iterations
Q1M, policy, evolution1M = mc_control_epsilon_greedy(env,
                                                     num_episodes=1000000,
                                                     epsilon=0.1)
# For plotting: Create value function from action-value function
# by picking the best action at each state
V1M = defaultdict(float)
for state, actions in Q1M.items():
    action_value = np.max(actions)
    V1M[state] = action_value
plotting.plot_value_function(V1M, title="Optimal Value Function_1M")
            V[state] += (episode_rewards[state] - V[state]) / returns_num[state] 
        
    return V


# In[4]:


def sample_policy(observation):
    """
    A policy that sticks if the player score is > 20 and hits otherwise.
    """
    
    player_score, dealer_score, usable_ace = observation
    return np.array([1.0, 0.0]) if player_score >= 20 else np.array([0.0, 1.0])


# In[5]:


V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="10,000 Steps")


# In[18]:


V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="500,000 Steps")

Example #17
0
 def plot_value_function(self):
     assert self.policy is not None
     plot_value_function(self.q)
Example #18
0
            action = np.random.choice(np.arange(len(A)), p=A)
            next_state, reward, done, info = env.step(action)
            episode.append((observation, action, reward))
            if done:
                break
            observation = next_state

        temp_set = set()
        for w in episode:
            temp_set.add((w[0], w[1]))

        for state, action in temp_set:
            ind = 0
            for i, w in enumerate(episode):
                if w[0] == state and w[1] == action:
                    ind = i
            returns_sum[(state, action)] += sum([w[2] * discount_factor ** i for i, w in enumerate(episode[ind:])])
            returns_counts[(state, action)] += 1
            Q[state][action] = returns_sum[(state, action)] / returns_counts[(state, action)]
    return Q


if __name__ == "__main__":
    #V_over500k = monte_carlo_prediction(env, num_eps=500000)
    Q_over500k = monte_corlo_control(env, num_eps=500000)
    V_over500k = defaultdict(float)
    for state in Q_over500k.keys():
        V_over500k[state] = np.max(Q_over500k[state])
    pprint(V_over500k)
    plotting.plot_value_function(V_over500k, title="5,00,000 Steps")
Example #19
0
        for t in range(len(episode))[::-1]:
            state, action, reward = episode[t]
            # Update the total reward since step t
            G = discount_factor * G + reward
            # Update weighted importance sampling formula denominator
            C[state][action] += W
            # Update the action-value function using the incremental update formula
            # This also improves our target policy which holds a reference to Q
            Q[state][action] += (W / C[state][action]) * (G - Q[state][action])
            # If the action taken by the behavior policy is not the action
            # taken by the traget policy then probability will be 0 and we can break
            if action != np.argmax(target_policy(state)):
                break
            W = W * 1. / behavior_policy(state)[action]
    return Q, target_policy


if __name__ == '__main__':
    random_policy = create_random_policy(env.action_space.n)
    Q, policy = mc_control_importance_sampling(env,
                                               num_episodes=500000,
                                               behavior_policy=random_policy)

    # For plotting, create value function from action-value function
    # by picking the best action at each state
    V = defaultdict(float)
    for state, action_values in Q.items():
        action_value = np.max(action_values)
        V[state] = action_value
    plotting.plot_value_function(V, title='Optimal Value Function')
Example #20
0
                    for j, e in enumerate(episode[i:])
                ])
                returns_sum[exp[0]][exp[1]] += G
                returns_count[exp[0]][exp[1]] += 1
                Q[exp[0]][exp[1]] = returns_sum[exp[0]][
                    exp[1]] / returns_count[exp[0]][exp[1]]

#############################################Implement your code end###################################################################################################
    return Q, policy

Q_first, policy_first = mc(env, num_episodes=500000, epsilon=0.1)
Q_every, policy_every = mc(env,
                           num_episodes=500000,
                           epsilon=0.1,
                           first_visit=0)

# For plotting: Create value function from action-value function
# by picking the best action at each state
V_first = defaultdict(float)
V_every = defaultdict(float)
for state, actions in Q_first.items():
    action_value = np.max(actions)
    V_first[state] = action_value
for state, actions in Q_every.items():
    action_value = np.max(actions)
    V_every[state] = action_value
plotting.plot_value_function(V_first,
                             title="(first_visit)Optimal Value Function")
plotting.plot_value_function(V_every,
                             title="(every_visit)Optimal Value Function")
Example #21
0
                break
            state = ns

        states_in_episode = set([tuple(state) for state, _, _ in episode])
        for s in states_in_episode:
            first_occurence_idx = next(i for i, (state, _,
                                                 _) in enumerate(episode)
                                       if s == state)
            G = sum([
                gamma**i * reward
                for i, (_, _,
                        reward) in enumerate(episode[first_occurence_idx:])
            ])

            # sum over sample episodes
            returns_sum[s] += G
            returns_count[s] += 1.0
            V[s] = returns_sum[s] / returns_count[s]
    return V


def sample_policy(state):
    score, _, _ = state
    return np.array([1.0, 0.0]) if score >= 20 else np.array([0.0, 1.0])


if __name__ == '__main__':
    env = BlackjackEnv()
    V = mc_prediction(sample_policy, env, 10000)
    plotting.plot_value_function(V, title='10,000 Steps')
            episode.append((state, action, reward))
            if done:
                break
            state = next_state

        states_in_episode = set([tuple(x[0]) for x in episode])
        for state in states_in_episode:
            first_time_id = next(i for i, x in enumerate(episode)
                                 if x[0] == state)
            G = sum([
                x[2] * (discount_factor**i)
                for i, x in enumerate(episode[first_time_id:])
            ])
            returns_sum[state] += G
            returns_count[state] += 1
            V[state] = returns_sum[state] / returns_count[state]

    return V


def sample_policy(observation):

    score, dealer_score, usable_ace = observation
    return 0 if score >= 18 else 1


if __name__ == '__main__':
    env = BlackjackEnv()
    V_50k = mc_prediction(sample_policy, env, 500000, 1.0)
    plotting.plot_value_function(V_50k, title='50k >= 18')
Example #23
0
            episode.append(state)
            probs = policy(state)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            if done:
                for state in episode:
                    returns_sum[state] += reward
                    returns_count[state] += 1.0
                break
    
    for k in returns_sum.keys():
        V[k] = returns_sum[k] / returns_count[k]
    
    return V  

def sample_policy(observation):
    """
    A policy that sticks if the player score is > 20 and hits otherwise.
    """
    score, dealer_score, usable_ace = observation
    return np.array([1.0, 0.0]) if score >= 16 else np.array([0.0, 1.0])


V_SMALL = mc_prediction(sample_policy, env, num_episodes=50000)
print(V_SMALL)
plotting.plot_value_function(V_SMALL, title="10,000 Steps")

#V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
#plotting.plot_value_function(V_500k, title="500,000 Steps")
Example #24
0
            # Find the first occurance of the state in the episode
            first_occurance_idx = next(i for i, x in enumerate(episode)
                                       if x[0] == state)
            print(episode[first_occurance_idx][0])
            # Sum up all rewards since the first occurance
            G = sum([
                x[2] * (discount_factor**i)
                for i, x in enumerate(episode[first_occurance_idx:])
            ])
            # Calculate average return for this state over all sampled episodes
            returns_sum[state] += G
            returns_count[state] += 1.0
            V[state] = returns_sum[state] / returns_count[state]
    return V


def sample_policy(observation):
    """
    A policy that sticks if the player score is over 20 and hits otherwise.
    """
    score, deal_score, usable_ace = observation
    return 0 if score >= 20 else 1


if __name__ == '__main__':
    v_10k = mc_prediction(sample_policy, env, num_episodes=10000)
    plotting.plot_value_function(v_10k, title='10000 Steps')

    v_500k = mc_prediction(sample_policy, env, num_episodes=500000)
    plotting.plot_value_function(v_500k, title='500000 Steps')