Example #1
0
def monte_carlo(iterations=1000000,
                policy=policies.epsilon_greedy,
                n_zero=100):
    """ Performs Monte Carlo control in the Easy21 game.

    :param iterations: number of monte carlo iterations
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :return: value function and the plot of the optimal value function
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # number of wins
    wins = 0

    print('Iterations completed:')
    for i in xrange(iterations):

        if (i % 500000) == 0:
            print(i)

        # create a new random starting state
        state = environment.State()
        # play a round
        observed_keys = []
        while not state.terminal:
            player = state.player_sum
            dealer = state.dealer_first_card

            # find an action defined by the policy
            epsilon = n_zero / float(n_zero + counter_state[(player, dealer)])
            action = policy(epsilon, value_function, state)
            observed_keys.append((player, dealer, action))

            # take a step
            [state, reward] = environment.step(state, action)

        # we have reached an end of episode
        if reward is not None:
            # update over all keys
            for key in observed_keys:
                # update counts
                counter_state[key[:-1]] += 1
                counter_state_action[key] += 1

                # update value function
                alpha = 1.0 / counter_state_action[key]
                value_function[key] += alpha * (reward - value_function[key])

        if reward == 1:
            wins += 1

    print('Wins: %.4f%%' % ((float(wins) / iterations) * 100))
    # plot the optimal value function
    plotting.plot_value_function(value_function)
    return value_function
Example #2
0
def q_network_test():
    env = BlackjackEnv()
    estimator = Estimator(0.001)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        V = q_network(env, sess, estimator, episode_num=10000)
    plotting.plot_value_function(V, title='Optimal Value Function')
def q_learning_test():
    env = BlackjackEnv()
    Q = q_learning(env, episode_nums=10000)
    V = defaultdict(float)
    for state, actions in Q.items():
        max_q = np.max(actions)
        V[state] = max_q
    plotting.plot_value_function(V, title='Optimal Value Function')
def mc_control_with_epsilon_greedy_test():
    env = BlackjackEnv()
    Q = mc_control_with_epsilon_greedy(env, episode_nums=10000)
    V = defaultdict(float)
    for state, actions in Q.items():
        max_q = np.max(actions)
        V[state] = max_q
    plotting.plot_value_function(V, title='Optimal Value Function')
Example #5
0
def main():
    env = BlackjackEnv()
    actor = Actor()
    estimator = Estimator()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        V = ac_test4debug(sess, env, actor, estimator, episode_num=10000)
    plotting.plot_value_function(V, title='Optimal Value Function')
def monte_carlo(iterations=1000000, policy=policies.epsilon_greedy, n_zero=100):
    """ Performs Monte Carlo control in the Easy21 game.

    :param iterations: number of monte carlo iterations
    :param policy: exploration strategy to use
    :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used)
    :return: value function and the plot of the optimal value function
    """
    # (player, dealer, action) key
    value_function = defaultdict(float)
    # (player, dealer) key
    counter_state = defaultdict(int)
    # (player, dealer, action) key
    counter_state_action = defaultdict(int)
    # number of wins
    wins = 0

    print('Iterations completed:')
    for i in xrange(iterations):

        if (i % 500000) == 0:
            print(i)

        # create a new random starting state
        state = environment.State()
        # play a round
        observed_keys = []
        while not state.terminal:
            player = state.player_sum
            dealer = state.dealer_first_card

            # find an action defined by the policy
            epsilon = n_zero / float(n_zero + counter_state[(player, dealer)])
            action = policy(epsilon, value_function, state)
            observed_keys.append((player, dealer, action))

            # take a step
            [state, reward] = environment.step(state, action)

        # we have reached an end of episode
        if reward is not None:
            # update over all keys
            for key in observed_keys:
                # update counts
                counter_state[key[:-1]] += 1
                counter_state_action[key] += 1

                # update value function
                alpha = 1.0 / counter_state_action[key]
                value_function[key] += alpha * (reward - value_function[key])

        if reward == 1:
            wins += 1

    print('Wins: %.4f%%' % ((float(wins) / iterations) * 100))
    # plot the optimal value function
    plotting.plot_value_function(value_function)
    return value_function
Example #7
0
def td_network_test():
    env = BlackjackEnv()
    estimator = Estimator(learning_rate=0.003)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        V = td_network(env, sess, estimator)
        #print(sess.run(estimator.w))
        #print(sess.run(estimator.b))
    plotting.plot_value_function(V, title='Optimal Value')
def main():
    env = gym.make('Blackjack-v0')
    env.seed(SEED)

    V = mc_policy_eval(sample_policy, env, 10000)
    plot_value_function(V, title="10,000 Episodes")

    V = mc_policy_eval(sample_policy, env, 500000)
    plot_value_function(V, title="500,000 Episodes")

    env.close()
Example #9
0
def dyna_q_test():
    env = BlackjackEnv()
    estimator = Estimator(0.003)
    model = Model(0.003)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        V = dyna_q(env,
                   sess,
                   estimator,
                   model,
                   episode_num=3000,
                   train_model_times=3000,
                   train_with_model_times=3)
    plotting.plot_value_function(V, title='Optimal Value Function')
def main():
    env = gym.make('Blackjack-v0')
    env.seed(SEED)

    policy, Q = mc_control_importance_sampling(env, 500000)

    # For plotting: Create value function from action-value function
    # by picking the best action at each state
    V = defaultdict(float)
    for state, actions in Q.items():
        action_value = np.max(actions)
        V[state] = action_value
    plot_value_function(V, title="Optimal Value Function")

    env.close()
Example #11
0
returns_sum = defaultdict(float)
returns_count = defaultdict(float)

# the final value function
V = defaultdict(float)

for i_episode in range(1, num_episodes+1):
    episode = []
    state = env.reset()
    for t in range(200):
        action = policy(state)
        next_state, reward, done= env.step(action)
        episode.append((state, action, reward))
        if done:
            break
        state = next_state
        
    states_in_episode = set([tuple(x[0]) for x in episode])
    for state in states_in_episode:
        # Find the first occurance of the state in the episode
        first_occurence_idx = next(i for i, x in enumerate(episode) if x[0] == state)
        G = sum([x[2]*(discount_factor**i) for i, x in enumerate(episode[first_occurence_idx:])])
        
        returns_sum[state] += G
        returns_count[state] += 1.0
        V[state] = returns_sum[state] / returns_count[state]
        

import plotting 
plotting.plot_value_function(V, title="10000 steps")
Example #12
0
            sum_env, n_sims, omega = omega, epsilon = epsilon, init_val = init_val,
            episode_file=path_fun("sum_state"), warmup=warmup)
        time_to_completion_sum = time.time() - start_time_sum
        print("Number of explored states (sum states): " + str(len(sumQ)))
        print("Cumulative avg. reward = " + str(sum_avg_reward))

        print("Training time: \n " +
              "Expanded state space: {} \n Sum state space: {}".format(
                  time_to_completion_expanded, time_to_completion_sum))

        # Convert Q (extended state) to sum state representation and make 3D plots
        Q_conv = ql.convert_to_sum_states(Q, env)
        V_conv = ql.convert_to_value_function(Q_conv)
        V_conv_filt = ql.fill_missing_sum_states(ql.filter_states(V_conv))
        pl.plot_value_function(V_conv_filt,
                               title = "Expanded state, " + str(decks) + " decks",
                               directory = plot_dir,
                               file_name = "3D_exp_" + str(decks) + "_decks.png")

        # Likewise make 3D plots for sumQ
        V_sum = ql.convert_to_value_function(sumQ)
        V_sum_filt = ql.fill_missing_sum_states(ql.filter_states(V_sum))
        pl.plot_value_function(V_sum_filt,
                               title = "Sum state, " + str(decks) + " decks",
                               directory = plot_dir,
                               file_name = "3D_sum_" + str(decks) + "_decks.png")
        # create line plots
        env_types = ["hand", "sum"]
        fig, lgd = pl.plot_avg_reward_episode(directory, env_types, [str(decks)])
        fig.savefig("{}/avgReturnEp_ndeck{}.png".format(plot_dir, decks),
                                bbox_extra_artists=(lgd,), bbox_inches='tight')
        matplotlib.pyplot.close(fig)
            for i,x in enumerate(episode):
                if x[0] == s_eps:
                    first_visit_pos=i
            G = sum([e[2]*discount**idx for idx,e in enumerate(episode[first_visit_pos:])])

            return_sum[s_eps]+=G
            return_count[s_eps]+=1.0
            V[s_eps] = return_sum[s_eps]*1.0/return_count[s_eps]

    return V

env = Blackjack()

V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="10,000 Steps")

V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="500,000 Steps")











        sa_in_episode = set([(tuple(x[0]), x[1]) for x in episode])
        for state, action in sa_in_episode:
            sa_pair = (state, action)
            # Find the first occurance of the (state, action) pair in the episode
            first_occurence_idx = next(i for i, x in enumerate(episode)
                                       if x[0] == state and x[1] == action)
            # Sum up all rewards since the first occurance
            G = sum([
                x[2] * (discount_factor**i)
                for i, x in enumerate(episode[first_occurence_idx:])
            ])
            # Calculate average return for this state over all sampled episodes
            returns_sum[sa_pair] += G
            returns_count[sa_pair] += 1.0
            Q[state][action] = returns_sum[sa_pair] / returns_count[sa_pair]

        # The policy is improved implicitly by changing the Q dictionary

    return Q, policy


Q, policy = mc_control_epsilon_greedy(env, num_episodes=500000, epsilon=0.1)

# For plotting: Create value function from action-value function
# by picking the best action at each state
V = defaultdict(float)
for state, actions in Q.items():
    action_value = np.max(actions)
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")
Example #15
0
    done = False
    while not done:
        reward, next_state = env.step(convert_agent_action(action))
        next_action = agent.take_action(state)

        if next_state == 'terminal':
            done = True

        transition = Transition(state, action, reward, next_state, next_action,
                                done)
        agent.step(transition)

        if not done:
            state = next_state
            action = next_action

    last_episodes_rewards.append(reward)

    if episode % last_episodes_rewards.maxlen == 0:
        success_rates.append(
            last_episodes_rewards.count(1) / last_episodes_rewards.maxlen)
        episodes_x.append(episode)

success_rates.pop(0)
episodes_x.pop(0)

plotting.plot_value_function(agent.q_table)

plt.show()
Example #16
0
            # states.append(state)
            # rewards.append(reward)
            state = next_state

        # num_states = len(states)

        # for i, s in enumerate(states):
        #     G = np.sum(np.array(
        #         rewards[i:]) * np.array([gamma ** i for i in range(0, num_states - i)]))
        #     N[s] += 1.0
        #     V[s] = V[s] + 0.01 * (G - V[s])

    return V


def naive_policy(state):
    player_hand, _, _ = state
    if player_hand >= 20:
        return 0
    else:

        return 1


if __name__ == '__main__':
    env = BlackjackEnv()
    steps = 200000
    v = TDPolicyEvaluation(env, naive_policy, steps, 1.0, 0.5)
    # print(v)
    plotting.plot_value_function(v, title="{} Steps".format(steps))
Example #17
0
            print("\r{} @ {}/{} ({})".format(t, i + 1, n_episodes,
                                             episode_reward[i]),
                  end="")

            if done:
                break

            state = next_state

        G = 0
        for state, reward, action in episode[::-1]:
            G = reward + discount * G

        for state, reward, action in episode:
            N[state][action] += 1
            Q[state][action] += (G - Q[state][action]) / N[state][action]
            G = (G - reward) / discount

    print()
    return Q, episode_reward, episode_length


Q, rewards, lengths = mc(env, 800000)

plt.plot(pd.Series(rewards).rolling(10000, min_periods=10000).mean())
plt.show()

plotting.plot_value_function(np.amax(Q, 2))
plotting.plot_value_function(np.argmax(Q, 2), title="Policy function")
        W =1
        prob_b=prob_b[::-1]
        for idx,eps in enumerate(episode[::-1]):
            state,action,reward  = eps
            pair=(state,action)
            G = discount_factor*G+reward
            return_count[pair]+=W
            Q[state][action]+=W*1.0/return_count[pair]*(G-Q[state][action])
            target_policy[state] = np.argmax(Q[state])
            if target_policy[state]!=action:
                break
            W = W*1.0/prob_b[idx]

    return Q

env=Blackjack()
Q = Off_policy_MC_Control(env, episode_nums=500000)

V = defaultdict(float)
for state, actions in Q.items():
    action_value = np.max(actions)
    V[state] = action_value
plotting.plot_value_function(V, title="Optimal Value Function")







Example #19
0
 def plot_action_graph(self):
     plotting.plot_value_function(self.state_action_map,
                                  title="100,000 Steps")
Example #20
0
            Q[state][action] += (G - Q[state][action]) / N[state][action]
            G = (G - reward) / discount

    print()
    return Q, episode_reward, episode_length


Qtrue, _, _ = mc(env, 1000000)
sqerrs = []
lambdas = np.arange(0, 1.01, 0.1)
for lambda0 in lambdas:
    Q, err = sarsa(env, 1000, 1.0, 0.1, 0.05, lambda0, Qtrue)
    sqerrs.append(err)

plt.plot(sqerrs[0])
plt.plot(sqerrs[-1])
plt.title("Q mse over episodes")
plt.xlabel("episode")
plt.ylabel("Q mse")
plt.legend(["lambda=0", "lambda=1"])
plt.show()

plt.plot(lambdas, [err[-1] for err in sqerrs])
plt.title("Q mse for different lambda")
plt.xlabel("lambda")
plt.ylabel("Q mse")
plt.show()

plotting.plot_value_function(np.amax(Qtrue, axis=2))
plotting.plot_value_function(np.amax(Q, axis=2))
Example #21
0
			next_state, reward, done, _ = env.step(action)
			episode.append((state, action, reward))
			state = next_state

		# Compute states values
		for state, action, reward in episode:
			firstOccurence = next(i for i, x in enumerate(episode) if x[0] == state)
			G = sum([x[2]*discount_factor**i for i, x in enumerate(episode[firstOccurence:])])
			returns_sum[state] += G
			returns_count[state] += 1.0
			V[state] = returns_sum[state]/returns_count[state]
			break

	return V


def sample_policy(observation):
	score, dealer_score, usable_ace = observation
	return 0 if score >= 20 else 1


matplotlib.style.use('ggplot')
env = BlackjackEnv()


V_10k = mc_prediction(sample_policy, env, num_episodes=10000)
plotting.plot_value_function(V_10k, title="10,000 Steps")

V_500k = mc_prediction(sample_policy, env, num_episodes=500000)
plotting.plot_value_function(V_500k, title="500,000 Steps")
Example #22
0
File: MC.py Project: LouisYZK/dist
    N=100000
    M=1000
    x = (np.random.rand(N)-0.5)*8
    w_x = p(x)/q(x)
    w_x = w_x/sum(w_x)
    w_xc = np.cumsum(w_x) #used for uniform quantile inverse
    # resample from x with replacement with probability of w_x
    X=np.array([])
    for i in range(M):
        u = np.random.rand()
        X = np.hstack((X,x[w_xc>u][0]))

    x = np.linspace(-4,4,500)
    plt.plot(x,p(x))
    plt.hist(X,bins=100,normed=True)
    plt.title('Sampling Importance Resampling')
    plt.show()

if __name__ == '__main__':
    policy = sample_policy
    env = gym.make('Blackjack-v0')
    # V = mc_prediction(policy, env, num_episodes=80000)
    # plottig(V)
    Q, policy = mc_control_epsilon_greedy(env, num_episodes=100000, epsilon=0.1)
    V = defaultdict(float)
    for state, action in Q.items():
        action_value = np.max(action)
        V[state] = action_value
    plot_value_function(V)
    # importance_sampling()
    pass