returns_sum = defaultdict(float)
returns_count = defaultdict(float)
V = defaultdict(float)
num_episodes = 10000
t_steps = 100

R = np.zeros((num_episodes, 1))

for i_episode in range(1, num_episodes + 1):
    episode = []
    state = env.reset()
    rewards = 0.0
    for t in range(t_steps):
        action = policy(state)
        next_state, reward, done, _ = env.step(action)
        episode.append((state, action, reward))

        if done:
            break
        state = next_state

    states_in_episode = set([s for s, _, _ in episode])
    for s in states_in_episode:
        first_occurence_idx = next(i for i, x in enumerate(episode)
                                   if x[0] == state)
        G = sum([
            r * (discount_factor**i)
            for i, (_, _, r) in enumerate(episode[first_occurence_idx:])
        ])
        returns_sum[state] += G
Exemple #2
0
if "../" not in sys.path:
	sys.path.append("../")
from lib.envs.blackjack import BlackjackEnv

env=BlackjackEnv() 



def print_observation(observation):
    score, dealer_score, usable_ace = observation
    print("Player Score: {} (Usable Ace: {}), Dealer Score: {}".format(
          score, usable_ace, dealer_score))

def strategy(observation):
    score, dealer_score, usable_ace = observation
    # Stick (action 0) if the score is > 20, hit (action 1) otherwise
    return 0 if score >= 20 else 1

for i_episode in range(20):
    observation = env.reset()
    for t in range(100):
        print_observation(observation)
        action = strategy(observation)
        print("Taking action: {}".format( ["Stick", "Hit"][action]))
        observation, reward, done, _ = env.step(action)
        if done:
            print_observation(observation)
            print("Game end. Reward: {}\n".format(float(reward)))
            break