policy = sample_policy discount_factor = 1.0 returns_sum = defaultdict(float) returns_count = defaultdict(float) V = defaultdict(float) num_episodes = 10000 t_steps = 100 R = np.zeros((num_episodes, 1)) for i_episode in range(1, num_episodes + 1): episode = [] state = env.reset() rewards = 0.0 for t in range(t_steps): action = policy(state) next_state, reward, done, _ = env.step(action) episode.append((state, action, reward)) if done: break state = next_state states_in_episode = set([s for s, _, _ in episode]) for s in states_in_episode: first_occurence_idx = next(i for i, x in enumerate(episode) if x[0] == state) G = sum([
if "../" not in sys.path: sys.path.append("../") from lib.envs.blackjack import BlackjackEnv env=BlackjackEnv() def print_observation(observation): score, dealer_score, usable_ace = observation print("Player Score: {} (Usable Ace: {}), Dealer Score: {}".format( score, usable_ace, dealer_score)) def strategy(observation): score, dealer_score, usable_ace = observation # Stick (action 0) if the score is > 20, hit (action 1) otherwise return 0 if score >= 20 else 1 for i_episode in range(20): observation = env.reset() for t in range(100): print_observation(observation) action = strategy(observation) print("Taking action: {}".format( ["Stick", "Hit"][action])) observation, reward, done, _ = env.step(action) if done: print_observation(observation) print("Game end. Reward: {}\n".format(float(reward))) break