num_states = environment.observation_space.n num_actions = environment.action_space.n agent = QAgent(num_states, num_actions) sum_reward = 0 for episode in range(NUM_EPISODES): done = False last_state = environment.reset() last_reward = None # Number of steps taken. A bit of a safeguard... num_steps = 0 while not done: # Epsilon-greedy policy action = agent.get_action(last_state, environment) state, reward, done, info = environment.step(action) # A crude timeout: If we play too long without # completing the level, kill the game num_steps += 1 if num_steps > 1000: print( "Episode timeout! Could not finish in 1000 steps. Check your actions!" ) done = True # Update Q-table if we have one whole experience of # s, a, r, s', t' if last_state is not None:
num_states = environment.observation_space.n num_actions = environment.action_space.n vtable = VTable(num_states, discount_factor=0.5) agent = QAgent(num_states, num_actions) # Load already trained Q-table agent.load("q_table.npy") for episode in range(NUM_EPISODES): done = False state = environment.reset() # Keep track of visited states and rewards # obtained states = [] rewards = [] while not done: # Store state states.append(state) # Take action according to Q-agent action = agent.get_action(state, environment) state, reward, done, info = environment.step(action) # Store reward rewards.append(reward) # Update v-estimate with the played game vtable.process_trajectory(states, rewards) if ((episode + 1) % SHOW_EVERY_EPISODES) == 0: vtable.visualize_v((4, 4))