num_states = environment.observation_space.n
num_actions = environment.action_space.n

agent = QAgent(num_states, num_actions)

sum_reward = 0

for episode in range(NUM_EPISODES):
    done = False
    last_state = environment.reset()
    last_reward = None
    # Number of steps taken. A bit of a safeguard...
    num_steps = 0
    while not done:
        # Epsilon-greedy policy
        action = agent.get_action(last_state, environment)

        state, reward, done, info = environment.step(action)

        # A crude timeout: If we play too long without
        # completing the level, kill the game
        num_steps += 1
        if num_steps > 1000:
            print(
                "Episode timeout! Could not finish in 1000 steps. Check your actions!"
            )
            done = True

        # Update Q-table if we have one whole experience of
        # s, a, r, s', t'
        if last_state is not None:
num_states = environment.observation_space.n
num_actions = environment.action_space.n

vtable = VTable(num_states, discount_factor=0.5)
agent = QAgent(num_states, num_actions)
# Load already trained Q-table
agent.load("q_table.npy")

for episode in range(NUM_EPISODES):
    done = False
    state = environment.reset()
    # Keep track of visited states and rewards
    # obtained
    states = []
    rewards = []
    while not done:
        # Store state
        states.append(state)
        # Take action according to Q-agent
        action = agent.get_action(state, environment)
        state, reward, done, info = environment.step(action)
        # Store reward
        rewards.append(reward)

    # Update v-estimate with the played game
    vtable.process_trajectory(states, rewards)

    if ((episode + 1) % SHOW_EVERY_EPISODES) == 0:
        vtable.visualize_v((4, 4))