Example #1
0
def play_game(grid: Grid, policy):
    # Reset game to start at a random position.
    # Need to do this becasue of the current deterministic policy
    # we would never end up at certain states
    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    state = grid.current_state()
    states_and_rewards = [(state, 0)]  # List of tuples (state, reward)
    while not grid.is_game_over():
        action = policy[state]
        action = random_action(action)
        reward = grid.move(action)
        state = grid.current_state()
        states_and_rewards.append((state, reward))

    # Calculate returns, G, by working backwards from the terminal state
    G = 0
    states_and_returns = []
    first = True
    for state, reward in reversed(states_and_rewards):
        # Value of terminal state is 0 so ignore it. Can also ignore last G
        if first:
            first = False
        else:
            states_and_returns.append((state, G))
        G = reward + gamma*G
    
    states_and_returns.reverse()  # Order of states visited, which was reverse
    return states_and_returns
Example #2
0
def play_game(grid: Grid, policy):
    state = (2, 0)
    grid.set_state(state)
    action = random_action(policy[state])

    states_actions_rewards = [(state, action, 0)]
    while True:
        reward = grid.move(action)
        state = grid.current_state()
        if grid.is_game_over():
            states_actions_rewards.append((state, None, reward))
            break
        else:
            action = random_action(policy[state])
            states_actions_rewards.append((state, action, reward))
            
    # Calculate returns, G, by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for state, action, reward in reversed(states_actions_rewards):
        # Value of terminal state is 0 so ignore it. Can also ignore last G
        if first:
            first = False
        else:
            states_actions_returns.append((state, action, G))
        G = reward + gamma*G
    
    states_actions_returns.reverse()  # Order of states visited, which was reverse
    return states_actions_returns
Example #3
0
def play_game(grid: Grid, policy: dict):
    # returns a list of states and corresponding rewards
    # start at the designated start state
    s = (2, 0)
    grid.set_state(s)
    states_and_rewards = [(s, 0)]  #list of tuples (state, reward)
    while not grid.game_over():
        a = policy[s]
        a = random_action(a)
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
    return states_and_rewards
Example #4
0
def play_game(grid: Grid, policy):
    # Reset game to start at a random position.
    # Need to do this becasue of the current deterministic policy
    # we would never end up at certain states
    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    state = grid.current_state()
    action = np.random.choice(
        ALL_POSSIBLE_ACTIONS)  # First action is uniformly random

    states_actions_rewards = [(state, action, 0)]
    seen_states = set()
    seen_states.add(grid.current_state())
    num_steps = 0

    while True:
        reward = grid.move(action)
        num_steps += 1
        state = grid.current_state()

        if state in seen_states:
            r = -10. / num_steps
            # Hack so we don't end up in an infinitely long episode bumping into a wall
            states_actions_rewards.append((state, None, r))
            break
        elif grid.is_game_over():
            states_actions_rewards.append((state, None, reward))
            break
        else:
            action = policy[state]
            states_actions_rewards.append((state, action, reward))
        seen_states.add(state)

    # Calculate returns, G, by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for state, action, reward in reversed(states_actions_rewards):
        # Value of terminal state is 0 so ignore it. Can also ignore last G
        if first:
            first = False
        else:
            states_actions_returns.append((state, action, G))
        G = reward + gamma * G

    states_actions_returns.reverse(
    )  # Order of states visited, which was reverse
    return states_actions_returns