Ejemplo n.º 1
0
def play_and_train(env: gym.Env, agent: QLearningAgent, t_max=10 ** 4):
    """ This function should
    - run a full game (for t_max steps), actions given by agent
    - train agent whenever possible
    - return total reward
    """
    total_reward = 0.0
    state = env.reset()

    for _ in range(t_max):
        action = agent.get_action(state)
        new_state, reward, done, _ = env.step(action)
        total_reward += reward
        agent.update(state, action, new_state, reward)
        state = new_state

        if done:
            break

        state = new_state

    return total_reward
Ejemplo n.º 2
0
    n_actions = env.action_space.n

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    # alpha=0.5
    # get_legal_actions = lambda s: range(n_actions)
    # epsilon=0.1
    # discount=0.99

    alpha = 0.5
    get_legal_actions = lambda s: range(n_actions)
    epsilon = 0.1
    discount = 0.99

    agent = QLearningAgent(alpha, epsilon, discount, get_legal_actions)

    plt.figure(figsize=[10, 4])
    rewards = []

    # Training loop
    for i in range(max_iterations):
        # Play & train game
        # Update rewards
        # rewards

        # Decay agent epsilon
        # agent.epsilon = ?

        initial_state = env.reset()
        rewards.append(play_and_train(env, agent))
Ejemplo n.º 3
0
        if done:
            break

    return total_reward


if __name__ == '__main__':
    env = gym.make("CartPole-v0").env
    env.reset()
    n_actions = env.action_space.n

    print(env.observation_space.high)
    print(env.observation_space.low)
    print('CartPole state: %s' % (env.reset()))

    agent = QLearningAgent(0.3, 0.5, 1.0, lambda s: range(n_actions))

    # (x, x', theta, theta')
    state_bins = [  # Cart position.
        discretize_range(-2.4, 2.4, 2),
        # Cart velocity.
        discretize_range(-3.0, 3.0, 2),
        # Pole angle.
        discretize_range(-0.5, 0.5, 7),
        # Tip velocity.
        discretize_range(-2.0, 2.0, 7)
    ]
    max_bins = max(len(bin) for bin in state_bins)

    rewards = []
    for i in range(2000):
    env = gym.make('Taxi-v3')
    env.reset()
    env.render()

    n_states = env.observation_space.n
    n_actions = env.action_space.n

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    # alpha=0.5
    # get_legal_actions = lambda s: range(n_actions)
    # epsilon=0.1
    # discount=0.99

    agent = QLearningAgent(0.5,0.1,0.99,lambda  s: range(n_actions))

    plt.figure(figsize=[10, 4])
    rewards = []

    # Training loop
    for i in range(max_iterations):

        total_r = play_and_train(env,agent)
        rewards.append(total_r)
        # Play & train game
        # Update rewards
        # rewards

        # Decay agent epsilon
        # agent.epsilon = ?
Ejemplo n.º 5
0
            break

    return total_reward


if __name__ == '__main__':
    env = gym.make("CartPole-v0").env
    env.reset()
    n_actions = env.action_space.n

    print(env.observation_space.high)
    print(env.observation_space.low)
    print('CartPole state: %s' % (env.reset()))

    agent = QLearningAgent(alpha=0.3,
                           epsilon=0.5,
                           discount=1.0,
                           get_legal_actions=lambda s: range(n_actions))

    # (x, x', theta, theta')
    state_bins = [  # Cart position.
        discretize_range(-2.4, 2.4, 2),
        # Cart velocity.
        discretize_range(-3.0, 3.0, 2),
        # Pole angle.
        discretize_range(-0.5, 0.5, 7),
        # Tip velocity.
        discretize_range(-2.0, 2.0, 7)
    ]
    max_bins = max(len(bin) for bin in state_bins)

    rewards = []
Ejemplo n.º 6
0
    env = CliffWalkingEnv()
    env.reset()
    env.render()

    n_states = env.nS
    n_actions = env.nA

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    alpha = 0.5
    get_legal_actions = lambda s: range(n_actions)
    epsilon = 0.2
    discount = 0.99

    agent = QLearningAgent(alpha, epsilon, discount, getActionRange)
    agent2 = SarsaAgent(alpha, epsilon, discount, getActionRange)

    plt.figure(figsize=[10, 4])
    rewards1 = []
    rewards2 = []
    # Training loop
    for i in range(max_iterations):
        # Play & train game

        rewards1.append(play_and_train(env, agent))
        rewards2.append(play_and_train(env, agent2))
        if (i + 1) % 100 == 0:
            agent.epsilon = max(agent.epsilon * 0.99, 0.00001)
            agent2.epsilon = max(agent2.epsilon * 0.99, 0.00001)
            # agent.alpha = max(agent.alpha * 0.99, 0.00001)
    env = gym.make('Taxi-v2').env
    env.reset()
    env.render()

    n_states = env.observation_space.n
    n_actions = env.action_space.n

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    # alpha=0.5
    # get_legal_actions = lambda s: range(n_actions)
    # epsilon=0.1
    # discount=0.99

    agent = QLearningAgent(alpha=0.5, get_legal_actions=lambda s: range(n_actions), discount=0.99, epsilon=0.1)

    plt.figure(figsize=[10, 4])
    rewards = []

    # Training loop
    for i in range(max_iterations):
        # Play & train game
        # Update rewards
        # rewards

        rewards.append(play_and_train(env, agent))

        # Decay agent epsilon

        if i % 100 == 0:
Ejemplo n.º 8
0
    env.reset()
    env.render()

    n_states = env.observation_space.n
    n_actions = env.action_space.n

    print('States number = %i, Actions number = %i' % (n_states, n_actions))

    # create q learning agent with
    # alpha=0.5
    # get_legal_actions = lambda s: range(n_actions)
    # epsilon=0.1
    # discount=0.99

    epsilon = 0.1
    agent = QLearningAgent(0.5, epsilon, 0.99, lambda s: range(n_actions))

    plt.figure(figsize=[10, 4])
    rewards = []

    # Training loop
    for i in range(max_iterations):
        # Play & train game
        # Update rewards
        # rewards

        result = play_and_train(env, agent)
        rewards.append(result)
        # Decay agent epsilon
        # agent.epsilon = ?