Esempio n. 1
0
def test_render():
    env = SoccerEnv()
    env.render()
    action = env.encode_action(SoccerEnv.Action.Stick, SoccerEnv.Action.Stick)
    env.step(action)
    env.render()
    return
Esempio n. 2
0
def test_transition2():
    initial_state = SoccerEnv.encode_state(0,3,0,1,False)
    action = SoccerEnv.encode_action(SoccerEnv.Action.S, SoccerEnv.Action.N)
    env = SoccerEnv()
    transitions = env.P[initial_state][action]
    expected_next_state = SoccerEnv.encode_state(1,3,0,1,False)
    for prob, next_state, reward, done in transitions:
        assert next_state == expected_next_state
        assert reward == 0
        assert done == 0
Esempio n. 3
0
def test_transitions():
    transitions = SoccerEnv.transitions(0, 2, 0, 1, 0, SoccerEnv.Action.W, SoccerEnv.Action.Stick)
    expected_states = set([SoccerEnv.encode_state(0, 2, 0, 1, 0), SoccerEnv.encode_state(0, 2, 0, 1, 1)])
    assert len(transitions) == 2
    for next_state, reward, done in transitions:
        assert next_state in expected_states
        assert reward == 0
        assert done == 0

    state = SoccerEnv.encode_state(0, 2, 0, 1, 0)
    action = SoccerEnv.encode_action(SoccerEnv.Action.W, SoccerEnv.Action.Stick)
    env = SoccerEnv()
    transitions = env.P[state][action]
    assert len(transitions) == 2
    for prob, next_state, reward, done in transitions:
        assert abs(prob - 0.5) < 0.001
        assert next_state in expected_states
        assert reward == 0
        assert done == 0
Esempio n. 4
0
def test_action_encode():
    env = SoccerEnv()
    action1, action2 = 1, 2
    x = env.encode_action(1,2)
    assert (action1, action2) == env.decode_action(x)
Esempio n. 5
0
ref_state = SoccerEnv.encode_state(0, 2, 0, 1, 0)
ref_P1_action = int(SoccerEnv.Action.S)

# Q errors for plotting
Q_Learner_P1_Q_errors = []

for i_episode in range(n_episodes_MAX):
    state = env.reset()

    P1_Q_ref = Q_agent.Q[ref_state, ref_P1_action]

    for t in range(steps_MAX):
        P1_action = Q_agent.sample_policy(state)
        P2_action = np.random.randint(num_actions)

        joint_action = env.encode_action(P1_action, P2_action)

        # Take action A, observe R, S'
        state_new, reward, done, info = env.step(joint_action)

        Q_agent.learn(reward, state, state_new, P1_action)

        state = state_new

        if done:
            if verbose:
                print("Episode finished after {} timesteps".format(t + 1))
            break

    # calc error at end of episode update
    Q_Learner_P1_Q_errors.append(np.abs(Q_agent.Q[ref_state, ref_P1_action] - P1_Q_ref))