Beispiel #1
0
def test_transition2():
    initial_state = SoccerEnv.encode_state(0,3,0,1,False)
    action = SoccerEnv.encode_action(SoccerEnv.Action.S, SoccerEnv.Action.N)
    env = SoccerEnv()
    transitions = env.P[initial_state][action]
    expected_next_state = SoccerEnv.encode_state(1,3,0,1,False)
    for prob, next_state, reward, done in transitions:
        assert next_state == expected_next_state
        assert reward == 0
        assert done == 0
Beispiel #2
0
def test_transitions():
    transitions = SoccerEnv.transitions(0, 2, 0, 1, 0, SoccerEnv.Action.W, SoccerEnv.Action.Stick)
    expected_states = set([SoccerEnv.encode_state(0, 2, 0, 1, 0), SoccerEnv.encode_state(0, 2, 0, 1, 1)])
    assert len(transitions) == 2
    for next_state, reward, done in transitions:
        assert next_state in expected_states
        assert reward == 0
        assert done == 0

    state = SoccerEnv.encode_state(0, 2, 0, 1, 0)
    action = SoccerEnv.encode_action(SoccerEnv.Action.W, SoccerEnv.Action.Stick)
    env = SoccerEnv()
    transitions = env.P[state][action]
    assert len(transitions) == 2
    for prob, next_state, reward, done in transitions:
        assert abs(prob - 0.5) < 0.001
        assert next_state in expected_states
        assert reward == 0
        assert done == 0
n_episodes_MAX = 2 * 10**5  #10*10**5
steps_MAX = 100
verbose = False
alpha = 0.1

# Instantiate Q-Learner
from QLearner import QLearningAgent

Foe_Q_agent = QLearningAgent(state_size=num_states,
                             action_size=num_individual_actions,
                             value_state_function_learner='minimax-q',
                             learning_rate=alpha)

# Ref state s, action 'S'
ref_state = SoccerEnv.encode_state(0, 2, 0, 1, 0)
ref_P1_action = int(SoccerEnv.Action.S)
ref_P2_action = int(SoccerEnv.Action.Stick)

# Q errors for plotting
Foe_Q_P1_Q_errors = []

for i_episode in range(n_episodes_MAX):
    state = env.reset()

    P1_Q_ref = Foe_Q_agent.Q[ref_state, ref_P1_action, ref_P2_action]

    for t in range(steps_MAX):
        joint_action = np.random.randint(num_actions)

        # Take action A, observe R, S'