def test_render(): env = SoccerEnv() env.render() action = env.encode_action(SoccerEnv.Action.Stick, SoccerEnv.Action.Stick) env.step(action) env.render() return
def test_transition2(): initial_state = SoccerEnv.encode_state(0,3,0,1,False) action = SoccerEnv.encode_action(SoccerEnv.Action.S, SoccerEnv.Action.N) env = SoccerEnv() transitions = env.P[initial_state][action] expected_next_state = SoccerEnv.encode_state(1,3,0,1,False) for prob, next_state, reward, done in transitions: assert next_state == expected_next_state assert reward == 0 assert done == 0
def test_transitions(): transitions = SoccerEnv.transitions(0, 2, 0, 1, 0, SoccerEnv.Action.W, SoccerEnv.Action.Stick) expected_states = set([SoccerEnv.encode_state(0, 2, 0, 1, 0), SoccerEnv.encode_state(0, 2, 0, 1, 1)]) assert len(transitions) == 2 for next_state, reward, done in transitions: assert next_state in expected_states assert reward == 0 assert done == 0 state = SoccerEnv.encode_state(0, 2, 0, 1, 0) action = SoccerEnv.encode_action(SoccerEnv.Action.W, SoccerEnv.Action.Stick) env = SoccerEnv() transitions = env.P[state][action] assert len(transitions) == 2 for prob, next_state, reward, done in transitions: assert abs(prob - 0.5) < 0.001 assert next_state in expected_states assert reward == 0 assert done == 0
def test_action_encode(): env = SoccerEnv() action1, action2 = 1, 2 x = env.encode_action(1,2) assert (action1, action2) == env.decode_action(x)
ref_state = SoccerEnv.encode_state(0, 2, 0, 1, 0) ref_P1_action = int(SoccerEnv.Action.S) # Q errors for plotting Q_Learner_P1_Q_errors = [] for i_episode in range(n_episodes_MAX): state = env.reset() P1_Q_ref = Q_agent.Q[ref_state, ref_P1_action] for t in range(steps_MAX): P1_action = Q_agent.sample_policy(state) P2_action = np.random.randint(num_actions) joint_action = env.encode_action(P1_action, P2_action) # Take action A, observe R, S' state_new, reward, done, info = env.step(joint_action) Q_agent.learn(reward, state, state_new, P1_action) state = state_new if done: if verbose: print("Episode finished after {} timesteps".format(t + 1)) break # calc error at end of episode update Q_Learner_P1_Q_errors.append(np.abs(Q_agent.Q[ref_state, ref_P1_action] - P1_Q_ref))