def _run_agent_one_ep(env: BaseEnv, agent: DQNAgent, config: Config, eps: float, behavior_name: str, train: Optional[bool] = True): # Get a starting state env.reset() decision_steps, terminal_steps = env.get_steps(behavior_name) state = decision_steps.obs[0] agent_id = decision_steps.agent_id[0] done = False did_win = False episode_reward = 0.0 import time while not done: reward = 0.0 # Get and perform an action action = agent.act(decision_steps.obs[0], eps) env.set_actions(behavior_name, np.expand_dims(action, 0).reshape(-1, 1)) env.step() decision_steps, terminal_steps = env.get_steps(behavior_name) # Determine S', R, Done next_state = None if agent_id in decision_steps: reward += decision_steps.reward[0] next_state = decision_steps.obs[0] if agent_id in terminal_steps: terminal_reward = terminal_steps.reward[0] # Add win/loss did_win = True if math.isclose(terminal_reward, 1.0) else False reward += terminal_reward next_state = terminal_steps.obs[0] done = True assert next_state is not None, f"next_state cannot be None. Agent {agent_id} did not appear in decision or terminal steps" if train: # Learn from (S, A, R, S') experience = Experience(state, action, reward, next_state, done) agent.step(experience) # Set new state state = next_state episode_reward += reward return (episode_reward, did_win)