def run_single_episode(env: UnityEnvironment, brain_name, agent: Agent = None, train_mode=False, max_t=2000, epsilon=0.0): """ Execute a single episode Params ====== env (UnityEnvironment): enviroment brain_name (string): default brain name agent (Agent): agent that is responsible for control the actions (if no agent, a random action is chosen) train_mode (bool): indicate if the environment is on the train mode max_t (int): max number of steps in each episode epsilon (float): attenuate the noise applied to the action Return ====== scores (float): episode scores of all agents """ env_info = env.reset(train_mode=train_mode)[brain_name] num_agents = len(env_info.agents) action_size = env.brains[brain_name].vector_action_space_size states = env_info.vector_observations scores = np.zeros(num_agents) # initialize the score (for each agent) # Run all the steps of one episode for time_step in range(1, max_t + 1): if agent: # if a agent is provide, get all the action (for each agent) actions = agent.act(states, epsilon=epsilon, add_noise=train_mode) else: # select a random action (if no agent) actions = np.random.randn(num_agents, action_size) actions = np.clip(actions, -1, 1) # all actions between -1 and 1 env_info = env.step(actions)[ brain_name] # send the action to the environment next_states = env_info.vector_observations # get the next state rewards = env_info.rewards # get the reward dones = env_info.local_done # get the done flag if agent and train_mode: # learn if in train mode agent.step(states, actions, rewards, next_states, dones, time_step) states = next_states scores += rewards # increase the scores (for each agent) if np.any(dones): # Exit episode if done break return scores
def run_single_episode(env: UnityEnvironment, brain_name, agent: Agent = None, max_t=1000, eps=0., train_mode=False): """ Execute a single episode Params ====== env (UnityEnvironment): enviroment brain_name (string): default brain name agent (Agent): agent that is responsible for control the actions (if no agent, a random action is chosen) max_t (int): max steps in each episode train_mode (bool): indicate if the environment is on the train mode Return ====== score (float): total score of episode """ env_info = env.reset(train_mode=train_mode)[brain_name] action_size = env.brains[brain_name].vector_action_space_size state = env_info.vector_observations[0] score = 0 for _ in range(max_t): # Run each step in episode action = agent.act(state, eps) if agent else np.random.randint(action_size) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # get the done flag if agent and train_mode: agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: # Exit episode if done break return score