def reset(self): """ Resets the environment, re-initializes agents, plots episode (if applicable) and returns an initial observation. Returns: initial observation (np array): each agent's observation given the initial configuration """ if self.episode_step_number is not None and self.episode_step_number > 0 and self.plot_episodes and self.test_case_index >= 0: plot_episode(self.agents, self.evaluate, self.map, self.test_case_index, self.id, circles_along_traj=Config.PLOT_CIRCLES_ALONG_TRAJ, plot_save_dir=self.plot_save_dir, plot_policy_name=self.plot_policy_name, limits=self.plt_limits, fig_size=self.plt_fig_size, show=Config.SHOW_EPISODE_PLOTS, save=Config.SAVE_EPISODE_PLOTS) if Config.ANIMATE_EPISODES: animate_episode(num_agents=len(self.agents), plot_save_dir=self.plot_save_dir, plot_policy_name=self.plot_policy_name, test_case_index=self.test_case_index, agents=self.agents) self.episode_number += 1 self.begin_episode = True self.episode_step_number = 0 self._init_agents() if Config.USE_STATIC_MAP: self._init_static_map() for state in Config.STATES_IN_OBS: for agent in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT): self.observation[agent][state] = np.zeros((Config.STATE_INFO_DICT[state]['size']), dtype=Config.STATE_INFO_DICT[state]['dtype']) return self._get_obs()
def step(self, actions, dt=None): """ Run one timestep of environment dynamics. This is the main function. An external process will compute an action for every agent then call env.step(actions). The agents take those actions, then we check if any agents have earned a reward (collision/goal/...). Then agents take an observation of the new world state. We compute whether each agent is done (collided/reached goal/ran out of time) and if everyone's done, the episode ends. We return the relevant info back to the process that called env.step(actions). Args: actions (dict): keyed by agent indices, each value has a [delta heading angle, speed] command. dt (float): time in seconds to run the simulation (defaults to :code:`self.dt_nominal`) Returns: 4-element tuple containing - **next_observations** (*np array*): (obs_length x num_agents) with each agent's observation - **rewards** (*list*): 1 scalar reward per agent in self.agents - **game_over** (*bool*): true if every agent is done - **info_dict** (*dict*): metadata that helps in training """ if dt is None: dt = self.dt_nominal self.episode_step_number += 1 # Take action self._take_action(actions, dt) # Collect rewards rewards = self._compute_rewards() # Take observation next_observations = self._get_obs() if Config.ANIMATE_EPISODES and self.episode_step_number % self.animation_period_steps == 0: plot_episode(self.agents, False, self.map, self.test_case_index, circles_along_traj=Config.PLOT_CIRCLES_ALONG_TRAJ, plot_save_dir=self.plot_save_dir, plot_policy_name=self.plot_policy_name, save_for_animation=True, limits=self.plt_limits, fig_size=self.plt_fig_size, perturbed_obs=self.perturbed_obs, show=False, save=True) # Check which agents' games are finished (at goal/collided/out of time) which_agents_done, game_over = self._check_which_agents_done() which_agents_done_dict = {} which_agents_learning_dict = {} for i, agent in enumerate(self.agents): which_agents_done_dict[agent.id] = which_agents_done[i] which_agents_learning_dict[ agent.id] = agent.policy.is_still_learning return next_observations, rewards, game_over, \ { 'which_agents_done': which_agents_done_dict, 'which_agents_learning': which_agents_learning_dict, }