Exemple #1
0
    def reset(self):
        """ Resets the environment, re-initializes agents, plots episode (if applicable) and returns an initial observation.

        Returns:
            initial observation (np array): each agent's observation given the initial configuration
        """
        if self.episode_step_number is not None and self.episode_step_number > 0 and self.plot_episodes and self.test_case_index >= 0:
            plot_episode(self.agents, self.evaluate, self.map, self.test_case_index, self.id, circles_along_traj=Config.PLOT_CIRCLES_ALONG_TRAJ, plot_save_dir=self.plot_save_dir, plot_policy_name=self.plot_policy_name, limits=self.plt_limits, fig_size=self.plt_fig_size, show=Config.SHOW_EPISODE_PLOTS, save=Config.SAVE_EPISODE_PLOTS)
            if Config.ANIMATE_EPISODES:
                animate_episode(num_agents=len(self.agents), plot_save_dir=self.plot_save_dir, plot_policy_name=self.plot_policy_name, test_case_index=self.test_case_index, agents=self.agents)
            self.episode_number += 1
        self.begin_episode = True
        self.episode_step_number = 0
        self._init_agents()
        if Config.USE_STATIC_MAP:
            self._init_static_map()
        for state in Config.STATES_IN_OBS:
            for agent in range(Config.MAX_NUM_AGENTS_IN_ENVIRONMENT):
                self.observation[agent][state] = np.zeros((Config.STATE_INFO_DICT[state]['size']), dtype=Config.STATE_INFO_DICT[state]['dtype'])
        return self._get_obs()
Exemple #2
0
    def step(self, actions, dt=None):
        """ Run one timestep of environment dynamics.

        This is the main function. An external process will compute an action for every agent
        then call env.step(actions). The agents take those actions,
        then we check if any agents have earned a reward (collision/goal/...).
        Then agents take an observation of the new world state. We compute whether each agent is done
        (collided/reached goal/ran out of time) and if everyone's done, the episode ends.
        We return the relevant info back to the process that called env.step(actions).

        Args:
            actions (dict): keyed by agent indices, each value has a [delta heading angle, speed] command.
            dt (float): time in seconds to run the simulation (defaults to :code:`self.dt_nominal`)

        Returns:
        4-element tuple containing

        - **next_observations** (*np array*): (obs_length x num_agents) with each agent's observation
        - **rewards** (*list*): 1 scalar reward per agent in self.agents
        - **game_over** (*bool*): true if every agent is done
        - **info_dict** (*dict*): metadata that helps in training

        """
        if dt is None:
            dt = self.dt_nominal

        self.episode_step_number += 1

        # Take action
        self._take_action(actions, dt)

        # Collect rewards
        rewards = self._compute_rewards()

        # Take observation
        next_observations = self._get_obs()

        if Config.ANIMATE_EPISODES and self.episode_step_number % self.animation_period_steps == 0:
            plot_episode(self.agents,
                         False,
                         self.map,
                         self.test_case_index,
                         circles_along_traj=Config.PLOT_CIRCLES_ALONG_TRAJ,
                         plot_save_dir=self.plot_save_dir,
                         plot_policy_name=self.plot_policy_name,
                         save_for_animation=True,
                         limits=self.plt_limits,
                         fig_size=self.plt_fig_size,
                         perturbed_obs=self.perturbed_obs,
                         show=False,
                         save=True)

        # Check which agents' games are finished (at goal/collided/out of time)
        which_agents_done, game_over = self._check_which_agents_done()

        which_agents_done_dict = {}
        which_agents_learning_dict = {}
        for i, agent in enumerate(self.agents):
            which_agents_done_dict[agent.id] = which_agents_done[i]
            which_agents_learning_dict[
                agent.id] = agent.policy.is_still_learning

        return next_observations, rewards, game_over, \
            {
                'which_agents_done': which_agents_done_dict,
                'which_agents_learning': which_agents_learning_dict,
            }