def step(self, action_dict):
        obs_dict = {}
        reward_dict = {}
        done_dict = {'__all__': False}
        info_dict = {}

        # Targets move (t -> t+1)
        for n in range(self.nb_targets):
            self.targets[n].update()
        # Agents move (t -> t+1) and observe the targets
        for ii, agent_id in enumerate(action_dict):
            obs_dict[self.agents[ii].agent_id] = []
            reward_dict[self.agents[ii].agent_id] = []
            done_dict[self.agents[ii].agent_id] = []

            action_vw = self.action_map[action_dict[agent_id]]
            _ = self.agents[ii].update(
                action_vw,
                [t.state[:2] for t in self.targets[:self.nb_targets]])

            observed = []
            # Update beliefs of all targets
            for jj in range(self.num_targets):
                # Observe
                obs = self.observation(self.targets[jj], self.agents[ii])
                observed.append(obs[0])
                self.belief_targets[jj].predict()  # Belief state at t+1
                if obs[0]:  # if observed, update the target belief.
                    self.belief_targets[jj].update(obs[1],
                                                   self.agents[ii].state)

            obstacles_pt = map_utils.get_closest_obstacle(
                self.MAP, self.agents[ii].state)

            if obstacles_pt is None:
                obstacles_pt = (self.sensor_r, np.pi)
            # Calculate beliefs on only assigned targets
            for kk in range(self.nb_targets):
                r_b, alpha_b = util.relative_distance_polar(
                    self.belief_targets[kk].state[:2],
                    xy_base=self.agents[ii].state[:2],
                    theta_base=self.agents[ii].state[-1])
                r_dot_b, alpha_dot_b = util.relative_velocity_polar(
                    self.belief_targets[kk].state[:2],
                    self.belief_targets[kk].state[2:],
                    self.agents[ii].state[:2], self.agents[ii].state[-1],
                    action_vw[0], action_vw[1])
                obs_dict[agent_id].append([
                    r_b, alpha_b, r_dot_b, alpha_dot_b,
                    np.log(LA.det(self.belief_targets[kk].cov)),
                    float(observed[kk]), obstacles_pt[0], obstacles_pt[1]
                ])
            obs_dict[agent_id] = np.asarray(obs_dict[agent_id])
        # Get all rewards after all agents and targets move (t -> t+1)
        reward, done, mean_nlogdetcov = self.get_reward(
            obstacles_pt, observed, self.is_training)
        reward_dict['__all__'], done_dict['__all__'], info_dict[
            'mean_nlogdetcov'] = reward, done, mean_nlogdetcov
        return obs_dict, reward_dict, done_dict, info_dict
    def step(self, action_dict):
        obs_dict = {}
        locations = []
        full_state = {}
        reward_dict = {}
        done_dict = {'__all__': False}
        info_dict = {}

        # Targets move (t -> t+1)
        for n in range(self.num_targets):
            self.targets[n].update()
            locations.append(self.targets[n].state[:2])
        # Agents move (t -> t+1) and observe the targets
        for ii, agent_id in enumerate(action_dict):
            obs_dict[self.agents[ii].agent_id] = []
            reward_dict[self.agents[ii].agent_id] = []
            done_dict[self.agents[ii].agent_id] = []

            action_vw = self.action_map[action_dict[agent_id]]
            _ = self.agents[ii].update(action_vw,
                                       [t.state[:2] for t in self.targets])
            locations.append(self.agents[ii].state[:2])

            observed = []
            for jj in range(self.num_targets):
                # Observe
                obs = self.observation(self.targets[jj], self.agents[ii])
                observed.append(obs[0])
                self.belief_targets[jj].predict()  # Belief state at t+1
                if obs[0]:  # if observed, update the target belief.
                    self.belief_targets[jj].update(obs[1],
                                                   self.agents[ii].state)

            obstacles_pt = map_utils.get_closest_obstacle(
                self.MAP, self.agents[ii].state)

            if obstacles_pt is None:
                obstacles_pt = (self.sensor_r, np.pi)
            for kk in range(self.num_targets):
                r_b, alpha_b = util.relative_distance_polar(
                    self.belief_targets[kk].state[:2],
                    xy_base=self.agents[ii].state[:2],
                    theta_base=self.agents[ii].state[-1])
                r_dot_b, alpha_dot_b = util.relative_velocity_polar(
                    self.belief_targets[kk].state[:2],
                    self.belief_targets[kk].state[2:],
                    self.agents[ii].state[:2], self.agents[ii].state[-1],
                    action_vw[0], action_vw[1])
                obs_dict[agent_id].extend([
                    r_b, alpha_b, r_dot_b, alpha_dot_b,
                    np.log(LA.det(self.belief_targets[kk].cov)),
                    float(observed[kk])
                ])
            obs_dict[agent_id].extend([obstacles_pt[0], obstacles_pt[1]])
        #Global state for each agent (ref is origin)
        global_state = util.global_relative_measure(np.array(locations),
                                                    self.MAP.origin)
        # Full state dict
        for m, agent_id in enumerate(obs_dict):
            for p, ids in enumerate(obs_dict):
                if agent_id != ids:
                    # Relative location and recent action of all other agents
                    r, alpha = util.relative_distance_polar(
                        np.array(self.agents[p].state[:2]),
                        xy_base=self.agents[m].state[:2],
                        theta_base=self.agents[m].state[2])
                    obs_dict[agent_id].extend([r, alpha])
            full_state[agent_id] = {
                'obs': np.asarray(obs_dict[agent_id]),
                'state': np.concatenate((obs_dict[agent_id], global_state))
            }
        # Get all rewards after all agents and targets move (t -> t+1)
        reward, done, mean_nlogdetcov = self.get_reward(
            obstacles_pt, observed, self.is_training)
        reward_dict['__all__'], done_dict['__all__'], info_dict[
            'mean_nlogdetcov'] = reward, done, mean_nlogdetcov
        return full_state, reward_dict, done_dict, info_dict
    def step(self, action_dict):
        obs_dict = {}
        reward_dict = {}
        done_dict = {'__all__': False}
        info_dict = {}

        # Targets move (t -> t+1)
        for n in range(self.nb_targets):
            self.targets[n].update()
        # Agents move (t -> t+1) and observe the targets
        for ii, agent_id in enumerate(action_dict):
            obs_dict[self.agents[ii].agent_id] = []
            reward_dict[self.agents[ii].agent_id] = []
            done_dict[self.agents[ii].agent_id] = []

            action_vw = self.action_map[action_dict[agent_id]]

            # Locations of all targets and agents in order to maintain a margin between them
            margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]]
            for p, ids in enumerate(action_dict):
                if agent_id != ids:
                    margin_pos.append(np.array(self.agents[p].state[:2]))
            _ = self.agents[ii].update(action_vw, margin_pos)
            # _ = self.agents[ii].update(action_vw, [t.state[:2] for t in self.targets[:self.nb_targets]])

            observed = []
            # Update beliefs of all targets
            for jj in range(self.num_targets):
                # Observe
                obs = self.observation(self.targets[jj], self.agents[ii])
                observed.append(obs[0])
                self.belief_targets[jj].predict()  # Belief state at t+1
                if obs[0]:  # if observed, update the target belief.
                    self.belief_targets[jj].update(obs[1],
                                                   self.agents[ii].state)

            # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state)

            # if obstacles_pt is None:
            obstacles_pt = (self.sensor_r, np.pi)
            # Calculate beliefs on only assigned targets
            for kk in range(self.nb_targets):
                r_b, alpha_b = util.relative_distance_polar(
                    self.belief_targets[kk].state[:2],
                    xy_base=self.agents[ii].state[:2],
                    theta_base=self.agents[ii].state[-1])
                r_dot_b, alpha_dot_b = util.relative_velocity_polar(
                    self.belief_targets[kk].state[:2],
                    self.belief_targets[kk].state[2:],
                    self.agents[ii].state[:2], self.agents[ii].state[-1],
                    action_vw[0], action_vw[1])
                obs_dict[agent_id].append([
                    r_b, alpha_b, r_dot_b, alpha_dot_b,
                    np.log(LA.det(self.belief_targets[kk].cov)),
                    float(observed[kk]), obstacles_pt[0], obstacles_pt[1]
                ])
        # Greedily assign agents to closest target in order, all targets assigned if agents > targets
        mask = np.ones(self.nb_targets, bool)
        if self.nb_targets > self.nb_agents:
            oracle = 1
        else:
            oracle = 0
        for agent_id in obs_dict:
            obs_dict[agent_id] = np.asarray(obs_dict[agent_id])
            if np.sum(mask) != np.maximum(
                    0, self.nb_targets - self.nb_agents + oracle):
                idx = np.flatnonzero(mask)
                close = idx[np.argmin(obs_dict[agent_id][:, 0][mask])]
                obs_dict[agent_id] = obs_dict[agent_id][None, close]
                mask[close] = False
        # Get all rewards after all agents and targets move (t -> t+1)
        reward, done, mean_nlogdetcov = self.get_reward(
            obstacles_pt, observed, self.is_training)
        reward_dict['__all__'], done_dict['__all__'], info_dict[
            'mean_nlogdetcov'] = reward, done, mean_nlogdetcov
        return obs_dict, reward_dict, done_dict, info_dict
    def step(self, action_dict):
        obs_dict = {}
        reward_dict = {}
        done_dict = {'__all__': False}
        info_dict = {}

        # Targets move (t -> t+1)
        for n in range(self.nb_targets):
            self.targets[n].update()
            self.belief_targets[n].predict()  # Belief state at t+1
        # Agents move (t -> t+1) and observe the targets
        for ii, agent_id in enumerate(action_dict):
            obs_dict[self.agents[ii].agent_id] = []
            reward_dict[self.agents[ii].agent_id] = []
            done_dict[self.agents[ii].agent_id] = []

            action_vw = self.action_map[action_dict[agent_id]]

            # Locations of all targets and agents in order to maintain a margin between them
            margin_pos = [t.state[:2] for t in self.targets[:self.nb_targets]]
            for p, ids in enumerate(action_dict):
                if agent_id != ids:
                    margin_pos.append(np.array(self.agents[p].state[:2]))
            _ = self.agents[ii].update(action_vw, margin_pos)
            # _ = self.agents[ii].update(action_vw, [t.state[:2] for t in self.targets[:self.nb_targets]])

            observed = np.zeros(self.nb_targets, dtype=bool)
            obstacles_pt = (self.sensor_r, np.pi)
            # Update beliefs of all targets
            for jj in range(self.nb_targets):
                # Observe
                obs, z_t = self.observation(self.targets[jj], self.agents[ii])
                observed[jj] = obs

                if obs:  # if observed, update the target belief.
                    self.belief_targets[jj].update(z_t, self.agents[ii].state)

            # obstacles_pt = map_utils.get_closest_obstacle(self.MAP, self.agents[ii].state)

            # if obstacles_pt is None:

            # Calculate beliefs on only assigned targets
            # for kk in range(self.nb_targets):
                r_b, alpha_b = util.relative_distance_polar(
                    self.belief_targets[jj].state[:2],
                    xy_base=self.agents[ii].state[:2],
                    theta_base=self.agents[ii].state[-1])
                r_dot_b, alpha_dot_b = util.relative_velocity_polar(
                    self.belief_targets[jj].state[:2],
                    self.belief_targets[jj].state[2:],
                    self.agents[ii].state[:2], self.agents[ii].state[-1],
                    action_vw[0], action_vw[1])
                obs_dict[agent_id].append([
                    r_b, alpha_b, r_dot_b, alpha_dot_b,
                    np.log(LA.det(self.belief_targets[jj].cov)),
                    float(obs), obstacles_pt[0], obstacles_pt[1]
                ])
            obs_dict[agent_id] = np.asarray(obs_dict[agent_id])
        # Get all rewards after all agents and targets move (t -> t+1)
        reward, done, mean_nlogdetcov = self.get_reward(
            obstacles_pt, observed, self.is_training)
        reward_dict['__all__'], done_dict['__all__'], info_dict[
            'mean_nlogdetcov'] = reward, done, mean_nlogdetcov
        return obs_dict, reward_dict, done_dict, info_dict