Esempio n. 1
0
    def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
        obs, rewards, dones, infos = self.rail_env.step(action_dict)

        o, r, d = self._step_out(obs, dones)

        assert len(obs) > 0
        assert all([x is not None for x in (dones, rewards, obs)])

        return StepOutput(obs=o,
                          reward=r,
                          done=d,
                          info={
                              agent: {
                                  'max_episode_steps':
                                  self.rail_env._max_episode_steps,
                                  'num_agents':
                                  self.rail_env.get_num_agents(),
                                  'agent_done':
                                  dones[agent]
                                  and agent not in self.rail_env.active_agents,
                                  'agent_score':
                                  self._agent_scores[agent],
                                  'agent_step':
                                  self._agent_steps[agent],
                              }
                              for agent in o.keys()
                          })
Esempio n. 2
0
    def _scheduling_step(self, action):
        norm_factor = self._env.rail_env._max_episode_steps * self._env.rail_env.get_num_agents(
        )
        sorted_actions = {
            k: v
            for k, v in sorted(
                action.items(), key=lambda item: item[1], reverse=True)
        }
        self._env.sorted_handles = list(sorted_actions.keys())

        done = defaultdict(lambda: False)
        while not done['__all__']:
            actions = ShortestPathAgent().compute_actions(
                self.last_obs, self._env.rail_env)
            _, _, done, _ = self._env.step(actions)

        pc = np.sum(
            np.array([1 for a in self._env.rail_env.agents if is_done(a)
                      ])) / self._env.rail_env.get_num_agents()
        malf = np.sum([
            a.malfunction_data['nr_malfunctions']
            for a in self._env.rail_env.agents
        ])
        print("EPISODE PC:", pc, "NR MALFUNCTIONS:", malf)

        d = {
            a.handle: a.status == RailAgentStatus.DONE
            or a.status == RailAgentStatus.DONE_REMOVED
            for a in self._env.rail_env.agents
        }
        d['__all__'] = True

        r = {
            a.handle: self._env._agent_scores[a.handle] / norm_factor
            for a in self._env.rail_env.agents
        }
        o = self.last_obs
        return StepOutput(obs=o,
                          reward=r,
                          done=d,
                          info={
                              a.handle: {
                                  'max_episode_steps':
                                  self._env.rail_env._max_episode_steps,
                                  'num_agents':
                                  self._env.rail_env.get_num_agents(),
                                  'agent_done': d[a.handle],
                                  'agent_score':
                                  self._env._agent_scores[a.handle],
                                  'agent_step':
                                  self._env._agent_steps[a.handle],
                              }
                              for a in self._env.rail_env.agents
                          })
Esempio n. 3
0
    def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
        obs, reward, done, info = self.env.step(action_dict)

        o, r, d, i = {}, {}, {}, {}
        for agent_id, agent_obs in obs.items():
            o[agent_id] = obs[agent_id]
            d[agent_id] = done[agent_id]
            i[agent_id] = info[agent_id]
            r[agent_id] = np.mean(list(reward.values()))

        d['__all__'] = done['__all__'] or all(d.values())
        return StepOutput(o, r, d, i)
Esempio n. 4
0
    def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
        obs, rewards, dones, infos = self.rail_env.step(action_dict)
        done = dones["__all__"]
        reward = np.sum(list(rewards.values()))
        observation = obs

        self._agent_score += reward
        self._agent_steps += 1

        return StepOutput(obs=observation, reward=reward, done=done, info={
            'max_episode_steps': self.rail_env._max_episode_steps,
            'num_agents': self.rail_env.get_num_agents(),
            'agent_done': done,
            'agent_score': self._agent_score,
            'agent_step': self._agent_steps,
        })
Esempio n. 5
0
    def step(self, action_dict: Dict[int, float]) -> StepOutput:
        rail_env: RailEnv = self.unwrapped.rail_env
        sorted_actions = {
            k: v
            for k, v in sorted(
                action_dict.items(), key=lambda item: item[1], reverse=True)
        }
        self.env.sorted_handles = list(sorted_actions.keys())

        rail_actions = self.sp_agent.compute_actions(
            {h: None
             for h in action_dict.keys()}, env=rail_env)
        o, r, d, i = self.env.step(rail_actions)
        r = {h: rew / self.norm_factor for h, rew in r.items()}

        return StepOutput(o, r, d, i)
Esempio n. 6
0
    def step(self, action_list):

        action_dict = {}
        for i, action in enumerate(action_list):
            action_dict[i] = action

        step_r = self._env.step(action_dict)

        if not self._global_obs:
            return StepOutput(obs=[step for step in step_r.obs.values()],
                              reward=np.sum(
                                  [r for r in step_r.reward.values()]),
                              done=all(step_r.done.values()),
                              info=step_r.info[0])
        else:
            return step_r
Esempio n. 7
0
    def step(self, action: Dict[int, RailEnvActions]) -> StepOutput:
        action_dict = {
            h: RailEnvActions.STOP_MOVING.value
            for h in range(self._num_agents) if h not in self._agents_done
        }
        if self._allow_noop:
            action_dict[self._current_handle] = action[self._current_handle]
        else:
            action_dict[
                self._current_handle] = action[self._current_handle] + 1

        obs, rewards, dones, infos = self.rail_env.step(action_dict)

        new_dones = []
        for agent, done in dones.items():
            if agent not in self._agents_done and agent != '__all__' and done:
                new_dones.append(agent)

        if not dones['__all__']:
            self._current_handle = (self._current_handle +
                                    1) % self._num_agents
            while self._current_handle in (self._agents_done + new_dones):
                self._current_handle = (self._current_handle +
                                        1) % self._num_agents

        d, r, o = dict(), dict(), dict()
        for agent in new_dones + [self._current_handle]:
            o[agent] = obs[agent]
            r[agent] = rewards[agent]
            d[agent] = dones[agent]
            self._agent_scores[agent] += rewards[agent]
            if not d[agent]:
                self._agent_steps[agent] += 1

        d['__all__'] = dones['__all__']
        self._agents_done.extend(new_dones)

        i = {
            h: self.get_agent_info(h, d)
            for h, done in d.items() if not h == '__all__'
        }

        return StepOutput(obs=o, reward=r, done=d, info=i)
Esempio n. 8
0
    def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
        obs, reward, done, info = self.env.step(action_dict)

        if self._deadlock_reward != 0:
            new_deadlocked_agents = self.check_deadlock()
        else:
            new_deadlocked_agents = []

        o, r, d, i = {}, {}, {}, {}
        for agent_id, agent_obs in obs.items():
            if agent_id not in self._deadlocked_agents or agent_id in new_deadlocked_agents:
                o[agent_id] = obs[agent_id]
                d[agent_id] = done[agent_id]
                i[agent_id] = info[agent_id]
                r[agent_id] = reward[agent_id]
                if agent_id in new_deadlocked_agents:
                    # agent is in deadlocked (and was not before) -> give deadlock reward and set to done
                    r[agent_id] += self._deadlock_reward
                    d[agent_id] = True
        d['__all__'] = done['__all__'] or all(d.values())

        return StepOutput(o, r, d, i)
Esempio n. 9
0
    def step(self, action_dict: Dict[int, float]) -> StepOutput:
        rail_env: RailEnv = self.unwrapped.rail_env

        sorted_actions = {
            k: v
            for k, v in sorted(
                action_dict.items(), key=lambda item: item[1], reverse=True)
        }
        self.env.sorted_handles = list(sorted_actions.keys())

        cum_done = defaultdict(lambda: False)
        cum_rew = defaultdict(lambda: 0)
        rail_actions = self.sp_agent.compute_actions(
            {h: None
             for h in action_dict.keys()}, env=rail_env)
        o, r, done, i = self.env.step(rail_actions)
        r = {h: rew / self.norm_factor for h, rew in r.items()}
        for h, rew in r.items():
            cum_rew[h] += rew

        for h, curr_d in done.items():
            cum_done[h] = curr_d or cum_done[h]
        while not rail_env._elapsed_steps % 10 == 0 and not cum_done.get(
                '__all__', False):

            rail_actions = self.sp_agent.compute_actions(
                {h: None
                 for h in action_dict.keys()}, env=rail_env)
            o, r, done, i = self.env.step(rail_actions)
            r = {h: rew / self.norm_factor for h, rew in r.items()}

            for h, curr_d in done.items():
                cum_done[h] = curr_d or cum_done[h]

            for h, rew in r.items():
                cum_rew[h] += rew

        return StepOutput(o, cum_rew, cum_done, i)
Esempio n. 10
0
 def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
     o, r, d, i = {}, {}, {}, {}
     while len(o) == 0:
         obs, reward, done, info = self.env.step(action_dict)
         for agent_id, agent_obs in obs.items():
             if done[agent_id] or self._on_decision_cell(
                     self.unwrapped.rail_env.agents[agent_id]):
                 o[agent_id] = agent_obs
                 r[agent_id] = reward[agent_id]
                 d[agent_id] = done[agent_id]
                 i[agent_id] = info[agent_id]
                 if self._accumulate_skipped_rewards:
                     discounted_skipped_reward = r[agent_id]
                     for skipped_reward in reversed(
                             self._skipped_rewards[agent_id]):
                         discounted_skipped_reward = self._discounting * discounted_skipped_reward + skipped_reward
                     r[agent_id] = discounted_skipped_reward
                     self._skipped_rewards[agent_id] = []
             elif self._accumulate_skipped_rewards:
                 self._skipped_rewards[agent_id].append(reward[agent_id])
         d['__all__'] = done['__all__']
         action_dict = {}
     return StepOutput(o, r, d, i)
Esempio n. 11
0
    def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
        rail_env: RailEnv = self.unwrapped.rail_env

        obs, reward, done, info = self.env.step(action_dict)

        o, r, d, i = {}, {}, {}, {}
        for agent_id, agent_obs in obs.items():
            o[agent_id] = obs[agent_id]
            d[agent_id] = done[agent_id]
            i[agent_id] = info[agent_id]
            if done[agent_id]:
                if rail_env.agents[agent_id].status in [
                        RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED
                ]:
                    # agent is done and really done -> give finished reward
                    r[agent_id] = self._finished_reward
                else:
                    # agent is done but not really done -> give not_finished reward
                    r[agent_id] = self._not_finished_reward
            else:
                r[agent_id] = 0
        d['__all__'] = done['__all__'] or all(d.values())

        return StepOutput(o, r, d, i)
Esempio n. 12
0
 def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
     obs, reward, done, info = self.env.step(action_dict)
     return StepOutput(self._transform_obs(obs), reward, done, info)
Esempio n. 13
0
    def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput:
        d, r, o = None, None, None
        obs_or_done = False

        action_dict = {
            k: v
            for k, v in action_dict.items() if k not in self._agents_done
        }

        while not obs_or_done:
            # Perform envs steps as long as there is no observation (for all agents) or all agents are done
            # The observation is `None` if an agent is done or malfunctioning.

            obs, rewards, dones, infos = self.rail_env.step(action_dict)

            d, r, o = dict(), dict(), dict()
            for agent in self.agent_keys + ["__all__"]:
                if agent != '__all__':
                    if dones.get(agent, False):
                        if agent not in self._agents_done:
                            self._agents_done.append(agent)
                    if self.agent_done_independent and agent not in self._agents_done:
                        o[agent] = obs.get(agent, self.prev_obs[agent])
                        r[agent] = rewards.get(
                            agent, 0 if agent in self._agents_done else -1)
                        self._agent_scores[agent] += rewards.get(agent, 0)
                        self._agent_steps[agent] += 1

                    elif not self.agent_done_independent:
                        o[agent] = obs.get(agent, self.prev_obs[agent])
                        r[agent] = rewards.get(
                            agent, 0 if agent in self._agents_done else -1)
                        self._agent_scores[agent] += rewards.get(agent, 0)
                        self._agent_steps[agent] += 1

                if self.agent_done_independent:
                    d[agent] = dones[agent]
                else:
                    d[agent] = dones["__all__"]

            action_dict = {
            }  # reset action dict for cases where we do multiple envs steps
            obs_or_done = len(o) > 0 or d[
                '__all__']  # step through envs as long as there are no obs/all agents done

        assert all([x is not None for x in (d, r, o)])

        self.prev_obs = o

        return StepOutput(obs=o,
                          reward=r,
                          done=d,
                          info={
                              agent: {
                                  'max_episode_steps':
                                  self.rail_env._max_episode_steps,
                                  'num_agents':
                                  self.rail_env.get_num_agents(),
                                  'agent_done':
                                  d.get(agent, False)
                                  and agent not in self.rail_env.active_agents,
                                  'agent_score':
                                  self._agent_scores.get(agent, 0),
                                  'agent_step':
                                  self._agent_steps.get(agent, 0),
                              }
                              for agent in o.keys()
                          })