Beispiel #1
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self.dones = {agent: False for agent in self.possible_agents}
        self._agents = self._possible_agents[:]

        self._prev_timestep: rl_environment.TimeStep = None
        self._current_player_id = 0

        opnspl_tmstep = self._environment.reset()
        agent = self.current_agent
        done = self.dones[agent]

        observe = self._to_observation(opnspl_tmstep)
        observation = self._convert_observation(agent, observe, done)

        self._discount = convert_np_type(self.discount_spec()[agent].dtype, 1)

        reward = convert_np_type(
            self.reward_spec()[agent].dtype,
            (opnspl_tmstep.rewards[self.current_player_id]
             if opnspl_tmstep.rewards else 0),
        )

        return parameterized_restart(reward, self._discount, observation)
Beispiel #2
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self._step_type = dm_env.StepType.FIRST
        discount_spec = self.discount_spec()
        observe = self._environment.reset()

        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self.possible_agents
        }

        if type(observe) == tuple:
            observe, env_extras = observe
        else:
            env_extras = {}

        observations = self._convert_observations(
            observe, {agent: False
                      for agent in self.possible_agents})
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self.possible_agents
        }

        return parameterized_restart(rewards, self._discounts,
                                     observations), env_extras
Beispiel #3
0
    def reset(self) -> Tuple[dm_env.TimeStep, np.array]:
        """Resets the episode."""
        self._reset_next_step = False
        self._step_type = dm_env.StepType.FIRST
        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self._environment.possible_agents
        }
        observe, state_infos = self._environment.reset()
        observations = self._convert_observations(
            observe, {agent: False for agent in self.possible_agents}
        )
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self.possible_agents
        }

        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self.possible_agents
        }
        return (
            parameterized_restart(rewards, self._discounts, observations),
            state_infos,
        )
Beispiel #4
0
 def reset(self) -> dm_env.TimeStep:
     observation = self._generate_fake_observation()
     discount = convert_np_type("float32", 1)  # Not used in pettingzoo
     reward = convert_np_type("float32", 0)
     self._step = 1
     return parameterized_restart(reward=reward,
                                  discount=discount,
                                  observation=observation)
Beispiel #5
0
    def reset(self) -> Tuple[dm_env.TimeStep, np.array]:
        """Resets the env and returns observations from ready agents.
        Returns:
            obs (dict): New observations for each ready agent.
        """
        self._env_done = False
        self._reset_next_step = False
        self._step_type = dm_env.StepType.FIRST

        # reset internal SC2 env
        obs_list, state = self._environment.reset()

        # Convert observations
        observe: Dict[str, np.ndarray] = {}

        for i, obs in enumerate(obs_list):
            observe[f"agent_{i}"] = {
                "observation":
                obs,
                "action_mask":
                np.array(self._environment.get_avail_agent_actions(i),
                         dtype=np.float32),
            }

        observations = self._convert_observations(
            observe, {agent: False
                      for agent in self._possible_agents})

        self._agents = list(observe.keys())

        # create discount spec
        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self._possible_agents
        }

        # create rewards spec
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self._possible_agents
        }

        # dm_env timestep
        timestep = parameterized_restart(rewards, self._discounts,
                                         observations)

        return timestep, {"s_t": state}
Beispiel #6
0
    def step(self, actions: Dict[str, np.ndarray]) -> dm_env.TimeStep:
        """Steps the environment."""
        self._pre_step()

        if self._reset_next_step:
            return self.reset()

        self._agents = [
            agent for agent in self.agents
            if not self._environment.dones[get_agent_handle(agent)]
        ]

        observations, rewards, dones, infos = self._environment.step(actions)

        rewards_spec = self.reward_spec()
        #  Handle empty rewards
        if not rewards:
            rewards = {
                agent: convert_np_type(rewards_spec[agent].dtype, 0)
                for agent in self.possible_agents
            }
        else:
            rewards = {
                get_agent_id(agent):
                convert_np_type(rewards_spec[get_agent_id(agent)].dtype,
                                reward)
                for agent, reward in rewards.items()
            }

        if observations:
            observations = self._create_observations(observations, infos,
                                                     dones)

        if self.env_done():
            self._step_type = dm_env.StepType.LAST
            self._reset_next_step = True
        else:
            self._step_type = dm_env.StepType.MID

        return dm_env.TimeStep(
            observation=observations,
            reward=rewards,
            discount=self._discounts,
            step_type=self._step_type,
        )
Beispiel #7
0
    def reset(self) -> dm_env.TimeStep:
        observations = {}
        for agent in self.agents:
            observation = self._generate_fake_observation()
            observations[agent] = observation

        rewards = {
            agent: convert_np_type("float32", 0)
            for agent in self.agents
        }
        discounts = {
            agent: convert_np_type("float32", 1)
            for agent in self.agents
        }

        self._step = 1
        return parameterized_restart(rewards, discounts,
                                     observations)  # type: ignore
Beispiel #8
0
    def assert_env_reset(
        wrapped_env: dm_env.Environment,
        dm_env_timestep: dm_env.TimeStep,
        env_spec: EnvSpec,
    ) -> None:
        if env_spec.env_type == EnvType.Parallel:
            rewards_spec = wrapped_env.reward_spec()
            expected_rewards = {
                agent: convert_np_type(rewards_spec[agent].dtype, 0)
                for agent in wrapped_env.agents
            }

            discount_spec = wrapped_env.discount_spec()
            expected_discounts = {
                agent: convert_np_type(rewards_spec[agent].dtype, 1)
                for agent in wrapped_env.agents
            }

            Helpers.compare_dicts(
                dm_env_timestep.reward,
                expected_rewards,
            ), "Failed to reset reward."
            Helpers.compare_dicts(
                dm_env_timestep.discount,
                expected_discounts,
            ), "Failed to reset discount."

        elif env_spec.env_type == EnvType.Sequential:
            for agent in wrapped_env.agents:
                rewards_spec = wrapped_env.reward_spec()
                expected_reward = convert_np_type(rewards_spec[agent].dtype, 0)

                discount_spec = wrapped_env.discount_spec()
                expected_discount = convert_np_type(discount_spec[agent].dtype,
                                                    1)

                assert dm_env_timestep.reward == expected_reward and type(
                    dm_env_timestep.reward) == type(
                        expected_reward), "Failed to reset reward."
                assert dm_env_timestep.discount == expected_discount and type(
                    dm_env_timestep.discount) == type(
                        expected_discount), "Failed to reset discount."
Beispiel #9
0
    def step(self, actions: Dict[str, np.ndarray]) -> Tuple[dm_env.TimeStep, np.array]:
        """Steps the environment."""

        if self._reset_next_step:
            return self.reset()

        observations, rewards, dones, state = self._environment.step(actions)

        rewards_spec = self.reward_spec()
        #  Handle empty rewards
        if not rewards:
            rewards = {
                agent: convert_np_type(rewards_spec[agent].dtype, 0)
                for agent in self.agent_ids
            }
        else:
            rewards = {
                agent: convert_np_type(rewards_spec[agent].dtype, reward)
                for agent, reward in rewards.items()
            }

        if observations:
            observations = self._convert_observations(observations, dones)

        if self._environment.env_done:
            self._step_type = dm_env.StepType.LAST
            self._reset_next_step = True
        else:
            self._step_type = dm_env.StepType.MID

        timestep = dm_env.TimeStep(
            observation=observations,
            reward=rewards,
            discount=self._discounts,
            step_type=self._step_type,
        )
        if self.return_state_info:
            return timestep, {"s_t": state}
        else:
            return timestep
Beispiel #10
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self._environment.reset()
        self._step_types = {
            agent: dm_env.StepType.FIRST
            for agent in self.possible_agents
        }
        self._first_step_performed = {
            agent: False
            for agent in self.possible_agents
        }

        observe, _, done, _ = self._environment.last()
        agent = self.current_agent
        observation = self._convert_observation(agent, observe, done)

        self._discount = convert_np_type(self.discount_spec()[agent].dtype, 1)

        reward = convert_np_type(self.reward_spec()[agent].dtype, 0)

        return parameterized_restart(reward, self._discount, observation)
Beispiel #11
0
    def reset(self) -> dm_env.TimeStep:
        """Resets the episode."""
        self._reset_next_step = False
        self._agents = self.possible_agents[:]
        self._discounts = {
            agent: np.dtype("float32").type(1.0)
            for agent in self.agents
        }
        observe, info = self._environment.reset()
        observations = self._create_observations(observe, info,
                                                 self._environment.dones)
        rewards_spec = self.reward_spec()
        rewards = {
            agent: convert_np_type(rewards_spec[agent].dtype, 0)
            for agent in self.possible_agents
        }

        discount_spec = self.discount_spec()
        self._discounts = {
            agent: convert_np_type(discount_spec[agent].dtype, 1)
            for agent in self.possible_agents
        }
        return parameterized_restart(rewards, self._discounts, observations)
Beispiel #12
0
    def step(  # type: ignore[override]
            self, action: Union[int, float]) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        _, _, done, _ = self._environment.last()

        # If current agent is done
        if done:
            self._environment.step(None)
        else:
            self._environment.step(action)

        agent = self.current_agent
        # Reset if all agents are done
        if self.env_done():
            self._reset_next_step = True
            reward = convert_np_type(self.reward_spec()[agent].dtype, 0)
            observation = self._convert_observation(
                agent, self._environment.observe(agent), done)
        else:
            #  observation for next agent
            observe, reward, done, info = self._environment.last()

            # Convert rewards to match spec
            reward = convert_np_type(self.reward_spec()[agent].dtype, reward)
            observation = self._convert_observation(agent, observe, done)

        step_type = dm_env.StepType.LAST if done else dm_env.StepType.MID

        return dm_env.TimeStep(
            observation=observation,
            reward=reward,
            discount=self._discount,
            step_type=step_type,
        )
Beispiel #13
0
    def step(
            self,
            actions: Dict[str,
                          np.ndarray]) -> Tuple[dm_env.TimeStep, np.array]:
        """Returns observations from ready agents.
        The returns are dicts mapping from agent_id strings to values. The
        number of agents in the env can vary over time.
        Returns
        -------
            obs (dict): New observations for each ready agent.
            rewards (dict): Reward values for each ready agent. If the
                episode is just started, the value will be None.
            dones (dict): Done values for each ready agent. The special key
                "__all__" (required) is used to indicate env termination.
            infos (dict): Optional info values for each agent id.
        """
        if self._reset_next_step:
            return self.reset()

        actions_feed = list(actions.values())
        reward, terminated, info = self._environment.step(actions_feed)
        obs_list = self._environment.get_obs()
        state = self._environment.get_state()
        self._env_done = terminated

        observe = {}
        rewards = {}
        dones = {}
        for i, obs in enumerate(obs_list):
            agent = f"agent_{i}"
            observe[agent] = {
                "observation":
                obs,
                "action_mask":
                np.array(self._environment.get_avail_agent_actions(i),
                         dtype=np.float32),
            }
            rewards[agent] = reward
            dones[agent] = terminated

        observations = self._convert_observations(observe, dones)
        self._agents = list(observe.keys())
        rewards_spec = self.reward_spec()

        #  Handle empty rewards
        if not rewards:
            rewards = {
                agent: convert_np_type(rewards_spec[agent].dtype, 0)
                for agent in self._possible_agents
            }
        else:
            rewards = {
                agent: convert_np_type(rewards_spec[agent].dtype, reward)
                for agent, reward in rewards.items()
            }

        if self.env_done():
            self._step_type = dm_env.StepType.LAST
            self._reset_next_step = True
        else:
            self._step_type = dm_env.StepType.MID

        timestep = dm_env.TimeStep(
            observation=observations,
            reward=rewards,
            discount=self._discounts,
            step_type=self._step_type,
        )

        self.reward = rewards

        return timestep, {"s_t": state}
Beispiel #14
0
    def step(self, action_list: Tuple[np.ndarray]) -> dm_env.TimeStep:
        """Steps the environment."""
        if self._reset_next_step:
            return self.reset()

        # only action lists are accepted
        if not isinstance(action_list, (list, tuple)):
            action_list = [action_list]

        agent = self.current_agent

        # done agents should be removed and active agents should take steps
        if self.dones[agent]:
            self.agents.remove(agent)
            del self.dones[agent]

            # move to next agent, which should also be done
            self._current_player_id = (self._current_player_id +
                                       1) % self.num_agents
            agent = self.current_agent

            opnspl_timestep = self._prev_timestep

            step_type = dm_env.StepType.LAST

        else:
            opnspl_timestep = self._environment.step(action_list)

            # after a step, a next agent becomes the current
            agent = self.current_agent

            if (self._environment.get_state.current_player() ==
                    pyspiel.PlayerId.TERMINAL):
                # all agents get done at a terminal step in turn-based games
                # current agent/player is updated using _current_player_id
                self.dones = {agnt: True for agnt in self._possible_agents}
                self._current_player_id = (self._current_player_id +
                                           1) % self.num_agents

                agent = self.current_agent
            else:
                self.dones[agent] = False

            step_type = (dm_env.StepType.LAST
                         if self.dones[agent] else dm_env.StepType.MID)
            self._prev_timestep = opnspl_timestep

        observe = self._to_observation(opnspl_timestep)

        # Reset if all agents are done
        if self.env_done():
            self._reset_next_step = True
            reward = convert_np_type(
                self.reward_spec()[agent].dtype,
                0,
            )
            observation = self._convert_observation(agent, observe, True)
        else:
            #  observation for next agent
            reward = convert_np_type(
                self.reward_spec()[agent].dtype,
                opnspl_timestep.rewards[self.current_player_id],
            )
            observation = self._convert_observation(agent, observe,
                                                    self.dones[agent])

        return dm_env.TimeStep(
            observation=observation,
            reward=reward,
            discount=self._discount,
            step_type=step_type,
        )