Ejemplos de MultiDiscrete.contains en Python

Lenguaje de programación: Python

Namespace/Package Name: gym.spaces

Clase / Tipo: MultiDiscrete

Método / Función: contains

Ejemplos en hotexamples.com: 4

Python MultiDiscrete.contains - 4 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de gym.spaces.MultiDiscrete.contains extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

MultiDiscrete(30)

sample(6)

contains(4)

__init__(1)

seed(1)

Ejemplo n.º 1

Mostrar archivo

class MatrixGameEnv(Env):
    def __init__(self,
                 matrices,
                 reward_perturbation=0,
                 rand: th.Generator = th.default_generator):
        super().__init__()

        matrices = th.as_tensor(matrices)
        reward_perturbation = th.as_tensor(reward_perturbation)
        # Check shape of transition matrix
        n_agents = matrices.dim() - 1
        if matrices.shape[0] != n_agents:
            raise ValueError(
                "Number of matrices does not match dimensions of each matrix")

        # Check shape of reward perturbation
        if reward_perturbation.shape != () and reward_perturbation.shape != (
                n_agents, ):
            raise ValueError(
                "Reward perturbation must be either same or specified for each agent"
            )
        # Check values of reward perturbation
        if (reward_perturbation < 0).any():
            raise ValueError(
                "Values of reward perturbation must be non-negative")

        ## State space
        self.observation_space = Discrete(1)
        ## Action space
        self.action_space = MultiDiscrete(matrices.shape[1:])

        ## Matrices of the matrix game
        self.matrices = matrices
        ## Standard deviation of reward perturbation
        self.reward_perturbation = reward_perturbation
        ## Random number generator
        self.rand = rand

    def reset(self):
        return th.tensor(0)

    def step(self, actions):
        # Check validity of joint actions
        if not self.action_space.contains(np.array(actions)):
            raise ValueError("Joint actions {} is invalid".format(actions))

        # Rewards for each agent
        rewards = self.matrices[(slice(None), *actions)].clone()
        # Add random perturbation to rewards
        reward_perturbation = self.reward_perturbation
        if (reward_perturbation != 0).all():
            rewards += th.normal(0., reward_perturbation, generator=self.rand)

        # Step result
        return th.tensor(0), rewards, True, {}

Ejemplo n.º 2

Mostrar archivo

Archivo: algorithmic_env.py Proyecto: LK/gym

class AlgorithmicEnv(Env):

    metadata = {'render.modes': ['human', 'ansi']}
    # Only 'promote' the length of generated input strings if the worst of the 
    # last n episodes was no more than this far from the maximum reward
    MIN_REWARD_SHORTFALL_FOR_PROMOTION = -1.0

    def __init__(self, base=10, chars=False, starting_min_length=2):
        """
        base: Number of distinct characters. 
        chars: If True, use uppercase alphabet. Otherwise, digits. Only affects
               rendering.
        starting_min_length: Minimum input string length. Ramps up as episodes 
                             are consistently solved.
        """
        self.base = base
        # Keep track of this many past episodes
        self.last = 10
        # Cumulative reward earned this episode
        self.episode_total_reward = None
        # Running tally of reward shortfalls. e.g. if there were 10 points to earn and
        # we got 8, we'd append -2
        AlgorithmicEnv.reward_shortfalls = []
        if chars:
            self.charmap = [chr(ord('A')+i) for i in range(base)]
        else:
            self.charmap = [str(i) for i in range(base)]
        self.charmap.append(' ')
        # TODO: Not clear why this is a class variable rather than instance. 
        # Could lead to some spooky action at a distance if someone is working
        # with multiple algorithmic envs at once. Also makes testing tricky.
        AlgorithmicEnv.min_length = starting_min_length
        # Three sub-actions:
        #       1. Move read head left or right (or up/down)
        #       2. Write or not
        #       3. Which character to write. (Ignored if should_write=0)
        self.action_space = MultiDiscrete(
            [len(self.MOVEMENTS), 2, self.base]
        )
        # Can see just what is on the input tape (one of n characters, or nothing)
        self.observation_space = Discrete(self.base + 1)
        self.seed()
        self.reset()

    @classmethod
    def _movement_idx(kls, movement_name):
        return kls.MOVEMENTS.index(movement_name)

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _get_obs(self, pos=None):
        """Return an observation corresponding to the given read head position
        (or the current read head position, if none is given)."""
        raise NotImplemented

    def _get_str_obs(self, pos=None):
        ret = self._get_obs(pos)
        return self.charmap[ret]

    def _get_str_target(self, pos):
        """Return the ith character of the target string (or " " if index
        out of bounds)."""
        if pos < 0 or len(self.target) <= pos:
            return " "
        else:
            return self.charmap[self.target[pos]]

    def render_observation(self):
        """Return a string representation of the input tape/grid."""
        raise NotImplementedError

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout
        inp = "Total length of input instance: %d, step: %d\n" % (self.input_width, self.time)
        outfile.write(inp)
        x, y, action = self.read_head_position, self.write_head_position, self.last_action
        if action is not None:
            inp_act, out_act, pred = action
        outfile.write("=" * (len(inp) - 1) + "\n")
        y_str =      "Output Tape         : "
        target_str = "Targets             : "
        if action is not None:
            pred_str = self.charmap[pred]
        x_str = self.render_observation()
        for i in range(-2, len(self.target) + 2):
            target_str += self._get_str_target(i)
            if i < y - 1:
                y_str += self._get_str_target(i)
            elif i == (y - 1):
                if action is not None and out_act == 1:
                    color = 'green' if pred == self.target[i] else 'red'
                    y_str += colorize(pred_str, color, highlight=True)
                else:
                    y_str += self._get_str_target(i)
        outfile.write(x_str)
        outfile.write(y_str + "\n")
        outfile.write(target_str + "\n\n")

        if action is not None:
            outfile.write("Current reward      :   %.3f\n" % self.last_reward)
            outfile.write("Cumulative reward   :   %.3f\n" % self.episode_total_reward)
            move = self.MOVEMENTS[inp_act]
            outfile.write("Action              :   Tuple(move over input: %s,\n" % move)
            out_act = out_act == 1
            outfile.write("                              write to the output tape: %s,\n" % out_act)
            outfile.write("                              prediction: %s)\n" % pred_str)
        else:
            outfile.write("\n" * 5)

        if mode != 'human':
            with closing(outfile):
                return outfile.getvalue()

    @property
    def input_width(self):
        return len(self.input_data)

    def step(self, action):
        assert self.action_space.contains(action)
        self.last_action = action
        inp_act, out_act, pred = action
        done = False
        reward = 0.0
        self.time += 1
        assert 0 <= self.write_head_position
        if out_act == 1:
            try:
                correct = pred == self.target[self.write_head_position]
            except IndexError:
                logger.warn("It looks like you're calling step() even though this "+
                    "environment has already returned done=True. You should always call "+
                    "reset() once you receive done=True. Any further steps are undefined "+
                    "behaviour.")
                correct = False
            if correct:
                reward = 1.0
            else:
                # Bail as soon as a wrong character is written to the tape
                reward = -0.5
                done = True
            self.write_head_position += 1
            if self.write_head_position >= len(self.target):
                done = True
        self._move(inp_act)
        if self.time > self.time_limit:
            reward = -1.0
            done = True
        obs = self._get_obs()
        self.last_reward = reward
        self.episode_total_reward += reward
        return (obs, reward, done, {})

    @property
    def time_limit(self):
        """If an agent takes more than this many timesteps, end the episode
        immediately and return a negative reward."""
        # (Seemingly arbitrary)
        return self.input_width + len(self.target) + 4

    def _check_levelup(self):
        """Called between episodes. Update our running record of episode rewards 
        and, if appropriate, 'level up' minimum input length."""
        if self.episode_total_reward is None:
            # This is before the first episode/call to reset(). Nothing to do
            return
        AlgorithmicEnv.reward_shortfalls.append(self.episode_total_reward - len(self.target))
        AlgorithmicEnv.reward_shortfalls = AlgorithmicEnv.reward_shortfalls[-self.last:]
        if len(AlgorithmicEnv.reward_shortfalls) == self.last and \
          min(AlgorithmicEnv.reward_shortfalls) >= self.MIN_REWARD_SHORTFALL_FOR_PROMOTION and \
          AlgorithmicEnv.min_length < 30:
            AlgorithmicEnv.min_length += 1
            AlgorithmicEnv.reward_shortfalls = []
        

    def reset(self):
        self._check_levelup()
        self.last_action = None
        self.last_reward = 0
        self.read_head_position = self.READ_HEAD_START
        self.write_head_position = 0
        self.episode_total_reward = 0.0
        self.time = 0
        length = self.np_random.randint(3) + AlgorithmicEnv.min_length
        self.input_data = self.generate_input_data(length)
        self.target = self.target_from_input_data(self.input_data)
        return self._get_obs()

    def generate_input_data(self, size):
        raise NotImplemented

    def target_from_input_data(self, input_data):
        raise NotImplemented("Subclasses must implement")

    def _move(self, movement):
        raise NotImplemented

Ejemplo n.º 3

Mostrar archivo

class FarmEnv(gym.Env):
    """
    Description:
        A wind farm is controlled by yaw angles
    Observation:
        Type: Box(n+2)
        Num     Observation                        Min                Max
        0       current yaw angle (°)             -max_yaw            max_yaw
        ...     ...                               ...                 ...
        n       current yaw angle (°)             -max_yaw            max_yaw
        n+1     wind angle (°)                     0                  359
        n+2     wind speed (kts)                   min_wind_speed     max_wind_speed

    Actions:
        Type: Discrete(2)
        Num   Action                     Min                Max
        0     yaw angle (°)             -max_yaw            max_yaw
        ...
        n     yaw angle (°)             -max_yaw            max_yaw

    Reward:
        Reward is power increase for every step taken, including the termination step
    Starting State:
        TBD
    Episode Termination:
        Pole Angle is more than 12 degrees.
        Cart Position is more than 2.4 (center of the cart reaches the edge of
        the display).
        Episode length is greater than 200.
        Solved Requirements:
        Considered solved when the average return is greater than or equal to
        195.0 over 100 consecutive trials.
    """
    def __init__(self, config):
        self.farm = config['farm']
        self.numwt = config['num_wind_turbines']

        # initialize yaw boundaries
        self.allowed_yaw = config["max_yaw"]

        self.min_wind_speed = config["min_wind_speed"]
        self.max_wind_speed = config["max_wind_speed"]
        self.min_wind_angle = config["min_wind_angle"]
        self.max_wind_angle = config["max_wind_angle"]

        self.continuous_action_space = config["continuous_action_space"]

        self.best_explored_power = {}
        self.count_steps = 0
        self.initialized_yaw_angle = 0
        self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32)

        self.turbine_powers = np.full((self.numwt, ), 0, dtype=np.float64)
        self.turbulent_intensities = np.full((self.numwt, ),
                                             0,
                                             dtype=np.float64)
        self.thrust_coefs = np.full((self.numwt, ), 0, dtype=np.float64)
        self.wt_speed_u = np.full((self.numwt, ), 0, dtype=np.float64)
        self.wt_speed_v = np.full((self.numwt, ), 0, dtype=np.float64)

        self.cur_wind_speed = [8.]  # in kts
        self.cur_wind_angle = [270.]  # in degrees

        self.initial_wind_angle = 0  # in kts
        self.max_wind_direction_variation = 10,  # max wind angle variation during episode

        self.cur_nominal_power = 0
        self.cur_power = 0
        self.cur_power_ratio = 0
        self.cur_nominal_ti_sum = 0

        if self.continuous_action_space:
            # action space is the yaw angles for the wt , to be multiplied by allowed_yaw°
            action_low = np.full((self.numwt, ), -1., dtype=np.float32)
            action_high = np.full((self.numwt, ), 1., dtype=np.float32)
            self.action_space = Box(low=action_low,
                                    high=action_high,
                                    shape=(self.numwt, ))
        else:
            # discrete action space
            self.action_space = MultiDiscrete(
                np.full((self.numwt, ), 2 * self.allowed_yaw,
                        dtype=np.float32))

        print(f'action space : {self.action_space}')
        print(f'action space : {self.action_space.sample()}')

        # observation space TODO
        observation_high = np.concatenate(
            (
                #  np.array([self.max_wind_angle, self.max_wind_speed])),
                np.array([1.] * self.numwt
                         ),  # yaw max positions for all wind turbines
                #  np.array([1] * self.numwt),  # x axis wind speed for all wind turbines
                #  np.array([1] * self.numwt),  # y axis wind speed for all wind turbines
                np.array([1] * self.numwt
                         ),  # max turbulence intensity for all wind turbines
                np.array([1]),  # max power ratio
                #  np.array([1] * self.numwt),  # max thrust coef for all wind turbines
                #  np.array([1] * self.numwt),  # max power coef for all wind turbines
                np.array([1]),  # max sinus wind angle
                np.array([1]),  # max cosinus wind angle
                np.array(
                    [1])),  # max normalized wind speed (range 2 to 25.5 m.s-1)
            axis=0)
        observation_low = np.concatenate(
            (
                #  np.array([self.min_wind_angle, self.min_wind_speed])),
                np.array([-1] * self.numwt
                         ),  # yaw min positions for all wind turbines
                #  np.array([-1] * self.numwt),  # x axis wind speed for all wind turbines
                #  np.array([-1] * self.numwt),  # y axis wind speed for all wind turbines
                np.array([0.] * self.numwt
                         ),  # min turbulence intensity for all wind turbines
                np.array([-1]),  # min power ratio
                #  np.array([0] * self.numwt),  # min thrust coef for all wind turbines
                #  np.array([0] * self.numwt),  # min power coef for all wind turbines
                np.array([-1]),  # min sinus wind angle
                np.array([-1]),  # min cosinus wind angle
                np.array(
                    [0])),  # min normalized wind speed (range 2 to 25.5 m.s-1)
            axis=0)
        print(f'observation low : {observation_low}')
        print(f'observation high : {observation_high}')
        self.observation_space = Box(low=observation_low,
                                     high=observation_high,
                                     shape=(self.numwt * 2 + 4, ),
                                     dtype=np.float64)
        print(f'observation space : {self.observation_space}')

    def reset(self, wd=None, ws=None):
        self.count_steps = 0
        self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32)

        # Define wind conditions for this episode

        # check wind speed in range (2 to 25,5)
        assert self.max_wind_speed < 25.5, "max wind speed too high"
        assert self.min_wind_speed > 2., "min wind speed too low"

        if wd:
            self.cur_wind_angle = wd
        else:
            self.cur_wind_angle = np.random.randint(self.min_wind_angle,
                                                    self.max_wind_angle)

        if ws:
            self.cur_wind_speed = ws
        else:
            self.cur_wind_speed = np.random.uniform(self.min_wind_speed,
                                                    self.max_wind_speed)

        self.initial_wind_angle = self.cur_wind_angle

        # Update the flow in the model
        print(f'wind angle {self.cur_wind_angle}')
        print(f'wind speed {self.cur_wind_speed}')
        self.farm.reinitialize_flow_field(wind_direction=[self.cur_wind_angle],
                                          wind_speed=[self.cur_wind_speed])
        self.farm.calculate_wake()
        self.cur_nominal_power = self.farm.get_farm_power()
        self.best_explored_power[self.cur_wind_angle] = self.cur_nominal_power
        self.cur_nominal_ti_sum = np.sum(self.farm.get_turbine_ti())

        state = self.get_observation()
        # print(f'initial state is {state}')
        # print(f'observation space is {self.observation_space}')

        return state  # return current state of the environment

    def get_observation(self):
        self.turbulent_intensities = (np.array(self.farm.get_turbine_ti()) -
                                      0.055) / 0.07  #rescaling
        self.cur_power = self.farm.get_farm_power()

        # self.thrust_coefs = self.farm.get_turbine_ct()

        #
        # turbine_powers = self.farm.get_turbine_power()
        # self.turbine_powers = turbine_powers / np.max(turbine_powers)
        #
        # wind_speed_points_at_wt = pd.DataFrame(self.farm.get_set_of_points(self.farm_layout[0], self.farm_layout[1], [80.] * self.numwt).head(self.numwt))
        # u_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.u)
        # v_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.v)
        # self.wt_speed_u = u_wind_speed_points_at_wt / self.cur_wind_speed
        # self.wt_speed_v = v_wind_speed_points_at_wt / self.cur_wind_speed

        current_yaws = self.cur_yaws / self.allowed_yaw
        self.cur_power_ratio = (
            self.cur_power - self.cur_nominal_power) / self.cur_nominal_power
        observation = np.concatenate(
            (
                #  self.wt_speed_u,
                # self.wt_speed_v,
                current_yaws,
                self.turbulent_intensities,
                # self.thrust_coefs,
                # self.turbine_powers,
                np.array([self.cur_power_ratio]),
                np.array([sind(self.cur_wind_angle)]),
                np.array([cosd(self.cur_wind_angle)]),
                np.array([self.cur_wind_speed / 25.5]),
            ),
            axis=0)

        return observation

    def step(self, action, no_variation=False):

        # check actions validity
        err_msg = "%r (%s) invalid" % (action, type(action))
        assert self.action_space.contains(action), err_msg

        #  Execute the actions
        if self.continuous_action_space:
            self.cur_yaws = action * self.allowed_yaw
        else:
            self.cur_yaws = action - self.allowed_yaw

        print(f'current yaws {self.cur_yaws}')

        if not no_variation:
            # Apply wind variation
            if self.cur_wind_angle <= self.initial_wind_angle + self.max_wind_direction_variation[
                    0] or self.cur_wind_angle >= self.initial_wind_angle - self.max_wind_direction_variation[
                        0]:
                self.cur_wind_angle = self.cur_wind_angle + np.random.randint(
                    -1, 2)
            self.farm.reinitialize_flow_field(
                wind_direction=[self.cur_wind_angle],
                wind_speed=[self.cur_wind_speed])
            print(f'new {self.cur_wind_angle}')
            self.farm.calculate_wake()
            self.cur_nominal_power = self.farm.get_farm_power()
        # Get the Observations from the simulation
        self.farm.calculate_wake(yaw_angles=self.cur_yaws)

        observation = self.get_observation()

        # check observation
        err_msg = "%r (%s) invalid" % (observation, type(observation))
        assert self.observation_space.contains(observation), err_msg

        # reward calc
        # power_ratio = (self.cur_power - self.best_explored_power[self.cur_wind_angle]) / self.best_explored_power[self.cur_wind_angle]

        reward = self.cur_power_ratio * 100
        print(f'power ratio       {self.cur_power_ratio}')

        # if self.cur_power > self.best_explored_power[self.cur_wind_angle]:
        #     self.best_explored_power[self.cur_wind_angle] = self.cur_power

        self.count_steps += 1

        # Done Evaluation
        if self.count_steps == 30:
            done = True
        else:
            done = False

        return observation, reward, done, {}

Ejemplo n.º 4

Mostrar archivo

class RandomMDPEnv(Env):
    # Reward correlation scale
    _REWARD_CORRELATION_SCALE = 10

    def __init__(self,
                 n_states: int,
                 n_actions: Union[int, Tuple[int]],
                 n_agents: Optional[int] = None,
                 acyclic: bool = False,
                 reward_correlation=None,
                 reward_perturbation=0,
                 rand: th.Generator = th.default_generator):
        super().__init__()

        # All agents have same number of actions
        if mu.isscalar(n_actions):
            # Number of agents must be given
            if n_agents is None:
                raise ValueError(
                    "Number of agents must be given when number of actions is scalar"
                )
            n_actions = (n_actions, ) * n_agents
        # Check size of number of actions array
        n_actions_size = len(n_actions)
        if n_actions_size != n_agents:
            raise ValueError(
                "Expect {} number of actions for each agent, got {}".format(
                    n_agents, n_actions_size))

        # Rewards of different agents have no correlation by default
        reward_correlation = th.as_tensor(reward_correlation)
        if reward_correlation is None:
            reward_correlation = self._REWARD_CORRELATION_SCALE * th.eye(
                n_agents)
        # Check shape of the correlation matrix
        elif reward_correlation.shape != (n_agents, n_agents):
            raise ValueError(
                "Rewards correlation matrix must be a {}*{} square matrix".
                format(n_agents, n_agents))

        # Full shape and allowed dimensions of reward perturbation
        perturbation_full_shape = (n_states, *n_actions, n_states, n_agents)
        perturbation_allowed_dims = [
            0, 1, n_agents + 1, n_agents + 2, n_agents + 3
        ]
        # Check shape and dimensions of the reward perturbation
        reward_perturbation = th.as_tensor(reward_perturbation)
        perturbation_shape = reward_perturbation.shape
        perturbation_dims = len(perturbation_shape)
        if perturbation_dims not in perturbation_allowed_dims:
            raise ValueError(
                "Expect reward perturbation tensor of {} dimensions, got {}".
                format(perturbation_allowed_dims, perturbation_dims))
        if perturbation_full_shape[:perturbation_dims] != perturbation_shape:
            raise ValueError(
                "Expect reward perturbation tensor with shape {}, got {}".
                format(perturbation_full_shape[:perturbation_dims],
                       perturbation_shape))
        # Check values of reward perturbation
        if (reward_perturbation < 0).any():
            raise ValueError(
                "Values of reward perturbation must be non-negative")

        ## State space
        self.observation_space = Discrete(n_states)
        ## Joint action space
        self.action_space = MultiDiscrete(n_actions)

        ## Reward correlation matrix
        self.reward_correlation = reward_correlation
        ## Reward perturbation
        self.reward_perturbation = reward_perturbation
        ## Acyclic MDP
        self.acyclic = acyclic
        ## Random number generator
        self.rand = rand

        # Make multi-agent MDP environment
        self._make_ma_mdp()
        # Initialize environment
        self.reset()

    @property
    def _done(self):
        # Game is done if MDP is acyclic and last state is reached
        return self.acyclic and self._state == self.n_states - 1

    def _make_ma_mdp(self):
        joint_action_shape = self.joint_action_shape
        n_states = self.n_states
        n_agents = len(joint_action_shape)
        rand = self.rand

        # Reward perturbation
        perturbation = mu.unsqueeze(
            self.reward_perturbation, -1,
            n_states + 3 - self.reward_perturbation.dim())
        # Generate transition probability tensor
        trans_prob = th.rand(n_states,
                             *joint_action_shape,
                             n_states,
                             generator=rand)
        # Acyclic (episodic) MDP
        if self.acyclic:
            states_idx, next_states_idx = th.tril_indices(n_states)
            trans_prob[states_idx, ..., next_states_idx] = 0
        # Normalize transition probability matrix
        trans_prob /= trans_prob.sum(dim=-1, keepdim=True)
        trans_prob[th.isnan(trans_prob)] = 0

        # Generate random reward (following method ensures enough variance in rewards)
        # 1) Generate rewards "core" for state, joint actions and agents
        rewards = th.randn(n_states,
                           *joint_action_shape,
                           1,
                           n_agents,
                           generator=rand)
        # 2) Multiply "core" by scales to generate different rewards for next state
        scales_dist = Exponential(th.tensor(1.))
        with mu.use_rand(rand):
            rewards *= scales_dist.sample(
                (n_states, *joint_action_shape, n_states, n_agents))
        # 3) Correlate rewards
        rewards = rewards @ self.reward_correlation

        ## Transition probability
        self._trans_prob = trans_prob
        ## Rewards for state-joint actions
        self._rewards = rewards

    def reset(self):
        ## Current state
        self._state = state = th.tensor(0)
        # Return current state
        return state

    def step(self, actions):
        reward_perturbation = self.reward_perturbation
        n_states = self.n_states
        n_agents = len(self.action_space.nvec)
        rand = self.rand
        state = self._state

        # Validity of joint actions
        if not self.action_space.contains(np.array(actions)):
            raise ValueError("Joint actions {} is invalid".format(actions))
        # Game already done
        if self._done:
            warnings.warn(
                "Attempting to step the environment after game is done")
            # Dummy step result
            return state, th.zeros(n_agents), True, {}

        # Find transition probability distribution for state-joint actions
        trans_prob_sa = self._trans_prob[state][actions]
        # Draw next state from distribution
        next_state_dist = Categorical(probs=trans_prob_sa)
        with mu.use_rand(rand):
            self._state = next_state = next_state_dist.sample()

        # Get reward perturbation
        perturbation_dims = reward_perturbation.dim()
        perturbation_idx = (state, *actions, next_state)
        if perturbation_dims <= len(perturbation_idx):
            perturbation = th.full(
                n_agents,
                reward_perturbation[perturbation_idx[:perturbation_dims]])
        else:
            perturbation = reward_perturbation[perturbation_idx]
        # Sample perturbation rewards
        perturbation_rewards = rand.normal(0., perturbation)

        # Compute total rewards
        rewards = self._rewards[state][actions][next_state].clone()
        rewards += perturbation_rewards
        # Step result
        return next_state, rewards, self._done, {}