Esempi in Python per RunningMeanStd, esempi in Python per stable_baselines3.common.running_mean_std.RunningMeanStd

Esempio n. 1

0

Mostra file

File: vec_normalize.py Progetto: wmmc88/stable-baselines3

 def __init__(
     self,
     venv: VecEnv,
     training: bool = True,
     norm_obs: bool = True,
     norm_reward: bool = True,
     clip_obs: float = 10.0,
     clip_reward: float = 10.0,
     gamma: float = 0.99,
     epsilon: float = 1e-8,
 ):
     VecEnvWrapper.__init__(self, venv)
     self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
     self.ret_rms = RunningMeanStd(shape=())
     self.clip_obs = clip_obs
     self.clip_reward = clip_reward
     # Returns: discounted rewards
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
     self.training = training
     self.norm_obs = norm_obs
     self.norm_reward = norm_reward
     self.old_obs = np.array([])
     self.old_reward = np.array([])

Esempio n. 2

0

Mostra file

    def __init__(
        self,
        num_envs: int,
        step_count: int = 32,
        domain: phiflow.Domain = phiflow.Domain((32, ), box=phiflow.box[0:1]),
        dt: float = 0.03,
        viscosity: float = 0.003,
        diffusion_substeps: int = 1,
        final_reward_factor: float = 32,
        reward_rms: Optional[RunningMeanStd] = None,
        exp_name: str = 'v0',
    ):
        act_shape = self._get_act_shape(domain.resolution)
        obs_shape = self._get_obs_shape(domain.resolution)
        observation_space = gym.spaces.Box(-np.inf,
                                           np.inf,
                                           shape=obs_shape,
                                           dtype=np.float32)
        action_space = gym.spaces.Box(-np.inf,
                                      np.inf,
                                      shape=act_shape,
                                      dtype=np.float32)

        super().__init__(num_envs, observation_space, action_space)

        self.reward_range = (-float('inf'), float('inf'))
        self.spec = None
        self.exp_name = exp_name
        self.domain = domain
        self.step_count = step_count
        self.step_idx = 0
        self.ep_idx = 0
        self.dt = dt
        self.viscosity = viscosity
        self.physics = phiflow.Burgers(diffusion_substeps=diffusion_substeps)
        self.final_reward_factor = final_reward_factor
        self.reward_rms = reward_rms
        if self.reward_rms is None:
            self.reward_rms = RunningMeanStd()
        self.actions = None
        self.test_mode = False
        self.init_state = None
        self.goal_state = None
        self.cont_state = None
        self.pass_state = None
        self.gt_state = None
        self.gt_forces = None
        self.lviz = None
        self.gifviz = None
        self.pngviz = None

Esempio n. 3

0

Mostra file

File: gail.py Progetto: zhouzhou773/pytorch-a2c-ppo-acktr-gail

    def __init__(self, input_dim, hidden_dim, device):
        super(Discriminator, self).__init__()

        self.device = device

        self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

Esempio n. 4

0

Mostra file

    def __init__(
        self,
        venv: VecEnv,
        training: bool = True,
        norm_obs: bool = True,
        norm_reward: bool = True,
        clip_obs: float = 10.0,
        clip_reward: float = 10.0,
        gamma: float = 0.99,
        epsilon: float = 1e-8,
        norm_obs_keys: Optional[List[str]] = None,
    ):
        VecEnvWrapper.__init__(self, venv)

        self.norm_obs = norm_obs
        self.norm_obs_keys = norm_obs_keys
        # Check observation spaces
        if self.norm_obs:
            self._sanity_checks()

            if isinstance(self.observation_space, gym.spaces.Dict):
                self.obs_spaces = self.observation_space.spaces
                self.obs_rms = {
                    key: RunningMeanStd(shape=self.obs_spaces[key].shape)
                    for key in self.norm_obs_keys
                }
            else:
                self.obs_spaces = None
                self.obs_rms = RunningMeanStd(
                    shape=self.observation_space.shape)

        self.ret_rms = RunningMeanStd(shape=())
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        # Returns: discounted rewards
        self.returns = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = training
        self.norm_obs = norm_obs
        self.norm_reward = norm_reward
        self.old_obs = np.array([])
        self.old_reward = np.array([])

Esempio n. 5

0

Mostra file

    def __init__(
        self,
        venv: VecEnv,
        training: bool = True,
        norm_obs: bool = True,
        norm_reward: bool = True,
        clip_obs: float = 10.0,
        clip_reward: float = 10.0,
        gamma: float = 0.99,
        epsilon: float = 1e-8,
    ):
        VecEnvWrapper.__init__(self, venv)

        if norm_obs:
            if not isinstance(self.observation_space,
                              (gym.spaces.Box, gym.spaces.Dict)):
                raise ValueError(
                    "VecNormalize only supports `gym.spaces.Box` and `gym.spaces.Dict` observation spaces"
                )

        if isinstance(self.observation_space, gym.spaces.Dict):
            self.obs_keys = set(self.observation_space.spaces.keys())
            self.obs_spaces = self.observation_space.spaces
            self.obs_rms = {
                key: RunningMeanStd(shape=space.shape)
                for key, space in self.obs_spaces.items()
            }
        else:
            self.obs_keys, self.obs_spaces = None, None
            self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)

        self.ret_rms = RunningMeanStd(shape=())
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        # Returns: discounted rewards
        self.returns = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = training
        self.norm_obs = norm_obs
        self.norm_reward = norm_reward
        self.old_obs = np.array([])
        self.old_reward = np.array([])

Esempio n. 6

0

Mostra file

def test_runningmeanstd():
    """Test RunningMeanStd object"""
    for (x_1, x_2, x_3) in [
        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]:
        rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])

        x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
        moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)]
        rms.update(x_1)
        rms.update(x_2)
        rms.update(x_3)
        moments_2 = [rms.mean, rms.var]

        assert np.allclose(moments_1, moments_2)

Esempio n. 7

0

Mostra file

 def load_DvD_weight(self, weight_path):
     weights = np.load(weight_path)
     mu = weights[-self.ob_dim * 2:-self.ob_dim]
     std = weights[-self.ob_dim:]
     from stable_baselines3.common.running_mean_std import RunningMeanStd
     self.obs_rms = RunningMeanStd(shape=(self.ob_dim, ))
     self.obs_rms.mean = mu
     self.obs_rms.var = np.square(std)
     # print(mu, std)
     weights = weights[:-self.ob_dim * 2]
     sizes = [(self.h_dim, self.ob_dim), (self.h_dim, self.h_dim),
              (self.ac_dim, self.h_dim)]
     hiddens, weights = self._get_weights(weights, sizes)
     h1, h2, h999 = hiddens
     b1, b2 = weights
     self.base.actor[0].weight.data = h1
     self.base.actor[0].bias.data = b1
     self.base.actor[2].weight.data = h2
     self.base.actor[2].bias.data = b2
     self.dist.fc_mean[0].weight.data = h999
     self.dist.fc_mean[0].bias.data.fill_(0.)

Esempio n. 8

0

Mostra file

class VecNormalize(VecEnvWrapper):
    """
    A moving average, normalizing wrapper for vectorized environment.
    has support for saving/loading moving average,

    :param venv: the vectorized environment to wrap
    :param training: Whether to update or not the moving average
    :param norm_obs: Whether to normalize observation or not (default: True)
    :param norm_reward: Whether to normalize rewards or not (default: True)
    :param clip_obs: Max absolute value for observation
    :param clip_reward: Max value absolute for discounted reward
    :param gamma: discount factor
    :param epsilon: To avoid division by zero
    :param norm_obs_keys: Which keys from observation dict to normalize.
        If not specified, all keys will be normalized.
    """
    def __init__(
        self,
        venv: VecEnv,
        training: bool = True,
        norm_obs: bool = True,
        norm_reward: bool = True,
        clip_obs: float = 10.0,
        clip_reward: float = 10.0,
        gamma: float = 0.99,
        epsilon: float = 1e-8,
        norm_obs_keys: Optional[List[str]] = None,
    ):
        VecEnvWrapper.__init__(self, venv)

        self.norm_obs = norm_obs
        self.norm_obs_keys = norm_obs_keys
        # Check observation spaces
        if self.norm_obs:
            self._sanity_checks()

            if isinstance(self.observation_space, gym.spaces.Dict):
                self.obs_spaces = self.observation_space.spaces
                self.obs_rms = {
                    key: RunningMeanStd(shape=self.obs_spaces[key].shape)
                    for key in self.norm_obs_keys
                }
            else:
                self.obs_spaces = None
                self.obs_rms = RunningMeanStd(
                    shape=self.observation_space.shape)

        self.ret_rms = RunningMeanStd(shape=())
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        # Returns: discounted rewards
        self.returns = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = training
        self.norm_obs = norm_obs
        self.norm_reward = norm_reward
        self.old_obs = np.array([])
        self.old_reward = np.array([])

    def _sanity_checks(self) -> None:
        """
        Check the observations that are going to be normalized are of the correct type (spaces.Box).
        """
        if isinstance(self.observation_space, gym.spaces.Dict):
            # By default, we normalize all keys
            if self.norm_obs_keys is None:
                self.norm_obs_keys = list(self.observation_space.spaces.keys())
            # Check that all keys are of type Box
            for obs_key in self.norm_obs_keys:
                if not isinstance(self.observation_space.spaces[obs_key],
                                  gym.spaces.Box):
                    raise ValueError(
                        f"VecNormalize only supports `gym.spaces.Box` observation spaces but {obs_key} "
                        f"is of type {self.observation_space.spaces[obs_key]}. "
                        "You should probably explicitely pass the observation keys "
                        " that should be normalized via the `norm_obs_keys` parameter."
                    )

        elif isinstance(self.observation_space, gym.spaces.Box):
            if self.norm_obs_keys is not None:
                raise ValueError(
                    "`norm_obs_keys` param is applicable only with `gym.spaces.Dict` observation spaces"
                )

        else:
            raise ValueError(
                "VecNormalize only supports `gym.spaces.Box` and `gym.spaces.Dict` observation spaces, "
                f"not {self.observation_space}")

    def __getstate__(self) -> Dict[str, Any]:
        """
        Gets state for pickling.

        Excludes self.venv, as in general VecEnv's may not be pickleable."""
        state = self.__dict__.copy()
        # these attributes are not pickleable
        del state["venv"]
        del state["class_attributes"]
        # these attributes depend on the above and so we would prefer not to pickle
        del state["returns"]
        return state

    def __setstate__(self, state: Dict[str, Any]) -> None:
        """
        Restores pickled state.

        User must call set_venv() after unpickling before using.

        :param state:"""
        # Backward compatibility
        if "norm_obs_keys" not in state and isinstance(
                state["observation_space"], gym.spaces.Dict):
            state["norm_obs_keys"] = list(
                state["observation_space"].spaces.keys())
        self.__dict__.update(state)
        assert "venv" not in state
        self.venv = None

    def set_venv(self, venv: VecEnv) -> None:
        """
        Sets the vector environment to wrap to venv.

        Also sets attributes derived from this such as `num_env`.

        :param venv:
        """
        if self.venv is not None:
            raise ValueError(
                "Trying to set venv of already initialized VecNormalize wrapper."
            )
        VecEnvWrapper.__init__(self, venv)

        # Check only that the observation_space match
        utils.check_for_correct_spaces(venv, self.observation_space,
                                       venv.action_space)
        self.returns = np.zeros(self.num_envs)

    def step_wait(self) -> VecEnvStepReturn:
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, dones)

        where ``dones`` is a boolean vector indicating whether each element is new.
        """
        obs, rewards, dones, infos = self.venv.step_wait()
        self.old_obs = obs
        self.old_reward = rewards

        if self.training and self.norm_obs:
            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
                for key in self.obs_rms.keys():
                    self.obs_rms[key].update(obs[key])
            else:
                self.obs_rms.update(obs)

        obs = self.normalize_obs(obs)

        if self.training:
            self._update_reward(rewards)
        rewards = self.normalize_reward(rewards)

        # Normalize the terminal observations
        for idx, done in enumerate(dones):
            if not done:
                continue
            if "terminal_observation" in infos[idx]:
                infos[idx]["terminal_observation"] = self.normalize_obs(
                    infos[idx]["terminal_observation"])

        self.returns[dones] = 0
        return obs, rewards, dones, infos

    def _update_reward(self, reward: np.ndarray) -> None:
        """Update reward normalization statistics."""
        self.returns = self.returns * self.gamma + reward
        self.ret_rms.update(self.returns)

    def _normalize_obs(self, obs: np.ndarray,
                       obs_rms: RunningMeanStd) -> np.ndarray:
        """
        Helper to normalize observation.
        :param obs:
        :param obs_rms: associated statistics
        :return: normalized observation
        """
        return np.clip(
            (obs - obs_rms.mean) / np.sqrt(obs_rms.var + self.epsilon),
            -self.clip_obs, self.clip_obs)

    def _unnormalize_obs(self, obs: np.ndarray,
                         obs_rms: RunningMeanStd) -> np.ndarray:
        """
        Helper to unnormalize observation.
        :param obs:
        :param obs_rms: associated statistics
        :return: unnormalized observation
        """
        return (obs * np.sqrt(obs_rms.var + self.epsilon)) + obs_rms.mean

    def normalize_obs(
        self, obs: Union[np.ndarray, Dict[str, np.ndarray]]
    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
        """
        Normalize observations using this VecNormalize's observations statistics.
        Calling this method does not update statistics.
        """
        # Avoid modifying by reference the original object
        obs_ = deepcopy(obs)
        if self.norm_obs:
            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
                # Only normalize the specified keys
                for key in self.norm_obs_keys:
                    obs_[key] = self._normalize_obs(
                        obs[key], self.obs_rms[key]).astype(np.float32)
            else:
                obs_ = self._normalize_obs(obs,
                                           self.obs_rms).astype(np.float32)
        return obs_

    def normalize_reward(self, reward: np.ndarray) -> np.ndarray:
        """
        Normalize rewards using this VecNormalize's rewards statistics.
        Calling this method does not update statistics.
        """
        if self.norm_reward:
            reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
                             -self.clip_reward, self.clip_reward)
        return reward

    def unnormalize_obs(
        self, obs: Union[np.ndarray, Dict[str, np.ndarray]]
    ) -> Union[np.ndarray, Dict[str, np.ndarray]]:
        # Avoid modifying by reference the original object
        obs_ = deepcopy(obs)
        if self.norm_obs:
            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
                for key in self.norm_obs_keys:
                    obs_[key] = self._unnormalize_obs(obs[key],
                                                      self.obs_rms[key])
            else:
                obs_ = self._unnormalize_obs(obs, self.obs_rms)
        return obs_

    def unnormalize_reward(self, reward: np.ndarray) -> np.ndarray:
        if self.norm_reward:
            return reward * np.sqrt(self.ret_rms.var + self.epsilon)
        return reward

    def get_original_obs(self) -> Union[np.ndarray, Dict[str, np.ndarray]]:
        """
        Returns an unnormalized version of the observations from the most recent
        step or reset.
        """
        return deepcopy(self.old_obs)

    def get_original_reward(self) -> np.ndarray:
        """
        Returns an unnormalized version of the rewards from the most recent step.
        """
        return self.old_reward.copy()

    def reset(self) -> Union[np.ndarray, Dict[str, np.ndarray]]:
        """
        Reset all environments
        :return: first observation of the episode
        """
        obs = self.venv.reset()
        self.old_obs = obs
        self.returns = np.zeros(self.num_envs)
        if self.training and self.norm_obs:
            if isinstance(obs, dict) and isinstance(self.obs_rms, dict):
                for key in self.obs_rms.keys():
                    self.obs_rms[key].update(obs[key])
            else:
                self.obs_rms.update(obs)
        return self.normalize_obs(obs)

    @staticmethod
    def load(load_path: str, venv: VecEnv) -> "VecNormalize":
        """
        Loads a saved VecNormalize object.

        :param load_path: the path to load from.
        :param venv: the VecEnv to wrap.
        :return:
        """
        with open(load_path, "rb") as file_handler:
            vec_normalize = pickle.load(file_handler)
        vec_normalize.set_venv(venv)
        return vec_normalize

    def save(self, save_path: str) -> None:
        """
        Save current VecNormalize object with
        all running statistics and settings (e.g. clip_obs)

        :param save_path: The path to save to
        """
        with open(save_path, "wb") as file_handler:
            pickle.dump(self, file_handler)

    @property
    def ret(self) -> np.ndarray:
        warnings.warn(
            "`VecNormalize` `ret` attribute is deprecated. Please use `returns` instead.",
            DeprecationWarning)
        return self.returns

Esempio n. 9

0

Mostra file

    def __init__(
            self,
            path,
            domain,
            viscosity,
            step_count,
            dt,
            diffusion_substeps,
            n_envs,
            final_reward_factor,
            steps_per_rollout,
            n_epochs,
            learning_rate,
            batch_size,
            data_path=None,
            val_range=range(100, 200),
            test_range=range(100),
    ):
        callbacks = []

        env_kwargs = dict(
            num_envs=n_envs,
            step_count=step_count,
            domain=domain,
            dt=dt,
            viscosity=viscosity,
            diffusion_substeps=diffusion_substeps,
            final_reward_factor=final_reward_factor,
            exp_name=path,
        )

        evaluation_env_kwargs = {
            k: env_kwargs[k]
            for k in env_kwargs if k != 'num_envs'
        }

        if data_path is not None:
            self.val_env = BurgersFixedSetEnv(data_path=data_path,
                                              data_range=val_range,
                                              num_envs=len(val_range),
                                              **evaluation_env_kwargs)
            self.test_env = BurgersFixedSetEnv(data_path=data_path,
                                               data_range=test_range,
                                               num_envs=len(test_range),
                                               **evaluation_env_kwargs)

            callbacks.append(
                EveryNRolloutsFunctionCallback(
                    1, lambda _: self._record_forces(self.val_env,
                                                     'val_set_forces')))

        # Only add a fresh running mean to new experiments
        if not ExperimentFolder.exists(path):
            env_kwargs['reward_rms'] = RunningMeanStd()

        agent_kwargs = dict(
            verbose=0,
            policy=CustomActorCriticPolicy,
            policy_kwargs=dict(
                pi_net=RES_UNET,
                vf_net=CNN_FUNNEL,
                vf_latent_dim=16,
                pi_kwargs=dict(sizes=[4, 8, 16, 16, 16]),
                vf_kwargs=dict(sizes=[4, 8, 16, 16, 16]),
            ),
            n_steps=steps_per_rollout,
            n_epochs=n_epochs,
            learning_rate=learning_rate,
            batch_size=batch_size,
        )

        super().__init__(path, BurgersEnv, env_kwargs, agent_kwargs,
                         steps_per_rollout, n_envs, callbacks)

Esempio n. 10

0

Mostra file

File: gail.py Progetto: zhouzhou773/pytorch-a2c-ppo-acktr-gail

class Discriminator(nn.Module):
    def __init__(self, input_dim, hidden_dim, device):
        super(Discriminator, self).__init__()

        self.device = device

        self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

    def compute_grad_pen(self,
                         expert_state,
                         expert_action,
                         policy_state,
                         policy_action,
                         lambda_=10):
        alpha = torch.rand(expert_state.size(0), 1)
        expert_data = torch.cat([expert_state, expert_action], dim=1)
        policy_data = torch.cat([policy_state, policy_action], dim=1)

        alpha = alpha.expand_as(expert_data).to(expert_data.device)

        mixup_data = alpha * expert_data + (1 - alpha) * policy_data
        mixup_data.requires_grad = True

        disc = self.trunk(mixup_data)
        ones = torch.ones(disc.size()).to(disc.device)
        grad = autograd.grad(outputs=disc,
                             inputs=mixup_data,
                             grad_outputs=ones,
                             create_graph=True,
                             retain_graph=True,
                             only_inputs=True)[0]

        grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean()
        return grad_pen

    def update(self, expert_loader, rollouts, obsfilt=None):
        self.train()

        policy_data_generator = rollouts.feed_forward_generator(
            None, mini_batch_size=expert_loader.batch_size)

        loss = 0
        n = 0
        for expert_batch, policy_batch in zip(expert_loader,
                                              policy_data_generator):
            policy_state, policy_action = policy_batch[0], policy_batch[2]
            policy_d = self.trunk(
                torch.cat([policy_state, policy_action], dim=1))

            expert_state, expert_action = expert_batch
            expert_state = obsfilt(expert_state.numpy(), update=False)
            expert_state = torch.FloatTensor(expert_state).to(self.device)
            expert_action = expert_action.to(self.device)
            expert_d = self.trunk(
                torch.cat([expert_state, expert_action], dim=1))

            expert_loss = F.binary_cross_entropy_with_logits(
                expert_d,
                torch.ones(expert_d.size()).to(self.device))
            policy_loss = F.binary_cross_entropy_with_logits(
                policy_d,
                torch.zeros(policy_d.size()).to(self.device))

            gail_loss = expert_loss + policy_loss
            grad_pen = self.compute_grad_pen(expert_state, expert_action,
                                             policy_state, policy_action)

            loss += (gail_loss + grad_pen).item()
            n += 1

            self.optimizer.zero_grad()
            (gail_loss + grad_pen).backward()
            self.optimizer.step()
        return loss / n

    def predict_reward(self, state, action, gamma, masks, update_rms=True):
        with torch.no_grad():
            self.eval()
            d = self.trunk(torch.cat([state, action], dim=1))
            s = torch.sigmoid(d)
            reward = s.log() - (1 - s).log()
            if self.returns is None:
                self.returns = reward.clone()

            if update_rms:
                self.returns = self.returns * masks * gamma + reward
                self.ret_rms.update(self.returns.cpu().numpy())

            return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)

Esempio n. 11

0

Mostra file

    def run(self):
        if self.num_env_steps == 0:
            return
        env = self.env
        args = self.args
        # ref = self.ref
        ref = False
        reset_every = args.num_steps
        # print(env.seed)
        env.seed(self.seed)
        # acquire_all_locks(self.obs_locks)

        reward_filters = {agent: Identity() for agent in self.agents}
        if self.reward_norm:
            reward_filters = {agent: RewardFilter(reward_filters[agent], shape=(), gamma=args.gamma, clip=False)
                              for agent in self.agents}

        self.obs_rms = {agent: RunningMeanStd(shape=env.observation_spaces[agent].shape) for agent in self.agents}

        if self.reseed_step is not None and 0 >= self.reseed_step:
            self.reseed(0, self.reseed_z)

        last_seed = None
        if ref:
            last_seed = self.reseed(0, 1)
            self.np_random.seed(last_seed)

        init_obs = env.reset()
        init_obs = self.normalize_obs(init_obs)
        # self.log(init_obs)

        obs_places = []
        obs_lens = []
        offset = 0
        item_size = np.zeros(1, dtype=self.dtype).nbytes
        actions = dict()
        infos = dict()

        num_episodes = self.num_env_steps // args.episode_steps

        for agent in self.agents:
            actions[agent] = 0

            obs_space = self.env.observation_spaces[agent]
            assert isinstance(obs_space, gym.spaces.Box) and len(obs_space.shape) == 1
            obs_len = obs_space.shape[0]
            full_len = obs_len + 4  # reward, normalized reward, done, bad mask
            place = np.frombuffer(self.obs_shm.buf[offset + item_size * full_len * self.env_id:
                                                   offset + item_size * full_len * (self.env_id + 1)],
                                  dtype=self.dtype)
            obs_places.append(place)
            obs_lens.append(obs_len)
            self.write(place, init_obs[agent], 0., 0., 0., 0.)
            # np.copyto(place[:obs_len], init_obs[agent])
            # self.log("#{} - obs for {}: {}".format(0, agent, init_obs[agent]))
            offset += item_size * full_len * self.num_envs

        # release_all_locks(self.obs_locks)

        self.main_conn.recv()
        done = False
        step = 0
        finished_episodes = 0
        while True:
            # self.log(step)
            self.np_random.tomaxint()  # flush state for 1 step
            release_all_locks(self.obs_locks)

            if self.reseed_step is not None and step + 1 == self.reseed_step:
                self.reseed(step + 1, self.reseed_z)

            if done:
                # self.log("done")
                # acquire_all_locks(self.act_locks)
                if args.reject_sampling:
                    if self.main_conn.recv():
                        break
                else:
                    acquire_all_locks(self.act_locks)
                    finished_episodes += 1
                    # self.log(finished_episodes)
                    if finished_episodes >= num_episodes:
                        break
                obs = env.reset()
                obs = self.normalize_obs(obs)
                for agent in self.agents:
                    reward_filters[agent].reset()
                rewards = {agent: 0. for agent in self.agents}
                dones = {agent: False for agent in self.agents}
                # self.log([(self.obs_rms[agent].mean, self.obs_rms[agent].var) for agent in self.agents])
            else:
                # self.log(len(self.act_locks))
                acquire_all_locks(self.act_locks)
                act_pos = self.env_id * sum(self.act_sizes) * item_size
                for i, agent in enumerate(self.agents):
                    _action = copy.deepcopy(np.frombuffer(self.act_shm.buf[act_pos: act_pos + self.act_sizes[i] * item_size],
                                            dtype=self.dtype))
                    actions[agent] = self.act_recover_fns[i](_action)
                    # self.log(_action)
                    act_pos += self.act_sizes[i] * item_size
                    # print(np.isnan(actions[agent]))
                    # self.log("step {} from {} - act {}".format(step, agent, actions[agent]))
                obs, rewards, dones, infos = env.step(actions)
                obs = self.normalize_obs(obs)
            # release_all_locks(self.act_locks)
            # acquire_all_locks(self.obs_locks)

            # if ref and (step + 1) % reset_every == 0:
            #     c = (step + 1) // reset_every
            #     if c % 2 == 0:
            #         last_seed = self.reseed(step + 1, 1)
            #         self.np_random.seed(last_seed)
            #     else:
            #         self.env.seed(last_seed)

            # self.log("to write")
            not_done = False
            # self.log("step {}, done {}".format(step, dones))
            for i, agent in enumerate(self.agents):
                # self.log("{}, {}".format(i, agent))
                # self.log("step {} - obs for {}: {}, {}, {}".format(i + 1, agent, obs[agent], rewards[agent], dones[agent]))
                # print(infos, type(agent))
                bad_mask = 0.0 if type(infos[agent]) is dict and 'bad_transition' in infos[agent].keys() else 1.0
                self.write(obs_places[i], obs[agent], rewards[agent], reward_filters[agent](rewards[agent]), dones[agent], bad_mask)
                not_done = not_done or not dones[agent]
            done = not not_done

            # if self.env_id == 0:
            #     self.log("step: {}, done: {}".format(step, done))

            step += 1

        release_all_locks(self.obs_locks)

Esempio n. 12

0

Mostra file

File: vec_normalize.py Progetto: wmmc88/stable-baselines3

class VecNormalize(VecEnvWrapper):
    """
    A moving average, normalizing wrapper for vectorized environment.
    has support for saving/loading moving average,

    :param venv: the vectorized environment to wrap
    :param training: Whether to update or not the moving average
    :param norm_obs: Whether to normalize observation or not (default: True)
    :param norm_reward: Whether to normalize rewards or not (default: True)
    :param clip_obs: Max absolute value for observation
    :param clip_reward: Max value absolute for discounted reward
    :param gamma: discount factor
    :param epsilon: To avoid division by zero
    """
    def __init__(
        self,
        venv: VecEnv,
        training: bool = True,
        norm_obs: bool = True,
        norm_reward: bool = True,
        clip_obs: float = 10.0,
        clip_reward: float = 10.0,
        gamma: float = 0.99,
        epsilon: float = 1e-8,
    ):
        VecEnvWrapper.__init__(self, venv)
        self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.ret_rms = RunningMeanStd(shape=())
        self.clip_obs = clip_obs
        self.clip_reward = clip_reward
        # Returns: discounted rewards
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = training
        self.norm_obs = norm_obs
        self.norm_reward = norm_reward
        self.old_obs = np.array([])
        self.old_reward = np.array([])

    def __getstate__(self) -> Dict[str, Any]:
        """
        Gets state for pickling.

        Excludes self.venv, as in general VecEnv's may not be pickleable."""
        state = self.__dict__.copy()
        # these attributes are not pickleable
        del state["venv"]
        del state["class_attributes"]
        # these attributes depend on the above and so we would prefer not to pickle
        del state["ret"]
        return state

    def __setstate__(self, state: Dict[str, Any]) -> None:
        """
        Restores pickled state.

        User must call set_venv() after unpickling before using.

        :param state:"""
        self.__dict__.update(state)
        assert "venv" not in state
        self.venv = None

    def set_venv(self, venv: VecEnv) -> None:
        """
        Sets the vector environment to wrap to venv.

        Also sets attributes derived from this such as `num_env`.

        :param venv:
        """
        if self.venv is not None:
            raise ValueError(
                "Trying to set venv of already initialized VecNormalize wrapper."
            )
        VecEnvWrapper.__init__(self, venv)
        if self.obs_rms.mean.shape != self.observation_space.shape:
            raise ValueError("venv is incompatible with current statistics.")
        self.ret = np.zeros(self.num_envs)

    def step_wait(self) -> VecEnvStepReturn:
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.old_obs = obs
        self.old_reward = rews

        if self.training:
            self.obs_rms.update(obs)
        obs = self.normalize_obs(obs)

        if self.training:
            self._update_reward(rews)
        rews = self.normalize_reward(rews)

        self.ret[news] = 0
        return obs, rews, news, infos

    def _update_reward(self, reward: np.ndarray) -> None:
        """Update reward normalization statistics."""
        self.ret = self.ret * self.gamma + reward
        self.ret_rms.update(self.ret)

    def normalize_obs(self, obs: np.ndarray) -> np.ndarray:
        """
        Normalize observations using this VecNormalize's observations statistics.
        Calling this method does not update statistics.
        """
        if self.norm_obs:
            obs = np.clip((obs - self.obs_rms.mean) /
                          np.sqrt(self.obs_rms.var + self.epsilon),
                          -self.clip_obs, self.clip_obs)
        return obs

    def normalize_reward(self, reward: np.ndarray) -> np.ndarray:
        """
        Normalize rewards using this VecNormalize's rewards statistics.
        Calling this method does not update statistics.
        """
        if self.norm_reward:
            reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon),
                             -self.clip_reward, self.clip_reward)
        return reward

    def unnormalize_obs(self, obs: np.ndarray) -> np.ndarray:
        if self.norm_obs:
            return (obs * np.sqrt(self.obs_rms.var + self.epsilon)
                    ) + self.obs_rms.mean
        return obs

    def unnormalize_reward(self, reward: np.ndarray) -> np.ndarray:
        if self.norm_reward:
            return reward * np.sqrt(self.ret_rms.var + self.epsilon)
        return reward

    def get_original_obs(self) -> np.ndarray:
        """
        Returns an unnormalized version of the observations from the most recent
        step or reset.
        """
        return self.old_obs.copy()

    def get_original_reward(self) -> np.ndarray:
        """
        Returns an unnormalized version of the rewards from the most recent step.
        """
        return self.old_reward.copy()

    def reset(self) -> np.ndarray:
        """
        Reset all environments
        """
        obs = self.venv.reset()
        self.old_obs = obs
        self.ret = np.zeros(self.num_envs)
        if self.training:
            self._update_reward(self.ret)
        return self.normalize_obs(obs)

    @staticmethod
    def load(load_path: str, venv: VecEnv) -> "VecNormalize":
        """
        Loads a saved VecNormalize object.

        :param load_path: the path to load from.
        :param venv: the VecEnv to wrap.
        :return:
        """
        with open(load_path, "rb") as file_handler:
            vec_normalize = pickle.load(file_handler)
        vec_normalize.set_venv(venv)
        return vec_normalize

    def save(self, save_path: str) -> None:
        """
        Save current VecNormalize object with
        all running statistics and settings (e.g. clip_obs)

        :param save_path: The path to save to
        """
        with open(save_path, "wb") as file_handler:
            pickle.dump(self, file_handler)

Esempio n. 13

0

Mostra file

File: test_vec_normalize.py Progetto: mjlbach/stable-baselines3

def test_combining_stats():
    np.random.seed(4)
    for shape in [(1, ), (3, ), (3, 4)]:
        values = []
        rms_1 = RunningMeanStd(shape=shape)
        rms_2 = RunningMeanStd(shape=shape)
        rms_3 = RunningMeanStd(shape=shape)
        for _ in range(15):
            value = np.random.randn(*shape)
            rms_1.update(value)
            rms_3.update(value)
            values.append(value)
        for _ in range(19):
            # Shift the values
            value = np.random.randn(*shape) + 1.0
            rms_2.update(value)
            rms_3.update(value)
            values.append(value)
        rms_1.combine(rms_2)
        assert np.allclose(rms_3.mean, rms_1.mean)
        assert np.allclose(rms_3.var, rms_1.var)
        rms_4 = rms_3.copy()
        assert np.allclose(rms_4.mean, rms_3.mean)
        assert np.allclose(rms_4.var, rms_3.var)
        assert np.allclose(rms_4.count, rms_3.count)
        assert id(rms_4.mean) != id(rms_3.mean)
        assert id(rms_4.var) != id(rms_3.var)
        x_cat = np.concatenate(values, axis=0)
        assert np.allclose(x_cat.mean(axis=0), rms_4.mean)
        assert np.allclose(x_cat.var(axis=0), rms_4.var)

Esempio n. 14

0

Mostra file

class BurgersEnv(VecEnv):
    metadata = {'render.modes': ['live', 'gif', 'png']}

    def __init__(
        self,
        num_envs: int,
        step_count: int = 32,
        domain: phiflow.Domain = phiflow.Domain((32, ), box=phiflow.box[0:1]),
        dt: float = 0.03,
        viscosity: float = 0.003,
        diffusion_substeps: int = 1,
        final_reward_factor: float = 32,
        reward_rms: Optional[RunningMeanStd] = None,
        exp_name: str = 'v0',
    ):
        act_shape = self._get_act_shape(domain.resolution)
        obs_shape = self._get_obs_shape(domain.resolution)
        observation_space = gym.spaces.Box(-np.inf,
                                           np.inf,
                                           shape=obs_shape,
                                           dtype=np.float32)
        action_space = gym.spaces.Box(-np.inf,
                                      np.inf,
                                      shape=act_shape,
                                      dtype=np.float32)

        super().__init__(num_envs, observation_space, action_space)

        self.reward_range = (-float('inf'), float('inf'))
        self.spec = None
        self.exp_name = exp_name
        self.domain = domain
        self.step_count = step_count
        self.step_idx = 0
        self.ep_idx = 0
        self.dt = dt
        self.viscosity = viscosity
        self.physics = phiflow.Burgers(diffusion_substeps=diffusion_substeps)
        self.final_reward_factor = final_reward_factor
        self.reward_rms = reward_rms
        if self.reward_rms is None:
            self.reward_rms = RunningMeanStd()
        self.actions = None
        self.test_mode = False
        self.init_state = None
        self.goal_state = None
        self.cont_state = None
        self.pass_state = None
        self.gt_state = None
        self.gt_forces = None
        self.lviz = None
        self.gifviz = None
        self.pngviz = None

    def reset(self) -> VecEnvObs:
        self.step_idx = 0

        self.gt_forces = self._get_gt_forces()
        self.init_state = self._get_init_state()
        self.cont_state = self.init_state.copied_with()
        self.goal_state = self._get_goal_state()

        if self.test_mode:
            self._init_ref_states()

        return self._build_obs()

    def step_async(self, actions: np.ndarray) -> None:
        self.actions = actions.reshape(self.cont_state.velocity.data.shape)

    def step_wait(self) -> VecEnvStepReturn:
        self.step_idx += 1
        forces = self.actions
        forces_effect = phiflow.FieldEffect(
            phiflow.CenteredGrid(self.actions, box=self.domain.box),
            ['velocity'])
        self.cont_state = self._step_sim(self.cont_state, (forces_effect, ))

        # Perform reference simulation only when evaluating results -> after render was called once
        if self.test_mode:
            self.pass_state = self._step_sim(self.pass_state, ())
            self.gt_state = self._step_gt()

        obs = self._build_obs()
        rew = self._build_rew(forces)
        done = np.full((self.num_envs, ), self.step_idx == self.step_count)
        if self.step_idx == self.step_count:
            self.ep_idx += 1

            missing_forces_field = (self.goal_state.velocity.data -
                                    self.cont_state.velocity.data) / self.dt
            missing_forces = phiflow.FieldEffect(
                phiflow.CenteredGrid(missing_forces_field,
                                     box=self.domain.box), ['velocity'])
            forces += missing_forces_field
            self.cont_state = self.cont_state.copied_with(
                velocity=(self.cont_state.velocity.data +
                          missing_forces_field * self.dt))

            add_rew = self._build_rew(
                missing_forces.field.data) * self.final_reward_factor
            rew += add_rew

            obs = self.reset()

        info = [{
            'rew_unnormalized': rew[i],
            'forces': np.abs(forces[i]).sum()
        } for i in range(self.num_envs)]

        self.reward_rms.update(rew)
        rew = (rew - self.reward_rms.mean) / np.sqrt(self.reward_rms.var)

        return obs, rew, done, info

    def close(self) -> None:
        pass

    def disable_test_mode_wtf(self):
        self.test_mode = False

    def render(self, mode: str = 'live') -> None:
        if not self.test_mode:
            self.test_mode = True
            self._init_ref_states()
            if mode == 'live':
                self.lviz = LivePlotter()
            elif mode == 'gif':
                self.gifviz = GifPlotter('StableBurger-%s' % self.exp_name)
            elif mode == 'png':
                self.pngviz = PngPlotter('StableBurger-%s' % self.exp_name)
            else:
                raise NotImplementedError()

        fields, labels = self._get_fields_and_labels()

        if mode == 'live':
            self.lviz.render(fields, labels, 2, True)
        elif mode == 'gif':
            self.gifviz.render(fields, labels, 2, True, 'Velocity',
                               self.ep_idx, self.step_idx, self.step_count,
                               True)
        elif mode == 'png':
            self.pngviz.render(fields, labels, 2, True, 'Velocity',
                               self.ep_idx, self.step_idx, self.step_count,
                               True)
        else:
            raise NotImplementedError()

    def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]:
        return [None for _ in range(self.num_envs)]

    def get_attr(self, attr_name: str, indices: VecEnvIndices = None):
        return [
            getattr(self, attr_name)
            for _ in self._vec_env_indices_to_list(indices)
        ]

    def set_attr(self,
                 attr_name: str,
                 value: Any,
                 indices: VecEnvIndices = None):
        setattr(self, attr_name, value)

    def env_method(self,
                   method_name: str,
                   *method_args,
                   indices: VecEnvIndices = None,
                   **method_kwargs) -> List[Any]:
        getattr(self, method_name)(*method_args, **method_kwargs)

    def env_is_wrapped(self,
                       wrapper_class: Type[gym.Wrapper],
                       indices: VecEnvIndices = None) -> List[bool]:
        return [False for _ in self._vec_env_indices_to_list(indices)]

    def _step_sim(
            self, in_state: phiflow.BurgersVelocity,
            effects: Tuple[phiflow.FieldEffect,
                           ...]) -> phiflow.BurgersVelocity:
        return self.physics.step(in_state, dt=self.dt, effects=effects)

    def _step_gt(self):
        return self._step_sim(self.gt_state, (self.gt_forces, ))

    def _get_init_state(self) -> phiflow.BurgersVelocity:
        return phiflow.BurgersVelocity(domain=self.domain,
                                       velocity=GaussianClash(self.num_envs),
                                       viscosity=self.viscosity)

    def _get_gt_forces(self) -> phiflow.FieldEffect:
        return phiflow.FieldEffect(GaussianForce(self.num_envs), ['velocity'])

    def _get_goal_state(self) -> phiflow.BurgersVelocity:
        state = self.init_state.copied_with()
        for _ in range(self.step_count):
            state = self._step_sim(state, (self.gt_forces, ))
        return state

    def _init_ref_states(self) -> None:
        self.pass_state = self.init_state.copied_with()
        self.gt_state = self.init_state.copied_with()

    def _build_obs(self) -> List[np.ndarray]:
        curr_data = self.cont_state.velocity.data
        goal_data = self.goal_state.velocity.data

        # Preserve the spacial dimensions, cut off batch dim and use only one channel
        time_shape = curr_data.shape[1:-1] + (1, )
        time_data = np.full(curr_data.shape[1:],
                            self.step_idx / self.step_count)
        # Channels last
        return [
            np.concatenate(obs + (time_data, ), axis=-1)
            for obs in zip(curr_data, goal_data)
        ]

    def _build_rew(self, forces: np.ndarray) -> np.ndarray:
        reduced_shape = (forces.shape[0], -1)
        reshaped_forces = forces.reshape(reduced_shape)
        return -np.sum(reshaped_forces**2, axis=-1)

    # The whole field with one parameter in each direction, flattened out
    def _get_act_shape(self, field_shape: Tuple[int, ...]) -> Tuple[int, ...]:
        act_dim = np.prod(field_shape) * len(field_shape)
        return (act_dim, )

    # Current and goal field with one parameter in each direction and one time channel
    def _get_obs_shape(self, field_shape: Tuple[int, ...]) -> Tuple[int, ...]:
        return tuple(field_shape) + (2 * len(field_shape) + 1, )

    def _vec_env_indices_to_list(self,
                                 raw_indices: VecEnvIndices) -> List[int]:
        if raw_indices is None:
            return []
        if isinstance(raw_indices, int):
            return [raw_indices]
        return list(raw_indices)

    def _get_fields_and_labels(self) -> Tuple[List[np.ndarray], List[str]]:
        # Take the simulation of the first env
        fields = [
            f.velocity.data[0].reshape(-1) for f in [
                self.init_state,
                self.goal_state,
                self.pass_state,
                self.gt_state,
                self.cont_state,
            ]
        ]

        labels = [
            'Initial state',
            'Goal state',
            'Uncontrolled simulation',
            'Ground truth simulation',
            'Controlled simulation',
        ]

        return fields, labels