def __init__( self, venv: VecEnv, training: bool = True, norm_obs: bool = True, norm_reward: bool = True, clip_obs: float = 10.0, clip_reward: float = 10.0, gamma: float = 0.99, epsilon: float = 1e-8, ): VecEnvWrapper.__init__(self, venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) self.old_reward = np.array([])
def __init__( self, num_envs: int, step_count: int = 32, domain: phiflow.Domain = phiflow.Domain((32, ), box=phiflow.box[0:1]), dt: float = 0.03, viscosity: float = 0.003, diffusion_substeps: int = 1, final_reward_factor: float = 32, reward_rms: Optional[RunningMeanStd] = None, exp_name: str = 'v0', ): act_shape = self._get_act_shape(domain.resolution) obs_shape = self._get_obs_shape(domain.resolution) observation_space = gym.spaces.Box(-np.inf, np.inf, shape=obs_shape, dtype=np.float32) action_space = gym.spaces.Box(-np.inf, np.inf, shape=act_shape, dtype=np.float32) super().__init__(num_envs, observation_space, action_space) self.reward_range = (-float('inf'), float('inf')) self.spec = None self.exp_name = exp_name self.domain = domain self.step_count = step_count self.step_idx = 0 self.ep_idx = 0 self.dt = dt self.viscosity = viscosity self.physics = phiflow.Burgers(diffusion_substeps=diffusion_substeps) self.final_reward_factor = final_reward_factor self.reward_rms = reward_rms if self.reward_rms is None: self.reward_rms = RunningMeanStd() self.actions = None self.test_mode = False self.init_state = None self.goal_state = None self.cont_state = None self.pass_state = None self.gt_state = None self.gt_forces = None self.lviz = None self.gifviz = None self.pngviz = None
def __init__(self, input_dim, hidden_dim, device): super(Discriminator, self).__init__() self.device = device self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=())
def __init__( self, venv: VecEnv, training: bool = True, norm_obs: bool = True, norm_reward: bool = True, clip_obs: float = 10.0, clip_reward: float = 10.0, gamma: float = 0.99, epsilon: float = 1e-8, norm_obs_keys: Optional[List[str]] = None, ): VecEnvWrapper.__init__(self, venv) self.norm_obs = norm_obs self.norm_obs_keys = norm_obs_keys # Check observation spaces if self.norm_obs: self._sanity_checks() if isinstance(self.observation_space, gym.spaces.Dict): self.obs_spaces = self.observation_space.spaces self.obs_rms = { key: RunningMeanStd(shape=self.obs_spaces[key].shape) for key in self.norm_obs_keys } else: self.obs_spaces = None self.obs_rms = RunningMeanStd( shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.returns = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) self.old_reward = np.array([])
def __init__( self, venv: VecEnv, training: bool = True, norm_obs: bool = True, norm_reward: bool = True, clip_obs: float = 10.0, clip_reward: float = 10.0, gamma: float = 0.99, epsilon: float = 1e-8, ): VecEnvWrapper.__init__(self, venv) if norm_obs: if not isinstance(self.observation_space, (gym.spaces.Box, gym.spaces.Dict)): raise ValueError( "VecNormalize only supports `gym.spaces.Box` and `gym.spaces.Dict` observation spaces" ) if isinstance(self.observation_space, gym.spaces.Dict): self.obs_keys = set(self.observation_space.spaces.keys()) self.obs_spaces = self.observation_space.spaces self.obs_rms = { key: RunningMeanStd(shape=space.shape) for key, space in self.obs_spaces.items() } else: self.obs_keys, self.obs_spaces = None, None self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.returns = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) self.old_reward = np.array([])
def test_runningmeanstd(): """Test RunningMeanStd object""" for (x_1, x_2, x_3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))]: rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) x_cat = np.concatenate([x_1, x_2, x_3], axis=0) moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)] rms.update(x_1) rms.update(x_2) rms.update(x_3) moments_2 = [rms.mean, rms.var] assert np.allclose(moments_1, moments_2)
def load_DvD_weight(self, weight_path): weights = np.load(weight_path) mu = weights[-self.ob_dim * 2:-self.ob_dim] std = weights[-self.ob_dim:] from stable_baselines3.common.running_mean_std import RunningMeanStd self.obs_rms = RunningMeanStd(shape=(self.ob_dim, )) self.obs_rms.mean = mu self.obs_rms.var = np.square(std) # print(mu, std) weights = weights[:-self.ob_dim * 2] sizes = [(self.h_dim, self.ob_dim), (self.h_dim, self.h_dim), (self.ac_dim, self.h_dim)] hiddens, weights = self._get_weights(weights, sizes) h1, h2, h999 = hiddens b1, b2 = weights self.base.actor[0].weight.data = h1 self.base.actor[0].bias.data = b1 self.base.actor[2].weight.data = h2 self.base.actor[2].bias.data = b2 self.dist.fc_mean[0].weight.data = h999 self.dist.fc_mean[0].bias.data.fill_(0.)
class VecNormalize(VecEnvWrapper): """ A moving average, normalizing wrapper for vectorized environment. has support for saving/loading moving average, :param venv: the vectorized environment to wrap :param training: Whether to update or not the moving average :param norm_obs: Whether to normalize observation or not (default: True) :param norm_reward: Whether to normalize rewards or not (default: True) :param clip_obs: Max absolute value for observation :param clip_reward: Max value absolute for discounted reward :param gamma: discount factor :param epsilon: To avoid division by zero :param norm_obs_keys: Which keys from observation dict to normalize. If not specified, all keys will be normalized. """ def __init__( self, venv: VecEnv, training: bool = True, norm_obs: bool = True, norm_reward: bool = True, clip_obs: float = 10.0, clip_reward: float = 10.0, gamma: float = 0.99, epsilon: float = 1e-8, norm_obs_keys: Optional[List[str]] = None, ): VecEnvWrapper.__init__(self, venv) self.norm_obs = norm_obs self.norm_obs_keys = norm_obs_keys # Check observation spaces if self.norm_obs: self._sanity_checks() if isinstance(self.observation_space, gym.spaces.Dict): self.obs_spaces = self.observation_space.spaces self.obs_rms = { key: RunningMeanStd(shape=self.obs_spaces[key].shape) for key in self.norm_obs_keys } else: self.obs_spaces = None self.obs_rms = RunningMeanStd( shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.returns = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) self.old_reward = np.array([]) def _sanity_checks(self) -> None: """ Check the observations that are going to be normalized are of the correct type (spaces.Box). """ if isinstance(self.observation_space, gym.spaces.Dict): # By default, we normalize all keys if self.norm_obs_keys is None: self.norm_obs_keys = list(self.observation_space.spaces.keys()) # Check that all keys are of type Box for obs_key in self.norm_obs_keys: if not isinstance(self.observation_space.spaces[obs_key], gym.spaces.Box): raise ValueError( f"VecNormalize only supports `gym.spaces.Box` observation spaces but {obs_key} " f"is of type {self.observation_space.spaces[obs_key]}. " "You should probably explicitely pass the observation keys " " that should be normalized via the `norm_obs_keys` parameter." ) elif isinstance(self.observation_space, gym.spaces.Box): if self.norm_obs_keys is not None: raise ValueError( "`norm_obs_keys` param is applicable only with `gym.spaces.Dict` observation spaces" ) else: raise ValueError( "VecNormalize only supports `gym.spaces.Box` and `gym.spaces.Dict` observation spaces, " f"not {self.observation_space}") def __getstate__(self) -> Dict[str, Any]: """ Gets state for pickling. Excludes self.venv, as in general VecEnv's may not be pickleable.""" state = self.__dict__.copy() # these attributes are not pickleable del state["venv"] del state["class_attributes"] # these attributes depend on the above and so we would prefer not to pickle del state["returns"] return state def __setstate__(self, state: Dict[str, Any]) -> None: """ Restores pickled state. User must call set_venv() after unpickling before using. :param state:""" # Backward compatibility if "norm_obs_keys" not in state and isinstance( state["observation_space"], gym.spaces.Dict): state["norm_obs_keys"] = list( state["observation_space"].spaces.keys()) self.__dict__.update(state) assert "venv" not in state self.venv = None def set_venv(self, venv: VecEnv) -> None: """ Sets the vector environment to wrap to venv. Also sets attributes derived from this such as `num_env`. :param venv: """ if self.venv is not None: raise ValueError( "Trying to set venv of already initialized VecNormalize wrapper." ) VecEnvWrapper.__init__(self, venv) # Check only that the observation_space match utils.check_for_correct_spaces(venv, self.observation_space, venv.action_space) self.returns = np.zeros(self.num_envs) def step_wait(self) -> VecEnvStepReturn: """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, dones) where ``dones`` is a boolean vector indicating whether each element is new. """ obs, rewards, dones, infos = self.venv.step_wait() self.old_obs = obs self.old_reward = rewards if self.training and self.norm_obs: if isinstance(obs, dict) and isinstance(self.obs_rms, dict): for key in self.obs_rms.keys(): self.obs_rms[key].update(obs[key]) else: self.obs_rms.update(obs) obs = self.normalize_obs(obs) if self.training: self._update_reward(rewards) rewards = self.normalize_reward(rewards) # Normalize the terminal observations for idx, done in enumerate(dones): if not done: continue if "terminal_observation" in infos[idx]: infos[idx]["terminal_observation"] = self.normalize_obs( infos[idx]["terminal_observation"]) self.returns[dones] = 0 return obs, rewards, dones, infos def _update_reward(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.returns = self.returns * self.gamma + reward self.ret_rms.update(self.returns) def _normalize_obs(self, obs: np.ndarray, obs_rms: RunningMeanStd) -> np.ndarray: """ Helper to normalize observation. :param obs: :param obs_rms: associated statistics :return: normalized observation """ return np.clip( (obs - obs_rms.mean) / np.sqrt(obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) def _unnormalize_obs(self, obs: np.ndarray, obs_rms: RunningMeanStd) -> np.ndarray: """ Helper to unnormalize observation. :param obs: :param obs_rms: associated statistics :return: unnormalized observation """ return (obs * np.sqrt(obs_rms.var + self.epsilon)) + obs_rms.mean def normalize_obs( self, obs: Union[np.ndarray, Dict[str, np.ndarray]] ) -> Union[np.ndarray, Dict[str, np.ndarray]]: """ Normalize observations using this VecNormalize's observations statistics. Calling this method does not update statistics. """ # Avoid modifying by reference the original object obs_ = deepcopy(obs) if self.norm_obs: if isinstance(obs, dict) and isinstance(self.obs_rms, dict): # Only normalize the specified keys for key in self.norm_obs_keys: obs_[key] = self._normalize_obs( obs[key], self.obs_rms[key]).astype(np.float32) else: obs_ = self._normalize_obs(obs, self.obs_rms).astype(np.float32) return obs_ def normalize_reward(self, reward: np.ndarray) -> np.ndarray: """ Normalize rewards using this VecNormalize's rewards statistics. Calling this method does not update statistics. """ if self.norm_reward: reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) return reward def unnormalize_obs( self, obs: Union[np.ndarray, Dict[str, np.ndarray]] ) -> Union[np.ndarray, Dict[str, np.ndarray]]: # Avoid modifying by reference the original object obs_ = deepcopy(obs) if self.norm_obs: if isinstance(obs, dict) and isinstance(self.obs_rms, dict): for key in self.norm_obs_keys: obs_[key] = self._unnormalize_obs(obs[key], self.obs_rms[key]) else: obs_ = self._unnormalize_obs(obs, self.obs_rms) return obs_ def unnormalize_reward(self, reward: np.ndarray) -> np.ndarray: if self.norm_reward: return reward * np.sqrt(self.ret_rms.var + self.epsilon) return reward def get_original_obs(self) -> Union[np.ndarray, Dict[str, np.ndarray]]: """ Returns an unnormalized version of the observations from the most recent step or reset. """ return deepcopy(self.old_obs) def get_original_reward(self) -> np.ndarray: """ Returns an unnormalized version of the rewards from the most recent step. """ return self.old_reward.copy() def reset(self) -> Union[np.ndarray, Dict[str, np.ndarray]]: """ Reset all environments :return: first observation of the episode """ obs = self.venv.reset() self.old_obs = obs self.returns = np.zeros(self.num_envs) if self.training and self.norm_obs: if isinstance(obs, dict) and isinstance(self.obs_rms, dict): for key in self.obs_rms.keys(): self.obs_rms[key].update(obs[key]) else: self.obs_rms.update(obs) return self.normalize_obs(obs) @staticmethod def load(load_path: str, venv: VecEnv) -> "VecNormalize": """ Loads a saved VecNormalize object. :param load_path: the path to load from. :param venv: the VecEnv to wrap. :return: """ with open(load_path, "rb") as file_handler: vec_normalize = pickle.load(file_handler) vec_normalize.set_venv(venv) return vec_normalize def save(self, save_path: str) -> None: """ Save current VecNormalize object with all running statistics and settings (e.g. clip_obs) :param save_path: The path to save to """ with open(save_path, "wb") as file_handler: pickle.dump(self, file_handler) @property def ret(self) -> np.ndarray: warnings.warn( "`VecNormalize` `ret` attribute is deprecated. Please use `returns` instead.", DeprecationWarning) return self.returns
def __init__( self, path, domain, viscosity, step_count, dt, diffusion_substeps, n_envs, final_reward_factor, steps_per_rollout, n_epochs, learning_rate, batch_size, data_path=None, val_range=range(100, 200), test_range=range(100), ): callbacks = [] env_kwargs = dict( num_envs=n_envs, step_count=step_count, domain=domain, dt=dt, viscosity=viscosity, diffusion_substeps=diffusion_substeps, final_reward_factor=final_reward_factor, exp_name=path, ) evaluation_env_kwargs = { k: env_kwargs[k] for k in env_kwargs if k != 'num_envs' } if data_path is not None: self.val_env = BurgersFixedSetEnv(data_path=data_path, data_range=val_range, num_envs=len(val_range), **evaluation_env_kwargs) self.test_env = BurgersFixedSetEnv(data_path=data_path, data_range=test_range, num_envs=len(test_range), **evaluation_env_kwargs) callbacks.append( EveryNRolloutsFunctionCallback( 1, lambda _: self._record_forces(self.val_env, 'val_set_forces'))) # Only add a fresh running mean to new experiments if not ExperimentFolder.exists(path): env_kwargs['reward_rms'] = RunningMeanStd() agent_kwargs = dict( verbose=0, policy=CustomActorCriticPolicy, policy_kwargs=dict( pi_net=RES_UNET, vf_net=CNN_FUNNEL, vf_latent_dim=16, pi_kwargs=dict(sizes=[4, 8, 16, 16, 16]), vf_kwargs=dict(sizes=[4, 8, 16, 16, 16]), ), n_steps=steps_per_rollout, n_epochs=n_epochs, learning_rate=learning_rate, batch_size=batch_size, ) super().__init__(path, BurgersEnv, env_kwargs, agent_kwargs, steps_per_rollout, n_envs, callbacks)
class Discriminator(nn.Module): def __init__(self, input_dim, hidden_dim, device): super(Discriminator, self).__init__() self.device = device self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) def compute_grad_pen(self, expert_state, expert_action, policy_state, policy_action, lambda_=10): alpha = torch.rand(expert_state.size(0), 1) expert_data = torch.cat([expert_state, expert_action], dim=1) policy_data = torch.cat([policy_state, policy_action], dim=1) alpha = alpha.expand_as(expert_data).to(expert_data.device) mixup_data = alpha * expert_data + (1 - alpha) * policy_data mixup_data.requires_grad = True disc = self.trunk(mixup_data) ones = torch.ones(disc.size()).to(disc.device) grad = autograd.grad(outputs=disc, inputs=mixup_data, grad_outputs=ones, create_graph=True, retain_graph=True, only_inputs=True)[0] grad_pen = lambda_ * (grad.norm(2, dim=1) - 1).pow(2).mean() return grad_pen def update(self, expert_loader, rollouts, obsfilt=None): self.train() policy_data_generator = rollouts.feed_forward_generator( None, mini_batch_size=expert_loader.batch_size) loss = 0 n = 0 for expert_batch, policy_batch in zip(expert_loader, policy_data_generator): policy_state, policy_action = policy_batch[0], policy_batch[2] policy_d = self.trunk( torch.cat([policy_state, policy_action], dim=1)) expert_state, expert_action = expert_batch expert_state = obsfilt(expert_state.numpy(), update=False) expert_state = torch.FloatTensor(expert_state).to(self.device) expert_action = expert_action.to(self.device) expert_d = self.trunk( torch.cat([expert_state, expert_action], dim=1)) expert_loss = F.binary_cross_entropy_with_logits( expert_d, torch.ones(expert_d.size()).to(self.device)) policy_loss = F.binary_cross_entropy_with_logits( policy_d, torch.zeros(policy_d.size()).to(self.device)) gail_loss = expert_loss + policy_loss grad_pen = self.compute_grad_pen(expert_state, expert_action, policy_state, policy_action) loss += (gail_loss + grad_pen).item() n += 1 self.optimizer.zero_grad() (gail_loss + grad_pen).backward() self.optimizer.step() return loss / n def predict_reward(self, state, action, gamma, masks, update_rms=True): with torch.no_grad(): self.eval() d = self.trunk(torch.cat([state, action], dim=1)) s = torch.sigmoid(d) reward = s.log() - (1 - s).log() if self.returns is None: self.returns = reward.clone() if update_rms: self.returns = self.returns * masks * gamma + reward self.ret_rms.update(self.returns.cpu().numpy()) return reward / np.sqrt(self.ret_rms.var[0] + 1e-8)
def run(self): if self.num_env_steps == 0: return env = self.env args = self.args # ref = self.ref ref = False reset_every = args.num_steps # print(env.seed) env.seed(self.seed) # acquire_all_locks(self.obs_locks) reward_filters = {agent: Identity() for agent in self.agents} if self.reward_norm: reward_filters = {agent: RewardFilter(reward_filters[agent], shape=(), gamma=args.gamma, clip=False) for agent in self.agents} self.obs_rms = {agent: RunningMeanStd(shape=env.observation_spaces[agent].shape) for agent in self.agents} if self.reseed_step is not None and 0 >= self.reseed_step: self.reseed(0, self.reseed_z) last_seed = None if ref: last_seed = self.reseed(0, 1) self.np_random.seed(last_seed) init_obs = env.reset() init_obs = self.normalize_obs(init_obs) # self.log(init_obs) obs_places = [] obs_lens = [] offset = 0 item_size = np.zeros(1, dtype=self.dtype).nbytes actions = dict() infos = dict() num_episodes = self.num_env_steps // args.episode_steps for agent in self.agents: actions[agent] = 0 obs_space = self.env.observation_spaces[agent] assert isinstance(obs_space, gym.spaces.Box) and len(obs_space.shape) == 1 obs_len = obs_space.shape[0] full_len = obs_len + 4 # reward, normalized reward, done, bad mask place = np.frombuffer(self.obs_shm.buf[offset + item_size * full_len * self.env_id: offset + item_size * full_len * (self.env_id + 1)], dtype=self.dtype) obs_places.append(place) obs_lens.append(obs_len) self.write(place, init_obs[agent], 0., 0., 0., 0.) # np.copyto(place[:obs_len], init_obs[agent]) # self.log("#{} - obs for {}: {}".format(0, agent, init_obs[agent])) offset += item_size * full_len * self.num_envs # release_all_locks(self.obs_locks) self.main_conn.recv() done = False step = 0 finished_episodes = 0 while True: # self.log(step) self.np_random.tomaxint() # flush state for 1 step release_all_locks(self.obs_locks) if self.reseed_step is not None and step + 1 == self.reseed_step: self.reseed(step + 1, self.reseed_z) if done: # self.log("done") # acquire_all_locks(self.act_locks) if args.reject_sampling: if self.main_conn.recv(): break else: acquire_all_locks(self.act_locks) finished_episodes += 1 # self.log(finished_episodes) if finished_episodes >= num_episodes: break obs = env.reset() obs = self.normalize_obs(obs) for agent in self.agents: reward_filters[agent].reset() rewards = {agent: 0. for agent in self.agents} dones = {agent: False for agent in self.agents} # self.log([(self.obs_rms[agent].mean, self.obs_rms[agent].var) for agent in self.agents]) else: # self.log(len(self.act_locks)) acquire_all_locks(self.act_locks) act_pos = self.env_id * sum(self.act_sizes) * item_size for i, agent in enumerate(self.agents): _action = copy.deepcopy(np.frombuffer(self.act_shm.buf[act_pos: act_pos + self.act_sizes[i] * item_size], dtype=self.dtype)) actions[agent] = self.act_recover_fns[i](_action) # self.log(_action) act_pos += self.act_sizes[i] * item_size # print(np.isnan(actions[agent])) # self.log("step {} from {} - act {}".format(step, agent, actions[agent])) obs, rewards, dones, infos = env.step(actions) obs = self.normalize_obs(obs) # release_all_locks(self.act_locks) # acquire_all_locks(self.obs_locks) # if ref and (step + 1) % reset_every == 0: # c = (step + 1) // reset_every # if c % 2 == 0: # last_seed = self.reseed(step + 1, 1) # self.np_random.seed(last_seed) # else: # self.env.seed(last_seed) # self.log("to write") not_done = False # self.log("step {}, done {}".format(step, dones)) for i, agent in enumerate(self.agents): # self.log("{}, {}".format(i, agent)) # self.log("step {} - obs for {}: {}, {}, {}".format(i + 1, agent, obs[agent], rewards[agent], dones[agent])) # print(infos, type(agent)) bad_mask = 0.0 if type(infos[agent]) is dict and 'bad_transition' in infos[agent].keys() else 1.0 self.write(obs_places[i], obs[agent], rewards[agent], reward_filters[agent](rewards[agent]), dones[agent], bad_mask) not_done = not_done or not dones[agent] done = not not_done # if self.env_id == 0: # self.log("step: {}, done: {}".format(step, done)) step += 1 release_all_locks(self.obs_locks)
class VecNormalize(VecEnvWrapper): """ A moving average, normalizing wrapper for vectorized environment. has support for saving/loading moving average, :param venv: the vectorized environment to wrap :param training: Whether to update or not the moving average :param norm_obs: Whether to normalize observation or not (default: True) :param norm_reward: Whether to normalize rewards or not (default: True) :param clip_obs: Max absolute value for observation :param clip_reward: Max value absolute for discounted reward :param gamma: discount factor :param epsilon: To avoid division by zero """ def __init__( self, venv: VecEnv, training: bool = True, norm_obs: bool = True, norm_reward: bool = True, clip_obs: float = 10.0, clip_reward: float = 10.0, gamma: float = 0.99, epsilon: float = 1e-8, ): VecEnvWrapper.__init__(self, venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) self.clip_obs = clip_obs self.clip_reward = clip_reward # Returns: discounted rewards self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = training self.norm_obs = norm_obs self.norm_reward = norm_reward self.old_obs = np.array([]) self.old_reward = np.array([]) def __getstate__(self) -> Dict[str, Any]: """ Gets state for pickling. Excludes self.venv, as in general VecEnv's may not be pickleable.""" state = self.__dict__.copy() # these attributes are not pickleable del state["venv"] del state["class_attributes"] # these attributes depend on the above and so we would prefer not to pickle del state["ret"] return state def __setstate__(self, state: Dict[str, Any]) -> None: """ Restores pickled state. User must call set_venv() after unpickling before using. :param state:""" self.__dict__.update(state) assert "venv" not in state self.venv = None def set_venv(self, venv: VecEnv) -> None: """ Sets the vector environment to wrap to venv. Also sets attributes derived from this such as `num_env`. :param venv: """ if self.venv is not None: raise ValueError( "Trying to set venv of already initialized VecNormalize wrapper." ) VecEnvWrapper.__init__(self, venv) if self.obs_rms.mean.shape != self.observation_space.shape: raise ValueError("venv is incompatible with current statistics.") self.ret = np.zeros(self.num_envs) def step_wait(self) -> VecEnvStepReturn: """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.old_obs = obs self.old_reward = rews if self.training: self.obs_rms.update(obs) obs = self.normalize_obs(obs) if self.training: self._update_reward(rews) rews = self.normalize_reward(rews) self.ret[news] = 0 return obs, rews, news, infos def _update_reward(self, reward: np.ndarray) -> None: """Update reward normalization statistics.""" self.ret = self.ret * self.gamma + reward self.ret_rms.update(self.ret) def normalize_obs(self, obs: np.ndarray) -> np.ndarray: """ Normalize observations using this VecNormalize's observations statistics. Calling this method does not update statistics. """ if self.norm_obs: obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, self.clip_obs) return obs def normalize_reward(self, reward: np.ndarray) -> np.ndarray: """ Normalize rewards using this VecNormalize's rewards statistics. Calling this method does not update statistics. """ if self.norm_reward: reward = np.clip(reward / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) return reward def unnormalize_obs(self, obs: np.ndarray) -> np.ndarray: if self.norm_obs: return (obs * np.sqrt(self.obs_rms.var + self.epsilon) ) + self.obs_rms.mean return obs def unnormalize_reward(self, reward: np.ndarray) -> np.ndarray: if self.norm_reward: return reward * np.sqrt(self.ret_rms.var + self.epsilon) return reward def get_original_obs(self) -> np.ndarray: """ Returns an unnormalized version of the observations from the most recent step or reset. """ return self.old_obs.copy() def get_original_reward(self) -> np.ndarray: """ Returns an unnormalized version of the rewards from the most recent step. """ return self.old_reward.copy() def reset(self) -> np.ndarray: """ Reset all environments """ obs = self.venv.reset() self.old_obs = obs self.ret = np.zeros(self.num_envs) if self.training: self._update_reward(self.ret) return self.normalize_obs(obs) @staticmethod def load(load_path: str, venv: VecEnv) -> "VecNormalize": """ Loads a saved VecNormalize object. :param load_path: the path to load from. :param venv: the VecEnv to wrap. :return: """ with open(load_path, "rb") as file_handler: vec_normalize = pickle.load(file_handler) vec_normalize.set_venv(venv) return vec_normalize def save(self, save_path: str) -> None: """ Save current VecNormalize object with all running statistics and settings (e.g. clip_obs) :param save_path: The path to save to """ with open(save_path, "wb") as file_handler: pickle.dump(self, file_handler)
def test_combining_stats(): np.random.seed(4) for shape in [(1, ), (3, ), (3, 4)]: values = [] rms_1 = RunningMeanStd(shape=shape) rms_2 = RunningMeanStd(shape=shape) rms_3 = RunningMeanStd(shape=shape) for _ in range(15): value = np.random.randn(*shape) rms_1.update(value) rms_3.update(value) values.append(value) for _ in range(19): # Shift the values value = np.random.randn(*shape) + 1.0 rms_2.update(value) rms_3.update(value) values.append(value) rms_1.combine(rms_2) assert np.allclose(rms_3.mean, rms_1.mean) assert np.allclose(rms_3.var, rms_1.var) rms_4 = rms_3.copy() assert np.allclose(rms_4.mean, rms_3.mean) assert np.allclose(rms_4.var, rms_3.var) assert np.allclose(rms_4.count, rms_3.count) assert id(rms_4.mean) != id(rms_3.mean) assert id(rms_4.var) != id(rms_3.var) x_cat = np.concatenate(values, axis=0) assert np.allclose(x_cat.mean(axis=0), rms_4.mean) assert np.allclose(x_cat.var(axis=0), rms_4.var)
class BurgersEnv(VecEnv): metadata = {'render.modes': ['live', 'gif', 'png']} def __init__( self, num_envs: int, step_count: int = 32, domain: phiflow.Domain = phiflow.Domain((32, ), box=phiflow.box[0:1]), dt: float = 0.03, viscosity: float = 0.003, diffusion_substeps: int = 1, final_reward_factor: float = 32, reward_rms: Optional[RunningMeanStd] = None, exp_name: str = 'v0', ): act_shape = self._get_act_shape(domain.resolution) obs_shape = self._get_obs_shape(domain.resolution) observation_space = gym.spaces.Box(-np.inf, np.inf, shape=obs_shape, dtype=np.float32) action_space = gym.spaces.Box(-np.inf, np.inf, shape=act_shape, dtype=np.float32) super().__init__(num_envs, observation_space, action_space) self.reward_range = (-float('inf'), float('inf')) self.spec = None self.exp_name = exp_name self.domain = domain self.step_count = step_count self.step_idx = 0 self.ep_idx = 0 self.dt = dt self.viscosity = viscosity self.physics = phiflow.Burgers(diffusion_substeps=diffusion_substeps) self.final_reward_factor = final_reward_factor self.reward_rms = reward_rms if self.reward_rms is None: self.reward_rms = RunningMeanStd() self.actions = None self.test_mode = False self.init_state = None self.goal_state = None self.cont_state = None self.pass_state = None self.gt_state = None self.gt_forces = None self.lviz = None self.gifviz = None self.pngviz = None def reset(self) -> VecEnvObs: self.step_idx = 0 self.gt_forces = self._get_gt_forces() self.init_state = self._get_init_state() self.cont_state = self.init_state.copied_with() self.goal_state = self._get_goal_state() if self.test_mode: self._init_ref_states() return self._build_obs() def step_async(self, actions: np.ndarray) -> None: self.actions = actions.reshape(self.cont_state.velocity.data.shape) def step_wait(self) -> VecEnvStepReturn: self.step_idx += 1 forces = self.actions forces_effect = phiflow.FieldEffect( phiflow.CenteredGrid(self.actions, box=self.domain.box), ['velocity']) self.cont_state = self._step_sim(self.cont_state, (forces_effect, )) # Perform reference simulation only when evaluating results -> after render was called once if self.test_mode: self.pass_state = self._step_sim(self.pass_state, ()) self.gt_state = self._step_gt() obs = self._build_obs() rew = self._build_rew(forces) done = np.full((self.num_envs, ), self.step_idx == self.step_count) if self.step_idx == self.step_count: self.ep_idx += 1 missing_forces_field = (self.goal_state.velocity.data - self.cont_state.velocity.data) / self.dt missing_forces = phiflow.FieldEffect( phiflow.CenteredGrid(missing_forces_field, box=self.domain.box), ['velocity']) forces += missing_forces_field self.cont_state = self.cont_state.copied_with( velocity=(self.cont_state.velocity.data + missing_forces_field * self.dt)) add_rew = self._build_rew( missing_forces.field.data) * self.final_reward_factor rew += add_rew obs = self.reset() info = [{ 'rew_unnormalized': rew[i], 'forces': np.abs(forces[i]).sum() } for i in range(self.num_envs)] self.reward_rms.update(rew) rew = (rew - self.reward_rms.mean) / np.sqrt(self.reward_rms.var) return obs, rew, done, info def close(self) -> None: pass def disable_test_mode_wtf(self): self.test_mode = False def render(self, mode: str = 'live') -> None: if not self.test_mode: self.test_mode = True self._init_ref_states() if mode == 'live': self.lviz = LivePlotter() elif mode == 'gif': self.gifviz = GifPlotter('StableBurger-%s' % self.exp_name) elif mode == 'png': self.pngviz = PngPlotter('StableBurger-%s' % self.exp_name) else: raise NotImplementedError() fields, labels = self._get_fields_and_labels() if mode == 'live': self.lviz.render(fields, labels, 2, True) elif mode == 'gif': self.gifviz.render(fields, labels, 2, True, 'Velocity', self.ep_idx, self.step_idx, self.step_count, True) elif mode == 'png': self.pngviz.render(fields, labels, 2, True, 'Velocity', self.ep_idx, self.step_idx, self.step_count, True) else: raise NotImplementedError() def seed(self, seed: Optional[int] = None) -> List[Union[None, int]]: return [None for _ in range(self.num_envs)] def get_attr(self, attr_name: str, indices: VecEnvIndices = None): return [ getattr(self, attr_name) for _ in self._vec_env_indices_to_list(indices) ] def set_attr(self, attr_name: str, value: Any, indices: VecEnvIndices = None): setattr(self, attr_name, value) def env_method(self, method_name: str, *method_args, indices: VecEnvIndices = None, **method_kwargs) -> List[Any]: getattr(self, method_name)(*method_args, **method_kwargs) def env_is_wrapped(self, wrapper_class: Type[gym.Wrapper], indices: VecEnvIndices = None) -> List[bool]: return [False for _ in self._vec_env_indices_to_list(indices)] def _step_sim( self, in_state: phiflow.BurgersVelocity, effects: Tuple[phiflow.FieldEffect, ...]) -> phiflow.BurgersVelocity: return self.physics.step(in_state, dt=self.dt, effects=effects) def _step_gt(self): return self._step_sim(self.gt_state, (self.gt_forces, )) def _get_init_state(self) -> phiflow.BurgersVelocity: return phiflow.BurgersVelocity(domain=self.domain, velocity=GaussianClash(self.num_envs), viscosity=self.viscosity) def _get_gt_forces(self) -> phiflow.FieldEffect: return phiflow.FieldEffect(GaussianForce(self.num_envs), ['velocity']) def _get_goal_state(self) -> phiflow.BurgersVelocity: state = self.init_state.copied_with() for _ in range(self.step_count): state = self._step_sim(state, (self.gt_forces, )) return state def _init_ref_states(self) -> None: self.pass_state = self.init_state.copied_with() self.gt_state = self.init_state.copied_with() def _build_obs(self) -> List[np.ndarray]: curr_data = self.cont_state.velocity.data goal_data = self.goal_state.velocity.data # Preserve the spacial dimensions, cut off batch dim and use only one channel time_shape = curr_data.shape[1:-1] + (1, ) time_data = np.full(curr_data.shape[1:], self.step_idx / self.step_count) # Channels last return [ np.concatenate(obs + (time_data, ), axis=-1) for obs in zip(curr_data, goal_data) ] def _build_rew(self, forces: np.ndarray) -> np.ndarray: reduced_shape = (forces.shape[0], -1) reshaped_forces = forces.reshape(reduced_shape) return -np.sum(reshaped_forces**2, axis=-1) # The whole field with one parameter in each direction, flattened out def _get_act_shape(self, field_shape: Tuple[int, ...]) -> Tuple[int, ...]: act_dim = np.prod(field_shape) * len(field_shape) return (act_dim, ) # Current and goal field with one parameter in each direction and one time channel def _get_obs_shape(self, field_shape: Tuple[int, ...]) -> Tuple[int, ...]: return tuple(field_shape) + (2 * len(field_shape) + 1, ) def _vec_env_indices_to_list(self, raw_indices: VecEnvIndices) -> List[int]: if raw_indices is None: return [] if isinstance(raw_indices, int): return [raw_indices] return list(raw_indices) def _get_fields_and_labels(self) -> Tuple[List[np.ndarray], List[str]]: # Take the simulation of the first env fields = [ f.velocity.data[0].reshape(-1) for f in [ self.init_state, self.goal_state, self.pass_state, self.gt_state, self.cont_state, ] ] labels = [ 'Initial state', 'Goal state', 'Uncontrolled simulation', 'Ground truth simulation', 'Controlled simulation', ] return fields, labels