def collect_rollouts(self, env: VecEnv, callback: BaseCallback, rollout_buffer: TrajRolloutBuffer, n_rollout_steps: int) -> bool: """ Collect rollouts using the current policy and fill a `RolloutBuffer`. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts :param n_steps: (int) Number of experiences to collect per environment :return: (bool) True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() # while n_steps < n_rollout_steps: while not rollout_buffer.full: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor obs_ctx_tensor = th.as_tensor(self._last_obs).to( self.device) # (num_agents,) + (obs_dim,) actions, values, log_probs = self.policy.forward( obs_ctx_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) # action_dict = zip(enumerate(clipped_actions)) new_obs, rewards, dones, infos = env.step( clipped_actions) # env step takes np.array, returns np.array? # TODO: figure out where to put this reset: maybe in an env wrapper like they did if dones[self.env.num_agents] == 1: env.reset() if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 # self.num_timesteps += env.num_envs self.num_timesteps += 1 if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) _obs = self._last_obs[..., 0:self.env.obs_size] _ctx = self._last_obs[..., self.env.obs_size:] # TODO: need to fix in the case of new number of agents, since range(len(last_obs)) will be incorrect for i in range(len(self._last_obs)): rollout_buffer.add(agent_id=i, context=_ctx[i], done=self._last_dones[i], obs=_obs[i], action=actions[i], reward=rewards[i], value=values[i], log_prob=log_probs[i]) # for i, state_ctx_pair in enumerate(zip(*_obs, _ctx)): # print(state_ctx_pair) # rollout_buffer.add(agent_id=i, # context=state_ctx_pair[-1], # done=self._last_dones[i], # obs=state_ctx_pair[0:-1], # action=actions[i], # reward=rewards[i], # value=values[i], # log_prob=log_probs[i]) # TODO: needs modification! # rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs) self._last_obs = new_obs self._last_dones = dones rollout_buffer.compute_returns_and_advantage(values) callback.on_rollout_end() return True
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, n_episodes: int = 1, n_steps: int = -1, action_noise: Optional[ActionNoise] = None, learning_starts: int = 0, replay_buffer: Optional[ReplayBuffer] = None, log_interval: Optional[int] = None, ) -> RolloutReturn: """ Collect experiences and store them into a ReplayBuffer. :param env: (VecEnv) The training environment :param callback: (BaseCallback) Callback that will be called at each step (and at the beginning and end of the rollout) :param n_episodes: (int) Number of episodes to use to collect rollout data You can also specify a ``n_steps`` instead :param n_steps: (int) Number of steps to use to collect rollout data You can also specify a ``n_episodes`` instead. :param action_noise: (Optional[ActionNoise]) Action noise that will be used for exploration Required for deterministic policy (e.g. TD3). This can also be used in addition to the stochastic policy for SAC. :param learning_starts: (int) Number of steps before learning for the warm-up phase. :param replay_buffer: (ReplayBuffer) :param log_interval: (int) Log data every ``log_interval`` episodes :return: (RolloutReturn) """ episode_rewards, total_timesteps = [], [] total_steps, total_episodes = 0, 0 assert isinstance(env, VecEnv), "You must pass a VecEnv" # assert env.num_envs == 1, "OffPolicyAlgorithm only support single environment" if self.use_sde: self.actor.reset_noise() callback.on_rollout_start() continue_training = True while total_steps < n_steps or total_episodes < n_episodes: _last_obs = env.reset() done = False episode_reward, episode_timesteps = 0.0, 0 while not done: if self.use_sde and self.sde_sample_freq > 0 and total_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.actor.reset_noise() # Select action randomly or according to policy action, buffer_action = self._sample_action( learning_starts, action_noise) # Rescale and perform action new_obs, reward, doneArray, infos = env.step(action) done = True for d in doneArray: if not d: done = False self.num_timesteps += 1 episode_timesteps += 1 total_steps += 1 # Give access to local variables callback.update_locals(locals()) # Only stop training if return value is False, not when it is None. if callback.on_step() is False: return RolloutReturn(0.0, total_steps, total_episodes, continue_training=False) episode_reward += np.sum(np.asarray(reward)) # Retrieve reward and episode length if using Monitor wrapper self._update_info_buffer(infos[0], done) # Store data in replay buffer if replay_buffer is not None: # Store only the unnormalized version if self._vec_normalize_env is not None: new_obs_ = self._vec_normalize_env.get_original_obs() reward_ = self._vec_normalize_env.get_original_reward() else: # Avoid changing the original ones self._last_original_obs, new_obs_, reward_ = self._last_obs, new_obs, reward for i in range(0, env.num_envs): replay_buffer.add(self._last_original_obs[i], new_obs_[i], buffer_action[i], reward_[i], doneArray[i]) self._last_obs = new_obs # Save the unnormalized observation if self._vec_normalize_env is not None: self._last_original_obs = new_obs_ self._update_current_progress_remaining( self.num_timesteps, self._total_timesteps) # For DQN, check if the target network should be updated # and update the exploration schedule # For SAC/TD3, the update is done as the same time as the gradient update # see https://github.com/hill-a/stable-baselines/issues/900 self._on_step() if 0 < n_steps <= total_steps: break if done: total_episodes += 1 self._episode_num += 1 episode_rewards.append(episode_reward) total_timesteps.append(episode_timesteps) if action_noise is not None: action_noise.reset() # Log training infos if log_interval is not None and self._episode_num % log_interval == 0: self._dump_logs() mean_reward = np.mean(episode_rewards) if total_episodes > 0 else 0.0 callback.on_rollout_end() return RolloutReturn(mean_reward, total_steps, total_episodes, continue_training)
def generate_trajectories( policy, venv: VecEnv, sample_until: GenTrajTerminationFn, *, deterministic_policy: bool = False, rng: np.random.RandomState = np.random, ) -> Sequence[types.TrajectoryWithRew]: """Generate trajectory dictionaries from a policy and an environment. Args: policy (Callable,BasePolicy or BaseAlgorithm): A function mapping observation to action, a stable_baselines3 policy or an algorithm trained on the gym environment. venv: The vectorized environments to interact with. sample_until: A function determining the termination condition. It takes a sequence of trajectories, and returns a bool. Most users will want to use one of `min_episodes` or `min_timesteps`. deterministic_policy: If True, asks policy to deterministically return action. Note the trajectories might still be non-deterministic if the environment has non-determinism! rng: used for shuffling trajectories. Returns: Sequence of trajectories, satisfying `sample_until`. Additional trajectories may be collected to avoid biasing process towards short episodes; the user should truncate if required. """ if isinstance(policy, BaseAlgorithm): policy.set_env(venv) # Collect rollout tuples. trajectories = [] # accumulator for incomplete trajectories trajectories_accum = TrajectoryAccumulator() obs = venv.reset() for env_idx, ob in enumerate(obs): # Seed with first obs only. Inside loop, we'll only add second obs from # each (s,a,r,s') tuple, under the same "obs" key again. That way we still # get all observations, but they're not duplicated into "next obs" and # "previous obs" (this matters for, e.g., Atari, where observations are # really big). trajectories_accum.add_step(dict(obs=ob), env_idx) # Now, we sample until `sample_until(trajectories)` is true. # If we just stopped then this would introduce a bias towards shorter episodes, # since longer episodes are more likely to still be active, i.e. in the process # of being sampled from. To avoid this, we continue sampling until all epsiodes # are complete. # # To start with, all environments are active. active = np.ones(venv.num_envs, dtype=np.bool) while np.any(active): if isinstance(policy, Callable): acts = policy(obs) else: acts, _ = policy.predict(obs, deterministic=deterministic_policy) obs, rews, dones, infos = venv.step(acts) # If an environment is inactive, i.e. the episode completed for that # environment after `sample_until(trajectories)` was true, then we do # *not* want to add any subsequent trajectories from it. We avoid this # by just making it never done. dones &= active new_trajs = trajectories_accum.add_steps_and_auto_finish( acts, obs, rews, dones, infos) trajectories.extend(new_trajs) if sample_until(trajectories): # Termination condition has been reached. Mark as inactive any environments # where a trajectory was completed this timestep. active &= ~dones # Note that we just drop partial trajectories. This is not ideal for some # algos; e.g. BC can probably benefit from partial trajectories, too. # Each trajectory is sampled i.i.d.; however, shorter episodes are added to # `trajectories` sooner. Shuffle to avoid bias in order. This is important # when callees end up truncating the number of trajectories or transitions. # It is also cheap, since we're just shuffling pointers. rng.shuffle(trajectories) # Sanity checks. for trajectory in trajectories: n_steps = len(trajectory.acts) # extra 1 for the end exp_obs = (n_steps + 1, ) + venv.observation_space.shape real_obs = trajectory.obs.shape assert real_obs == exp_obs, f"expected shape {exp_obs}, got {real_obs}" exp_act = (n_steps, ) + venv.action_space.shape real_act = trajectory.acts.shape assert real_act == exp_act, f"expected shape {exp_act}, got {real_act}" exp_rew = (n_steps, ) real_rew = trajectory.rews.shape assert real_rew == exp_rew, f"expected shape {exp_rew}, got {real_rew}" return trajectories
def collect_rollouts( self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int, ) -> bool: """ Collect experiences using the current policy and fill a ``RolloutBuffer``. The term rollout here refers to the model-free notion and should not be used with the concept of rollout used in model-based RL or planning. :param env: The training environment :param callback: Callback that will be called at each step (and at the beginning and end of the rollout) :param rollout_buffer: Buffer to fill with rollouts :param n_steps: Number of experiences to collect per environment :return: True if function returned with at least `n_rollout_steps` collected, False if callback terminated rollout prematurely. """ assert self._last_obs is not None, "No previous observation was provided" n_steps = 0 rollout_buffer.reset() # Sample new weights for the state dependent exploration if self.use_sde: self.policy.reset_noise(env.num_envs) callback.on_rollout_start() while n_steps < n_rollout_steps * self.outer_steps: # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin # while n_steps < n_rollout_steps: if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0: # Sample a new noise matrix self.policy.reset_noise(env.num_envs) with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(self._last_obs, self.device) actions, values, log_probs = self.policy.forward(obs_tensor) actions = actions.cpu().numpy() # Rescale and perform action clipped_actions = actions # Clip the actions to avoid out of bound error if isinstance(self.action_space, gym.spaces.Box): clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high) new_obs, rewards, dones, infos = env.step(clipped_actions) self.num_timesteps += env.num_envs # Give access to local variables callback.update_locals(locals()) if callback.on_step() is False: return False self._update_info_buffer(infos) n_steps += 1 # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # second case: do not reset the env when encountering step T terminal_obs = new_obs.copy() infos_array = np.array(infos) # change list to numpy array i = 0 for done in dones: if done: terminal_obs[i] = infos_array[i][ "terminal_observation"] i += 1 with th.no_grad(): # Convert to pytorch tensor or to TensorDict obs_tensor = obs_as_tensor(terminal_obs, self.device) _, terminal_values, _ = self.policy.forward( obs_tensor) # in the infinite game, V(s_T) is defined else: # when dones = [False, ..., False] terminal_values = None if isinstance(self.action_space, gym.spaces.Discrete): # Reshape in case of discrete action actions = actions.reshape(-1, 1) rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs, terminal_values) # Chenyin if n_steps % n_rollout_steps == 0: self._last_obs = env.reset() self._last_episode_starts = np.ones((env.num_envs, ), dtype=bool) else: self._last_obs = new_obs self._last_episode_starts = dones # self._last_obs = new_obs # self._last_episode_starts = dones with th.no_grad(): # Compute value for the last timestep if n_steps % n_rollout_steps == 0 or dones.any(): # if dones.any(): # obs_tensor = obs_as_tensor(terminal_obs, self.device) # _, values, _ = self.policy.forward(obs_tensor) values = terminal_values assert values is not None else: obs_tensor = obs_as_tensor(new_obs, self.device) _, values, _ = self.policy.forward(obs_tensor) rollout_buffer.compute_returns_and_advantage(last_values=values) callback.on_rollout_end() return True