Exemple #1
0
    def collect_rollouts(self,
                         env: VecEnv,
                         callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int = 256) -> bool:

        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:

            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            self.num_timesteps += env.num_envs

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards, dones, values,
                               log_probs)
            self._last_obs = new_obs

        rollout_buffer.compute_returns_and_advantage(values, dones=dones)

        callback.on_rollout_end()

        return True
    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect rollouts using the current policy and fill a `RolloutBuffer`.

        :param env: (VecEnv) The training environment
        :param callback: (BaseCallback) Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: (RolloutBuffer) Buffer to fill with rollouts
        :param n_steps: (int) Number of experiences to collect per environment
        :return: (bool) True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            self.num_timesteps += env.num_envs

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        rollout_buffer.compute_returns_and_advantage(values, dones=dones)

        callback.on_rollout_end()

        return True
    def collect_rollouts(self,
                         env: VecEnv,
                         callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int = 256) -> bool:

        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1
            self.num_timesteps += env.num_envs

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards, dones, values, log_probs)
            self._last_obs = new_obs

        rollout_buffer.compute_returns_and_advantage(values, dones=dones)


        # # MSA debugging learning
        # try:
        #     import copy
        #     c_rb = copy.copy (rollout_buffer)
        #     self.rollout_buffer_hist.append(c_rb)
        # except:
        #     pass
        # if len(self.rollout_buffer_hist) == 25:
        #     import matplotlib.pyplot as plt
        #     n_envs = 4
        #     V = np.empty((0,n_envs), float)
        #     A = np.empty((0,n_envs), float)
        #     R = np.empty((0,n_envs), float)
        #     lp = np.empty((0,n_envs), float)
        #     r = np.empty((0,n_envs), float)
        #     a = np.empty((0,n_envs, actions.shape[1]), float)
        #     S = np.empty((0,n_envs, new_obs.shape[1]), float)
        #     for rb in self.rollout_buffer_hist:
        #         V = np.append (V, rb.values, axis=0)
        #         A = np.append (A, rb.advantages, axis=0)
        #         R = np.append (R, rb.returns, axis=0)
        #         lp = np.append (lp, rb.log_probs, axis=0)
        #         r = np.append (r, rb.rewards, axis=0)
        #         a = np.append (a, rb.actions, axis=0)
        #         S = np.append (S, rb.observations, axis=0)
        #     plt.plot (V)
        #     plt.title ('Values')
        #     dir_no = "2"
        #     filename = "RL_detailed_plots/"+ dir_no + "/V.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (A)
        #     plt.title ('Advantages')
        #     filename = "RL_detailed_plots/"+ dir_no + "/A.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (R)
        #     plt.title ('Returns')
        #     filename = "RL_detailed_plots/"+ dir_no + "/R.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (lp)
        #     plt.title ('Log Probs')
        #     filename = "RL_detailed_plots/"+ dir_no + "/lp.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     plt.plot (r)
        #     plt.title ('rewards')
        #     filename = "RL_detailed_plots/"+ dir_no + "/rew.png"
        #     plt.savefig(filename)
        #     plt.close ()
        #
        #     try:
        #         fig, axes = plt.subplots (nrows=actions.shape[1], ncols=1, figsize=(8, actions.shape[1]))
        #         for i in range (actions.shape[1]):
        #             axes[i].plot (a[:, :, i])
        #         plt.suptitle ('Actions', y=1)
        #         filename = "RL_detailed_plots/" + dir_no + "/act.png"
        #         plt.savefig (filename)
        #         plt.close()
        #     except:
        #         plt.plot (a[:, :, 0])
        #         plt.title ('Actions')
        #         filename = "RL_detailed_plots/" + dir_no + "/act.png"
        #         plt.savefig (filename)
        #         plt.close()
        #
        #     fig, axes = plt.subplots (nrows= new_obs.shape[1], ncols=1, figsize=(8, 2*new_obs.shape[1]))
        #     for i in range ( new_obs.shape[1]):
        #         axes[i].plot (S[:, :, i])
        #         axes[i].plot (S[:, :, i])
        #     plt.suptitle ('States', y=1)
        #     filename = "RL_detailed_plots/" + dir_no + "/S.png"
        #     plt.savefig (filename)
        #     plt.close()


        callback.on_rollout_end()

        return True
Exemple #4
0
    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        # Switch to eval mode (this affects batch norm / dropout)
        self.policy.set_training_mode(False)

        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor or to TensorDict
                obs_tensor = obs_as_tensor(self._last_obs, self.device)
                actions, values, log_probs = self.policy(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)

            # Handle timeout by bootstraping with value function
            # see GitHub issue #633
            for idx, done in enumerate(dones):
                if (
                    done
                    and infos[idx].get("terminal_observation") is not None
                    and infos[idx].get("TimeLimit.truncated", False)
                ):
                    terminal_obs = self.policy.obs_to_tensor(infos[idx]["terminal_observation"])[0]
                    with th.no_grad():
                        terminal_value = self.policy.predict_values(terminal_obs)[0]
                    rewards[idx] += self.gamma * terminal_value

            rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs)
            self._last_obs = new_obs
            self._last_episode_starts = dones

        with th.no_grad():
            # Compute value for the last timestep
            values = self.policy.predict_values(obs_as_tensor(new_obs, self.device))

        rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)

        callback.on_rollout_end()

        return True
    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            # Tag on the other agent's action
            submit_actions = clipped_actions
            if self.bridge and self.bridge.other(self.is_protagonist):
                other_actions = self.bridge.other(self.is_protagonist).predict(
                    obs_tensor.cpu().numpy())[0]
                # if isinstance(self.action_space, gym.spaces.Box):
                #     clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)
                if len(other_actions.shape) < len(clipped_actions.shape):
                    other_actions = other_actions.unsqueeze(dim=1)
                submit_actions = np.concatenate(
                    [other_actions, clipped_actions] if self.is_protagonist
                    else [clipped_actions, other_actions],
                    axis=1)
            elif self.adv_action_space:
                submit_actions = np.concatenate(
                    (np.array([np.full(self.adv_action_space.shape, np.nan)
                               ]), clipped_actions),
                    axis=1)

            new_obs, rewards, dones, infos = env.step(submit_actions)
            if not self.is_protagonist:
                rewards = -rewards

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True
Exemple #6
0
    def collect_rollouts(
        self,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps * self.outer_steps:  # here n_rollout_steps is n_steps in PPO args. Noted by Chenyin
            # while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor or to TensorDict
                obs_tensor = obs_as_tensor(self._last_obs, self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            # (1) if at the T-th step, the env is going to reset, so we shall store the terminal states in advance
            # (2) if done, new_obs is the new state after resetting the env, so we need to get terminal state from infos
            if n_steps % n_rollout_steps == 0 or dones.any():
                # if dones.any():  # second case: do not reset the env when encountering step T
                terminal_obs = new_obs.copy()
                infos_array = np.array(infos)  # change list to numpy array
                i = 0
                for done in dones:
                    if done:
                        terminal_obs[i] = infos_array[i][
                            "terminal_observation"]
                    i += 1
                with th.no_grad():
                    # Convert to pytorch tensor or to TensorDict
                    obs_tensor = obs_as_tensor(terminal_obs, self.device)
                    _, terminal_values, _ = self.policy.forward(
                        obs_tensor)  # in the infinite game, V(s_T) is defined
            else:  # when dones = [False, ..., False]
                terminal_values = None

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_episode_starts, values, log_probs,
                               terminal_values)

            # Chenyin
            if n_steps % n_rollout_steps == 0:
                self._last_obs = env.reset()
                self._last_episode_starts = np.ones((env.num_envs, ),
                                                    dtype=bool)
            else:
                self._last_obs = new_obs
                self._last_episode_starts = dones
            # self._last_obs = new_obs
            # self._last_episode_starts = dones

        with th.no_grad():
            # Compute value for the last timestep
            if n_steps % n_rollout_steps == 0 or dones.any():
                # if dones.any():
                # obs_tensor = obs_as_tensor(terminal_obs, self.device)
                # _, values, _ = self.policy.forward(obs_tensor)
                values = terminal_values
                assert values is not None
            else:
                obs_tensor = obs_as_tensor(new_obs, self.device)
                _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values)

        callback.on_rollout_end()

        return True
    def collect_rollouts(
        self,
        opponent_model,
        env: VecEnv,
        callback: BaseCallback,
        rollout_buffer: RolloutBuffer,
        n_rollout_steps: int,
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()

        ## Initialized observation of OPPOMENT MODEL
        opponent_model._last_obs = self._last_obs  ### MIGHT NEED TO CHANGE THIS

        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()
        volley_env = gym.make(
            "SlimeVolley-v0"
        )  ### WORKS FOR NOW, MIGHT NEED BETTER WAYS TO DO THIS

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            ####print("line 166: obs_tensor, actions, values,log_probs: ", obs_tensor, actions, values,log_probs)
            actions = actions.cpu().numpy()
            ####print("line 168: agent actions numpy", actions)
            ## OPPOMENT MODEL
            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor_op = th.as_tensor(opponent_model._last_obs).to(
                    self.device)
                actions_op, values_op, log_probs_op = opponent_model.policy.forward(
                    obs_tensor_op)

            actions_op = actions_op.cpu().numpy()
            ####print("line 177: opponent actions numpy", actions_op)
            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)
            ####print("line 182: clipped actions numpy", clipped_actions)

            ## OPPOMENT MODEL
            # Rescale and perform action
            clipped_actions_op = actions_op
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions_op = np.clip(actions_op, self.action_space.low,
                                             self.action_space.high)

            action_n = np.array([clipped_actions[0], clipped_actions_op[0]])

            new_obs_n, rewards_n, dones_n, info_n = volley_env.step(action_n)
            ####print("line 192: new_obs, rewards, dones, infos", new_obs, rewards, dones, info)
            ################new_obs, rewards, dones, infos = env.step(clipped_actions)
            ################new_obs, rewards, dones, infos = volley_env.step(clipped_actions[0])

            new_obs = numpy.array([new_obs_n[0]])
            ####print("line 209: agent new_obs", new_obs)
            rewards = numpy.array([rewards_n[0]])
            dones = numpy.array([dones_n[0]])
            infos = numpy.array([info_n])

            ## OPPOMENT MODEL
            new_obs_op = numpy.array([new_obs_n[1]])
            ####print("line 206: new_obs_op", new_obs_op)

            opponent_model._last_obs = numpy.array([new_obs_op])
            ####print("line 209: opponent new_obs", opponent_model._last_obs)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True
    def collect_rollouts(self, env: VecEnv, callback: BaseCallback,
                         rollout_buffer: RolloutBuffer,
                         n_rollout_steps: int) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)

            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low,
                                          self.action_space.high)

            new_obs, rewards, dones, infos = env.step(clipped_actions)

            if dones[0]:
                for info in infos:
                    goal_diff = info['l_score'] - info['r_score']
                    print(
                        f"Rewards: {goal_diff} | Score: [{info['l_score']} : {info['r_score']}]"
                    )
                    self.scores.append(goal_diff)

                avg_score = sum(self.scores) / len(self.scores)
                print(f"Average Reward: {avg_score}")
                print("")

                if avg_score > self.best_score:
                    self.best_score = avg_score
                    self.save_best_model = True

                if self.log_handler is not None:
                    self.log_handler.log({"Average Reward": avg_score})

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards,
                               self._last_dones, values, log_probs)
            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)

        rollout_buffer.compute_returns_and_advantage(last_values=values,
                                                     dones=dones)

        callback.on_rollout_end()

        return True
Exemple #9
0
    def collect_rollouts(
        self, env: VecEnv, callback: BaseCallback, rollout_buffer: RolloutBuffer, n_rollout_steps: int
    ) -> bool:
        """
        Collect experiences using the current policy and fill a ``RolloutBuffer``.
        The term rollout here refers to the model-free notion and should not
        be used with the concept of rollout used in model-based RL or planning.

        :param env: The training environment
        :param callback: Callback that will be called at each step
            (and at the beginning and end of the rollout)
        :param rollout_buffer: Buffer to fill with rollouts
        :param n_steps: Number of experiences to collect per environment
        :return: True if function returned with at least `n_rollout_steps`
            collected, False if callback terminated rollout prematurely.
        """
        assert self._last_obs is not None, "No previous observation was provided"
        n_steps = 0
        rollout_buffer.reset()
        # Sample new weights for the state dependent exploration
        if self.use_sde:
            self.policy.reset_noise(env.num_envs)

        callback.on_rollout_start()
        
        # debug ===============================================================
        if mode == 'debug':
            print(["OPA.collect_rollouts started, let's roll!"])

        while n_steps < n_rollout_steps:
            if self.use_sde and self.sde_sample_freq > 0 and n_steps % self.sde_sample_freq == 0:
                # Sample a new noise matrix
                self.policy.reset_noise(env.num_envs)
            
            # notes ===========================================================
            # use last observation to generate action (with log probs) and value
            with th.no_grad():
                # Convert to pytorch tensor
                obs_tensor = th.as_tensor(self._last_obs).to(self.device)
                actions, values, log_probs = self.policy.forward(obs_tensor)
            actions = actions.cpu().numpy()
            
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.collect_rollouts loop', 'n_rollout_steps:', n_rollout_steps, 'n_steps:', n_steps])
                print(['OPA.collect_rollouts loop eval', 
                       'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs])

            # Rescale and perform action
            clipped_actions = actions
            # Clip the actions to avoid out of bound error
            if isinstance(self.action_space, gym.spaces.Box):
                clipped_actions = np.clip(actions, self.action_space.low, self.action_space.high)

            # notes ===========================================================
            # use clipped_actions to interact with env
            new_obs, rewards, dones, infos = env.step(clipped_actions)

            self.num_timesteps += env.num_envs

            # Give access to local variables
            callback.update_locals(locals())
            if callback.on_step() is False:
                return False

            self._update_info_buffer(infos)
            n_steps += 1

            if isinstance(self.action_space, gym.spaces.Discrete):
                # Reshape in case of discrete action
                actions = actions.reshape(-1, 1)
            rollout_buffer.add(self._last_obs, actions, rewards, self._last_dones, values, log_probs)
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.collect_rollouts loop save', 
                       'last_obs:', self._last_obs, 'actions', actions, 'values', values, 'log_probs', log_probs, 
                       'rewards', rewards, 'last_dones', self._last_dones])
            # notes ===========================================================
            # 6 things to save in buffer: last_obs, actions, rewards, last_dones, values, log_probs

            self._last_obs = new_obs
            self._last_dones = dones

        with th.no_grad():
            # Compute value for the last timestep
            obs_tensor = th.as_tensor(new_obs).to(self.device)
            _, values, _ = self.policy.forward(obs_tensor)
        
        # debug ===============================================================
        if mode == 'debug':
            print(['OPA.collect_rollouts last', 'new_obs:', new_obs, 'values:', values, 'dones:', dones])
            print(['OPA.collect_rollouts finished, ready to compute_returns'])
            
        rollout_buffer.compute_returns_and_advantage(last_values=values, dones=dones)
        

        callback.on_rollout_end()

        return True