class TwoValueHeadsBaseGeneral(ABC):
    """The base class for RL algorithms."""
    def __init__(self,
                 envs,
                 acmodel,
                 num_frames_per_proc,
                 discount,
                 gae_lambda,
                 entropy_coef,
                 value_loss_coef,
                 max_grad_norm,
                 recurrence,
                 preprocess_obss,
                 reshape_reward,
                 exp_used_pred,
                 min_stats_ep_batch=16):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        exp_used_pred : float
            the proportion of experience used for training predictor
        """

        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.exp_used_pred = exp_used_pred

        # Initialize episode statistics values
        self._finished_episodes = 0
        self._ep_statistics = []
        self._min_stats_ep_batch = min_stats_ep_batch

        # Store helpers values

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.num_procs = sum(map(len, envs)) if isinstance(envs[0],
                                                           list) else len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Control parameters

        assert self.acmodel.recurrent or self.recurrence == 1
        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.acmodel.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.acmodel.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values_ext = torch.zeros(*shape, device=self.device)
        self.values_int = torch.zeros(*shape, device=self.device)
        self.rewards_ext = torch.zeros(*shape, device=self.device)
        self.rewards_int = torch.zeros(*shape, device=self.device)
        self.advantages_ext = torch.zeros(*shape, device=self.device)
        self.advantages_int = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)

            with torch.no_grad():
                if self.acmodel.recurrent:
                    dist, value, memory = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)
            action = dist.sample()
            obs, reward, done, info = self.env.step(action.cpu().numpy())

            self.collect_interactions(info)

            # Update experiences values
            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values_ext[i] = value[0]
            self.values_int[i] = value[1]
            if self.reshape_reward is not None:
                self.rewards_ext[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                                   device=self.device)
            else:
                self.rewards_ext[i] = torch.tensor(reward, device=self.device)

            self.log_probs[i] = dist.log_prob(action)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards_ext[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # ==========================================================================================
        # Define experiences: ---> for observations
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # import pdb; pdb.set_trace()
        exps = DictList()

        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]

        # Preprocess experiences
        exps.obs = self.preprocess_obss(exps.obs, device=self.device)
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        # Add other data to experience buffer
        self.add_extra_experience(exps)

        # ==========================================================================================

        # -- Calculate intrinsic return
        self.rewards_int = self.calculate_intrinsic_reward(
            exps, self.rewards_int)

        # Add advantage and return to experiences
        # don;t use end of episode signal for intrinsic rewards
        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        # Calculate intrinsic rewards and advantages
        for i in reversed(range(self.num_frames_per_proc)):
            next_value_int = self.values_int[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value[1]
            next_advantage_int = self.advantages_int[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards_int[
                i] + self.discount * next_value_int - self.values_int[i]
            self.advantages_int[
                i] = delta + self.discount * self.gae_lambda * next_advantage_int

        # Calculate extrinisc rewards and advantages
        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask

            next_value_ext = self.values_ext[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value[0]
            next_advantage_ext = self.advantages_ext[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards_ext[
                i] + self.discount * next_value_ext * next_mask - self.values_ext[
                    i]
            self.advantages_ext[
                i] = delta + self.discount * self.gae_lambda * next_advantage_ext * next_mask

        # ==========================================================================================
        # @ continue Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        # for all tensors below, T x P -> P x T -> P * T
        exps.value_ext = self.values_ext.transpose(0, 1).reshape(-1)
        exps.value_int = self.values_int.transpose(0, 1).reshape(-1)
        exps.reward_ext = self.rewards_ext.transpose(0, 1).reshape(-1)
        exps.reward_int = self.rewards_int.transpose(0, 1).reshape(-1)
        exps.advantage_ext = self.advantages_ext.transpose(0, 1).reshape(-1)
        exps.advantage_int = self.advantages_int.transpose(0, 1).reshape(-1)
        exps.returnn_ext = exps.value_ext + exps.advantage_ext
        exps.returnn_int = exps.value_int + exps.advantage_int
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        aux_logs = self.process_interactions()
        # add extra logs with agent interactions
        for k in aux_logs:
            log[k] = aux_logs[k]

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    def collect_interactions(self, info):

        # collect all end-of-episode statistics about environment
        for env_info in info:
            if len(env_info) > 0:
                self._finished_episodes += 1
                self._ep_statistics.append(deepcopy(env_info))

    def process_interactions(self):

        # process statistics about the agent's behaviour
        # in the environment

        if self._finished_episodes < self._min_stats_ep_batch:
            return get_interactions_stats([])
        else:
            logs = get_interactions_stats(self._ep_statistics)

        # reset statistics
        self._finished_episodes = 0
        self._ep_statistics = []

        return logs

    @abstractmethod
    def update_parameters(self):
        raise NotImplemented

    @abstractmethod
    def get_save_data(self):
        raise NotImplemented

    @abstractmethod
    def calculate_intrinsic_reward(self, exps: DictList,
                                   dst_intrinsic_r: torch.Tensor):
        raise NotImplemented

    def add_extra_experience(self, exps: DictList):
        return

    def evaluate(self):
        return None
Example #2
0
class BaseAlgo(ABC):
    """The base class for RL algorithms."""
    def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr,
                 gae_lambda, entropy_coef, value_loss_coef, max_grad_norm,
                 recurrence, preprocess_obss, reshape_reward):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        acmodel : torch.Module
            the model
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        recurrence : int
            the number of steps the gradient is propagated back in time
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        """

        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward

        # Store helpers values

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Control parameters

        if not (self.acmodel.recurrent):
            self.recurrence = 1

        assert self.num_frames_per_proc % self.recurrence == 0

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.acmodel.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.acmodel.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        # Initialize log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            with torch.no_grad():
                if self.acmodel.recurrent:
                    dist, value, memory = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)
            action = dist.sample()

            obs, reward, done, _ = self.env.step(action.cpu().numpy())

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    @abstractmethod
    def update_parameters(self):
        pass
Example #3
0
class BaseAlgo(ABC):
    def __init__(self, envs, acmodel, num_frames_per_proc, discount, lr,
                 gae_tau, entropy_coef, value_loss_coef, max_grad_norm,
                 recurrence, preprocess_obss, reshape_reward):
        # Store parameters

        self.env = ParallelEnv(envs)
        self.acmodel = acmodel
        self.acmodel.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_tau = gae_tau
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.recurrence = recurrence
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward

        # Store helpers values

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Control parameters

        if not (self.acmodel.recurrent):
            self.recurrence = 1

        assert self.num_frames_per_proc % self.recurrence == 0

        # Store experiences values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.obs = self.env.reset()
        self.obss = [None] * (shape[0])
        if self.acmodel.recurrent:
            self.memory = torch.zeros(shape[1],
                                      self.acmodel.memory_size,
                                      device=self.device)
            self.memories = torch.zeros(*shape,
                                        self.acmodel.memory_size,
                                        device=self.device)
        self.mask = torch.ones(shape[1], device=self.device)
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        # Store log values

        self.log_episode_return = torch.zeros(self.num_procs,
                                              device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs,
                                                       device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs,
                                                  device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

    def collect_experiences(self):
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            with torch.no_grad():
                if self.acmodel.recurrent:
                    dist, value, memory = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)
            action = dist.sample()

            obs, reward, done, _ = self.env.step(action.cpu().numpy())

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_tau * next_advantage * next_mask

        # Defines experiences

        exps = DictList()
        exps.obs = [obs for obss in self.obss for obs in obss]
        if self.acmodel.recurrent:
            exps.memory = self.memories.view(-1, *self.memories.shape[2:])
            exps.mask = self.masks.view(-1, *self.masks.shape[2:]).unsqueeze(1)
        exps.action = self.actions.view(-1, *self.actions.shape[2:])
        exps.value = self.values.view(-1, *self.values.shape[2:])
        exps.reward = self.rewards.view(-1, *self.rewards.shape[2:])
        exps.advantage = self.advantages.view(-1, *self.advantages.shape[2:])
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.view(-1, *self.log_probs.shape[2:])

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    @abstractmethod
    def update_parameters(self):
        pass
def train_environment_model(environment_class,
                            agent_name,
                            n_environments=16,
                            seed=0,
                            learning_rate=5e-4,
                            batch_per_environment=4,
                            observation_weight=1,
                            reward_weight=1,
                            note=None,
                            tensorboard=True,
                            train_for_n_frames=None,
                            log_interval=1,
                            store_interval=10):
    saved_arguments = locals()

    date_suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    note = note + "_" if note else ""

    model_name = "EM_{}{}_s{}_{}".format(note,
                                         environment_name(environment_class),
                                         seed, date_suffix)
    model_directory = utils.get_model_dir(model_name)
    logger = utils.get_logger(model_directory)
    csv_file, csv_writer = utils.get_csv_writer(model_directory)
    logger.info("{}\n".format(saved_arguments))

    if tensorboard:
        from tensorboardX import SummaryWriter
        tensorboard_writer = SummaryWriter(model_directory)

    total_start_time = time.time()

    utils.seed(seed)
    agent_model = utils.load_model(utils.get_model_dir(agent_name))
    environment_model = EnvironmentModel(environment_class)
    optimizer = torch.optim.Adam(environment_model.parameters(),
                                 lr=learning_rate)

    logger.info("Using pre-trained agent model: {}\n".format(agent_name))
    logger.info("{}\n".format(agent_model))

    logger.info("Environment model architecture: {}\n".format(agent_name))
    logger.info("{}\n".format(environment_model))

    environments = []
    for i in range(n_environments):
        environment = instantiate_environment(environment_class)
        environment.seed(seed + 10000 * i)
        environments.append(environment)

    observation_preprocessor = MyObssPreprocessor(
        environments[0].observation_space)
    environments = ParallelEnv(environments)

    n_updates = 0
    n_frames = 0

    last_observations = environments.reset()

    while train_for_n_frames is None or n_frames < train_for_n_frames:
        batch_start_time = time.time()

        old_observation_batch = torch.Tensor()
        action_batch = torch.LongTensor()
        new_observation_batch = torch.Tensor()
        reward_batch = torch.Tensor()

        for batch in range(batch_per_environment):
            distributions, _, _ = agent_model(
                observation_preprocessor(last_observations), memory=None)
            actions = distributions.sample()
            new_observations, rewards, _, _ = environments.step(
                actions.numpy())

            old_observation_batch = torch.cat(
                (old_observation_batch,
                 observation_preprocessor(last_observations).image))
            action_batch = torch.cat((action_batch, actions))
            new_observation_batch = torch.cat(
                (new_observation_batch,
                 observation_preprocessor(new_observations).image))
            reward_batch = torch.cat(
                (reward_batch, torch.tensor(rewards, dtype=torch.float)))

        optimizer.zero_grad()
        predicted_observations, predicted_rewards = environment_model(
            old_observation_batch, action_batch)

        transposed_new_observation_batch = torch.transpose(
            torch.transpose(new_observation_batch, 1, 3), 2, 3)
        observation_loss = nn.functional.mse_loss(
            predicted_observations, transposed_new_observation_batch)

        reward_loss = nn.functional.mse_loss(predicted_rewards.squeeze(),
                                             reward_batch)
        total_loss = observation_loss * observation_weight + reward_loss * reward_weight
        total_loss.backward()
        optimizer.step()

        additional_frames = n_environments * batch_per_environment
        n_frames += additional_frames
        n_updates += 1

        if n_updates % log_interval == 0:
            batch_end_time = time.time()
            fps = additional_frames / (batch_end_time - batch_start_time)
            duration = int(time.time() - total_start_time)

            header = ["update", "frames", "FPS", "duration"]
            data = [n_updates, n_frames, fps, duration]

            header += ["observation_loss", "reward_loss", "total_loss"]
            data += [
                observation_loss.item(),
                reward_loss.item(),
                total_loss.item()
            ]

            logger.info(
                "U {} | F {:06} | FPS {:04.0f} | D {} | obsL {:.3f} | rewL {:.3f} | L {:.3f}"
                .format(*data))

            if n_frames == additional_frames:
                csv_writer.writerow(header)
            csv_writer.writerow(data)
            csv_file.flush()

            if tensorboard:
                for field, value in zip(header, data):
                    tensorboard_writer.add_scalar(field, value, n_frames)

        if n_updates % store_interval == 0:
            utils.save_model(environment_model, model_directory)
            logger.info("Model successfully saved")
Example #5
0
class BaseAlgo(ABC):
    """The base class for RL algorithms."""

    def __init__(self, envs, pi_old, pi_train, num_frames_per_proc, discount, lr, gae_lambda, entropy_coef,
                 policy_reg_coef, value_reg_coef,
                 value_loss_coef, max_grad_norm, preprocess_obss, reshape_reward, iter_type):
        """
        Initializes a `BaseAlgo` instance.

        Parameters:
        ----------
        envs : list
            a list of environments that will be run in parallel
        pi_old : torch.Module
            the old model (=teacher)
        pi_train : torch.Module
            the new model (=student).
            During 'normal' RL training, we execute this model, not the 'old' one.
        num_frames_per_proc : int
            the number of frames collected by every process for an update
        discount : float
            the discount for future rewards
        lr : float
            the learning rate for optimizers
        gae_lambda : float
            the lambda coefficient in the GAE formula
            ([Schulman et al., 2015](https://arxiv.org/abs/1506.02438))
        entropy_coef : float
            the weight of the entropy cost in the final objective
        value_loss_coef : float
            the weight of the value loss in the final objective
        max_grad_norm : float
            gradient will be clipped to be at most this value
        preprocess_obss : function
            a function that takes observations returned by the environment
            and converts them into the format that the model can handle
        reshape_reward : function
            a function that shapes the reward, takes an
            (observation, action, reward, done) tuple as an input
        iter_type :  string
            which type of ITER to use "none", "distill" (the normal one) or
            "kickstarting" (executing the student during distillation!
        """

        # Store parameters

        self.env = ParallelEnv(envs)
        self.pi_train = pi_train
        self.pi_train.train()
        self.pi_old = pi_old
        if self.pi_old is not None:
            self.pi_old.train()
        self.num_frames_per_proc = num_frames_per_proc
        self.discount = discount
        self.lr = lr
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.policy_reg_coef = policy_reg_coef
        self.value_reg_coef = value_reg_coef
        self.value_loss_coef = value_loss_coef
        self.max_grad_norm = max_grad_norm
        self.preprocess_obss = preprocess_obss or default_preprocess_obss
        self.reshape_reward = reshape_reward
        self.iter_type = iter_type

        # Store helpers values

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.num_procs = len(envs)
        self.num_frames = self.num_frames_per_proc * self.num_procs

        # Initialize experience values

        shape = (self.num_frames_per_proc, self.num_procs)

        self.reset_env()
        self.obss = [None]*(shape[0])
        self.masks = torch.zeros(*shape, device=self.device)
        self.actions = torch.zeros(*shape, device=self.device, dtype=torch.int)
        self.values_old = torch.zeros(*shape, device=self.device)
        self.values_train = torch.zeros(*shape, device=self.device)
        self.rewards = torch.zeros(*shape, device=self.device)
        self.advantages_old = torch.zeros(*shape, device=self.device)
        self.advantages_train = torch.zeros(*shape, device=self.device)
        self.log_probs = torch.zeros(*shape, device=self.device)

        # Initialize log values
        self.log_episode_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_reshaped_return = torch.zeros(self.num_procs, device=self.device)
        self.log_episode_num_frames = torch.zeros(self.num_procs, device=self.device)

        self.log_done_counter = 0
        self.log_return = [0] * self.num_procs
        self.log_reshaped_return = [0] * self.num_procs
        self.log_num_frames = [0] * self.num_procs

    def switch_models(self, new_pi):
        self.pi_old = self.pi_train
        self.pi_train = new_pi
        self.pi_train.train()
        # if self.pi_old is not None:
        #     self.pi_old.eval()

        parameters = list(self.pi_train.parameters())
        if exp_config.also_update_old_policy:
            parameters += list(self.pi_old.parameters())

        self.optimizer = torch.optim.Adam(parameters, self.lr, eps=self.adam_eps)

    def execute_model(self, alpha, **kwargs):
        """Execute model.

        Args:
            alpha: float between 0 and 1. If it's 0, we know we're not distilling and
                I don't need to execute old policy
            **kwargs: Other arguments for the `compute` function of the model. Should at least contain `obs`.

        Returns:
            dict containing 'dist' and 'value' for 'old' and 'train', as well es 'execute', which could be
            either, depending on whether iter_type=='distill' or iter_type=='kickstarting'
        """

        if alpha == 0:
            # If alpha == 0, we're not currently distilling -> Don't need to execute old policy
            dist_train, value_train = self.pi_train.compute(**kwargs)
            dist_old, value_old = None, None
            dist_execute = dist_train

        else:
            dist_old, value_old = self.pi_old.compute(**kwargs)
            dist_train, value_train = self.pi_train.compute(**kwargs)

            assert self.iter_type in ["kickstarting", "distill"]
            dist_execute = dist_train if (self.iter_type == "kickstarting" or self.pi_old is None) else dist_old

            # Return new distribution, old value and weighted? sum of KL's
        return {"dist_execute": dist_execute,
                "dist_old": dist_old, "value_old": value_old,
                "dist_train": dist_train, "value_train": value_train}

    def reset_env(self):
        self.obs = self.env.reset()
        self.mask = torch.ones(self.num_procs, device=self.device)

    def collect_experiences(self, alpha):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.
        Args
        ------
        alpha: float between 0 and 1
            used to determine which policy to execute, based on whether
            we're currently distilling or not and what iter_type is

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
            with torch.no_grad():
                model_results = self.execute_model(alpha=alpha, obs=preprocessed_obs)
                dist = model_results['dist_execute']
                value_old = model_results['value_old']
                value_train = model_results['value_train']
            action = dist.sample()

            obs, reward, done, _ = self.env.step(action.cpu().numpy())

            # Update experiences values
            self.obss[i] = self.obs
            self.obs = obs
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)

            self.actions[i] = action
            self.values_train[i] = value_train

            if alpha > 0:
                self.values_old[i] = value_old

            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
                ], device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            # Update log values
            self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            model_results = self.execute_model(alpha=alpha, obs=preprocessed_obs)
            next_value_old = model_results['value_old']
            next_value_train = model_results['value_train']

        # For self.advantages_old
        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask

            if alpha > 0:
                next_value_old = self.values_old[i+1] if i < self.num_frames_per_proc - 1 else next_value_old
                next_advantage_old = self.advantages_old[i+1] if i < self.num_frames_per_proc - 1 else 0
                delta_old = self.rewards[i] + self.discount * next_value_old * next_mask - self.values_old[i]
                self.advantages_old[i] = delta_old + self.discount * self.gae_lambda * next_advantage_old * next_mask

            next_value_train = self.values_train[i+1] if i < self.num_frames_per_proc - 1 else next_value_train
            next_advantage_train = self.advantages_train[i+1] if i < self.num_frames_per_proc - 1 else 0
            delta_train = self.rewards[i] + self.discount * next_value_train * next_mask - self.values_train[i]
            self.advantages_train[i] = delta_train + self.discount * self.gae_lambda * next_advantage_train * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [self.obss[i][j]
                    for j in range(self.num_procs)
                    for i in range(self.num_frames_per_proc)]
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.advantage_old = self.advantages_old.transpose(0, 1).reshape(-1)
        exps.advantage_train = self.advantages_train.transpose(0, 1).reshape(-1)

        if alpha > 0:
            exps.value_old = self.values_old.transpose(0, 1).reshape(-1)
            exps.returnn_old = exps.value_old + exps.advantage_old
        exps.value_train = self.values_train.transpose(0, 1).reshape(-1)
        exps.returnn_train = exps.value_train + exps.advantage_train

        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences
        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values
        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log

    @abstractmethod
    def update_parameters(self):
        pass