コード例 #1
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            with torch.no_grad():
                if self.acmodel.recurrent:
                    dist, value, memory = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)
            action = dist.sample()

            obs, reward, done, _ = self.env.step(action.cpu().numpy())

            self.step_counter = [x + 1 for x in self.step_counter]

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward,
                                               device=self.device,
                                               dtype=torch.float)
            self.log_probs[i] = dist.log_prob(action)

            # get discounts
            discounts = [self.discount**(x - 1) for x in self.step_counter]
            discounts = torch.tensor(discounts,
                                     device=self.device,
                                     dtype=torch.float)

            # Update log values

            self.log_episode_return += torch.tensor(
                reward, device=self.device, dtype=torch.float) * discounts
            self.log_episode_reshaped_return += self.rewards[i] * discounts
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())
                    self.step_counter[i] = 0

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)
        logs = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, logs
コード例 #2
0
ファイル: base.py プロジェクト: klemenkotar/torch-ac
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        gazes = []
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction
            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            # Initialize gaze to none - this is only used with variable view
            gaze = None
            with torch.no_grad():
                if self.acmodel.recurrent:
                    if self.variable_view:
                        dist, gaze, value, memory = self.acmodel(
                            preprocessed_obs,
                            self.memory * self.mask.unsqueeze(1))
                    else:
                        dist, value, memory = self.acmodel(
                            preprocessed_obs,
                            self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)

            # If variable view is enabled the last two fields in the action vector are offsets not actions
            if self.variable_view:
                action = dist.sample()
                gaze_action = torch.stack(
                    (3.0 * gaze[0].sample(), 3.0 * gaze[1].sample()), dim=1)
                action_data = torch.cat(
                    (action.view([-1, 1]), gaze_action.long()), dim=1)
                obs, reward, done, _ = self.env.step(action_data.cpu().numpy())
            else:
                action = dist.sample()
                obs, reward, done, _ = self.env.step(action.cpu().numpy())

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            # Compute 3D distribution log probs for action, x gaze and y gaze f using variable view
            if self.variable_view:
                log_probs = []
                gaze_action /= 3.0
                for j in range(gaze_action.shape[0]):
                    gaze_dist = gaze[0].probs[j].ger(gaze[1].probs[j]).view(
                        [-1])
                    full_action_space_dist = Categorical(
                        (dist.probs[j].ger(gaze_dist)).view(-1))
                    action[j] = action[j] * 22 + gaze_action[j][
                        0] * 11 + gaze_action[j][1]
                    log_prob = full_action_space_dist.log_prob(action[j])
                    log_probs.append(log_prob)
                self.log_probs[i] = torch.Tensor(log_probs)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

            gazes.append(gaze_action)

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                if self.variable_view:
                    _, _, next_value, _ = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    _, next_value, _ = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # If using variable view store the gaze
        if self.variable_view:
            gazes = torch.cat(gazes)
            exps.gaze = gazes

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        logs = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, logs, gazes * 3
コード例 #3
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            if self.continuous_action:
                self.obs = [
                    self.model.scaler.transform(self.obs[0].reshape(
                        1, -1)).reshape(-1).astype('float64')
                ]

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)

            with torch.no_grad():
                if self.model.recurrent:
                    dist, value, embedding, _, successor, _, memory = self.model(
                        preprocessed_obs,
                        memory=self.memory * self.mask.unsqueeze(1))
                    _, target_value, target_embedding, _, target_successor, _, _ = self.target(
                        preprocessed_obs,
                        memory=self.memory * self.mask.unsqueeze(1))
                # target
                else:
                    dist, value, embedding, _, successor, _, _ = self.model(
                        preprocessed_obs)
                    _, target_value, target_embedding, _, target_successor, _, _ = self.target(
                        preprocessed_obs)

            if self.continuous_action:
                # Should this (eps + stochastic policy) be done? Or use (eps + det policy) or just stochastic policy?
                epsample = random.random()
                eps_threshold = 0.02 + (0.9 - 0.02) * math.exp(
                    -1. * self.total_updates / 200)
                if epsample > eps_threshold:
                    noise_dist = torch.distributions.normal.Normal(0, 0.03)
                    action = dist.sample() + noise_dist.sample()
                    action = torch.clamp(action, self.env.envs[0].min_action,
                                         self.env.envs[0].max_action)
                else:
                    action = torch.Tensor(
                        self.env.envs[0].action_space.sample())

                obs, reward, done, _ = self.env.step([action.cpu().numpy()])
                obs = (obs[0].reshape(1, -1))
            else:
                action = dist.sample()
                obs, reward, done, _ = self.env.step(action.cpu().numpy())

            # Update experiences values
            self.replay_memory.push((self.FloatTensor([obs[0]['image']]),
                                     self.FloatTensor([reward])))

            self.obss[i] = self.obs
            self.obs = obs
            if self.model.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            self.target_values[i] = target_value
            self.embeddings[i] = embedding
            self.target_embeddings[i] = target_embedding
            self.successors[i] = successor
            self.target_successors[i] = target_successor
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        if self.continuous_action:
            # asuming flat observations for continuous action case:
            # this is true for the Mountain Cart example but may not be in general
            # Ideally the continuous action code should be modifed to handle flat or image input
            # And the use of a scaler should be an option to train.py
            # And either use checks here to do the following
            # or create a wrapper that does the scaling and set it up in train.py
            self.obs[0] = self.model.scaler.transform(self.obs[0].reshape(
                1, -1)).reshape(-1)
            self.obs = self.obs.astype('float32')

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.model.recurrent:
                _, next_value, _, _, next_successor, _, _ = self.target(
                    preprocessed_obs,
                    memory=self.memory * self.mask.unsqueeze(1))  #target
            else:
                _, next_value, _, _, next_successor, _ = self.target(
                    preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_successor = self.target_successors[
                i + 1] if i < self.num_frames_per_proc - 1 else next_successor
            next_value = self.target_values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_SR_advantage = self.SR_advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0
            next_V_advantage = self.V_advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            SR_delta = self.target_embeddings[i] + (
                self.discount * next_successor *
                next_mask.reshape(-1, 1)) - self.successors[i]
            self.SR_advantages[i] = SR_delta + (
                self.discount * self.gae_lambda * next_SR_advantage *
                next_mask.reshape(-1, 1))

            V_delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.V_advantages[
                i] = V_delta + self.discount * self.gae_lambda * next_V_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.model.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.SR_advantage = self.SR_advantages.transpose(0, 1).reshape(
            -1, self.model.embedding_size)
        exps.successor = self.successors.transpose(0, 1).reshape(
            -1, self.model.embedding_size)
        exps.successorn = exps.successor + exps.SR_advantage
        exps.V_advantage = self.V_advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.V_advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        logs = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, logs
コード例 #4
0
ファイル: multiQ.py プロジェクト: mcavolowsky/torch-ac
    def collect_experiences_old(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            with torch.no_grad():
                if self.model.recurrent:
                    value, memory = self.model(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    value = self.model(preprocessed_obs)
            action = self.pareto_action(value, self.weights)
            eps_mask = torch.rand(action.shape) < self.eps
            action[eps_mask] = torch.randint(0, self.env.action_space.n,
                                             (sum(eps_mask), ))

            obs, reward, done, _ = self.env.step(action.cpu().numpy())

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.model.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(
                        self.log_episode_return[i])  #.item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i])  #.item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

                    # reroll the weights for that episode
                    if self.reward_size == 1:
                        self.weights[i, 0] = 1
                    elif self.reward_size == 2:
                        self.weights[i, 0] = torch.rand(1)
                        self.weights[i, 1] = 1 - self.weights[i, 0]
                    else:
                        raise NotImplementedError

            self.log_episode_return = (self.log_episode_return.T * self.mask).T
            self.log_episode_reshaped_return = (
                self.log_episode_reshaped_return.T * self.mask).T
            self.log_episode_num_frames *= self.mask

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.model.recurrent:
                next_value, _ = self.eval_model(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                next_value = self.eval_model(preprocessed_obs)
            next_value_clipped = torch.clip(next_value,
                                            *self.env.envs[0].reward_range)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_mask = torch.vstack([next_mask] * self.reward_size).T

            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value

            self.expected_values[i] = self.rewards[i] + (
                self.pareto_rewards(next_value_clipped, self.weights) *
                (self.discount * next_mask))
#           self.advantages[i] = delta + (next_advantage.T * (self.discount * self.gae_lambda *  next_mask)).T

# Define experiences:
#   the whole experience is the concatenation of the experience
#   of each process.
# In comments below:
#   - T is self.num_frames_per_proc,
#   - P is self.num_procs,
#   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.model.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1, self.reward_size)
        exps.reward = self.rewards.transpose(0,
                                             1).reshape(-1, self.reward_size)
        exps.exp_value = self.expected_values.transpose(0, 1).reshape(
            -1, self.reward_size)
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        logs = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, logs
コード例 #5
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.
        """

        hasMesses = False
        hasPerf = False
        hasPerfFull = False
        hasButtonPresses = False
        hasPhonesCleaned = False
        hasDirtCleaned = False

        addedAllMyData = False
        loggedAllMyData = False
        allMyData = None

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs,
                                                    device=self.device)
            with torch.no_grad():
                if self.acmodel.recurrent:
                    dist, value, memory = self.acmodel(
                        preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                else:
                    dist, value = self.acmodel(preprocessed_obs)
            action = dist.sample()

            obs, reward, done, info = self.env.step(action.cpu().numpy())

            if 'messes_cleaned' in info[0]:
                hasMesses = True
                messes = tuple([i['messes_cleaned'] for i in info])

            if 'performance_full' in info[0]:
                hasPerfFull = True
                performancesFULL = tuple([i['performance_full'] for i in info])

            if 'performance' in info[0]:
                hasPerf = True
                performances = tuple([i['performance'] for i in info])

            if 'button_presses' in info[0]:
                hasButtonPresses = True
                button_presses = tuple([i['button_presses'] for i in info])

            if 'phones_cleaned' in info[0]:
                hasPhonesCleaned = True
                phones_cleaned = tuple([i['phones_cleaned'] for i in info])

            if 'dirt_cleaned' in info[0]:
                hasDirtCleaned = True
                dirt_cleaned = tuple([i['dirt_cleaned'] for i in info])

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs
            if self.acmodel.recurrent:
                self.memories[i] = self.memory
                self.memory = memory
            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(
                done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)

                assert False
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
                if hasPerf:
                    self.rewards_PERFORMANCE[i] = torch.tensor(
                        performances, device=self.device)
                if hasButtonPresses:
                    self.rewards_BUTTON_PRESSES[i] = torch.tensor(
                        button_presses, device=self.device)
                if hasPhonesCleaned:
                    self.rewards_PHONES_CLEANED[i] = torch.tensor(
                        phones_cleaned, device=self.device)
                if hasDirtCleaned:
                    self.rewards_DIRT_CLEANED[i] = torch.tensor(
                        dirt_cleaned, device=self.device)

            self.log_probs[i] = dist.log_prob(action)

            # Update log values
            if hasMesses:
                self.log_episode_return_MESSES += torch.tensor(
                    messes, device=self.device, dtype=torch.float)
            # Update log values
            if hasPerfFull:
                self.log_episode_return_PERFORMANCE_FULL += torch.tensor(
                    performancesFULL, device=self.device, dtype=torch.float)
            # Update log values
            if hasPerf:
                self.log_episode_return_PERFORMANCE += torch.tensor(
                    performances, device=self.device, dtype=torch.float)
            # Update log values
            if hasButtonPresses:
                self.log_episode_return_BUTTON_PRESSES += torch.tensor(
                    button_presses, device=self.device, dtype=torch.float)
            # Update log values
            if hasPhonesCleaned:
                self.log_episode_return_PHONES_CLEANED += torch.tensor(
                    phones_cleaned, device=self.device, dtype=torch.float)
            # Update log values
            if hasDirtCleaned:
                self.log_episode_return_DIRT_CLEANED += torch.tensor(
                    dirt_cleaned, device=self.device, dtype=torch.float)

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]

            self.log_episode_reshaped_return_PERFORMANCE += self.rewards_PERFORMANCE[
                i]
            self.log_episode_reshaped_return_BUTTON_PRESSES += self.rewards_BUTTON_PRESSES[
                i]
            self.log_episode_reshaped_return_PHONES_CLEANED += self.rewards_PHONES_CLEANED[
                i]
            self.log_episode_reshaped_return_DIRT_CLEANED += self.rewards_DIRT_CLEANED[
                i]

            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1

                    if hasMesses:
                        self.log_return_MESSES.append(
                            self.log_episode_return_MESSES[i].item())

                    if hasPerfFull:
                        self.log_return_PERFORMANCE_FULL.append(
                            self.log_episode_return_PERFORMANCE_FULL[i].item())

                    if hasPerf:
                        self.log_return_PERFORMANCE.append(
                            self.log_episode_return_PERFORMANCE[i].item())
                        self.log_reshaped_return_PERFORMANCE.append(
                            self.log_episode_reshaped_return_PERFORMANCE[i].
                            item())

                    if hasPhonesCleaned:
                        self.log_return_BUTTON_PRESSES.append(
                            self.log_episode_return_BUTTON_PRESSES[i].item())
                        self.log_reshaped_return_BUTTON_PRESSES.append(
                            self.log_episode_reshaped_return_BUTTON_PRESSES[i].
                            item())

                    if hasButtonPresses:
                        self.log_return_PHONES_CLEANED.append(
                            self.log_episode_return_PHONES_CLEANED[i].item())
                        self.log_reshaped_return_PHONES_CLEANED.append(
                            self.log_episode_reshaped_return_PHONES_CLEANED[i].
                            item())

                    if hasDirtCleaned:
                        self.log_return_DIRT_CLEANED.append(
                            self.log_episode_return_DIRT_CLEANED[i].item())
                        self.log_reshaped_return_DIRT_CLEANED.append(
                            self.log_episode_reshaped_return_DIRT_CLEANED[i].
                            item())

                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())

                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_return_PERFORMANCE_FULL *= self.mask
            self.log_episode_return_MESSES *= self.mask
            self.log_episode_return_PERFORMANCE *= self.mask
            self.log_episode_return_BUTTON_PRESSES *= self.mask
            self.log_episode_return_PHONES_CLEANED *= self.mask
            self.log_episode_return_DIRT_CLEANED *= self.mask

            self.log_episode_reshaped_return *= self.mask
            self.log_episode_reshaped_return_PERFORMANCE *= self.mask
            self.log_episode_reshaped_return_BUTTON_PRESSES *= self.mask
            self.log_episode_reshaped_return_PHONES_CLEANED *= self.mask
            self.log_episode_reshaped_return_DIRT_CLEANED *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            if self.acmodel.recurrent:
                _, next_value, _ = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
            else:
                _, next_value = self.acmodel(preprocessed_obs)

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Define experiences:
        #   the whole experience is the concatenation of the experience
        #   of each process.
        # In comments below:
        #   - T is self.num_frames_per_proc,
        #   - P is self.num_procs,
        #   - D is the dimensionality.

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        if self.acmodel.recurrent:
            # T x P x D -> P x T x D -> (P * T) x D
            exps.memory = self.memories.transpose(0, 1).reshape(
                -1, *self.memories.shape[2:])
            # T x P -> P x T -> (P * T) x 1
            exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)
        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        logs = {
            "return_per_episode":
            self.log_return[-keep:],
            "reshaped_return_per_episode":
            self.log_reshaped_return[-keep:],
            "num_frames_per_episode":
            self.log_num_frames[-keep:],
            "num_frames":
            self.num_frames,
            "messes_per_episode":
            self.log_return_MESSES[-keep:],
            "performance_full_per_episode":
            self.log_return_PERFORMANCE_FULL[-keep:],
            "performance_per_episode":
            self.log_return_PERFORMANCE[-keep:],
            "reshaped_performance_per_episode":
            self.log_reshaped_return_PERFORMANCE[-keep:],
            "buttons_per_episode":
            self.log_return_BUTTON_PRESSES[-keep:],
            "reshaped_buttons_per_episode":
            self.log_reshaped_return_BUTTON_PRESSES[-keep:],
            "phones_per_episode":
            self.log_return_PHONES_CLEANED[-keep:],
            "reshaped_phones_per_episode":
            self.log_reshaped_return_PHONES_CLEANED[-keep:],
            "dirt_per_episode":
            self.log_return_DIRT_CLEANED[-keep:],
            "reshaped_dirt_per_episode":
            self.log_reshaped_return_DIRT_CLEANED[-keep:],
            "numberOfPermutes":
            info[0]['numberOfPermutes'],
            "buttonValue":
            info[0]['buttonValue'],
            "episodesDone":
            self.log_done_counter,
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, logs