Exemple #1
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs0 = self.preprocess_obss(self.obs0,
                                                     device=self.device)

            preprocessed_obs1 = self.preprocess_obss(self.obs1,
                                                     device=self.device)

            with torch.no_grad():

                model_results0 = self.acmodel0(
                    preprocessed_obs1,
                    self.memory0 * self.mask0.unsqueeze(1))  ### NOTE

                dist0 = model_results0['dist']  ### NOTE
                value0 = model_results0['value']
                memory0 = model_results0['memory']
                msg0 = model_results0['message']
                dists_speaker0 = model_results0['dists_speaker']
                extra_predictions0 = model_results0['extra_predictions']
                #self.rng_states0[i] = model_results0['rng_states']
                #if torch.cuda.is_available():
                #    self.cuda_rng_states0[i] = model_results0['cuda_rng_states']

                preprocessed_obs0.instr *= 0
                preprocessed_obs0.image *= 0
                model_results1 = self.acmodel1(
                    preprocessed_obs0,
                    self.memory1 * self.mask1.unsqueeze(1),
                    msg=(msg0.transpose(0, 1) *
                         self.mask1.unsqueeze(1).unsqueeze(2)).transpose(
                             0, 1))  ### NOTE

                dist1 = model_results1['dist']
                value1 = model_results1['value']
                memory1 = model_results1['memory']
                msg1 = model_results1['message']
                dists_speaker1 = model_results1['dists_speaker']
                extra_predictions1 = model_results1['extra_predictions']
                #self.rng_states1[i] = model_results1['rng_states']
                #if torch.cuda.is_available():
                #    self.cuda_rng_states1[i] = model_results1['cuda_rng_states']

            #state = torch.get_rng_state()
            action0 = dist0.sample()

            #torch.set_rng_state(state)
            action1 = dist1.sample()

            obs0, reward0, done0, env_info0 = self.env0.step(
                action0.cpu().numpy())

            obs1, reward1, done1, env_info1 = self.env1.step(
                action1.cpu().numpy())

            # mask any rewards based on (previous) been_done
            rewardos0 = [0] * self.num_procs
            rewardos1 = [0] * self.num_procs
            for j in range(self.num_procs):
                rewardos0[j] = reward0[j] * (1 - self.been_done0[j].item())
                rewardos1[j] = reward1[j] * (1 - self.been_done1[j].item())

            reward0 = tuple(rewardos0)
            reward1 = tuple(rewardos1)

            #reward0 = tuple(0.5*r0 + 0.5*r1 for r0, r1 in zip(reward0, reward1)) ### NOTE
            #reward1 = reward0

            # reward sender agent (0) equally for success of receiver agent (1) ### NOTE
            reward0 = reward1

            self.been_done0 = (1 - (1 - self.been_done0) * (1 - torch.tensor(
                done0, device=self.device, dtype=torch.float)))
            self.been_done1 = (1 - (1 - self.been_done1) * (1 - torch.tensor(
                done1, device=self.device, dtype=torch.float)))
            both_done = self.been_done0 * self.been_done1

            # reset if receiver agent (1) is done ### NOTE
            both_done = self.been_done1

            obs0 = self.env0.sync_reset(both_done, obs0)
            obs1 = self.env1.sync_reset(both_done, obs1)

            if self.aux_info:
                env_info0 = self.aux_info_collector0.process(env_info0)
                # env_info0 = self.process_aux_info0(env_info0)

                env_info1 = self.aux_info_collector1.process(env_info1)
                # env_info1 = self.process_aux_info1(env_info1)

            # Update experiences values

            self.obss0[i] = self.obs0
            self.obs0 = obs0

            self.obss1[i] = self.obs1
            self.obs1 = obs1

            self.memories0[i] = self.memory0
            self.memory0 = memory0

            self.memories1[i] = self.memory1
            self.memory1 = memory1

            self.msgs0[i] = self.msg0
            self.msg0 = msg0

            self.msgs1[i] = self.msg1
            self.msg1 = msg1

            self.msgs_out0[i] = msg0

            self.msgs_out1[i] = msg1

            self.masks0[i] = self.mask0
            #self.mask0       = 1 - torch.tensor(done0, device=self.device, dtype=torch.float)
            self.mask0 = 1 - both_done
            self.actions0[i] = action0
            self.values0[i] = value0
            if self.reshape_reward is not None:
                self.rewards0[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs0, action0, reward0, done0)
                ],
                                                device=self.device)
            else:
                self.rewards0[i] = torch.tensor(reward0, device=self.device)
            self.log_probs0[i] = dist0.log_prob(action0)
            self.speaker_log_probs0[i] = self.acmodel0.speaker_log_prob(
                dists_speaker0, msg0)

            self.masks1[i] = self.mask1
            #self.mask1       = 1 - torch.tensor(done1, device=self.device, dtype=torch.float)
            self.mask1 = 1 - both_done
            self.actions1[i] = action1
            self.values1[i] = value1
            if self.reshape_reward is not None:
                self.rewards1[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs1, action1, reward1, done1)
                ],
                                                device=self.device)
            else:
                self.rewards1[i] = torch.tensor(reward1, device=self.device)
            self.log_probs1[i] = dist1.log_prob(action1)
            self.speaker_log_probs1[i] = self.acmodel1.speaker_log_prob(
                dists_speaker1, msg1)

            if self.aux_info:
                self.aux_info_collector0.fill_dictionaries(
                    i, env_info0, extra_predictions0)

                self.aux_info_collector1.fill_dictionaries(
                    i, env_info1, extra_predictions1)

            # Update log values

            self.log_episode_return0 += torch.tensor(reward0,
                                                     device=self.device,
                                                     dtype=torch.float)
            self.log_episode_reshaped_return0 += self.rewards0[i]

            self.log_episode_return1 += torch.tensor(reward1,
                                                     device=self.device,
                                                     dtype=torch.float)
            self.log_episode_reshaped_return1 += self.rewards1[i]

            self.log_episode_num_frames0 += torch.ones(self.num_procs,
                                                       device=self.device)
            self.log_episode_num_frames1 += torch.ones(self.num_procs,
                                                       device=self.device)

            #for i, done_ in enumerate(done0):
            for i in range(self.num_procs):
                #if done_:
                if both_done[i]:
                    self.log_done_counter0 += 1
                    self.log_return0.append(self.log_episode_return0[i].item())
                    self.log_reshaped_return0.append(
                        self.log_episode_reshaped_return0[i].item())
                    self.log_num_frames0.append(
                        self.log_episode_num_frames0[i].item())

                    #for i, done_ in enumerate(done1):
                    #if done_:
                    self.log_done_counter1 += 1
                    self.log_return1.append(self.log_episode_return1[i].item())
                    self.log_reshaped_return1.append(
                        self.log_episode_reshaped_return1[i].item())
                    self.log_num_frames1.append(
                        self.log_episode_num_frames1[i].item())

            # if both are done, reset both to not done
            self.been_done0 *= (1 - both_done)
            self.been_done1 *= (1 - both_done)

            self.log_episode_return0 *= self.mask0
            self.log_episode_reshaped_return0 *= self.mask0
            self.log_episode_num_frames0 *= self.mask0

            self.log_episode_return1 *= self.mask1
            self.log_episode_reshaped_return1 *= self.mask1
            self.log_episode_num_frames1 *= self.mask1

        # Add advantage and return to experiences

        preprocessed_obs0 = self.preprocess_obss(self.obs0, device=self.device)
        preprocessed_obs1 = self.preprocess_obss(self.obs1, device=self.device)

        with torch.no_grad():
            tmp = self.acmodel0(preprocessed_obs1,
                                self.memory0 *
                                self.mask0.unsqueeze(1))  ### NOTE
            next_value0 = tmp['value']

            preprocessed_obs0.instr *= 0
            preprocessed_obs0.image *= 0
            next_value1 = self.acmodel1(
                preprocessed_obs0,
                self.memory1 * self.mask1.unsqueeze(1),
                msg=(tmp['message'].transpose(0, 1) *
                     self.mask1.unsqueeze(1).unsqueeze(2)).transpose(
                         0, 1))['value']  ### NOTE

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask0 = self.masks0[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask0
            next_value0 = self.values0[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value0
            next_advantage0 = self.advantages0[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            next_mask1 = self.masks1[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask1
            next_value1 = self.values1[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value1
            next_advantage1 = self.advantages1[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta0 = self.rewards0[
                i] + self.discount * next_value0 * next_mask0 - self.values0[i]
            self.advantages0[
                i] = delta0 + self.discount * self.gae_lambda * next_advantage0 * next_mask0

            delta1 = self.rewards1[
                i] + self.discount * next_value1 * next_mask1 - self.values1[i]
            self.advantages1[
                i] = delta1 + self.discount * self.gae_lambda * next_advantage1 * next_mask1

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps0 = DictList()
        exps0.obs = [
            self.obss0[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]

        exps1 = DictList()
        exps1.obs = [
            self.obss1[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]

        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps0.memory = self.memories0.transpose(0, 1).reshape(
            -1, *self.memories0.shape[2:])

        exps1.memory = self.memories1.transpose(0, 1).reshape(
            -1, *self.memories1.shape[2:])

        exps0.message = self.msgs0.transpose(1, 2).transpose(0, 1).reshape(
            -1, self.acmodel0.max_len_msg, self.acmodel0.num_symbols)

        exps1.message = self.msgs1.transpose(1, 2).transpose(0, 1).reshape(
            -1, self.acmodel1.max_len_msg, self.acmodel1.num_symbols)

        exps0.message_out = self.msgs_out0.transpose(1, 2).transpose(
            0, 1).reshape(-1, self.acmodel0.max_len_msg,
                          self.acmodel0.num_symbols)

        exps1.message_out = self.msgs_out1.transpose(1, 2).transpose(
            0, 1).reshape(-1, self.acmodel1.max_len_msg,
                          self.acmodel1.num_symbols)

        #exps0.rng_states = self.rng_states0.transpose(0, 1).reshape(-1, *self.rng_states0.shape[2:])
        #if torch.cuda.is_available():
        #    exps0.cuda_rng_states = self.cuda_rng_states0.transpose(0, 1).reshape(-1, *self.cuda_rng_states0.shape[2:])

        #exps1.rng_states = self.rng_states1.transpose(0, 1).reshape(-1, *self.rng_states1.shape[2:])
        #if torch.cuda.is_available():
        #    exps1.cuda_rng_states = self.cuda_rng_states1.transpose(0, 1).reshape(-1, *self.cuda_rng_states1.shape[2:])

        # T x P -> P x T -> (P * T) x 1
        exps0.mask = self.masks0.transpose(0, 1).reshape(-1).unsqueeze(1)

        exps1.mask = self.masks1.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps0.action = self.actions0.transpose(0, 1).reshape(-1)
        exps0.value = self.values0.transpose(0, 1).reshape(-1)
        exps0.reward = self.rewards0.transpose(0, 1).reshape(-1)
        exps0.advantage = self.advantages0.transpose(0, 1).reshape(-1)
        exps0.returnn = exps0.value + exps0.advantage
        exps0.log_prob = self.log_probs0.transpose(0, 1).reshape(-1)
        exps0.speaker_log_prob = self.speaker_log_probs0.transpose(
            0, 1).reshape(-1)

        exps1.action = self.actions1.transpose(0, 1).reshape(-1)
        exps1.value = self.values1.transpose(0, 1).reshape(-1)
        exps1.reward = self.rewards1.transpose(0, 1).reshape(-1)
        exps1.advantage = self.advantages1.transpose(0, 1).reshape(-1)
        exps1.returnn = exps1.value + exps1.advantage
        exps1.log_prob = self.log_probs1.transpose(0, 1).reshape(-1)
        exps1.speaker_log_prob = self.speaker_log_probs1.transpose(
            0, 1).reshape(-1)

        if self.aux_info:
            exps0 = self.aux_info_collector0.end_collection(exps0)

            exps1 = self.aux_info_collector1.end_collection(exps1)

        # Preprocess experiences

        exps0.obs = self.preprocess_obss(exps0.obs, device=self.device)

        exps1.obs = self.preprocess_obss(exps1.obs, device=self.device)

        # Log some values

        keep0 = max(self.log_done_counter0, self.num_procs)

        keep1 = max(self.log_done_counter1, self.num_procs)

        log0 = {
            "return_per_episode": self.log_return0[-keep0:],
            "reshaped_return_per_episode": self.log_reshaped_return0[-keep0:],
            "num_frames_per_episode": self.log_num_frames0[-keep0:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter0,
        }

        log1 = {
            "return_per_episode": self.log_return1[-keep1:],
            "reshaped_return_per_episode": self.log_reshaped_return1[-keep1:],
            "num_frames_per_episode": self.log_num_frames1[-keep1:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter1,
        }

        self.log_done_counter0 = 0
        self.log_return0 = self.log_return0[-self.num_procs:]
        self.log_reshaped_return0 = self.log_reshaped_return0[-self.num_procs:]
        self.log_num_frames0 = self.log_num_frames0[-self.num_procs:]

        self.log_done_counter1 = 0
        self.log_return1 = self.log_return1[-self.num_procs:]
        self.log_reshaped_return1 = self.log_reshaped_return1[-self.num_procs:]
        self.log_num_frames1 = self.log_num_frames1[-self.num_procs:]

        return exps0, log0, exps1, log1
Exemple #2
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """
        from pdb import set_trace as st
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction
            preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)

            with torch.no_grad():
                from pdb import set_trace as st
                st()
                model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                dist = model_results['dist']
                value = model_results['value']
                memory = model_results['memory']
                extra_predictions = model_results['extra_predictions']


            action = dist.sample()
            obs, reward, done, env_info = self.env.step(action.cpu().numpy())
            if self.aux_info:
                env_info = self.aux_info_collector.process(env_info)
                # env_info = self.process_aux_info(env_info)

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs

            self.memories[i] = self.memory
            self.memory = memory

            self.masks[i] = self.mask


            self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
                ], device=self.device)

            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            if self.aux_info:
                self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions)

            # Update log values

            self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value']

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps = DictList()
        exps.obs = [self.obss[i][j]
                    for j in range(self.num_procs)
                    for i in range(self.num_frames_per_proc)]
        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:])
        # T x P -> P x T -> (P * T) x 1
        exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        if self.aux_info:
            exps = self.aux_info_collector.end_collection(exps)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter,
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log
Exemple #3
0
    def update_parameters(self):
        # Collect experiences
        t_collect, (exps, logs) = timer(self.collect_experiences)()
        logs['t_collect'] = t_collect
        '''
        exps is a DictList with the following keys ['obs', 'memory', 'mask', 'action', 'value', 'reward',
         'advantage', 'returnn', 'log_prob'] and ['collected_info', 'extra_predictions'] if we use aux_info
        exps.obs is a DictList with the following keys ['image', 'instr']
        exps.obj.image is a (n_procs * n_frames_per_proc) x image_size 4D tensor
        exps.obs.instr is a (n_procs * n_frames_per_proc) x (max number of words in an instruction) 2D tensor
        exps.memory is a (n_procs * n_frames_per_proc) x (memory_size = 2*image_embedding_size) 2D tensor
        exps.mask is (n_procs * n_frames_per_proc) x 1 2D tensor
        if we use aux_info: exps.collected_info and exps.extra_predictions are DictLists with keys
        being the added information. They are either (n_procs * n_frames_per_proc) 1D tensors or
        (n_procs * n_frames_per_proc) x k 2D tensors where k is the number of classes for multiclass classification
        '''
        t0_train = time.time()
        t_details_train_forward_model = {}
        t_train_backward = 0
        # objs[torch.sum(torch.stack([idx == i for i in indices]), dim=0).nonzero().flatten()]
        n = 0
        for _ in range(self.epochs):
            n = n + 1
            # Initialize log values

            log_entropies = []
            log_values = []
            log_policy_losses = []
            log_value_losses = []
            log_grad_norms = []

            log_losses = []
            '''
            For each epoch, we create int(total_frames / batch_size + 1) batches, each of size batch_size (except
            maybe the last one. Each batch is divided into sub-batches of size recurrence (frames are contiguous in
            a sub-batch), but the position of each sub-batch in a batch and the position of each batch in the whole
            list of frames is random thanks to self._get_batches_starting_indexes().
            '''

            for inds in self._get_batches_starting_indexes():
                # inds is a numpy array of indices that correspond to the beginning of a sub-batch
                # there are as many inds as there are batches
                # Initialize batch values

                batch_entropy = 0
                batch_value = 0
                batch_policy_loss = 0
                batch_value_loss = 0
                batch_loss = 0

                # Initialize memory

                # Extract first memories
                inds_mem = [
                    item for sublist in [
                        list(
                            range(
                                self.acmodel.memory_dim[0] *
                                i, self.acmodel.memory_dim[0] * i +
                                self.acmodel.memory_dim[0])) for i in inds
                    ] for item in sublist
                ]
                memory = exps.memory[inds_mem]

                all_obs_inds = exps.obs[1].image
                sb = DictList()

                for i in range(self.recurrence):

                    # Extract scene level quantities:
                    sb.action = exps.action[inds + i]
                    sb.log_prob = exps.log_prob[inds + i]
                    sb.advantage = exps.advantage[inds + i]
                    sb.value = exps.value[inds + i]
                    sb.returnn = exps.returnn[inds + i]

                    m_batch = torch.IntTensor([
                        j + i for j in inds
                        for _ in range(self.acmodel.memory_dim[0])
                    ])

                    # Extract subatch of observation and observation batch indices
                    sb.obs = torch.zeros((0, self.acmodel.image_dim))
                    sb.obs_batch = torch.zeros(0).int()
                    for j in inds + i:
                        idx_j = all_obs_inds == j
                        sb.obs = torch.cat([sb.obs, exps.obs[0].image[idx_j]],
                                           dim=0)
                        sb.obs_batch = torch.cat(
                            [sb.obs_batch, exps.obs[1].image[idx_j]], dim=0)

                    # TODO rename obs[0] and obs[1] into obs.obs and obs.obs_batch

                    # Reshape mask
                    sb.mask = exps.mask[list(
                        numpy.array(inds_mem) + self.acmodel.memory_dim[0] *
                        (i))].flatten()

                    # Compute loss
                    model_results = self.acmodel(sb.obs,
                                                 sb.mask.unsqueeze(1) * memory,
                                                 sb.obs_batch, m_batch)
                    dist = model_results['dist']
                    value = model_results['value']
                    memory = model_results['memory']
                    extra_predictions = model_results['extra_predictions']

                    entropy = dist.entropy().mean()

                    t_details_train_forward_model = cumulate_value(
                        t_details_train_forward_model,
                        model_results['log_time'])

                    ratio = torch.exp(dist.log_prob(sb.action) - sb.log_prob)
                    surr1 = ratio * sb.advantage
                    surr2 = torch.clamp(ratio, 1.0 - self.clip_eps,
                                        1.0 + self.clip_eps) * sb.advantage
                    policy_loss = -torch.min(surr1, surr2).mean()

                    value_clipped = sb.value + torch.clamp(
                        value - sb.value, -self.clip_eps, self.clip_eps)
                    surr1 = (value - sb.returnn).pow(2)
                    surr2 = (value_clipped - sb.returnn).pow(2)
                    value_loss = torch.max(surr1, surr2).mean()

                    loss = policy_loss - self.entropy_coef * entropy + self.value_loss_coef * value_loss

                    # Update batch values

                    batch_entropy += entropy.item()
                    batch_value += value.mean().item()
                    batch_policy_loss += policy_loss.item()
                    batch_value_loss += value_loss.item()
                    batch_loss += loss

                    # Update memories for next epoch

                    if i < self.recurrence - 1:
                        exps.memory[list(
                            numpy.array(inds_mem) +
                            self.acmodel.memory_dim[0] *
                            (i + 1))] = memory.detach()

                # Update batch values

                batch_entropy /= self.recurrence
                batch_value /= self.recurrence
                batch_policy_loss /= self.recurrence
                batch_value_loss /= self.recurrence
                batch_loss /= self.recurrence

                # Update actor-critic
                t0_train_backward = time.time()
                self.optimizer.zero_grad()
                batch_loss.backward()
                grad_norm = sum(
                    p.grad.data.norm(2)**2 for p in self.acmodel.parameters()
                    if p.grad is not None)**0.5
                torch.nn.utils.clip_grad_norm_(self.acmodel.parameters(),
                                               self.max_grad_norm)
                self.optimizer.step()
                t_train_backward += time.time() - t0_train_backward
                # Update log values

                log_entropies.append(batch_entropy)
                log_values.append(batch_value)
                log_policy_losses.append(batch_policy_loss)
                log_value_losses.append(batch_value_loss)
                log_grad_norms.append(grad_norm.item())
                log_losses.append(batch_loss.item())

        t_train = time.time() - t0_train

        # Log some values

        logs["entropy"] = numpy.mean(log_entropies)
        logs["value"] = numpy.mean(log_values)
        logs["policy_loss"] = numpy.mean(log_policy_losses)
        logs["value_loss"] = numpy.mean(log_value_losses)
        logs["grad_norm"] = numpy.mean(log_grad_norms)
        logs["loss"] = numpy.mean(log_losses)
        logs['t_collect'] = t_collect
        logs['t_train'] = t_train
        logs['t_details_train_forward_mordel'] = t_details_train_forward_model
        logs['t_backward'] = t_train_backward
        return logs
Exemple #4
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """

        t0 = time.time()
        t_forward_process = 0
        t_forward_step = 0
        t_details_forward_model = {}
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction
            tt_process, preprocessed_obs = timer(self.preprocess_obss)(
                self.obs, device=self.device)
            t_forward_process += tt_process
            obs_flat = preprocessed_obs.image[0]
            obs_batch = preprocessed_obs.image[1]

            with torch.no_grad():
                model_results = self.acmodel(
                    obs_flat,
                    self.mask.unsqueeze(1) * self.memory, obs_batch,
                    self.m_batch)
                dist = model_results['dist']
                value = model_results['value'].flatten()
                memory = model_results['memory']
                extra_predictions = model_results['extra_predictions']

            t_details_forward_model = cumulate_value(t_details_forward_model,
                                                     model_results['log_time'])

            action = dist.sample()

            tt_step, (obs, reward, done,
                      env_info) = timer(self.env.step)(action.cpu().numpy())
            t_forward_step += tt_step

            if self.aux_info:
                env_info = self.aux_info_collector.process(env_info)
                # env_info = self.process_aux_info(env_info)

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs

            self.memories[i] = self.memory
            self.memory = memory

            self.masks[i] = self.mask
            done_as_int = torch.tensor(done,
                                       device=self.device,
                                       dtype=torch.float).unsqueeze(1)
            self.mask = 1 - done_as_int.expand(
                done_as_int.shape[0], self.acmodel.memory_size[0]).flatten()

            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            if self.aux_info:
                self.aux_info_collector.fill_dictionaries(
                    i, env_info, extra_predictions)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            episode_mask = torch.tensor([
                self.mask[i * self.acmodel.memory_size[0]]
                for i in range(self.num_procs)
            ])
            self.log_episode_return *= episode_mask
            self.log_episode_reshaped_return *= episode_mask
            self.log_episode_num_frames *= episode_mask

        t_collect_forward = time.time() - t0

        # Add advantage and return to experiences
        t0 = time.time()
        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            # TODO: Add split obs_flat, obs_batch in preprocess_obss ?
            obs_flat = preprocessed_obs.image[0]
            obs_batch = preprocessed_obs.image[1]
            next_value = self.acmodel(obs_flat,
                                      self.mask.unsqueeze(1) * self.memory,
                                      obs_batch,
                                      self.m_batch)['value'].flatten()

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = torch.tensor([
                self.masks[i + 1][j * self.acmodel.memory_size[0]]
                for j in range(self.num_procs)
            ]) if i < self.num_frames_per_proc - 1 else torch.tensor([
                self.mask[j * self.acmodel.memory_size[0]]
                for j in range(self.num_procs)
            ])

            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0
            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        t_collect_backward = time.time() - t0

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk
        t0 = time.time()
        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]

        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality and M the number of memory slots

        # T x (P * M) x D -> T x P x M x D -> P x T x M x D -> (P * T * M) x D
        exps.memory = self.memories.reshape(
            (self.num_frames_per_proc, self.num_procs,
             self.acmodel.memory_size[0],
             self.acmodel.memory_size[1])).transpose(0, 1).reshape(
                 -1, *self.memories.shape[2:])

        # T x (P * M) -> T x P x M -> P x T x M -> (P * T * M) x 1
        exps.mask = self.masks.reshape(self.num_frames_per_proc,
                                       self.num_procs,
                                       self.acmodel.memory_size[0]).transpose(
                                           0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        t_organize_exp = time.time() - t0
        if self.aux_info:
            exps = self.aux_info_collector.end_collection(exps)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter,
            "t_collect_forward": t_collect_forward,
            "t_details_forward_model": t_details_forward_model,
            "t_forward_process": t_forward_process,
            "t_forward_step": t_forward_step,
            "t_collect_backward": t_collect_backward,
            "t_collect_organize": t_organize_exp
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log
Exemple #5
0
 def __getitem__(self, index):
     return DictList({key: [subvalue[index] for subvalue in value] for key, value in dict.items(self)})
Exemple #6
0
    def collect_experiences(self, teacher_dict):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Contains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """
        teacher_keys = list(teacher_dict.keys())
        all_teachers_dict = dict(zip(teacher_keys, [True] * len(teacher_keys)))

        # TODO: Make this handle the case where the meta_rollout length > 1
        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction
            preprocessed_obs = self.preprocess_obss(self.obs, teacher_dict)

            with torch.no_grad():
                dist, model_results = self.acmodel(
                    preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                value = model_results['value']
                memory = model_results['memory']
                extra_predictions = None

            action = dist.sample()

            obs, reward, done, env_info = self.env.step(action.cpu().numpy())

            # Update experiences values

            self.env_infos[i] = env_info
            self.obss[i] = self.obs
            self.obs = obs
            self.teacher_actions[i] = torch.FloatTensor(
                [ei['teacher_action'][0] for ei in env_info]).to(self.device)

            self.memories[i] = self.memory
            self.memory = memory

            self.masks[i] = self.mask
            done_tensor = torch.FloatTensor(done).to(self.device)
            self.done_index = done_tensor + self.done_index

            done_meta = self.done_index == self.rollouts_per_meta_task
            self.done_index = torch.remainder(self.done_index,
                                              self.rollouts_per_meta_task)
            self.dones[i] = done_tensor
            self.mask = 1 - done_meta.to(torch.int32)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(
                        obs, action, reward, done)
                ],
                                               device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            if self.aux_info:
                self.aux_info_collector.fill_dictionaries(
                    i, env_info, extra_predictions)

            # Update log values

            self.log_episode_return += torch.tensor(reward,
                                                    device=self.device,
                                                    dtype=torch.float)
            self.log_episode_success += torch.tensor(
                [e['success'] for e in env_info],
                device=self.device,
                dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs,
                                                      device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_success.append(self.log_episode_success[i].item())
                    self.log_reshaped_return.append(
                        self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(
                        self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_success *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # Add advantage and return to experiences

        preprocessed_obs = self.preprocess_obss(self.obs, teacher_dict)
        with torch.no_grad():
            next_value = self.acmodel(preprocessed_obs,
                                      self.memory *
                                      self.mask.unsqueeze(1))[1]['value']

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[
                i + 1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[
                i + 1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[
                i + 1] if i < self.num_frames_per_proc - 1 else 0

            delta = self.rewards[
                i] + self.discount * next_value * next_mask - self.values[i]
            self.advantages[
                i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps = DictList()
        exps.obs = [
            self.obss[i][j] for j in range(self.num_procs)
            for i in range(self.num_frames_per_proc)
        ]
        keys = list(env_info[0].keys())
        batch = len(env_info)
        timesteps = len(self.env_infos)
        env_info_dict = {}
        for k in keys:
            arr = []
            for b in range(batch):
                for t in range(timesteps):
                    arr.append(self.env_infos[t][b][k])
            env_info_dict[k] = np.stack(arr)
        env_info_dict = DictList(env_info_dict)
        exps.env_infos = env_info_dict
        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps.memory = self.memories.transpose(0, 1).reshape(
            -1, *self.memories.shape[2:])
        # T x P -> P x T -> (P * T) x 1
        exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)
        exps.teacher_action = self.teacher_actions.transpose(0, 1).reshape(-1)
        exps.done = self.dones.transpose(0, 1).reshape(-1)

        if self.aux_info:
            exps = self.aux_info_collector.end_collection(exps)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, all_teachers_dict)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "success_per_episode": self.log_success[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter,
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_success = self.log_success[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log
Exemple #7
0
    def collect_experiences(self):
        """Collects rollouts and computes advantages.

        Runs several environments concurrently. The next actions are computed
        in a batch mode for all environments at the same time. The rollouts
        and advantages from all environments are concatenated together.

        Returns
        -------
        exps : DictList
            Cscripts/train_rl.py --env BabyAI-GoToLocal-v0ontains actions, rewards, advantages etc as attributes.
            Each attribute, e.g. `exps.reward` has a shape
            (self.num_frames_per_proc * num_envs, ...). k-th block
            of consecutive `self.num_frames_per_proc` frames contains
            data obtained from the k-th environment. Be careful not to mix
            data from different environments!
        logs : dict
            Useful stats about the training process, including the average
            reward, policy loss, value loss, etc.

        """

        # Reset cpv buffer if needed. 
        if self.reward_fn == 'cpv' or self.reward_fn == 'both':
            self.reset_cpv_buffer()

        start = time.time()

        for i in range(self.num_frames_per_proc):
            # Do one agent-environment interaction

            preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
            with torch.no_grad():
                model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))
                dist = model_results['dist']
                value = model_results['value']
                memory = model_results['memory']
                extra_predictions = model_results['extra_predictions']

            action = dist.sample()

            # Take a step in env and process reward if not using default reward function. 
            obs, old_reward, done, env_info = self.env.step(action.cpu().numpy())

            if self.reward_fn == 'cpv' or self.reward_fn == 'both': 

                reward = old_reward # TODO Do we even need this if-else block here anymore?
                """
                unnormalized_reward = self.reward_model.calculate_reward(self.cpv_buffer, self.obs)

                if self.aux_info:
                    env_info = self.aux_info_collector.process(env_info)
                    env_info = self.process_aux_info(env_info)

                std = numpy.std(self.all_rewards) if self.all_rewards != [] else numpy.std(unnormalized_reward)
                mean = numpy.mean(self.all_rewards) if self.all_rewards != [] else numpy.mean(unnormalized_reward)
                reward = numpy.clip([(r - mean) /  std for r in unnormalized_reward], 0, 1)
                self.all_rewards.extend(unnormalized_reward)
                if len(self.all_rewards) > 1000:
                    self.all_rewards[-1000:]
                """

            elif self.reward_fn == 'babyai': 
                reward = old_reward

                if self.aux_info: 
                    env_info = self.aux_info_collector.process(env_info)
                    #env_info = self.process_aux_info(env_info)

            # Update experiences values

            self.obss[i] = self.obs
            self.obs = obs

            self.memories[i] = self.memory
            self.memory = memory

            self.masks[i] = self.mask
            self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float)
            self.actions[i] = action
            self.values[i] = value
            if self.reshape_reward is not None:
                self.rewards[i] = torch.tensor([
                    self.reshape_reward(obs_, action_, reward_, done_)
                    for obs_, action_, reward_, done_ in zip(obs, action, reward, done)
                ], device=self.device)
            else:
                self.rewards[i] = torch.tensor(reward, device=self.device)
            self.log_probs[i] = dist.log_prob(action)

            if self.aux_info:
                self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions)

            # Update log values
            self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float)
            self.log_episode_reshaped_return += self.rewards[i]
            self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device)

            for i, done_ in enumerate(done):
                if done_:
                    self.log_done_counter += 1
                    self.log_return.append(self.log_episode_return[i].item())
                    self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item())
                    self.log_num_frames.append(self.log_episode_num_frames[i].item())

            self.log_episode_return *= self.mask
            self.log_episode_reshaped_return *= self.mask
            self.log_episode_num_frames *= self.mask

        # If CPV, recompute reward based on trajectory. 
        if self.reward_fn == 'cpv': 

            # Make single run through CPV model to compute all rewards at once. 
            self.rewards = self.reward_model.calculate_reward(self.obss).permute(1,0)

            # TODO normalize rewards? 
            std, mean = torch.std_mean(self.rewards, dim=1)
            std = std.view(-1, 1).expand_as(self.rewards)
            mean = mean.view(-1, 1).expand_as(self.rewards)
            self.reward = torch.clamp((self.rewards - mean) /  std, 0.0, 1.0)

        # Add advantage and return to experiences
        preprocessed_obs = self.preprocess_obss(self.obs, device=self.device)
        with torch.no_grad():
            next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value']

        for i in reversed(range(self.num_frames_per_proc)):
            next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask
            next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value
            next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0

            # print("Reward")
            # print(self.rewards[i])
            # # print("Discount")
            # # print(self.discount[i])
            # print("Values")
            # print(self.values[i])

            delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i]

            self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask

        # Flatten the data correctly, making sure that
        # each episode's data is a continuous chunk

        exps = DictList()
        exps.obs = [self.obss[i][j]
                    for j in range(self.num_procs)
                    for i in range(self.num_frames_per_proc)]
        # In commments below T is self.num_frames_per_proc, P is self.num_procs,
        # D is the dimensionality

        # T x P x D -> P x T x D -> (P * T) x D
        exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:])
        # T x P -> P x T -> (P * T) x 1
        exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1)

        # for all tensors below, T x P -> P x T -> P * T
        exps.action = self.actions.transpose(0, 1).reshape(-1)
        exps.value = self.values.transpose(0, 1).reshape(-1)
        exps.reward = self.rewards.transpose(0, 1).reshape(-1)
        exps.advantage = self.advantages.transpose(0, 1).reshape(-1)
        exps.returnn = exps.value + exps.advantage
        exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1)

        if self.aux_info:
            exps = self.aux_info_collector.end_collection(exps)

        # Preprocess experiences

        exps.obs = self.preprocess_obss(exps.obs, device=self.device)

        # Log some values

        keep = max(self.log_done_counter, self.num_procs)

        log = {
            "return_per_episode": self.log_return[-keep:],
            "reshaped_return_per_episode": self.log_reshaped_return[-keep:],
            "num_frames_per_episode": self.log_num_frames[-keep:],
            "num_frames": self.num_frames,
            "episodes_done": self.log_done_counter,
        }

        self.log_done_counter = 0
        self.log_return = self.log_return[-self.num_procs:]
        self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:]
        self.log_num_frames = self.log_num_frames[-self.num_procs:]

        return exps, log