def collect_experiences(self): for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_tau * next_advantage * next_mask # Defines experiences exps = DictList() exps.obs = [obs for obss in self.obss for obs in obss] if self.acmodel.recurrent: exps.memory = self.memories.view(-1, *self.memories.shape[2:]) exps.mask = self.masks.view(-1, *self.masks.shape[2:]).unsqueeze(1) exps.action = self.actions.view(-1, *self.actions.shape[2:]) exps.value = self.values.view(-1, *self.values.shape[2:]) exps.reward = self.rewards.view(-1, *self.rewards.shape[2:]) exps.advantage = self.advantages.view(-1, *self.advantages.shape[2:]) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.view(-1, *self.log_probs.shape[2:]) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, info = self.env.step(action.cpu().numpy()) self.collect_interactions(info) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values_ext[i] = value[0] self.values_int[i] = value[1] if self.reshape_reward is not None: self.rewards_ext[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards_ext[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards_ext[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # ========================================================================================== # Define experiences: ---> for observations # the whole experience is the concatenation of the experience # of each process. # import pdb; pdb.set_trace() exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) exps.action = self.actions.transpose(0, 1).reshape(-1) if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # Add other data to experience buffer self.add_extra_experience(exps) # ========================================================================================== # -- Calculate intrinsic return self.rewards_int = self.calculate_intrinsic_reward( exps, self.rewards_int) # Add advantage and return to experiences # don;t use end of episode signal for intrinsic rewards preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) # Calculate intrinsic rewards and advantages for i in reversed(range(self.num_frames_per_proc)): next_value_int = self.values_int[ i + 1] if i < self.num_frames_per_proc - 1 else next_value[1] next_advantage_int = self.advantages_int[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards_int[ i] + self.discount * next_value_int - self.values_int[i] self.advantages_int[ i] = delta + self.discount * self.gae_lambda * next_advantage_int # Calculate extrinisc rewards and advantages for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value_ext = self.values_ext[ i + 1] if i < self.num_frames_per_proc - 1 else next_value[0] next_advantage_ext = self.advantages_ext[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards_ext[ i] + self.discount * next_value_ext * next_mask - self.values_ext[ i] self.advantages_ext[ i] = delta + self.discount * self.gae_lambda * next_advantage_ext * next_mask # ========================================================================================== # @ continue Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. # for all tensors below, T x P -> P x T -> P * T exps.value_ext = self.values_ext.transpose(0, 1).reshape(-1) exps.value_int = self.values_int.transpose(0, 1).reshape(-1) exps.reward_ext = self.rewards_ext.transpose(0, 1).reshape(-1) exps.reward_int = self.rewards_int.transpose(0, 1).reshape(-1) exps.advantage_ext = self.advantages_ext.transpose(0, 1).reshape(-1) exps.advantage_int = self.advantages_int.transpose(0, 1).reshape(-1) exps.returnn_ext = exps.value_ext + exps.advantage_ext exps.returnn_int = exps.value_int + exps.advantage_int exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } aux_logs = self.process_interactions() # add extra logs with agent interactions for k in aux_logs: log[k] = aux_logs[k] self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Defines experiences exps = DictList() exps.obs = [obs for obss in self.obss for obs in obss] if self.acmodel.recurrent: exps.memory = self.memories.view(-1, *self.memories.shape[2:]) exps.mask = self.masks.view(-1, *self.masks.shape[2:]).unsqueeze(1) exps.action = self.actions.view(-1, *self.actions.shape[2:]) exps.value = self.values.view(-1, *self.values.shape[2:]) exps.reward = self.rewards.view(-1, *self.rewards.shape[2:]) exps.advantage = self.advantages.view(-1, *self.advantages.shape[2:]) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.view(-1, *self.log_probs.shape[2:]) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log