def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) self.step_counter = [x + 1 for x in self.step_counter] # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device, dtype=torch.float) self.log_probs[i] = dist.log_prob(action) # get discounts discounts = [self.discount**(x - 1) for x in self.step_counter] discounts = torch.tensor(discounts, device=self.device, dtype=torch.float) # Update log values self.log_episode_return += torch.tensor( reward, device=self.device, dtype=torch.float) * discounts self.log_episode_reshaped_return += self.rewards[i] * discounts self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.step_counter[i] = 0 self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) logs = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, logs
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ gazes = [] for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) # Initialize gaze to none - this is only used with variable view gaze = None with torch.no_grad(): if self.acmodel.recurrent: if self.variable_view: dist, gaze, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) # If variable view is enabled the last two fields in the action vector are offsets not actions if self.variable_view: action = dist.sample() gaze_action = torch.stack( (3.0 * gaze[0].sample(), 3.0 * gaze[1].sample()), dim=1) action_data = torch.cat( (action.view([-1, 1]), gaze_action.long()), dim=1) obs, reward, done, _ = self.env.step(action_data.cpu().numpy()) else: action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Compute 3D distribution log probs for action, x gaze and y gaze f using variable view if self.variable_view: log_probs = [] gaze_action /= 3.0 for j in range(gaze_action.shape[0]): gaze_dist = gaze[0].probs[j].ger(gaze[1].probs[j]).view( [-1]) full_action_space_dist = Categorical( (dist.probs[j].ger(gaze_dist)).view(-1)) action[j] = action[j] * 22 + gaze_action[j][ 0] * 11 + gaze_action[j][1] log_prob = full_action_space_dist.log_prob(action[j]) log_probs.append(log_prob) self.log_probs[i] = torch.Tensor(log_probs) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask gazes.append(gaze_action) # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: if self.variable_view: _, _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # If using variable view store the gaze if self.variable_view: gazes = torch.cat(gazes) exps.gaze = gazes # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) logs = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, logs, gazes * 3
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction if self.continuous_action: self.obs = [ self.model.scaler.transform(self.obs[0].reshape( 1, -1)).reshape(-1).astype('float64') ] preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.model.recurrent: dist, value, embedding, _, successor, _, memory = self.model( preprocessed_obs, memory=self.memory * self.mask.unsqueeze(1)) _, target_value, target_embedding, _, target_successor, _, _ = self.target( preprocessed_obs, memory=self.memory * self.mask.unsqueeze(1)) # target else: dist, value, embedding, _, successor, _, _ = self.model( preprocessed_obs) _, target_value, target_embedding, _, target_successor, _, _ = self.target( preprocessed_obs) if self.continuous_action: # Should this (eps + stochastic policy) be done? Or use (eps + det policy) or just stochastic policy? epsample = random.random() eps_threshold = 0.02 + (0.9 - 0.02) * math.exp( -1. * self.total_updates / 200) if epsample > eps_threshold: noise_dist = torch.distributions.normal.Normal(0, 0.03) action = dist.sample() + noise_dist.sample() action = torch.clamp(action, self.env.envs[0].min_action, self.env.envs[0].max_action) else: action = torch.Tensor( self.env.envs[0].action_space.sample()) obs, reward, done, _ = self.env.step([action.cpu().numpy()]) obs = (obs[0].reshape(1, -1)) else: action = dist.sample() obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.replay_memory.push((self.FloatTensor([obs[0]['image']]), self.FloatTensor([reward]))) self.obss[i] = self.obs self.obs = obs if self.model.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value self.target_values[i] = target_value self.embeddings[i] = embedding self.target_embeddings[i] = target_embedding self.successors[i] = successor self.target_successors[i] = target_successor if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences if self.continuous_action: # asuming flat observations for continuous action case: # this is true for the Mountain Cart example but may not be in general # Ideally the continuous action code should be modifed to handle flat or image input # And the use of a scaler should be an option to train.py # And either use checks here to do the following # or create a wrapper that does the scaling and set it up in train.py self.obs[0] = self.model.scaler.transform(self.obs[0].reshape( 1, -1)).reshape(-1) self.obs = self.obs.astype('float32') preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.model.recurrent: _, next_value, _, _, next_successor, _, _ = self.target( preprocessed_obs, memory=self.memory * self.mask.unsqueeze(1)) #target else: _, next_value, _, _, next_successor, _ = self.target( preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_successor = self.target_successors[ i + 1] if i < self.num_frames_per_proc - 1 else next_successor next_value = self.target_values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_SR_advantage = self.SR_advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 next_V_advantage = self.V_advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 SR_delta = self.target_embeddings[i] + ( self.discount * next_successor * next_mask.reshape(-1, 1)) - self.successors[i] self.SR_advantages[i] = SR_delta + ( self.discount * self.gae_lambda * next_SR_advantage * next_mask.reshape(-1, 1)) V_delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.V_advantages[ i] = V_delta + self.discount * self.gae_lambda * next_V_advantage * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.model.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.SR_advantage = self.SR_advantages.transpose(0, 1).reshape( -1, self.model.embedding_size) exps.successor = self.successors.transpose(0, 1).reshape( -1, self.model.embedding_size) exps.successorn = exps.successor + exps.SR_advantage exps.V_advantage = self.V_advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.V_advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) logs = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, logs
def collect_experiences_old(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.model.recurrent: value, memory = self.model( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: value = self.model(preprocessed_obs) action = self.pareto_action(value, self.weights) eps_mask = torch.rand(action.shape) < self.eps action[eps_mask] = torch.randint(0, self.env.action_space.n, (sum(eps_mask), )) obs, reward, done, _ = self.env.step(action.cpu().numpy()) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.model.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append( self.log_episode_return[i]) #.item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i]) #.item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) # reroll the weights for that episode if self.reward_size == 1: self.weights[i, 0] = 1 elif self.reward_size == 2: self.weights[i, 0] = torch.rand(1) self.weights[i, 1] = 1 - self.weights[i, 0] else: raise NotImplementedError self.log_episode_return = (self.log_episode_return.T * self.mask).T self.log_episode_reshaped_return = ( self.log_episode_reshaped_return.T * self.mask).T self.log_episode_num_frames *= self.mask preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.model.recurrent: next_value, _ = self.eval_model( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: next_value = self.eval_model(preprocessed_obs) next_value_clipped = torch.clip(next_value, *self.env.envs[0].reward_range) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_mask = torch.vstack([next_mask] * self.reward_size).T next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value self.expected_values[i] = self.rewards[i] + ( self.pareto_rewards(next_value_clipped, self.weights) * (self.discount * next_mask)) # self.advantages[i] = delta + (next_advantage.T * (self.discount * self.gae_lambda * next_mask)).T # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.model.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1, self.reward_size) exps.reward = self.rewards.transpose(0, 1).reshape(-1, self.reward_size) exps.exp_value = self.expected_values.transpose(0, 1).reshape( -1, self.reward_size) exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) logs = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, logs
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ hasMesses = False hasPerf = False hasPerfFull = False hasButtonPresses = False hasPhonesCleaned = False hasDirtCleaned = False addedAllMyData = False loggedAllMyData = False allMyData = None for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: dist, value, memory = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: dist, value = self.acmodel(preprocessed_obs) action = dist.sample() obs, reward, done, info = self.env.step(action.cpu().numpy()) if 'messes_cleaned' in info[0]: hasMesses = True messes = tuple([i['messes_cleaned'] for i in info]) if 'performance_full' in info[0]: hasPerfFull = True performancesFULL = tuple([i['performance_full'] for i in info]) if 'performance' in info[0]: hasPerf = True performances = tuple([i['performance'] for i in info]) if 'button_presses' in info[0]: hasButtonPresses = True button_presses = tuple([i['button_presses'] for i in info]) if 'phones_cleaned' in info[0]: hasPhonesCleaned = True phones_cleaned = tuple([i['phones_cleaned'] for i in info]) if 'dirt_cleaned' in info[0]: hasDirtCleaned = True dirt_cleaned = tuple([i['dirt_cleaned'] for i in info]) # Update experiences values self.obss[i] = self.obs self.obs = obs if self.acmodel.recurrent: self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor( done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) assert False else: self.rewards[i] = torch.tensor(reward, device=self.device) if hasPerf: self.rewards_PERFORMANCE[i] = torch.tensor( performances, device=self.device) if hasButtonPresses: self.rewards_BUTTON_PRESSES[i] = torch.tensor( button_presses, device=self.device) if hasPhonesCleaned: self.rewards_PHONES_CLEANED[i] = torch.tensor( phones_cleaned, device=self.device) if hasDirtCleaned: self.rewards_DIRT_CLEANED[i] = torch.tensor( dirt_cleaned, device=self.device) self.log_probs[i] = dist.log_prob(action) # Update log values if hasMesses: self.log_episode_return_MESSES += torch.tensor( messes, device=self.device, dtype=torch.float) # Update log values if hasPerfFull: self.log_episode_return_PERFORMANCE_FULL += torch.tensor( performancesFULL, device=self.device, dtype=torch.float) # Update log values if hasPerf: self.log_episode_return_PERFORMANCE += torch.tensor( performances, device=self.device, dtype=torch.float) # Update log values if hasButtonPresses: self.log_episode_return_BUTTON_PRESSES += torch.tensor( button_presses, device=self.device, dtype=torch.float) # Update log values if hasPhonesCleaned: self.log_episode_return_PHONES_CLEANED += torch.tensor( phones_cleaned, device=self.device, dtype=torch.float) # Update log values if hasDirtCleaned: self.log_episode_return_DIRT_CLEANED += torch.tensor( dirt_cleaned, device=self.device, dtype=torch.float) self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_reshaped_return_PERFORMANCE += self.rewards_PERFORMANCE[ i] self.log_episode_reshaped_return_BUTTON_PRESSES += self.rewards_BUTTON_PRESSES[ i] self.log_episode_reshaped_return_PHONES_CLEANED += self.rewards_PHONES_CLEANED[ i] self.log_episode_reshaped_return_DIRT_CLEANED += self.rewards_DIRT_CLEANED[ i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 if hasMesses: self.log_return_MESSES.append( self.log_episode_return_MESSES[i].item()) if hasPerfFull: self.log_return_PERFORMANCE_FULL.append( self.log_episode_return_PERFORMANCE_FULL[i].item()) if hasPerf: self.log_return_PERFORMANCE.append( self.log_episode_return_PERFORMANCE[i].item()) self.log_reshaped_return_PERFORMANCE.append( self.log_episode_reshaped_return_PERFORMANCE[i]. item()) if hasPhonesCleaned: self.log_return_BUTTON_PRESSES.append( self.log_episode_return_BUTTON_PRESSES[i].item()) self.log_reshaped_return_BUTTON_PRESSES.append( self.log_episode_reshaped_return_BUTTON_PRESSES[i]. item()) if hasButtonPresses: self.log_return_PHONES_CLEANED.append( self.log_episode_return_PHONES_CLEANED[i].item()) self.log_reshaped_return_PHONES_CLEANED.append( self.log_episode_reshaped_return_PHONES_CLEANED[i]. item()) if hasDirtCleaned: self.log_return_DIRT_CLEANED.append( self.log_episode_return_DIRT_CLEANED[i].item()) self.log_reshaped_return_DIRT_CLEANED.append( self.log_episode_reshaped_return_DIRT_CLEANED[i]. item()) self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_return_PERFORMANCE_FULL *= self.mask self.log_episode_return_MESSES *= self.mask self.log_episode_return_PERFORMANCE *= self.mask self.log_episode_return_BUTTON_PRESSES *= self.mask self.log_episode_return_PHONES_CLEANED *= self.mask self.log_episode_return_DIRT_CLEANED *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_reshaped_return_PERFORMANCE *= self.mask self.log_episode_reshaped_return_BUTTON_PRESSES *= self.mask self.log_episode_reshaped_return_PHONES_CLEANED *= self.mask self.log_episode_reshaped_return_DIRT_CLEANED *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): if self.acmodel.recurrent: _, next_value, _ = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) else: _, next_value = self.acmodel(preprocessed_obs) for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Define experiences: # the whole experience is the concatenation of the experience # of each process. # In comments below: # - T is self.num_frames_per_proc, # - P is self.num_procs, # - D is the dimensionality. exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] if self.acmodel.recurrent: # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) logs = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "messes_per_episode": self.log_return_MESSES[-keep:], "performance_full_per_episode": self.log_return_PERFORMANCE_FULL[-keep:], "performance_per_episode": self.log_return_PERFORMANCE[-keep:], "reshaped_performance_per_episode": self.log_reshaped_return_PERFORMANCE[-keep:], "buttons_per_episode": self.log_return_BUTTON_PRESSES[-keep:], "reshaped_buttons_per_episode": self.log_reshaped_return_BUTTON_PRESSES[-keep:], "phones_per_episode": self.log_return_PHONES_CLEANED[-keep:], "reshaped_phones_per_episode": self.log_reshaped_return_PHONES_CLEANED[-keep:], "dirt_per_episode": self.log_return_DIRT_CLEANED[-keep:], "reshaped_dirt_per_episode": self.log_reshaped_return_DIRT_CLEANED[-keep:], "numberOfPermutes": info[0]['numberOfPermutes'], "buttonValue": info[0]['buttonValue'], "episodesDone": self.log_done_counter, } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, logs