def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs0 = self.preprocess_obss(self.obs0, device=self.device) preprocessed_obs1 = self.preprocess_obss(self.obs1, device=self.device) with torch.no_grad(): model_results0 = self.acmodel0( preprocessed_obs1, self.memory0 * self.mask0.unsqueeze(1)) ### NOTE dist0 = model_results0['dist'] ### NOTE value0 = model_results0['value'] memory0 = model_results0['memory'] msg0 = model_results0['message'] dists_speaker0 = model_results0['dists_speaker'] extra_predictions0 = model_results0['extra_predictions'] #self.rng_states0[i] = model_results0['rng_states'] #if torch.cuda.is_available(): # self.cuda_rng_states0[i] = model_results0['cuda_rng_states'] preprocessed_obs0.instr *= 0 preprocessed_obs0.image *= 0 model_results1 = self.acmodel1( preprocessed_obs0, self.memory1 * self.mask1.unsqueeze(1), msg=(msg0.transpose(0, 1) * self.mask1.unsqueeze(1).unsqueeze(2)).transpose( 0, 1)) ### NOTE dist1 = model_results1['dist'] value1 = model_results1['value'] memory1 = model_results1['memory'] msg1 = model_results1['message'] dists_speaker1 = model_results1['dists_speaker'] extra_predictions1 = model_results1['extra_predictions'] #self.rng_states1[i] = model_results1['rng_states'] #if torch.cuda.is_available(): # self.cuda_rng_states1[i] = model_results1['cuda_rng_states'] #state = torch.get_rng_state() action0 = dist0.sample() #torch.set_rng_state(state) action1 = dist1.sample() obs0, reward0, done0, env_info0 = self.env0.step( action0.cpu().numpy()) obs1, reward1, done1, env_info1 = self.env1.step( action1.cpu().numpy()) # mask any rewards based on (previous) been_done rewardos0 = [0] * self.num_procs rewardos1 = [0] * self.num_procs for j in range(self.num_procs): rewardos0[j] = reward0[j] * (1 - self.been_done0[j].item()) rewardos1[j] = reward1[j] * (1 - self.been_done1[j].item()) reward0 = tuple(rewardos0) reward1 = tuple(rewardos1) #reward0 = tuple(0.5*r0 + 0.5*r1 for r0, r1 in zip(reward0, reward1)) ### NOTE #reward1 = reward0 # reward sender agent (0) equally for success of receiver agent (1) ### NOTE reward0 = reward1 self.been_done0 = (1 - (1 - self.been_done0) * (1 - torch.tensor( done0, device=self.device, dtype=torch.float))) self.been_done1 = (1 - (1 - self.been_done1) * (1 - torch.tensor( done1, device=self.device, dtype=torch.float))) both_done = self.been_done0 * self.been_done1 # reset if receiver agent (1) is done ### NOTE both_done = self.been_done1 obs0 = self.env0.sync_reset(both_done, obs0) obs1 = self.env1.sync_reset(both_done, obs1) if self.aux_info: env_info0 = self.aux_info_collector0.process(env_info0) # env_info0 = self.process_aux_info0(env_info0) env_info1 = self.aux_info_collector1.process(env_info1) # env_info1 = self.process_aux_info1(env_info1) # Update experiences values self.obss0[i] = self.obs0 self.obs0 = obs0 self.obss1[i] = self.obs1 self.obs1 = obs1 self.memories0[i] = self.memory0 self.memory0 = memory0 self.memories1[i] = self.memory1 self.memory1 = memory1 self.msgs0[i] = self.msg0 self.msg0 = msg0 self.msgs1[i] = self.msg1 self.msg1 = msg1 self.msgs_out0[i] = msg0 self.msgs_out1[i] = msg1 self.masks0[i] = self.mask0 #self.mask0 = 1 - torch.tensor(done0, device=self.device, dtype=torch.float) self.mask0 = 1 - both_done self.actions0[i] = action0 self.values0[i] = value0 if self.reshape_reward is not None: self.rewards0[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs0, action0, reward0, done0) ], device=self.device) else: self.rewards0[i] = torch.tensor(reward0, device=self.device) self.log_probs0[i] = dist0.log_prob(action0) self.speaker_log_probs0[i] = self.acmodel0.speaker_log_prob( dists_speaker0, msg0) self.masks1[i] = self.mask1 #self.mask1 = 1 - torch.tensor(done1, device=self.device, dtype=torch.float) self.mask1 = 1 - both_done self.actions1[i] = action1 self.values1[i] = value1 if self.reshape_reward is not None: self.rewards1[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs1, action1, reward1, done1) ], device=self.device) else: self.rewards1[i] = torch.tensor(reward1, device=self.device) self.log_probs1[i] = dist1.log_prob(action1) self.speaker_log_probs1[i] = self.acmodel1.speaker_log_prob( dists_speaker1, msg1) if self.aux_info: self.aux_info_collector0.fill_dictionaries( i, env_info0, extra_predictions0) self.aux_info_collector1.fill_dictionaries( i, env_info1, extra_predictions1) # Update log values self.log_episode_return0 += torch.tensor(reward0, device=self.device, dtype=torch.float) self.log_episode_reshaped_return0 += self.rewards0[i] self.log_episode_return1 += torch.tensor(reward1, device=self.device, dtype=torch.float) self.log_episode_reshaped_return1 += self.rewards1[i] self.log_episode_num_frames0 += torch.ones(self.num_procs, device=self.device) self.log_episode_num_frames1 += torch.ones(self.num_procs, device=self.device) #for i, done_ in enumerate(done0): for i in range(self.num_procs): #if done_: if both_done[i]: self.log_done_counter0 += 1 self.log_return0.append(self.log_episode_return0[i].item()) self.log_reshaped_return0.append( self.log_episode_reshaped_return0[i].item()) self.log_num_frames0.append( self.log_episode_num_frames0[i].item()) #for i, done_ in enumerate(done1): #if done_: self.log_done_counter1 += 1 self.log_return1.append(self.log_episode_return1[i].item()) self.log_reshaped_return1.append( self.log_episode_reshaped_return1[i].item()) self.log_num_frames1.append( self.log_episode_num_frames1[i].item()) # if both are done, reset both to not done self.been_done0 *= (1 - both_done) self.been_done1 *= (1 - both_done) self.log_episode_return0 *= self.mask0 self.log_episode_reshaped_return0 *= self.mask0 self.log_episode_num_frames0 *= self.mask0 self.log_episode_return1 *= self.mask1 self.log_episode_reshaped_return1 *= self.mask1 self.log_episode_num_frames1 *= self.mask1 # Add advantage and return to experiences preprocessed_obs0 = self.preprocess_obss(self.obs0, device=self.device) preprocessed_obs1 = self.preprocess_obss(self.obs1, device=self.device) with torch.no_grad(): tmp = self.acmodel0(preprocessed_obs1, self.memory0 * self.mask0.unsqueeze(1)) ### NOTE next_value0 = tmp['value'] preprocessed_obs0.instr *= 0 preprocessed_obs0.image *= 0 next_value1 = self.acmodel1( preprocessed_obs0, self.memory1 * self.mask1.unsqueeze(1), msg=(tmp['message'].transpose(0, 1) * self.mask1.unsqueeze(1).unsqueeze(2)).transpose( 0, 1))['value'] ### NOTE for i in reversed(range(self.num_frames_per_proc)): next_mask0 = self.masks0[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask0 next_value0 = self.values0[ i + 1] if i < self.num_frames_per_proc - 1 else next_value0 next_advantage0 = self.advantages0[ i + 1] if i < self.num_frames_per_proc - 1 else 0 next_mask1 = self.masks1[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask1 next_value1 = self.values1[ i + 1] if i < self.num_frames_per_proc - 1 else next_value1 next_advantage1 = self.advantages1[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta0 = self.rewards0[ i] + self.discount * next_value0 * next_mask0 - self.values0[i] self.advantages0[ i] = delta0 + self.discount * self.gae_lambda * next_advantage0 * next_mask0 delta1 = self.rewards1[ i] + self.discount * next_value1 * next_mask1 - self.values1[i] self.advantages1[ i] = delta1 + self.discount * self.gae_lambda * next_advantage1 * next_mask1 # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps0 = DictList() exps0.obs = [ self.obss0[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] exps1 = DictList() exps1.obs = [ self.obss1[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps0.memory = self.memories0.transpose(0, 1).reshape( -1, *self.memories0.shape[2:]) exps1.memory = self.memories1.transpose(0, 1).reshape( -1, *self.memories1.shape[2:]) exps0.message = self.msgs0.transpose(1, 2).transpose(0, 1).reshape( -1, self.acmodel0.max_len_msg, self.acmodel0.num_symbols) exps1.message = self.msgs1.transpose(1, 2).transpose(0, 1).reshape( -1, self.acmodel1.max_len_msg, self.acmodel1.num_symbols) exps0.message_out = self.msgs_out0.transpose(1, 2).transpose( 0, 1).reshape(-1, self.acmodel0.max_len_msg, self.acmodel0.num_symbols) exps1.message_out = self.msgs_out1.transpose(1, 2).transpose( 0, 1).reshape(-1, self.acmodel1.max_len_msg, self.acmodel1.num_symbols) #exps0.rng_states = self.rng_states0.transpose(0, 1).reshape(-1, *self.rng_states0.shape[2:]) #if torch.cuda.is_available(): # exps0.cuda_rng_states = self.cuda_rng_states0.transpose(0, 1).reshape(-1, *self.cuda_rng_states0.shape[2:]) #exps1.rng_states = self.rng_states1.transpose(0, 1).reshape(-1, *self.rng_states1.shape[2:]) #if torch.cuda.is_available(): # exps1.cuda_rng_states = self.cuda_rng_states1.transpose(0, 1).reshape(-1, *self.cuda_rng_states1.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps0.mask = self.masks0.transpose(0, 1).reshape(-1).unsqueeze(1) exps1.mask = self.masks1.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps0.action = self.actions0.transpose(0, 1).reshape(-1) exps0.value = self.values0.transpose(0, 1).reshape(-1) exps0.reward = self.rewards0.transpose(0, 1).reshape(-1) exps0.advantage = self.advantages0.transpose(0, 1).reshape(-1) exps0.returnn = exps0.value + exps0.advantage exps0.log_prob = self.log_probs0.transpose(0, 1).reshape(-1) exps0.speaker_log_prob = self.speaker_log_probs0.transpose( 0, 1).reshape(-1) exps1.action = self.actions1.transpose(0, 1).reshape(-1) exps1.value = self.values1.transpose(0, 1).reshape(-1) exps1.reward = self.rewards1.transpose(0, 1).reshape(-1) exps1.advantage = self.advantages1.transpose(0, 1).reshape(-1) exps1.returnn = exps1.value + exps1.advantage exps1.log_prob = self.log_probs1.transpose(0, 1).reshape(-1) exps1.speaker_log_prob = self.speaker_log_probs1.transpose( 0, 1).reshape(-1) if self.aux_info: exps0 = self.aux_info_collector0.end_collection(exps0) exps1 = self.aux_info_collector1.end_collection(exps1) # Preprocess experiences exps0.obs = self.preprocess_obss(exps0.obs, device=self.device) exps1.obs = self.preprocess_obss(exps1.obs, device=self.device) # Log some values keep0 = max(self.log_done_counter0, self.num_procs) keep1 = max(self.log_done_counter1, self.num_procs) log0 = { "return_per_episode": self.log_return0[-keep0:], "reshaped_return_per_episode": self.log_reshaped_return0[-keep0:], "num_frames_per_episode": self.log_num_frames0[-keep0:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter0, } log1 = { "return_per_episode": self.log_return1[-keep1:], "reshaped_return_per_episode": self.log_reshaped_return1[-keep1:], "num_frames_per_episode": self.log_num_frames1[-keep1:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter1, } self.log_done_counter0 = 0 self.log_return0 = self.log_return0[-self.num_procs:] self.log_reshaped_return0 = self.log_reshaped_return0[-self.num_procs:] self.log_num_frames0 = self.log_num_frames0[-self.num_procs:] self.log_done_counter1 = 0 self.log_return1 = self.log_return1[-self.num_procs:] self.log_reshaped_return1 = self.log_reshaped_return1[-self.num_procs:] self.log_num_frames1 = self.log_num_frames1[-self.num_procs:] return exps0, log0, exps1, log1
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ from pdb import set_trace as st for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): from pdb import set_trace as st st() model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) dist = model_results['dist'] value = model_results['value'] memory = model_results['memory'] extra_predictions = model_results['extra_predictions'] action = dist.sample() obs, reward, done, env_info = self.env.step(action.cpu().numpy()) if self.aux_info: env_info = self.aux_info_collector.process(env_info) # env_info = self.process_aux_info(env_info) # Update experiences values self.obss[i] = self.obs self.obs = obs self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip(obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) if self.aux_info: self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item()) self.log_num_frames.append(self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value'] for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i] self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps = DictList() exps.obs = [self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc)] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) if self.aux_info: exps = self.aux_info_collector.end_collection(exps) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter, } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log
def update_parameters(self): # Collect experiences t_collect, (exps, logs) = timer(self.collect_experiences)() logs['t_collect'] = t_collect ''' exps is a DictList with the following keys ['obs', 'memory', 'mask', 'action', 'value', 'reward', 'advantage', 'returnn', 'log_prob'] and ['collected_info', 'extra_predictions'] if we use aux_info exps.obs is a DictList with the following keys ['image', 'instr'] exps.obj.image is a (n_procs * n_frames_per_proc) x image_size 4D tensor exps.obs.instr is a (n_procs * n_frames_per_proc) x (max number of words in an instruction) 2D tensor exps.memory is a (n_procs * n_frames_per_proc) x (memory_size = 2*image_embedding_size) 2D tensor exps.mask is (n_procs * n_frames_per_proc) x 1 2D tensor if we use aux_info: exps.collected_info and exps.extra_predictions are DictLists with keys being the added information. They are either (n_procs * n_frames_per_proc) 1D tensors or (n_procs * n_frames_per_proc) x k 2D tensors where k is the number of classes for multiclass classification ''' t0_train = time.time() t_details_train_forward_model = {} t_train_backward = 0 # objs[torch.sum(torch.stack([idx == i for i in indices]), dim=0).nonzero().flatten()] n = 0 for _ in range(self.epochs): n = n + 1 # Initialize log values log_entropies = [] log_values = [] log_policy_losses = [] log_value_losses = [] log_grad_norms = [] log_losses = [] ''' For each epoch, we create int(total_frames / batch_size + 1) batches, each of size batch_size (except maybe the last one. Each batch is divided into sub-batches of size recurrence (frames are contiguous in a sub-batch), but the position of each sub-batch in a batch and the position of each batch in the whole list of frames is random thanks to self._get_batches_starting_indexes(). ''' for inds in self._get_batches_starting_indexes(): # inds is a numpy array of indices that correspond to the beginning of a sub-batch # there are as many inds as there are batches # Initialize batch values batch_entropy = 0 batch_value = 0 batch_policy_loss = 0 batch_value_loss = 0 batch_loss = 0 # Initialize memory # Extract first memories inds_mem = [ item for sublist in [ list( range( self.acmodel.memory_dim[0] * i, self.acmodel.memory_dim[0] * i + self.acmodel.memory_dim[0])) for i in inds ] for item in sublist ] memory = exps.memory[inds_mem] all_obs_inds = exps.obs[1].image sb = DictList() for i in range(self.recurrence): # Extract scene level quantities: sb.action = exps.action[inds + i] sb.log_prob = exps.log_prob[inds + i] sb.advantage = exps.advantage[inds + i] sb.value = exps.value[inds + i] sb.returnn = exps.returnn[inds + i] m_batch = torch.IntTensor([ j + i for j in inds for _ in range(self.acmodel.memory_dim[0]) ]) # Extract subatch of observation and observation batch indices sb.obs = torch.zeros((0, self.acmodel.image_dim)) sb.obs_batch = torch.zeros(0).int() for j in inds + i: idx_j = all_obs_inds == j sb.obs = torch.cat([sb.obs, exps.obs[0].image[idx_j]], dim=0) sb.obs_batch = torch.cat( [sb.obs_batch, exps.obs[1].image[idx_j]], dim=0) # TODO rename obs[0] and obs[1] into obs.obs and obs.obs_batch # Reshape mask sb.mask = exps.mask[list( numpy.array(inds_mem) + self.acmodel.memory_dim[0] * (i))].flatten() # Compute loss model_results = self.acmodel(sb.obs, sb.mask.unsqueeze(1) * memory, sb.obs_batch, m_batch) dist = model_results['dist'] value = model_results['value'] memory = model_results['memory'] extra_predictions = model_results['extra_predictions'] entropy = dist.entropy().mean() t_details_train_forward_model = cumulate_value( t_details_train_forward_model, model_results['log_time']) ratio = torch.exp(dist.log_prob(sb.action) - sb.log_prob) surr1 = ratio * sb.advantage surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * sb.advantage policy_loss = -torch.min(surr1, surr2).mean() value_clipped = sb.value + torch.clamp( value - sb.value, -self.clip_eps, self.clip_eps) surr1 = (value - sb.returnn).pow(2) surr2 = (value_clipped - sb.returnn).pow(2) value_loss = torch.max(surr1, surr2).mean() loss = policy_loss - self.entropy_coef * entropy + self.value_loss_coef * value_loss # Update batch values batch_entropy += entropy.item() batch_value += value.mean().item() batch_policy_loss += policy_loss.item() batch_value_loss += value_loss.item() batch_loss += loss # Update memories for next epoch if i < self.recurrence - 1: exps.memory[list( numpy.array(inds_mem) + self.acmodel.memory_dim[0] * (i + 1))] = memory.detach() # Update batch values batch_entropy /= self.recurrence batch_value /= self.recurrence batch_policy_loss /= self.recurrence batch_value_loss /= self.recurrence batch_loss /= self.recurrence # Update actor-critic t0_train_backward = time.time() self.optimizer.zero_grad() batch_loss.backward() grad_norm = sum( p.grad.data.norm(2)**2 for p in self.acmodel.parameters() if p.grad is not None)**0.5 torch.nn.utils.clip_grad_norm_(self.acmodel.parameters(), self.max_grad_norm) self.optimizer.step() t_train_backward += time.time() - t0_train_backward # Update log values log_entropies.append(batch_entropy) log_values.append(batch_value) log_policy_losses.append(batch_policy_loss) log_value_losses.append(batch_value_loss) log_grad_norms.append(grad_norm.item()) log_losses.append(batch_loss.item()) t_train = time.time() - t0_train # Log some values logs["entropy"] = numpy.mean(log_entropies) logs["value"] = numpy.mean(log_values) logs["policy_loss"] = numpy.mean(log_policy_losses) logs["value_loss"] = numpy.mean(log_value_losses) logs["grad_norm"] = numpy.mean(log_grad_norms) logs["loss"] = numpy.mean(log_losses) logs['t_collect'] = t_collect logs['t_train'] = t_train logs['t_details_train_forward_mordel'] = t_details_train_forward_model logs['t_backward'] = t_train_backward return logs
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ t0 = time.time() t_forward_process = 0 t_forward_step = 0 t_details_forward_model = {} for i in range(self.num_frames_per_proc): # Do one agent-environment interaction tt_process, preprocessed_obs = timer(self.preprocess_obss)( self.obs, device=self.device) t_forward_process += tt_process obs_flat = preprocessed_obs.image[0] obs_batch = preprocessed_obs.image[1] with torch.no_grad(): model_results = self.acmodel( obs_flat, self.mask.unsqueeze(1) * self.memory, obs_batch, self.m_batch) dist = model_results['dist'] value = model_results['value'].flatten() memory = model_results['memory'] extra_predictions = model_results['extra_predictions'] t_details_forward_model = cumulate_value(t_details_forward_model, model_results['log_time']) action = dist.sample() tt_step, (obs, reward, done, env_info) = timer(self.env.step)(action.cpu().numpy()) t_forward_step += tt_step if self.aux_info: env_info = self.aux_info_collector.process(env_info) # env_info = self.process_aux_info(env_info) # Update experiences values self.obss[i] = self.obs self.obs = obs self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask done_as_int = torch.tensor(done, device=self.device, dtype=torch.float).unsqueeze(1) self.mask = 1 - done_as_int.expand( done_as_int.shape[0], self.acmodel.memory_size[0]).flatten() self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) if self.aux_info: self.aux_info_collector.fill_dictionaries( i, env_info, extra_predictions) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) episode_mask = torch.tensor([ self.mask[i * self.acmodel.memory_size[0]] for i in range(self.num_procs) ]) self.log_episode_return *= episode_mask self.log_episode_reshaped_return *= episode_mask self.log_episode_num_frames *= episode_mask t_collect_forward = time.time() - t0 # Add advantage and return to experiences t0 = time.time() preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): # TODO: Add split obs_flat, obs_batch in preprocess_obss ? obs_flat = preprocessed_obs.image[0] obs_batch = preprocessed_obs.image[1] next_value = self.acmodel(obs_flat, self.mask.unsqueeze(1) * self.memory, obs_batch, self.m_batch)['value'].flatten() for i in reversed(range(self.num_frames_per_proc)): next_mask = torch.tensor([ self.masks[i + 1][j * self.acmodel.memory_size[0]] for j in range(self.num_procs) ]) if i < self.num_frames_per_proc - 1 else torch.tensor([ self.mask[j * self.acmodel.memory_size[0]] for j in range(self.num_procs) ]) next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask t_collect_backward = time.time() - t0 # Flatten the data correctly, making sure that # each episode's data is a continuous chunk t0 = time.time() exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality and M the number of memory slots # T x (P * M) x D -> T x P x M x D -> P x T x M x D -> (P * T * M) x D exps.memory = self.memories.reshape( (self.num_frames_per_proc, self.num_procs, self.acmodel.memory_size[0], self.acmodel.memory_size[1])).transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x (P * M) -> T x P x M -> P x T x M -> (P * T * M) x 1 exps.mask = self.masks.reshape(self.num_frames_per_proc, self.num_procs, self.acmodel.memory_size[0]).transpose( 0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) t_organize_exp = time.time() - t0 if self.aux_info: exps = self.aux_info_collector.end_collection(exps) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter, "t_collect_forward": t_collect_forward, "t_details_forward_model": t_details_forward_model, "t_forward_process": t_forward_process, "t_forward_step": t_forward_step, "t_collect_backward": t_collect_backward, "t_collect_organize": t_organize_exp } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log
def __getitem__(self, index): return DictList({key: [subvalue[index] for subvalue in value] for key, value in dict.items(self)})
def collect_experiences(self, teacher_dict): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Contains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ teacher_keys = list(teacher_dict.keys()) all_teachers_dict = dict(zip(teacher_keys, [True] * len(teacher_keys))) # TODO: Make this handle the case where the meta_rollout length > 1 for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, teacher_dict) with torch.no_grad(): dist, model_results = self.acmodel( preprocessed_obs, self.memory * self.mask.unsqueeze(1)) value = model_results['value'] memory = model_results['memory'] extra_predictions = None action = dist.sample() obs, reward, done, env_info = self.env.step(action.cpu().numpy()) # Update experiences values self.env_infos[i] = env_info self.obss[i] = self.obs self.obs = obs self.teacher_actions[i] = torch.FloatTensor( [ei['teacher_action'][0] for ei in env_info]).to(self.device) self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask done_tensor = torch.FloatTensor(done).to(self.device) self.done_index = done_tensor + self.done_index done_meta = self.done_index == self.rollouts_per_meta_task self.done_index = torch.remainder(self.done_index, self.rollouts_per_meta_task) self.dones[i] = done_tensor self.mask = 1 - done_meta.to(torch.int32) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip( obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) if self.aux_info: self.aux_info_collector.fill_dictionaries( i, env_info, extra_predictions) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_success += torch.tensor( [e['success'] for e in env_info], device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_success.append(self.log_episode_success[i].item()) self.log_reshaped_return.append( self.log_episode_reshaped_return[i].item()) self.log_num_frames.append( self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_success *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, teacher_dict) with torch.no_grad(): next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))[1]['value'] for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[ i + 1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[ i + 1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[ i + 1] if i < self.num_frames_per_proc - 1 else 0 delta = self.rewards[ i] + self.discount * next_value * next_mask - self.values[i] self.advantages[ i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps = DictList() exps.obs = [ self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc) ] keys = list(env_info[0].keys()) batch = len(env_info) timesteps = len(self.env_infos) env_info_dict = {} for k in keys: arr = [] for b in range(batch): for t in range(timesteps): arr.append(self.env_infos[t][b][k]) env_info_dict[k] = np.stack(arr) env_info_dict = DictList(env_info_dict) exps.env_infos = env_info_dict # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape( -1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) exps.teacher_action = self.teacher_actions.transpose(0, 1).reshape(-1) exps.done = self.dones.transpose(0, 1).reshape(-1) if self.aux_info: exps = self.aux_info_collector.end_collection(exps) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, all_teachers_dict) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "success_per_episode": self.log_success[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter, } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_success = self.log_success[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log
def collect_experiences(self): """Collects rollouts and computes advantages. Runs several environments concurrently. The next actions are computed in a batch mode for all environments at the same time. The rollouts and advantages from all environments are concatenated together. Returns ------- exps : DictList Cscripts/train_rl.py --env BabyAI-GoToLocal-v0ontains actions, rewards, advantages etc as attributes. Each attribute, e.g. `exps.reward` has a shape (self.num_frames_per_proc * num_envs, ...). k-th block of consecutive `self.num_frames_per_proc` frames contains data obtained from the k-th environment. Be careful not to mix data from different environments! logs : dict Useful stats about the training process, including the average reward, policy loss, value loss, etc. """ # Reset cpv buffer if needed. if self.reward_fn == 'cpv' or self.reward_fn == 'both': self.reset_cpv_buffer() start = time.time() for i in range(self.num_frames_per_proc): # Do one agent-environment interaction preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): model_results = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1)) dist = model_results['dist'] value = model_results['value'] memory = model_results['memory'] extra_predictions = model_results['extra_predictions'] action = dist.sample() # Take a step in env and process reward if not using default reward function. obs, old_reward, done, env_info = self.env.step(action.cpu().numpy()) if self.reward_fn == 'cpv' or self.reward_fn == 'both': reward = old_reward # TODO Do we even need this if-else block here anymore? """ unnormalized_reward = self.reward_model.calculate_reward(self.cpv_buffer, self.obs) if self.aux_info: env_info = self.aux_info_collector.process(env_info) env_info = self.process_aux_info(env_info) std = numpy.std(self.all_rewards) if self.all_rewards != [] else numpy.std(unnormalized_reward) mean = numpy.mean(self.all_rewards) if self.all_rewards != [] else numpy.mean(unnormalized_reward) reward = numpy.clip([(r - mean) / std for r in unnormalized_reward], 0, 1) self.all_rewards.extend(unnormalized_reward) if len(self.all_rewards) > 1000: self.all_rewards[-1000:] """ elif self.reward_fn == 'babyai': reward = old_reward if self.aux_info: env_info = self.aux_info_collector.process(env_info) #env_info = self.process_aux_info(env_info) # Update experiences values self.obss[i] = self.obs self.obs = obs self.memories[i] = self.memory self.memory = memory self.masks[i] = self.mask self.mask = 1 - torch.tensor(done, device=self.device, dtype=torch.float) self.actions[i] = action self.values[i] = value if self.reshape_reward is not None: self.rewards[i] = torch.tensor([ self.reshape_reward(obs_, action_, reward_, done_) for obs_, action_, reward_, done_ in zip(obs, action, reward, done) ], device=self.device) else: self.rewards[i] = torch.tensor(reward, device=self.device) self.log_probs[i] = dist.log_prob(action) if self.aux_info: self.aux_info_collector.fill_dictionaries(i, env_info, extra_predictions) # Update log values self.log_episode_return += torch.tensor(reward, device=self.device, dtype=torch.float) self.log_episode_reshaped_return += self.rewards[i] self.log_episode_num_frames += torch.ones(self.num_procs, device=self.device) for i, done_ in enumerate(done): if done_: self.log_done_counter += 1 self.log_return.append(self.log_episode_return[i].item()) self.log_reshaped_return.append(self.log_episode_reshaped_return[i].item()) self.log_num_frames.append(self.log_episode_num_frames[i].item()) self.log_episode_return *= self.mask self.log_episode_reshaped_return *= self.mask self.log_episode_num_frames *= self.mask # If CPV, recompute reward based on trajectory. if self.reward_fn == 'cpv': # Make single run through CPV model to compute all rewards at once. self.rewards = self.reward_model.calculate_reward(self.obss).permute(1,0) # TODO normalize rewards? std, mean = torch.std_mean(self.rewards, dim=1) std = std.view(-1, 1).expand_as(self.rewards) mean = mean.view(-1, 1).expand_as(self.rewards) self.reward = torch.clamp((self.rewards - mean) / std, 0.0, 1.0) # Add advantage and return to experiences preprocessed_obs = self.preprocess_obss(self.obs, device=self.device) with torch.no_grad(): next_value = self.acmodel(preprocessed_obs, self.memory * self.mask.unsqueeze(1))['value'] for i in reversed(range(self.num_frames_per_proc)): next_mask = self.masks[i+1] if i < self.num_frames_per_proc - 1 else self.mask next_value = self.values[i+1] if i < self.num_frames_per_proc - 1 else next_value next_advantage = self.advantages[i+1] if i < self.num_frames_per_proc - 1 else 0 # print("Reward") # print(self.rewards[i]) # # print("Discount") # # print(self.discount[i]) # print("Values") # print(self.values[i]) delta = self.rewards[i] + self.discount * next_value * next_mask - self.values[i] self.advantages[i] = delta + self.discount * self.gae_lambda * next_advantage * next_mask # Flatten the data correctly, making sure that # each episode's data is a continuous chunk exps = DictList() exps.obs = [self.obss[i][j] for j in range(self.num_procs) for i in range(self.num_frames_per_proc)] # In commments below T is self.num_frames_per_proc, P is self.num_procs, # D is the dimensionality # T x P x D -> P x T x D -> (P * T) x D exps.memory = self.memories.transpose(0, 1).reshape(-1, *self.memories.shape[2:]) # T x P -> P x T -> (P * T) x 1 exps.mask = self.masks.transpose(0, 1).reshape(-1).unsqueeze(1) # for all tensors below, T x P -> P x T -> P * T exps.action = self.actions.transpose(0, 1).reshape(-1) exps.value = self.values.transpose(0, 1).reshape(-1) exps.reward = self.rewards.transpose(0, 1).reshape(-1) exps.advantage = self.advantages.transpose(0, 1).reshape(-1) exps.returnn = exps.value + exps.advantage exps.log_prob = self.log_probs.transpose(0, 1).reshape(-1) if self.aux_info: exps = self.aux_info_collector.end_collection(exps) # Preprocess experiences exps.obs = self.preprocess_obss(exps.obs, device=self.device) # Log some values keep = max(self.log_done_counter, self.num_procs) log = { "return_per_episode": self.log_return[-keep:], "reshaped_return_per_episode": self.log_reshaped_return[-keep:], "num_frames_per_episode": self.log_num_frames[-keep:], "num_frames": self.num_frames, "episodes_done": self.log_done_counter, } self.log_done_counter = 0 self.log_return = self.log_return[-self.num_procs:] self.log_reshaped_return = self.log_reshaped_return[-self.num_procs:] self.log_num_frames = self.log_num_frames[-self.num_procs:] return exps, log