def _get_loss_inputs_dict(self, batch, shuffle): """Return a feed dict from a batch. Arguments: batch (SampleBatch): batch of data to derive inputs from shuffle (bool): whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: feed dict of data """ # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( batch, shuffle=shuffle, max_seq_len=self._max_seq_len, batch_divisibility_req=self._batch_divisibility_req, feature_keys=[k for k, v in self._loss_inputs]) # Build the feed dict from the batch. feed_dict = {} for k, ph in self._loss_inputs: feed_dict[ph] = batch[k] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] for k in state_keys: feed_dict[self._loss_input_dict[k]] = batch[k] if state_keys: feed_dict[self._seq_lens] = batch["seq_lens"] return feed_dict
def test_pad_batch_fixed_max(self): """Test pad_batch_to_sequences_of_same_size when dynamic_max = False""" view_requirements = { "state_in_0": ViewRequirement( "state_out_0", shift="-3:-1", used_for_training=False, used_for_compute_actions=True, batch_repeat_value=1, ) } max_seq_len = 20 num_seqs = np.random.randint(1, 20) seq_lens = np.random.randint(1, max_seq_len, size=(num_seqs)) sum_seq_lens = np.sum(seq_lens) s1 = SampleBatch( { "a": np.arange(sum_seq_lens), "b": np.arange(sum_seq_lens), "seq_lens": seq_lens, "state_in_0": [[0]] * num_seqs, }, _max_seq_len=max_seq_len, ) pad_batch_to_sequences_of_same_size( s1, max_seq_len=max_seq_len, feature_keys=["a", "b"], view_requirements=view_requirements, ) check(s1.max_seq_len, max_seq_len) check(s1["a"].shape[0], max_seq_len * num_seqs) check(s1["b"].shape[0], max_seq_len * num_seqs)
def learn_on_batch(self, postprocessed_batch): # Callback handling. learn_stats = {} self.callbacks.on_learn_on_batch( policy=self, train_batch=postprocessed_batch, result=learn_stats ) pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self._max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._is_training = True postprocessed_batch = self._lazy_tensor_dict(postprocessed_batch) postprocessed_batch.set_training(True) stats = self._learn_on_batch_helper(postprocessed_batch) stats.update( { "custom_metrics": learn_stats, NUM_AGENT_STEPS_TRAINED: postprocessed_batch.count, } ) return convert_to_numpy(stats)
def learn_on_batch(self, postprocessed_batch): # Set Model to train mode. if self.model: self.model.train() for k, v in postprocessed_batch.items(): if 'state_in' in k[:8]: # assume all traj has the same length postprocessed_batch[k] = np.tile( v, (postprocessed_batch.count // v.shape[0], 1)) # print(k, len(postprocessed_batch[k])) postprocessed_batch.seq_lens = None # remove to use .copy() c = 0 for ep in range(self.ppo_epochs): for mb in minibatches(postprocessed_batch, self.minibatch_size): c += 1 # pad batch for rnn pad_batch_to_sequences_of_same_size( mb, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) mb["is_training"] = True # minibatch = mb.copy() mb['advantages'] = standardize(mb['advantages']) minibatch = self._lazy_tensor_dict(mb) # compute the loss loss = ppo_surrogate_loss(self, self.model, self.dist_class, minibatch) # compute gradient self.optimizer.zero_grad() loss.backward() # grad norm # apply_grad_clipping(self, self.optimizer, loss) if self.config['grad_clip']: grad_norm = nn.utils.clip_grad_norm_( self.model.parameters(), self.config['grad_clip']) self.optimizer.step() # log stats # stats['ppo_loss'] += loss_out.item() # stats['ppo_loss'] /= c # add more info about the loss # TODO: move this to inner loop and use average instead (0) # stats.update(kl_and_loss_stats(self, train_batch)) # compute the loss # log stats # stats = { # "loss": loss_out.item(), # 'test': 1 # # "grad_norm": grad_norm # # if isinstance(grad_norm, float) else grad_norm.item(), # } # TODO: move this to inner loop and use average instead (0) stats = kl_and_loss_stats(self, postprocessed_batch) return {LEARNER_STATS_KEY: stats}
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: assert len(self.devices) == 1 # If not done yet, see whether we have to zero-pad this batch. if not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( batch=postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) postprocessed_batch.set_training(True) self._lazy_tensor_dict(postprocessed_batch, device=self.devices[0]) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc( [postprocessed_batch]) all_grads, grad_info = tower_outputs[0] grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.stats_fn(postprocessed_batch)) fetches = self.extra_compute_grad_fetches() return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def learn_on_batch(self, postprocessed_batch): # Callback handling. learn_stats = {} self.callbacks.on_learn_on_batch( policy=self, train_batch=postprocessed_batch, result=learn_stats) if not isinstance(postprocessed_batch, SampleBatch) or \ not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self._max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) else: postprocessed_batch["seq_lens"] = postprocessed_batch.seq_lens self._is_training = True postprocessed_batch["is_training"] = True stats = self._learn_on_batch_eager(postprocessed_batch) stats.update({"custom_metrics": learn_stats}) return stats
def compute_gradients(self, samples): # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( samples, shuffle=False, max_seq_len=self._max_seq_len, batch_divisibility_req=self.batch_divisibility_req) return self._compute_gradients_eager(samples)
def learn_on_batch( self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]: # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req) train_batch = self._lazy_tensor_dict(postprocessed_batch) loss_out = force_list( self._loss(self, self.model, self.dist_class, train_batch)) # Call Model's custom-loss with Policy loss outputs and train_batch. if self.model: loss_out = self.model.custom_loss(loss_out, train_batch) assert len(loss_out) == len(self._optimizers) # assert not any(torch.isnan(l) for l in loss_out) fetches = self.extra_compute_grad_fetches() # Loop through all optimizers. grad_info = {"allreduce_latency": 0.0} for i, opt in enumerate(self._optimizers): # Erase gradients in all vars of this optimizer. opt.zero_grad() # Recompute gradients of loss over all variables. loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1)) grad_info.update(self.extra_grad_process(opt, loss_out[i])) if self.distributed_world_size: grads = [] for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: grads.append(p.grad) start = time.time() if torch.cuda.is_available(): # Sadly, allreduce_coalesced does not work with CUDA yet. for g in grads: torch.distributed.all_reduce( g, op=torch.distributed.ReduceOp.SUM) else: torch.distributed.all_reduce_coalesced( grads, op=torch.distributed.ReduceOp.SUM) for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: p.grad /= self.distributed_world_size grad_info["allreduce_latency"] += time.time() - start # Step the optimizer. opt.step() grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(train_batch)) return dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def compute_gradients(self, samples): pad_batch_to_sequences_of_same_size( samples, shuffle=False, max_seq_len=self._max_seq_len, batch_divisibility_req=self.batch_divisibility_req) self._is_training = True samples["is_training"] = True return self._compute_gradients_eager(samples)
def learn_on_batch(self, postprocessed_batch): # Callback handling. self.callbacks.on_learn_on_batch(policy=self, train_batch=postprocessed_batch) # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( postprocessed_batch, shuffle=False, max_seq_len=self._max_seq_len, batch_divisibility_req=self.batch_divisibility_req) return self._learn_on_batch_eager(postprocessed_batch)
def load_batch_into_buffer( self, batch: SampleBatch, buffer_index: int = 0, ) -> int: # Set the is_training flag of the batch. batch.set_training(True) # Shortcut for 1 CPU only: Store batch in `self._loaded_batches`. if len(self.devices) == 1 and self.devices[0].type == "cpu": assert buffer_index == 0 pad_batch_to_sequences_of_same_size( batch=batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._lazy_tensor_dict(batch) self._loaded_batches[0] = [batch] return len(batch) # Batch (len=28, seq-lens=[4, 7, 4, 10, 3]): # 0123 0123456 0123 0123456789ABC # 1) split into n per-GPU sub batches (n=2). # [0123 0123456] [012] [3 0123456789 ABC] # (len=14, 14 seq-lens=[4, 7, 3] [1, 10, 3]) slices = batch.timeslices(num_slices=len(self.devices)) # 2) zero-padding (max-seq-len=10). # - [0123000000 0123456000 0120000000] # - [3000000000 0123456789 ABC0000000] for slice in slices: pad_batch_to_sequences_of_same_size( batch=slice, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) # 3) Load splits into the given buffer (consisting of n GPUs). slices = [ slice.to_device(self.devices[i]) for i, slice in enumerate(slices) ] self._loaded_batches[buffer_index] = slices # Return loaded samples per-device. return len(slices[0])
def _get_loss_inputs_dict(self, train_batch: SampleBatch, shuffle: bool): """Return a feed dict from a batch. Args: train_batch: batch of data to derive inputs from. shuffle: whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: Feed dict of data. """ # Get batch ready for RNNs, if applicable. if not isinstance(train_batch, SampleBatch) or not train_batch.zero_padded: pad_batch_to_sequences_of_same_size( train_batch, max_seq_len=self._max_seq_len, shuffle=shuffle, batch_divisibility_req=self._batch_divisibility_req, feature_keys=list(self._loss_input_dict_no_rnn.keys()), view_requirements=self.view_requirements, ) # Mark the batch as "is_training" so the Model can use this # information. train_batch.set_training(True) # Build the feed dict from the batch. feed_dict = {} for key, placeholders in self._loss_input_dict.items(): a = tree.map_structure( lambda ph, v: feed_dict.__setitem__(ph, v), placeholders, train_batch[key], ) del a state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] for key in state_keys: feed_dict[self._loss_input_dict[key]] = train_batch[key] if state_keys: feed_dict[self._seq_lens] = train_batch[SampleBatch.SEQ_LENS] return feed_dict
def learn_on_batch(self, postprocessed_batch): # Callback handling. self.callbacks.on_learn_on_batch(policy=self, train_batch=postprocessed_batch) pad_batch_to_sequences_of_same_size( postprocessed_batch, shuffle=False, max_seq_len=self._max_seq_len, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._is_training = True postprocessed_batch["is_training"] = True return self._learn_on_batch_eager(postprocessed_batch)
def compute_gradients(self, postprocessed_batch: SampleBatch) -> \ Tuple[ModelGradients, Dict[str, TensorType]]: pad_batch_to_sequences_of_same_size( postprocessed_batch, shuffle=False, max_seq_len=self._max_seq_len, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) self._is_training = True self._lazy_tensor_dict(postprocessed_batch) postprocessed_batch.set_training(True) grads_and_vars, grads, stats = self._compute_gradients_helper( postprocessed_batch) return convert_to_numpy((grads, stats))
def learn_on_batch(self, postprocessed_batch): # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req) train_batch = self._lazy_tensor_dict(postprocessed_batch) loss_out = self._loss(self, self.model, self.dist_class, train_batch) self._optimizer.zero_grad() loss_out.backward() info = {} info.update(self.extra_grad_process()) if self.distributed_world_size: grads = [] for p in self.model.parameters(): if p.grad is not None: grads.append(p.grad) start = time.time() if torch.cuda.is_available(): # Sadly, allreduce_coalesced does not work with CUDA yet. for g in grads: torch.distributed.all_reduce( g, op=torch.distributed.ReduceOp.SUM) else: torch.distributed.all_reduce_coalesced( grads, op=torch.distributed.ReduceOp.SUM) for p in self.model.parameters(): if p.grad is not None: p.grad /= self.distributed_world_size info["allreduce_latency"] = time.time() - start self._optimizer.step() info.update(self.extra_grad_info(train_batch)) return { LEARNER_STATS_KEY: info }
def _get_loss_inputs_dict(self, train_batch, shuffle): """Return a feed dict from a batch. Args: train_batch (SampleBatch): batch of data to derive inputs from. shuffle (bool): whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: feed dict of data """ # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( train_batch, shuffle=shuffle, max_seq_len=self._max_seq_len, batch_divisibility_req=self._batch_divisibility_req, feature_keys=list(self._loss_input_dict_no_rnn.keys()), view_requirements=self.view_requirements, ) # Mark the batch as "is_training" so the Model can use this # information. train_batch["is_training"] = True # Build the feed dict from the batch. feed_dict = {} for key, placeholder in self._loss_input_dict.items(): feed_dict[placeholder] = train_batch[key] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] for key in state_keys: feed_dict[self._loss_input_dict[key]] = train_batch[key] if state_keys: feed_dict[self._seq_lens] = train_batch["seq_lens"] return feed_dict
def do_minibatch_sgd(samples, policies, local_worker, num_sgd_iter, sgd_minibatch_size, standardize_fields): """Execute minibatch SGD. Arguments: samples (SampleBatch): batch of samples to optimize. policies (dict): dictionary of policies to optimize. local_worker (RolloutWorker): master rollout worker instance. num_sgd_iter (int): number of epochs of optimization to take. sgd_minibatch_size (int): size of minibatches to use for optimization. standardize_fields (list): list of sample field names that should be normalized prior to optimization. Returns: averaged info fetches over the last SGD epoch taken. """ # Get batch global nepochs global seg_buf if isinstance(samples, SampleBatch): samples = MultiAgentBatch({DEFAULT_POLICY_ID: samples}, samples.count) fetches = {} for policy_id, policy in policies.items(): model = policy.model dist_class = policy.dist_class if policy_id not in samples.policy_batches: continue batch = samples.policy_batches[policy_id] for field in standardize_fields: batch[field] = standardized(batch[field]) seg_buf.append(batch) for i in range(num_sgd_iter): iter_extra_fetches = defaultdict(list) #pass the whole batch to the worker, then let it break it down into minibatches so we can handle policy and value function training separately batch_fetches = (local_worker.learn_on_batch( MultiAgentBatch({policy_id: batch}, batch.count)))[policy_id] for k, v in batch_fetches.get(LEARNER_STATS_KEY, {}).items(): iter_extra_fetches[k].append(v) logger.debug("{} {}".format(i, averaged(iter_extra_fetches))) fetches[policy_id] = averaged(iter_extra_fetches) nepochs += 1 if nepochs % 16 == 0: def forward(seg): logits, state = model.from_batch(seg) return logits, state REPLAY_MB_SIZE = 512 # #compute the probability distributions on the replay buffer before replay buffer training for seg in seg_buf: np_data = {} np_data["obs"] = th.from_numpy(seg.data["obs"]).to( th.cuda.current_device()) logits, state = tu.minibatched_call(forward, REPLAY_MB_SIZE, seg=np_data) seg.data["oldpd"] = logits.cpu().numpy() replay_batch = SampleBatch.concat_samples(seg_buf) #train on replay buffer for i in range(3): for mb in minibatches(replay_batch, REPLAY_MB_SIZE): pad_batch_to_sequences_of_same_size( mb, max_seq_len=20, shuffle=False, batch_divisibility_req=1) mb["obs"] = th.from_numpy(mb["obs"]).to( th.cuda.current_device()) logits, vpredaux = model.forward_aux(mb) oldpd = dist_class( th.from_numpy(mb['oldpd']).to( th.cuda.current_device())) pd = dist_class(logits, model) pol_distance = oldpd.kl(pd).mean() vpredtrue = model.value_function() vtarg = th.from_numpy(mb[Postprocessing.VALUE_TARGETS]).to( th.cuda.current_device()) vf_aux = 0.5 * th.mean(th.pow(vpredaux - vtarg, 2.0)) vf_true = 0.5 * th.mean(th.pow(vpredtrue - vtarg, 2.0)) loss = pol_distance + vf_aux + vf_true policy.aux_learn(loss) seg_buf.clear() return fetches
def learn_on_batch(self, postprocessed_batch): # if isinstance(postprocessed_batch, SampleBatch): # postprocessed_batch = MultiAgentBatch({DEFAULT_POLICY_ID: postprocessed_batch}, # postprocessed_batch.count) # postprocessed_batch = postprocessed_batch.policy_batches[DEFAULT_POLICY_ID] # print(type(postprocessed_batch)) # print(postprocessed_batch.keys()) # print(postprocessed_batch['agent_index']) # print(postprocessed_batch['seq_lens']) # Set Model to train mode. if self.model: self.model.train() # Turn the values into tensors # train_batch = self._lazy_tensor_dict(postprocessed_batch) # print(postprocessed_batch.keys()) # print(postprocessed_batch['agent_index'].shape) # print(train_batch['obs'].shape) # print(train_batch['']) # stats = {} rew = defaultdict(float) traj = {} # print(postprocessed_batch.count) for k, v in postprocessed_batch.items(): if 'state_in' in k[:8]: # assume all traj has the same length postprocessed_batch[k] = np.tile( v, (postprocessed_batch.count // v.shape[0], 1)) # print(k, len(postprocessed_batch[k])) postprocessed_batch.seq_lens = None # remove to use split_by_episode # very slow # need a way to get traj from a specific partner faster # could try split_by_episode # for i,row in enumerate(postprocessed_batch.rows()): # # print(i) # # print(row['state_in_0']) # # print(row['state_out_0']) # partner_id = tuple(row['partner_id']) # # print(partner_id) # # print(row['rewards']) # rew[partner_id] += row['rewards'] # for k,v in row.items(): # row[k] = [v] # row = SampleBatch(row) # if partner_id not in traj: # traj[partner_id] = row # else: # # print('concat',i) # traj[partner_id] = traj[partner_id].concat(row) for i, ep in enumerate(postprocessed_batch.split_by_episode()): # print(i) # print(ep['state_in_0']) # print(ep['state_out_0']) # assume a fixed set of partners in one episode partner_id = tuple(ep['partner_id'][0]) # print(partner_id) # print(ep['rewards']) rew[partner_id] += sum(ep['rewards']) # for k,v in ep.items(): # ep[k] = [v] # ep = SampleBatch(ep) if partner_id not in traj: traj[partner_id] = ep else: # print('concat',i) traj[partner_id] = traj[partner_id].concat(ep) rew_list = list(rew.items()) rew_list.sort(key=lambda x: x[1]) lowest_rew_partner = rew_list[0][0] # print(rew_list) train_traj = traj[lowest_rew_partner] # print(train_traj.count) stats = {'timesteps_used': train_traj.count} # print('traj.seq_lens:', train_traj.seq_lens) c = 0 for ep in range(self.ppo_epochs): # batch.shuffle() for mb in minibatches(train_traj, self.minibatch_size): c += 1 # minibatch = MultiAgentBatch({DEFAULT_POLICY_ID: mb}, mb.count) # minibatch = mb.copy() pad_batch_to_sequences_of_same_size( mb, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) # print(mb.seq_lens) # for k in mb.keys(): # print(k, len(mb[k])) # minibatch = mb.copy() # minibatch['advantages'] = standardize(minibatch['advantages']) mb["is_training"] = True # minibatch = mb.copy() mb['advantages'] = standardize(mb['advantages']) minibatch = self._lazy_tensor_dict(mb) # compute the loss loss = ppo_surrogate_loss(self, self.model, self.dist_class, minibatch) # compute gradient self.optimizer.zero_grad() loss.backward() # grad norm # apply_grad_clipping(self, self.optimizer, loss) if self.config['grad_clip']: grad_norm = nn.utils.clip_grad_norm_( self.model.parameters(), self.config['grad_clip']) self.optimizer.step() # log stats # stats['ppo_loss'] += loss_out.item() # stats['ppo_loss'] /= c # add more info about the loss # TODO: move this to inner loop and use average instead (0) # stats.update(kl_and_loss_stats(self, train_batch)) # compute the loss # log stats # stats = { # "loss": loss_out.item(), # 'test': 1 # # "grad_norm": grad_norm # # if isinstance(grad_norm, float) else grad_norm.item(), # } # TODO: move this to inner loop and use average instead (0) stats.update(kl_and_loss_stats(self, postprocessed_batch)) return {LEARNER_STATS_KEY: stats}
def learn_on_batch( self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]: # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, _use_trajectory_view_api=self.config["_use_trajectory_view_api"], ) train_batch = self._lazy_tensor_dict(postprocessed_batch) # Calculate the actual policy loss. loss_out = force_list( self._loss(self, self.model, self.dist_class, train_batch)) # Call Model's custom-loss with Policy loss outputs and train_batch. if self.model: loss_out = self.model.custom_loss(loss_out, train_batch) # Give Exploration component that chance to modify the loss (or add # its own terms). if hasattr(self, "exploration"): loss_out = self.exploration.get_exploration_loss( loss_out, train_batch) assert len(loss_out) == len(self._optimizers) # assert not any(torch.isnan(l) for l in loss_out) fetches = self.extra_compute_grad_fetches() # Loop through all optimizers. grad_info = {"allreduce_latency": 0.0} for i, opt in enumerate(self._optimizers): # Erase gradients in all vars of this optimizer. opt.zero_grad() # Recompute gradients of loss over all variables. loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1)) grad_info.update(self.extra_grad_process(opt, loss_out[i])) if self.distributed_world_size: grads = [] for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: grads.append(p.grad) start = time.time() if torch.cuda.is_available(): # Sadly, allreduce_coalesced does not work with CUDA yet. for g in grads: torch.distributed.all_reduce( g, op=torch.distributed.ReduceOp.SUM) else: torch.distributed.all_reduce_coalesced( grads, op=torch.distributed.ReduceOp.SUM) for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: p.grad /= self.distributed_world_size grad_info["allreduce_latency"] += time.time() - start # Step the optimizer for i, opt in enumerate(self._optimizers): xm.optimizer_step( opt, barrier=True) # HERE IS THE DIFFERENCE FOR TPU USE grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(train_batch)) if self.model: grad_info["model"] = self.model.metrics() return dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def learn_on_batch(self, postprocessed_batch): grad_info = {"allreduce_latency": 0.0} def minibatches(samples, sgd_minibatch_size): """Return a generator yielding minibatches from a sample batch. Arguments: samples (SampleBatch): batch of samples to split up. sgd_minibatch_size (int): size of minibatches to return. Returns: generator that returns mini-SampleBatches of size sgd_minibatch_size. """ if not sgd_minibatch_size: yield samples return if isinstance(samples, MultiAgentBatch): raise NotImplementedError( "Minibatching not implemented for multi-agent in simple mode" ) samples.shuffle() i = 0 slices = [] while i < samples.count: slices.append((i, i + sgd_minibatch_size)) i += sgd_minibatch_size random.shuffle(slices) for i, j in slices: yield samples.slice(i, j) #train policy function train_batch = None for minibatch in minibatches(postprocessed_batch, 1024): # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( minibatch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req) train_batch = self._lazy_tensor_dict(minibatch) loss_out = force_list( self._loss(self, self.model, self.dist_class, train_batch, True)) for i, opt in enumerate(self._optimizers): opt.zero_grad() pi_loss = loss_out[i] self.backprop(grad_info, opt, pi_loss, False) opt.step() #train value function for vtrain_i in range(3): for minibatch in minibatches(postprocessed_batch, 1024): # Get batch ready for RNNs, if applicable. pad_batch_to_sequences_of_same_size( minibatch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req) train_batch = self._lazy_tensor_dict(minibatch) loss_out = force_list( self._loss(self, self.model, self.dist_class, train_batch, False)) for i, opt in enumerate(self._optimizers): opt.zero_grad() vf_loss = loss_out[i] self.backprop(grad_info, opt, vf_loss, False) opt.step() grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update( self.extra_grad_info(train_batch) ) #is it ok just to update this with the last minibatch? return {LEARNER_STATS_KEY: grad_info}
def analyze_rnn_batch(batch, max_seq_len): count = batch.count # Check prev_reward/action, next_obs consistency. for idx in range(count): # If timestep tracked by batch, good. if "t" in batch: ts = batch["t"][idx] # Else, ts else: ts = batch["obs"][idx][3] obs_t = batch["obs"][idx] a_t = batch["actions"][idx] r_t = batch["rewards"][idx] state_in_0 = batch["state_in_0"][idx] state_in_1 = batch["state_in_1"][idx] # Check postprocessing outputs. if "2xobs" in batch: postprocessed_col_t = batch["2xobs"][idx] assert (obs_t == postprocessed_col_t / 2.0).all() # Check state-in/out and next-obs values. if idx > 0: next_obs_t_m_1 = batch["new_obs"][idx - 1] state_out_0_t_m_1 = batch["state_out_0"][idx - 1] state_out_1_t_m_1 = batch["state_out_1"][idx - 1] # Same trajectory as for t-1 -> Should be able to match. if (batch[SampleBatch.AGENT_INDEX][idx] == batch[SampleBatch.AGENT_INDEX][idx - 1] and batch[SampleBatch.EPS_ID][idx] == batch[SampleBatch.EPS_ID][idx - 1]): assert batch["unroll_id"][idx - 1] == batch["unroll_id"][idx] assert (obs_t == next_obs_t_m_1).all() assert (state_in_0 == state_out_0_t_m_1).all() assert (state_in_1 == state_out_1_t_m_1).all() # Different trajectory. else: assert batch["unroll_id"][idx - 1] != batch["unroll_id"][idx] assert not (obs_t == next_obs_t_m_1).all() assert not (state_in_0 == state_out_0_t_m_1).all() assert not (state_in_1 == state_out_1_t_m_1).all() # Check initial 0-internal states. if ts == 0: assert (state_in_0 == 0.0).all() assert (state_in_1 == 0.0).all() # Check initial 0-internal states (at ts=0). if ts == 0: assert (state_in_0 == 0.0).all() assert (state_in_1 == 0.0).all() # Check prev. a/r values. if idx < count - 1: prev_actions_t_p_1 = batch["prev_actions"][idx + 1] prev_rewards_t_p_1 = batch["prev_rewards"][idx + 1] # Same trajectory as for t+1 -> Should be able to match. if batch[SampleBatch.AGENT_INDEX][idx] == \ batch[SampleBatch.AGENT_INDEX][idx + 1] and \ batch[SampleBatch.EPS_ID][idx] == \ batch[SampleBatch.EPS_ID][idx + 1]: assert (a_t == prev_actions_t_p_1).all() assert r_t == prev_rewards_t_p_1 # Different (new) trajectory. Assume t-1 (prev-a/r) to be # always 0.0s. [3]=ts elif ts == 0: assert (prev_actions_t_p_1 == 0).all() assert prev_rewards_t_p_1 == 0.0 pad_batch_to_sequences_of_same_size(batch, max_seq_len=max_seq_len, shuffle=False, batch_divisibility_req=1) # Check after seq-len 0-padding. cursor = 0 for i, seq_len in enumerate(batch["seq_lens"]): state_in_0 = batch["state_in_0"][i] state_in_1 = batch["state_in_1"][i] for j in range(seq_len): k = cursor + j ts = batch["t"][k] obs_t = batch["obs"][k] a_t = batch["actions"][k] r_t = batch["rewards"][k] # Check postprocessing outputs. if "2xobs" in batch: postprocessed_col_t = batch["2xobs"][k] assert (obs_t == postprocessed_col_t / 2.0).all() # Check state-in/out and next-obs values. if j > 0: next_obs_t_m_1 = batch["new_obs"][k - 1] # state_out_0_t_m_1 = batch["state_out_0"][k - 1] # state_out_1_t_m_1 = batch["state_out_1"][k - 1] # Always same trajectory as for t-1. assert batch["unroll_id"][k - 1] == batch["unroll_id"][k] assert (obs_t == next_obs_t_m_1).all() # assert (state_in_0 == state_out_0_t_m_1).all()) # assert (state_in_1 == state_out_1_t_m_1).all()) # Check initial 0-internal states. elif ts == 0: assert (state_in_0 == 0.0).all() assert (state_in_1 == 0.0).all() for j in range(seq_len, max_seq_len): k = cursor + j obs_t = batch["obs"][k] a_t = batch["actions"][k] r_t = batch["rewards"][k] assert (obs_t == 0.0).all() assert (a_t == 0.0).all() assert (r_t == 0.0).all() cursor += max_seq_len
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: if not isinstance(postprocessed_batch, SampleBatch) or \ not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) # Mark the batch as "is_training" so the Model can use this # information. postprocessed_batch.is_training = True # Single device case: Use batch as-is (no slicing). if len(self.devices) == 1: batches = [self._lazy_tensor_dict(postprocessed_batch)] # Multi-GPU case: Slice inputs into n (roughly) equal batches. else: len_ = len(postprocessed_batch) batches = [] start = 0 for i, device in enumerate(self.devices): shard_len = len_ // (len(self.devices) - i) batch = self._lazy_tensor_dict(postprocessed_batch.slice( start, start + shard_len), device=device) batches.append(batch) len_ -= shard_len start += shard_len # Copy weights of main model to all towers. state_dict = self.model.state_dict() for tower in self.model_gpu_towers: tower.load_state_dict(state_dict) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(batches) # Multi device (GPU) case. if len(self.devices) > 1: # Mean-reduce over GPU-towers. all_grads = [] for i in range(len(tower_outputs[0][0])): if tower_outputs[0][0][i] is not None: all_grads.append( torch.mean(torch.stack( [t[0][i].to(self.device) for t in tower_outputs]), dim=0)) else: all_grads.append(None) # Set main model's grads to mean-reduced values. for i, p in enumerate(self.model.parameters()): p.grad = all_grads[i] # Reduce stats over towers as well. from ray.rllib.execution.train_ops import all_tower_reduce grad_info = tree.map_structure_with_path( lambda p, *t: all_tower_reduce(p, *t), *[t[1] for t in tower_outputs]) # Single device case. else: all_grads, grad_info = tower_outputs[0] grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(postprocessed_batch)) fetches = self.extra_compute_grad_fetches() return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: if not isinstance(postprocessed_batch, SampleBatch) or \ not postprocessed_batch.zero_padded: pad_batch_to_sequences_of_same_size( postprocessed_batch, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) else: postprocessed_batch["seq_lens"] = postprocessed_batch.seq_lens # Mark the batch as "is_training" so the Model can use this # information. postprocessed_batch.is_training = True train_batch = self._lazy_tensor_dict(postprocessed_batch) # Calculate the actual policy loss. loss_out = force_list( self._loss(self, self.model, self.dist_class, train_batch)) # Call Model's custom-loss with Policy loss outputs and train_batch. if self.model: loss_out = self.model.custom_loss(loss_out, train_batch) # Give Exploration component that chance to modify the loss (or add # its own terms). if hasattr(self, "exploration"): loss_out = self.exploration.get_exploration_loss( loss_out, train_batch) assert len(loss_out) == len(self._optimizers) # assert not any(torch.isnan(l) for l in loss_out) fetches = self.extra_compute_grad_fetches() # Loop through all optimizers. grad_info = {"allreduce_latency": 0.0} all_grads = [] for i, opt in enumerate(self._optimizers): # Erase gradients in all vars of this optimizer. opt.zero_grad() # Recompute gradients of loss over all variables. loss_out[i].backward(retain_graph=(i < len(self._optimizers) - 1)) grad_info.update(self.extra_grad_process(opt, loss_out[i])) grads = [] # Note that return values are just references; # Calling zero_grad would modify the values. for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: grads.append(p.grad) all_grads.append(p.grad.data.cpu().numpy()) else: all_grads.append(None) if self.distributed_world_size: start = time.time() if torch.cuda.is_available(): # Sadly, allreduce_coalesced does not work with CUDA yet. for g in grads: torch.distributed.all_reduce( g, op=torch.distributed.ReduceOp.SUM) else: torch.distributed.all_reduce_coalesced( grads, op=torch.distributed.ReduceOp.SUM) for param_group in opt.param_groups: for p in param_group["params"]: if p.grad is not None: p.grad /= self.distributed_world_size grad_info["allreduce_latency"] += time.time() - start grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(train_batch)) return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})
def compute_gradients(self, postprocessed_batch: SampleBatch) -> ModelGradients: # For multi-GPU, split the batch into n slices (n=#GPUs). if len(self.devices) == 1: batches = [postprocessed_batch] else: from ray.rllib.utils.sgd import minibatches batches = list( minibatches(postprocessed_batch, len(postprocessed_batch) // len(self.devices), shuffle=False)) if not isinstance(postprocessed_batch, SampleBatch) or \ not postprocessed_batch.zero_padded: for b in batches: pad_batch_to_sequences_of_same_size( b, max_seq_len=self.max_seq_len, shuffle=False, batch_divisibility_req=self.batch_divisibility_req, view_requirements=self.view_requirements, ) for b, d in zip(batches, self.devices): b.is_training = True self._lazy_tensor_dict(b, device=d) # Multi-GPU case: Slice inputs into n (roughly) equal batches. if len(self.devices) > 1: # Copy weights of main model to all towers. state_dict = self.model.state_dict() for tower in self.model_gpu_towers: tower.load_state_dict(state_dict) # Do the (maybe parallelized) gradient calculation step. tower_outputs = self._multi_gpu_parallel_grad_calc(batches) # Multi device (GPU) case. if len(self.devices) > 1: # Mean-reduce over GPU-towers. all_grads = [] for i in range(len(tower_outputs[0][0])): if tower_outputs[0][0][i] is not None: all_grads.append( torch.mean(torch.stack( [t[0][i].to(self.device) for t in tower_outputs]), dim=0)) else: all_grads.append(None) # Set main model's grads to mean-reduced values. for i, p in enumerate(self.model.parameters()): p.grad = all_grads[i] # Reduce stats over towers as well. from ray.rllib.execution.train_ops import all_tower_reduce grad_info = tree.map_structure_with_path( lambda p, *t: all_tower_reduce(p, *t), *[t[1] for t in tower_outputs]) # Single device case. else: all_grads, grad_info = tower_outputs[0] grad_info["allreduce_latency"] /= len(self._optimizers) grad_info.update(self.extra_grad_info(postprocessed_batch)) fetches = self.extra_compute_grad_fetches() return all_grads, dict(fetches, **{LEARNER_STATS_KEY: grad_info})