def testDynamicMaxLen(self): eps_ids = [5, 2, 2] agent_ids = [2, 2, 2] f = [[1, 1, 1]] s = [[1, 1, 1]] f_pad, s_init, seq_lens = chop_into_sequences(eps_ids, agent_ids, f, s, 4) self.assertEqual([f.tolist() for f in f_pad], [[1, 0, 1, 1]]) self.assertEqual([s.tolist() for s in s_init], [[1, 1]]) self.assertEqual(seq_lens.tolist(), [1, 2])
def testMultiAgent(self): eps_ids = [1, 1, 1, 5, 5, 5, 5, 5] agent_ids = [1, 1, 2, 1, 1, 2, 2, 3] f = [[101, 102, 103, 201, 202, 203, 204, 205], [[101], [102], [103], [201], [202], [203], [204], [205]]] s = [[209, 208, 207, 109, 108, 107, 106, 105]] f_pad, s_init, seq_lens = chop_into_sequences( eps_ids, agent_ids, f, s, 4, dynamic_max=False) self.assertEqual(seq_lens.tolist(), [2, 1, 2, 2, 1]) self.assertEqual(len(f_pad[0]), 20) self.assertEqual(len(s_init[0]), 5)
def testBasic(self): eps_ids = [1, 1, 1, 5, 5, 5, 5, 5] agent_ids = [1, 1, 1, 1, 1, 1, 1, 1] f = [[101, 102, 103, 201, 202, 203, 204, 205], [[101], [102], [103], [201], [202], [203], [204], [205]]] s = [[209, 208, 207, 109, 108, 107, 106, 105]] f_pad, s_init, seq_lens = chop_into_sequences(eps_ids, agent_ids, f, s, 4) self.assertEqual([f.tolist() for f in f_pad], [ [101, 102, 103, 0, 201, 202, 203, 204, 205, 0, 0, 0], [[101], [102], [103], [0], [201], [202], [203], [204], [205], [0], [0], [0]], ]) self.assertEqual([s.tolist() for s in s_init], [[209, 109, 105]]) self.assertEqual(seq_lens.tolist(), [3, 4, 1])
def _get_loss_inputs_dict(self, batch): feed_dict = {} if self._batch_divisibility_req > 1: meets_divisibility_reqs = ( len(batch["obs"]) % self._batch_divisibility_req == 0 and max(batch["agent_index"]) == 0) # not multiagent else: meets_divisibility_reqs = True # Simple case: not RNN nor do we need to pad if not self._state_inputs and meets_divisibility_reqs: for k, ph in self._loss_inputs: feed_dict[ph] = batch[k] return feed_dict if self._state_inputs: max_seq_len = self._max_seq_len dynamic_max = True else: max_seq_len = self._batch_divisibility_req dynamic_max = False # RNN or multi-agent case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] feature_sequences, initial_states, seq_lens = chop_into_sequences( batch["eps_id"], batch["agent_index"], [batch[k] for k in feature_keys], [batch[k] for k in state_keys], max_seq_len, dynamic_max=dynamic_max) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): feed_dict[self._loss_input_dict[k]] = v feed_dict[self._seq_lens] = seq_lens return feed_dict
def _get_loss_inputs_dict(self, batch): feed_dict = {} # Simple case if not self._state_inputs: for k, ph in self._loss_inputs: feed_dict[ph] = batch[k] return feed_dict # RNN case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] feature_sequences, initial_states, seq_lens = chop_into_sequences( batch["eps_id"], [batch[k] for k in feature_keys], [batch[k] for k in state_keys], self._max_seq_len) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): feed_dict[self._loss_input_dict[k]] = v feed_dict[self._seq_lens] = seq_lens return feed_dict
def learn_on_batch(self, samples): obs_batch, action_mask = self._unpack_observation( samples[SampleBatch.CUR_OBS]) next_obs_batch, next_action_mask = self._unpack_observation( samples[SampleBatch.NEXT_OBS]) group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS]) # These will be padded to shape [B * T, ...] [rew, action_mask, next_action_mask, act, dones, obs, next_obs], \ initial_states, seq_lens = \ chop_into_sequences( samples[SampleBatch.EPS_ID], samples[SampleBatch.UNROLL_ID], samples[SampleBatch.AGENT_INDEX], [ group_rewards, action_mask, next_action_mask, samples[SampleBatch.ACTIONS], samples[SampleBatch.DONES], obs_batch, next_obs_batch ], [samples["state_in_{}".format(k)] for k in range(len(self.get_initial_state()))], max_seq_len=self.config["model"]["max_seq_len"], dynamic_max=True) B, T = len(seq_lens), max(seq_lens) def to_batches(arr): new_shape = [B, T] + list(arr.shape[1:]) return th.from_numpy(np.reshape(arr, new_shape)) rewards = to_batches(rew).float() actions = to_batches(act).long() obs = to_batches(obs).reshape([B, T, self.n_agents, self.obs_size]).float() action_mask = to_batches(action_mask) next_obs = to_batches(next_obs).reshape( [B, T, self.n_agents, self.obs_size]).float() next_action_mask = to_batches(next_action_mask) # TODO(ekl) this treats group termination as individual termination terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand( B, T, self.n_agents) # Create mask for where index is < unpadded sequence length filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) < np.expand_dims(seq_lens, 1)).astype(np.float32) mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents) # Compute loss loss_out, mask, masked_td_error, chosen_action_qvals, targets = \ self.loss(rewards, actions, terminated, mask, obs, next_obs, action_mask, next_action_mask) # Optimise self.optimiser.zero_grad() loss_out.backward() grad_norm = th.nn.utils.clip_grad_norm_( self.params, self.config["grad_norm_clipping"]) self.optimiser.step() mask_elems = mask.sum().item() stats = { "loss": loss_out.item(), "grad_norm": grad_norm if isinstance(grad_norm, float) else grad_norm.item(), "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, "target_mean": (targets * mask).sum().item() / mask_elems, } return {LEARNER_STATS_KEY: stats}
def compute_apply(self, samples): obs_batch, action_mask = self._unpack_observation(samples["obs"]) group_rewards = self._get_group_rewards(samples["infos"]) # These will be padded to shape [B * T, ...] [rew, action_mask, act, dones, obs], initial_states, seq_lens = \ chop_into_sequences( samples["eps_id"], samples["agent_index"], [ group_rewards, action_mask, samples["actions"], samples["dones"], obs_batch ], [samples["state_in_{}".format(k)] for k in range(self.n_agents)], max_seq_len=self.config["model"]["max_seq_len"], dynamic_max=True, _extra_padding=1) # TODO(ekl) adding 1 extra unit of padding here, since otherwise we # lose the terminating reward and the Q-values will be unanchored! B, T = len(seq_lens), max(seq_lens) + 1 def to_batches(arr): new_shape = [B, T] + list(arr.shape[1:]) return th.from_numpy(np.reshape(arr, new_shape)) rewards = to_batches(rew)[:, :-1].float() actions = to_batches(act)[:, :-1].long() obs = to_batches(obs).reshape([B, T, self.n_agents, self.obs_size]).float() action_mask = to_batches(action_mask) # TODO(ekl) this treats group termination as individual termination terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand( B, T, self.n_agents)[:, :-1] filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) < np.expand_dims(seq_lens, 1)).astype(np.float32) mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)[:, :-1] mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) # Compute loss loss_out, mask, masked_td_error, chosen_action_qvals, targets = \ self.loss(rewards, actions, terminated, mask, obs, action_mask) # Optimise self.optimiser.zero_grad() loss_out.backward() grad_norm = th.nn.utils.clip_grad_norm_( self.params, self.config["grad_norm_clipping"]) self.optimiser.step() mask_elems = mask.sum().item() stats = { "loss": loss_out.item(), "grad_norm": grad_norm if isinstance(grad_norm, float) else grad_norm.item(), "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, "target_mean": (targets * mask).sum().item() / mask_elems, } return {"stats": stats}, {}
def _get_loss_inputs_dict(self, batch, shuffle): """Return a feed dict from a batch. Arguments: batch (SampleBatch): batch of data to derive inputs from shuffle (bool): whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: feed dict of data """ feed_dict = {} if self._batch_divisibility_req > 1: meets_divisibility_reqs = ( len(batch[SampleBatch.CUR_OBS]) % self._batch_divisibility_req == 0 and max(batch[SampleBatch.AGENT_INDEX]) == 0) # not multiagent else: meets_divisibility_reqs = True # Simple case: not RNN nor do we need to pad if not self._state_inputs and meets_divisibility_reqs: if shuffle: batch.shuffle() for k, ph in self._loss_inputs: feed_dict[ph] = batch[k] return feed_dict if self._state_inputs: max_seq_len = self._max_seq_len dynamic_max = True else: max_seq_len = self._batch_divisibility_req dynamic_max = False # RNN or multi-agent case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] feature_sequences, initial_states, seq_lens = chop_into_sequences( batch[SampleBatch.EPS_ID], batch[SampleBatch.UNROLL_ID], batch[SampleBatch.AGENT_INDEX], [batch[k] for k in feature_keys], [batch[k] for k in state_keys], max_seq_len, dynamic_max=dynamic_max, shuffle=shuffle) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): feed_dict[self._loss_input_dict[k]] = v feed_dict[self._seq_lens] = seq_lens if log_once("rnn_feed_dict"): logger.info("Padded input for RNN:\n\n{}\n".format( summarize({ "features": feature_sequences, "initial_states": initial_states, "seq_lens": seq_lens, "max_seq_len": max_seq_len, }))) return feed_dict
def learn_on_batch(self, samples): obs_batch, action_mask = self._unpack_observation(samples["obs"]) group_rewards = self._get_group_rewards(samples["infos"]) # These will be padded to shape [B * T, ...] [rew, action_mask, act, dones, obs], initial_states, seq_lens = \ chop_into_sequences( samples["eps_id"], samples["agent_index"], [ group_rewards, action_mask, samples["actions"], samples["dones"], obs_batch ], [samples["state_in_{}".format(k)] for k in range(len(self.get_initial_state()))], max_seq_len=self.config["model"]["max_seq_len"], dynamic_max=True, _extra_padding=1) # TODO(ekl) adding 1 extra unit of padding here, since otherwise we # lose the terminating reward and the Q-values will be unanchored! B, T = len(seq_lens), max(seq_lens) + 1 def to_batches(arr): new_shape = [B, T] + list(arr.shape[1:]) return th.from_numpy(np.reshape(arr, new_shape)) rewards = to_batches(rew)[:, :-1].float() actions = to_batches(act)[:, :-1].long() obs = to_batches(obs).reshape([B, T, self.n_agents, self.obs_size]).float() action_mask = to_batches(action_mask) # TODO(ekl) this treats group termination as individual termination terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand( B, T, self.n_agents)[:, :-1] filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) < np.expand_dims(seq_lens, 1)).astype(np.float32) mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)[:, :-1] mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) # Compute loss loss_out, mask, masked_td_error, chosen_action_qvals, targets = \ self.loss(rewards, actions, terminated, mask, obs, action_mask) # Optimise self.optimiser.zero_grad() loss_out.backward() grad_norm = th.nn.utils.clip_grad_norm_( self.params, self.config["grad_norm_clipping"]) self.optimiser.step() mask_elems = mask.sum().item() stats = { "loss": loss_out.item(), "grad_norm": grad_norm if isinstance(grad_norm, float) else grad_norm.item(), "td_error_abs": masked_td_error.abs().sum().item() / mask_elems, "q_taken_mean": (chosen_action_qvals * mask).sum().item() / mask_elems, "target_mean": (targets * mask).sum().item() / mask_elems, } return {"stats": stats}, {}