def to_transition(trajectory, next_trajectory=None): """Create a transition from a trajectory or two adjacent trajectories. **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are sliced along their *second* (`time`) dimension; for example: ``` time_steps.observation = trajectory.observation[:, :-1] next_time_steps.observation = trajectory.observation[:, 1:] ``` Args: trajectory: An instance of `Trajectory`. next_trajectory: (optional) An instance of `Trajectory`. Returns: A tuple `(time_steps, policy_steps, next_time_steps)`. The `reward` and `discount` fields of `time_steps` are filled with zeros because these cannot be deduced. """ if next_trajectory is None: next_trajectory = nest.map_structure(lambda x: x[:, 1:], trajectory) trajectory = nest.map_structure(lambda x: x[:, :-1], trajectory) policy_steps = policy_step.PolicyStep(trajectory.action, (), trajectory.policy_info) # TODO(kbanoop): Consider replacing 0 rewards & discounts with (). time_steps = ts.TimeStep( trajectory.step_type, reward=nest.map_structure(tf.zeros_like, trajectory.reward), # unknown discount=tf.zeros_like(trajectory.discount), # unknown observation=trajectory.observation) next_time_steps = ts.TimeStep(trajectory.next_step_type, trajectory.reward, trajectory.discount, next_trajectory.observation) return [time_steps, policy_steps, next_time_steps]
def _get_mock_env_episode(self): mock_env = mock.MagicMock() mock_env.step.side_effect = [ ts.TimeStep(ts.StepType.FIRST, 2, 1, [0]), ts.TimeStep(ts.StepType.MID, 3, 1, [1]), ts.TimeStep(ts.StepType.MID, 5, 1, [2]), ts.TimeStep(ts.StepType.LAST, 7, 1, [3]), ] return mock_env
def to_transition(trajectory, next_trajectory=None): """Create a transition from a trajectory or two adjacent trajectories. **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are sliced along their *second* (`time`) dimension; for example: ``` time_steps.step_type = trajectory.step_type[:,:-1] time_steps.observation = trajectory.observation[:,:-1] next_time_steps.observation = trajectory.observation[:,1:] next_time_steps. step_type = trajectory. next_step_type[:,:-1] next_time_steps.reward = trajectory.reward[:,:-1] next_time_steps. discount = trajectory. discount[:,:-1] ``` Notice that reward and discount for time_steps are undefined, therefore filled with zero. Args: trajectory: An instance of `Trajectory`. The tensors in Trajectory must have shape `[ B, T, ...]` when next_trajectory is None. next_trajectory: (optional) An instance of `Trajectory`. Returns: A tuple `(time_steps, policy_steps, next_time_steps)`. The `reward` and `discount` fields of `time_steps` are filled with zeros because these cannot be deduced (please do not use them). """ _validate_rank(trajectory.discount, min_rank=1, max_rank=2) if next_trajectory is not None: _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2) if next_trajectory is None: next_trajectory = tf.nest.map_structure(lambda x: x[:, 1:], trajectory) trajectory = tf.nest.map_structure(lambda x: x[:, :-1], trajectory) policy_steps = policy_step.PolicyStep(action=trajectory.action, state=(), info=trajectory.policy_info) # TODO(kbanoop): Consider replacing 0 rewards & discounts with (). time_steps = ts.TimeStep( trajectory.step_type, reward=tf.nest.map_structure(tf.zeros_like, trajectory.reward), # unknown discount=tf.zeros_like(trajectory.discount), # unknown observation=trajectory.observation) next_time_steps = ts.TimeStep(step_type=trajectory.next_step_type, reward=trajectory.reward, discount=trajectory.discount, observation=next_trajectory.observation) return [time_steps, policy_steps, next_time_steps]
def setUp(self): super(PolicySaverTest, self).setUp() self._time_step_spec = ts.TimeStep( step_type=tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), name='st', minimum=0, maximum=2), reward=tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=(), name='reward', minimum=0.0, maximum=5.0), discount=tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=(), name='discount', minimum=0.0, maximum=1.0), observation=tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=(4, ), name='obs', minimum=-10.0, maximum=10.0)) self._action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=10, name='act_0') self._global_seed = 12345 tf.compat.v1.set_random_seed(self._global_seed)
def current_time_step(self): def first(): return (tf.constant(FIRST, dtype=tf.int32), tf.constant(0.0, dtype=tf.float32), tf.constant(1.0, dtype=tf.float32)) def mid(): return (tf.constant(MID, dtype=tf.int32), tf.constant(0.0, dtype=tf.float32), tf.constant(1.0, dtype=tf.float32)) def last(): return (tf.constant(LAST, dtype=tf.int32), tf.constant(1.0, dtype=tf.float32), tf.constant(0.0, dtype=tf.float32)) state_value = tf.mod(self._state.value(), 3) step_type, reward, discount = tf.case( { tf.equal(state_value, FIRST): first, tf.equal(state_value, MID): mid, tf.equal(state_value, LAST): last }, exclusive=True, strict=True) return ts.TimeStep(step_type, reward, discount, state_value)
def _train(self, experience, weights=None): # TODO(b/126593927): Support batch dimensions >1. if experience.step_type.shape[0] != 1: raise NotImplementedError( 'ReinforceAgent does not yet support batch ' 'dimensions greater than 1.') experience = tf.nest.map_structure(lambda t: tf.squeeze(t, 0), experience) returns = common.compute_returns(experience.reward, experience.discount) if self._debug_summaries: tf.compat.v2.summary.histogram(name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram(name='returns', data=returns, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: ret_mean, ret_var = tf.nn.moments(x=returns, axes=[0]) returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6) if self._debug_summaries: tf.compat.v2.summary.histogram(name='normalized_returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) variables_to_train = self._actor_network.variables with tf.GradientTape() as tape: loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries(grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries(grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients(grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def loop_body(time, time_step, policy_state, output_action_tas, output_policy_info_tas): """Runs a step in environment. While loop will call multiple times. Args: time: Step time. time_step: Previous step's `TimeStep`. policy_state: Policy state tensor or nested structure of tensors. output_action_tas: Updated nest of `tf.TensorArray`, the new actions. output_policy_info_tas: Updated nest of `tf.TensorArray`, the new policy info. Returns: loop_vars for next iteration of tf.while_loop. """ policy_state, next_output_action_tas, next_output_policy_info_tas = ( process_step(time, time_step, policy_state, output_action_tas, output_policy_info_tas)) ta_read = lambda ta: ta.read(time) ta_read_prev = lambda ta: ta.read(time - 1) time_step = ts.TimeStep( step_type=ta_read(trajectory_tas.step_type), observation=tf.nest.map_structure(ta_read, trajectory_tas.observation), reward=tf.nest.map_structure(ta_read_prev, trajectory_tas.reward), discount=ta_read_prev(trajectory_tas.discount)) return (time + 1, time_step, policy_state, next_output_action_tas, next_output_policy_info_tas)
def _set_names_and_shapes(self, step_type, reward, discount, *flat_observations): """Returns a `TimeStep` namedtuple.""" step_type = tf.identity(step_type, name='step_type') reward = tf.identity(reward, name='reward') discount = tf.identity(discount, name='discount') batch_shape = () if not self.batched else (self.batch_size, ) batch_shape = tf.TensorShape(batch_shape) if not tf.executing_eagerly(): # Shapes are not required in eager mode. reward.set_shape(batch_shape) step_type.set_shape(batch_shape) discount.set_shape(batch_shape) # Give each tensor a meaningful name and set the static shape. named_observations = [] for obs, spec in zip(flat_observations, tf.nest.flatten(self.observation_spec())): named_observation = tf.identity(obs, name=spec.name) if not tf.executing_eagerly(): named_observation.set_shape(batch_shape.concatenate( spec.shape)) named_observations.append(named_observation) observations = tf.nest.pack_sequence_as(self.observation_spec(), named_observations) return ts.TimeStep(step_type, reward, discount, observations)
def testTrain(self, num_epochs, use_td_lambda_return): if tf.executing_eagerly(): self.skipTest('b/123777119') # Secondary bug: ('b/123770140') with tf.compat.v2.summary.record_if(False): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet(self._action_spec, ), value_net=DummyValueNet(outer_rank=2), normalize_observations=False, num_epochs=num_epochs, use_gae=use_td_lambda_return, use_td_lambda_return=use_td_lambda_return) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } policy_info = action_distribution_parameters experience = trajectory.Trajectory( time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) # Mock the build_train_op to return an op for incrementing this counter. counter = tf.compat.v1.train.get_or_create_global_step() zero = tf.constant(0, dtype=tf.float32) agent.build_train_op = ( lambda *_, **__: tf_agent.LossInfo( # pylint: disable=g-long-lambda counter.assign_add(1), ppo_agent.PPOLossInfo(*[zero] * 5))) train_op = agent.train(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) # Assert that counter starts out at zero. self.assertEqual(0, self.evaluate(counter)) self.evaluate(train_op) # Assert that train_op ran increment_counter num_epochs times. self.assertEqual(num_epochs, self.evaluate(counter))
def _pack_and_filter_timestep_observation(self, timestep): """Pack and filter observations into a single dimension. Args: timestep: A `TimeStep` namedtuple containing: - step_type: A `StepType` value. - reward: Reward at this timestep. - discount: A discount in the range [0, 1]. - observation: A NumPy array, or a nested dict, list or tuple of arrays corresponding to `observation_spec()`. Returns: A new `TimeStep` namedtuple that has filtered observations and packed into a single dimenison. """ # We can't set attribute to the TimeStep tuple, so we make a copy of the # observations. observations = timestep.observation if self._observations_whitelist is not None: observations = self._filter_observations(observations) return ts.TimeStep( timestep.step_type, timestep.reward, timestep.discount, self._flatten_nested_observations(observations, is_batched=self._env.batched))
def _time_step_batch(self): return ts.TimeStep( tf.constant( ts.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), tf.constant([[1, 2], [3, 4]], dtype=tf.float32, name='observation'))
def testTrain(self, num_epochs, use_td_lambda_return): with tf.compat.v2.summary.record_if(False): # Mock the build_train_op to return an op for incrementing this counter. counter = common.create_variable('test_train_counter') agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=num_epochs, use_gae=use_td_lambda_return, use_td_lambda_return=use_td_lambda_return, train_step_counter=counter) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } policy_info = action_distribution_parameters experience = trajectory.Trajectory( time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if not tf.executing_eagerly(): loss = agent.train(experience) else: loss = lambda: agent.train(experience) # Assert that counter starts out at zero. self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(0, self.evaluate(counter)) self.evaluate(loss) # Assert that train_op ran increment_counter num_epochs times. self.assertEqual(num_epochs, self.evaluate(counter))
def _apply_actor_network(self, time_step, policy_state): if self._observation_normalizer: observation = self._observation_normalizer.normalize( time_step.observation) time_step = ts.TimeStep(time_step.step_type, time_step.reward, time_step.discount, observation) return self._actor_network( time_step.observation, time_step.step_type, network_state=policy_state)
def _train(self, experience, weights=None, train_step_counter=None): # TODO(sfishman): Support batch dimensions >1. if experience.step_type.shape[0] != 1: raise NotImplementedError( 'ReinforceAgent does not yet support batch ' 'dimensions greater than 1.') experience = nest.map_structure(lambda t: tf.squeeze(t, 0), experience) returns = common.compute_returns(experience.reward, experience.discount) if self._debug_summaries: tf.contrib.summary.histogram('rewards', experience.reward) tf.contrib.summary.histogram('discounts', experience.discount) tf.contrib.summary.histogram('returns', returns) # TODO(kbnaoop): replace with tensor normalizer. if self._normalize_returns: ret_mean, ret_var = tf.nn.moments(returns, axes=[0]) returns = (returns - ret_mean) / (tf.sqrt(ret_var) + 1e-6) if self._debug_summaries: tf.contrib.summary.histogram('normalized_returns', returns) # TODO(kbanoop): remove after changing network interface to accept # observations and step_types, instead of time_steps. time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) # TODO(kbanoop): Filter boundary steps. loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) clip_gradients = (tf.contrib.training.clip_gradient_norms_fn( self._gradient_clipping) if self._gradient_clipping else None) # TODO(sguada): create_train_step should not return a Future. loss_info = eager_utils.create_train_step( loss_info, self._optimizer, total_loss_fn=lambda loss_info: loss_info.loss, global_step=train_step_counter, transform_grads_fn=clip_gradients, summarize_gradients=self._summarize_grads_and_vars, variables_to_train=lambda: self._actor_network.trainable_weights, ) if isinstance(loss_info, eager_utils.Future): loss_info = loss_info() if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): for var in self._actor_network.trainable_weights: tf.contrib.summary.histogram(var.name.replace(':', '_'), var) return loss_info
def _step(self, action): total_reward = 0 for _ in range(self._times): time_step = self._env.step(action) total_reward += time_step.reward if time_step.is_last(): break return ts.TimeStep(time_step.step_type, total_reward, time_step.discount, time_step.observation)
def testLastNumpy(self): observation = -1 reward = 2.0 discount = 1.0 time_step = ts.TimeStep(np.asarray(ts.StepType.LAST), np.asarray(reward), np.asarray(discount), np.asarray(observation)) self.assertTrue(time_step.is_last()) self.assertEqual(ts.StepType.LAST, time_step.step_type) self.assertEqual(-1, time_step.observation) self.assertEqual(2.0, time_step.reward) self.assertEqual(1.0, time_step.discount)
def _train(self, experience, weights=None): returns = value_ops.discounted_return( experience.reward, experience.discount, time_major=False) if self._debug_summaries: tf.compat.v2.summary.histogram( name='rewards', data=experience.reward, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='discounts', data=experience.discount, step=self.train_step_counter) tf.compat.v2.summary.histogram( name='returns', data=returns, step=self.train_step_counter) # TODO(b/126592060): replace with tensor normalizer. if self._normalize_returns: returns = _standard_normalize(returns, axes=(0, 1)) if self._debug_summaries: tf.compat.v2.summary.histogram( name='normalized_returns', data=returns, step=self.train_step_counter) time_step = ts.TimeStep(experience.step_type, tf.zeros_like(experience.reward), tf.zeros_like(experience.discount), experience.observation) variables_to_train = self._actor_network.variables with tf.GradientTape() as tape: loss_info = self._loss(time_step, experience.action, tf.stop_gradient(returns), weights=weights) tf.debugging.check_numerics(loss_info.loss, 'Loss is inf or nan') grads = tape.gradient(loss_info.loss, variables_to_train) grads_and_vars = zip(grads, variables_to_train) if self._gradient_clipping: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) if self._summarize_grads_and_vars: eager_utils.add_variables_summaries( grads_and_vars, self.train_step_counter) eager_utils.add_gradients_summaries( grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) return tf.nest.map_structure(tf.identity, loss_info)
def testTrain(self, num_epochs): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.train.AdamOptimizer(), actor_net=DummyActorNet(self._action_spec,), value_net=DummyValueNet(outer_rank=2), normalize_observations=False, num_epochs=num_epochs, ) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) time_steps = ts.TimeStep( step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[0.0, 0.0], [0.0, 0.0]], dtype=tf.float32), 'scale': tf.constant([[1.0, 1.0], [1.0, 1.0]], dtype=tf.float32), } policy_info = action_distribution_parameters experience = trajectory.Trajectory( time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) # Mock the build_train_op to return an op for incrementing this counter. counter = tf.train.get_or_create_global_step() zero = tf.constant(0, dtype=tf.float32) agent.build_train_op = ( lambda *_, **__: (counter.assign_add(1), [zero] * 5)) train_op = agent.train(experience) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) # Assert that counter starts out at zero. counter_ = sess.run(counter) self.assertEqual(0, counter_) sess.run(train_op) # Assert that train_op ran increment_counter num_epochs times. counter_ = sess.run(counter) self.assertEqual(num_epochs, counter_)
def convert_time_step(time_step): """Convert to agents time_step type as the __hash__ method is different.""" reward = time_step.reward if reward is None: reward = 0.0 discount = time_step.discount if discount is None: discount = 1.0 return ts.TimeStep( ts.StepType(time_step.step_type), _as_float32_array(reward), _as_float32_array(discount), time_step.observation, )
def test(self): first = ts.StepType.FIRST mid = ts.StepType.MID last = ts.StepType.LAST step_types = [first, mid, mid, last, mid, mid, mid, last] discounts = [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0] time_steps = ts.TimeStep( step_type=step_types, discount=discounts, reward=discounts, observation=discounts) episode_mask = common.get_episode_mask(time_steps) expected_mask = [1, 1, 1, 0, 1, 1, 1, 0] self.evaluate(tf.global_variables_initializer()) self.assertAllEqual(expected_mask, self.evaluate(episode_mask))
def testTrainWithRnn(self): with tf.compat.v2.summary.record_if(False): actor_net = actor_distribution_rnn_network.ActorDistributionRnnNetwork( self._obs_spec, self._action_spec, input_fc_layer_params=None, output_fc_layer_params=None, conv_layer_params=None, lstm_size=(40, )) counter = common.create_variable('test_train_counter') agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=actor_net, optimizer=tf.compat.v1.train.AdamOptimizer(0.001), train_step_counter=counter) batch_size = 5 observations = tf.constant([[[1, 2], [3, 4], [5, 6]]] * batch_size, dtype=tf.float32) time_steps = ts.TimeStep( step_type=tf.constant([[1] * 3] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), discount=tf.constant([[1] * 3] * batch_size, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]]] * batch_size, dtype=tf.float32) experience = trajectory.Trajectory(time_steps.step_type, observations, actions, (), time_steps.step_type, time_steps.reward, time_steps.discount) # Force variable creation. agent.policy.variables() if tf.executing_eagerly(): loss = lambda: agent.train(experience) else: loss = agent.train(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertEqual(self.evaluate(counter), 0) self.evaluate(loss) self.assertEqual(self.evaluate(counter), 1)
def testMakeTimestepMaskWithPartialEpisode(self): first, mid, last = ts.StepType.FIRST, ts.StepType.MID, ts.StepType.LAST next_step_types = tf.constant( [[mid, mid, last, first, mid, mid, last, first, mid, mid], [mid, mid, last, first, mid, mid, mid, mid, mid, last]]) zeros = tf.zeros_like(next_step_types) next_time_step = ts.TimeStep(next_step_types, zeros, zeros, zeros) # Mask should be 0.0 for transition timesteps (3, 7) and for all timesteps # belonging to the final, incomplete episode. expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] timestep_mask = ppo_utils.make_timestep_mask(next_time_step) timestep_mask_ = self.evaluate(timestep_mask) self.assertAllClose(expected_mask, timestep_mask_)
def run(self, trajectory, policy_state=None): """Apply the policy to trajectory steps and store actions/info. If `self.time_major == True`, the tensors in `trajectory` are assumed to have shape `[time, batch, ...]`. Otherwise they are assumed to have shape `[batch, time, ...]`. Args: trajectory: The `Trajectory` to run against. If the replay class was created with `time_major=True`, then the tensors in trajectory must be shaped `[time, batch, ...]`. Otherwise they must be shaped `[batch, time, ...]`. policy_state: (optional) A nest Tensor with initial step policy state. Returns: output_actions: A nest of the actions that the policy took. If the replay class was created with `time_major=True`, then the tensors here will be shaped `[time, batch, ...]`. Otherwise they'll be shaped `[batch, time, ...]`. output_policy_info: A nest of the policy info that the policy emitted. If the replay class was created with `time_major=True`, then the tensors here will be shaped `[time, batch, ...]`. Otherwise they'll be shaped `[batch, time, ...]`. policy_state: A nest Tensor with final step policy state. Raises: TypeError: If `policy_state` structure doesn't match `self.policy.policy_state_spec`, or `trajectory` structure doesn't match `self.policy.trajectory_spec`. ValueError: If `policy_state` doesn't match `self.policy.policy_state_spec`, or `trajectory` structure doesn't match `self.policy.trajectory_spec`. ValueError: If `trajectory` lacks two outer dims. """ trajectory_spec = self._policy.trajectory_spec() outer_dims = nest_utils.get_outer_shape(trajectory, trajectory_spec) if tf.compat.dimension_value(outer_dims.shape[0]) != 2: raise ValueError( "Expected two outer dimensions, but saw '{}' dimensions.\n" "Trajectory:\n{}.\nTrajectory spec from policy:\n{}.".format( tf.compat.dimension_value(outer_dims.shape[0]), trajectory, trajectory_spec)) if self._time_major: sequence_length = outer_dims[0] batch_size = outer_dims[1] static_batch_size = tf.compat.dimension_value( trajectory.discount.shape[1]) else: batch_size = outer_dims[0] sequence_length = outer_dims[1] static_batch_size = tf.compat.dimension_value( trajectory.discount.shape[0]) if policy_state is None: policy_state = self._policy.get_initial_state(batch_size) else: tf.nest.assert_same_structure(policy_state, self._policy.policy_state_spec()) if not self._time_major: # Make trajectory time-major. trajectory = tf.nest.map_structure( common_utils.transpose_batch_time, trajectory) trajectory_tas = tf.nest.map_structure( lambda t: tf.TensorArray(t.dtype, size=sequence_length).unstack(t), trajectory) def create_output_ta(spec): return tf.TensorArray(spec.dtype, size=sequence_length, element_shape=(tf.TensorShape([ static_batch_size ]).concatenate(spec.shape))) output_action_tas = tf.nest.map_structure(create_output_ta, trajectory_spec.action) output_policy_info_tas = tf.nest.map_structure( create_output_ta, trajectory_spec.policy_info) read0 = lambda ta: ta.read(0) zeros_like0 = lambda t: tf.zeros_like(t[0]) ones_like0 = lambda t: tf.ones_like(t[0]) time_step = ts.TimeStep( step_type=read0(trajectory_tas.step_type), reward=tf.nest.map_structure(zeros_like0, trajectory.reward), discount=ones_like0(trajectory.discount), observation=tf.nest.map_structure(read0, trajectory_tas.observation)) def process_step(time, time_step, policy_state, output_action_tas, output_policy_info_tas): """Take an action on the given step, and update output TensorArrays. Args: time: Step time. Describes which row to read from the trajectory TensorArrays and which location to write into in the output TensorArrays. time_step: Previous step's `TimeStep`. policy_state: Policy state tensor or nested structure of tensors. output_action_tas: Nest of `tf.TensorArray` containing new actions. output_policy_info_tas: Nest of `tf.TensorArray` containing new policy info. Returns: policy_state: The next policy state. next_output_action_tas: Updated `output_action_tas`. next_output_policy_info_tas: Updated `output_policy_info_tas`. """ action_step = self._policy.action(time_step, policy_state) policy_state = action_step.state write_ta = lambda ta, t: ta.write(time - 1, t) next_output_action_tas = tf.nest.map_structure( write_ta, output_action_tas, action_step.action) next_output_policy_info_tas = tf.nest.map_structure( write_ta, output_policy_info_tas, action_step.info) return (action_step.state, next_output_action_tas, next_output_policy_info_tas) def loop_body(time, time_step, policy_state, output_action_tas, output_policy_info_tas): """Runs a step in environment. While loop will call multiple times. Args: time: Step time. time_step: Previous step's `TimeStep`. policy_state: Policy state tensor or nested structure of tensors. output_action_tas: Updated nest of `tf.TensorArray`, the new actions. output_policy_info_tas: Updated nest of `tf.TensorArray`, the new policy info. Returns: loop_vars for next iteration of tf.while_loop. """ policy_state, next_output_action_tas, next_output_policy_info_tas = ( process_step(time, time_step, policy_state, output_action_tas, output_policy_info_tas)) ta_read = lambda ta: ta.read(time) ta_read_prev = lambda ta: ta.read(time - 1) time_step = ts.TimeStep( step_type=ta_read(trajectory_tas.step_type), observation=tf.nest.map_structure(ta_read, trajectory_tas.observation), reward=tf.nest.map_structure(ta_read_prev, trajectory_tas.reward), discount=ta_read_prev(trajectory_tas.discount)) return (time + 1, time_step, policy_state, next_output_action_tas, next_output_policy_info_tas) time = tf.constant(1) time, time_step, policy_state, output_action_tas, output_policy_info_tas = ( tf.while_loop(cond=lambda time, *_: time < sequence_length, body=loop_body, loop_vars=[ time, time_step, policy_state, output_action_tas, output_policy_info_tas ], back_prop=False, name="trajectory_replay_loop")) # Run the last time step last_policy_state, output_action_tas, output_policy_info_tas = ( process_step(time, time_step, policy_state, output_action_tas, output_policy_info_tas)) def stack_ta(ta): t = ta.stack() if not self._time_major: t = common_utils.transpose_batch_time(t) return t stacked_output_actions = tf.nest.map_structure(stack_ta, output_action_tas) stacked_output_policy_info = tf.nest.map_structure( stack_ta, output_policy_info_tas) return (stacked_output_actions, stacked_output_policy_info, last_policy_state)
def _time_step(self): return ts.TimeStep(step_type=tf.constant([1], dtype=tf.int32), reward=tf.constant([1], dtype=tf.float32), discount=tf.constant([1], dtype=tf.float32), observation=tf.constant([[1, 2]], dtype=tf.float32))
def testAgentDoesNotFailWhenNestedObservationActionAndDebugSummaries(self): summary_writer = tf.compat.v2.summary.create_file_writer( FLAGS.test_tmpdir, flush_millis=10000) summary_writer.set_as_default() nested_obs_spec = (self._obs_spec, self._obs_spec, { 'a': self._obs_spec, 'b': self._obs_spec, }) nested_time_spec = ts.time_step_spec(nested_obs_spec) nested_act_spec = (self._action_spec, { 'c': self._action_spec, 'd': self._action_spec }) class NestedActorNet(network.DistributionNetwork): def __init__(self, dummy_model): output_spec = (dummy_model.output_spec, { 'c': dummy_model.output_spec, 'd': dummy_model.output_spec, }) super(NestedActorNet, self).__init__(dummy_model.input_tensor_spec, (), output_spec=output_spec, name='NestedActorNet') self.dummy_model = dummy_model def call(self, *args, **kwargs): dummy_ans, _ = self.dummy_model(*args, **kwargs) return (dummy_ans, {'c': dummy_ans, 'd': dummy_ans}), () dummy_model = DummyActorNet(nested_obs_spec, self._action_spec) agent = ppo_agent.PPOAgent(nested_time_spec, nested_act_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=NestedActorNet(dummy_model), value_net=DummyValueNet(nested_obs_spec), debug_summaries=True) observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) observations = (observations, observations, { 'a': observations, 'b': observations, }) time_steps = ts.TimeStep(step_type=tf.constant([[1] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) actions = (actions, { 'c': actions, 'd': actions, }) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } action_distribution_parameters = (action_distribution_parameters, { 'c': action_distribution_parameters, 'd': action_distribution_parameters, }) policy_info = action_distribution_parameters experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) agent.train(experience)