def make_random_trajectory(): time_step_spec = ts.time_step_spec( tensor_spec.TensorSpec([], tf.int64, name='observation')) action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, minimum=1, maximum=2, name='action') # info and policy state specs match that of TFPolicyMock. outer_dims = [1, 6] # (batch_size, time) traj = trajectory.Trajectory( observation=tensor_spec.sample_spec_nest(time_step_spec.observation, outer_dims=outer_dims), action=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), policy_info=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), reward=tf.fill(outer_dims, 0.0), # step_type is F M L F M L. step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims), # next_step_type is M L F M L F. next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims), discount=tf.fill(outer_dims, 1.0), ) return traj, time_step_spec, action_spec
def __call__(self, observation, state=()): action = self._a_network(observation)[1] rand_action = tensor_spec.sample_bounded_spec( self._a_network.action_spec, outer_dims=[observation.shape[0]]) seed = tf.random.uniform([observation.shape[0]]) is_random = tf.less(seed, self._epsilon) action = tf.compat.v2.where(is_random, rand_action, action) return action, state
def make_random_trajectory(): """Creates a random trajectory. This trajectory contains Tensors shaped `[1, 6, ...]` where `1` is the batch and `6` is the number of time steps. Observations are unbounded but actions are bounded to take values within `[1, 2]`. Policy info is also provided, and is equal to the actions. It can be removed via: ```python traj = make_random_trajectory().clone(policy_info=()) ``` Returns: A `Trajectory`. """ time_step_spec = ts.time_step_spec( tensor_spec.TensorSpec([], tf.int32, name='observation')) action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, minimum=1, maximum=2, name='action') # info and policy state specs match that of TFPolicyMock. outer_dims = [1, 6] # (batch_size, time) traj = trajectory.Trajectory( observation=tensor_spec.sample_spec_nest(time_step_spec.observation, outer_dims=outer_dims), action=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), policy_info=tensor_spec.sample_bounded_spec(action_spec, outer_dims=outer_dims), reward=tf.fill(outer_dims, tf.constant(0, dtype=tf.float32)), # step_type is F M L F M L. step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims), # next_step_type is M L F M L F. next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims), discount=tf.fill(outer_dims, tf.constant(1, dtype=tf.float32)), ) return traj, time_step_spec, action_spec
def __call__(self, observation, state=()): action = self._a_network(observation)[1] noise = tf.random_normal(shape=action.shape, stddev=self._std) action = action + noise spec = self._a_network.action_spec action = tf.clip_by_value(action, spec.minimum + self._clip_eps, spec.maximum - self._clip_eps) rand_action = tensor_spec.sample_bounded_spec( self._a_network.action_spec, outer_dims=[observation.shape[0]]) seed = tf.random.uniform([observation.shape[0]]) is_random = tf.less(seed, self._eps) action = tf.compat.v2.where(is_random, rand_action, action) return action, state
def __call__(self, observation, state=()): action = tensor_spec.sample_bounded_spec( self._action_spec, outer_dims=[observation.shape[0]]) return action, state