def _create_trajectory( observation, action, policy_info, reward, discount, step_type, next_step_type, name_scope): """Create a Trajectory composed of either Tensors or numpy arrays. The input `discount` is used to infer the outer shape of the inputs, as it is always expected to be a singleton array with scalar inner shape. Args: observation: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`. action: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`. policy_info: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`. reward: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[B, ...]`, `[T, ...]`, or `[B, T, ...]`. discount: A floating point vector `Tensor` or `np.ndarray`; shaped `[B]`, `[T]`, or `[B, T]` (optional). step_type: `Tensor` or `np.ndarray` of `ts.StepType`, shaped `[B]`, `[T]`, or `[B, T]`. next_step_type: `Tensor` or `np.ndarray` of `ts.StepType`, shaped `[B]`, `[T]`, or `[B, T]`. name_scope: Python string, name to use when creating tensors. Returns: A `Trajectory` instance. """ if nest_utils.has_tensors( observation, action, policy_info, reward, discount): with tf.name_scope(name_scope): discount = tf.identity(discount) shape = tf.shape(input=discount) make_tensors = lambda struct: tf.nest.map_structure(tf.identity, struct) return Trajectory( step_type=tf.fill(shape, step_type), observation=make_tensors(observation), action=make_tensors(action), policy_info=make_tensors(policy_info), next_step_type=tf.fill(shape, next_step_type), reward=make_tensors(reward), discount=discount) else: discount = np.asarray(discount) shape = discount.shape make_arrays = lambda struct: tf.nest.map_structure(np.asarray, struct) return Trajectory( step_type=np.full(shape, step_type), observation=make_arrays(observation), action=make_arrays(action), policy_info=make_arrays(policy_info), next_step_type=np.full(shape, next_step_type), reward=make_arrays(reward), discount=discount)
def from_episode(observation, action, policy_info, reward, discount=None): """Create a Trajectory from tensors representing a single episode. If none of the inputs are tensors, then numpy arrays are generated instead. If `discount` is not provided, the first entry in `reward` is used to estimate `T`: ``` reward_0 = tf.nest.flatten(reward)[0] T = shape(reward_0)[0] ``` In this case, a `discount` of all ones having dtype `float32` is generated. Args: observation: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. action: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. policy_info: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. reward: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. discount: A floating point vector `Tensor` or `np.ndarray`; shaped `[T]` (optional). Returns: An instance of `Trajectory`. """ use_tensors = nest_utils.has_tensors(observation, action, policy_info, reward, discount) if use_tensors: ones_fn = tf.ones float_dtype = tf.float32 convert_fn = tf.convert_to_tensor concat_fn = tf.concat maximum_fn = tf.maximum fill_fn = tf.fill identity_map = lambda struct: tf.nest.map_structure( tf.identity, struct) else: ones_fn = np.ones float_dtype = np.float32 convert_fn = np.asarray concat_fn = np.concatenate maximum_fn = np.maximum fill_fn = np.full identity_map = lambda struct: tf.nest.map_structure(np.asarray, struct) def _from_episode(observation, action, policy_info, reward, discount): """Implementation of from_episode.""" if discount is not None: time_source = discount else: time_source = tf.nest.flatten(reward)[0] if tf.is_tensor(time_source): num_frames = (tf.compat.dimension_value(time_source.shape[0]) or tf.shape(input=time_source)[0]) else: num_frames = np.shape(time_source)[0] if discount is None: discount = ones_fn([num_frames], dtype=float_dtype) if not tf.is_tensor(num_frames): def check_num_frames(t): if t.shape[0] is not None and t.shape[0] != num_frames: raise ValueError('Expected first dimension to be {}, ' 'but saw value: {}'.format(num_frames, t)) tf.nest.map_structure( check_num_frames, (observation, action, policy_info, reward, discount)) ts_first = convert_fn(ts.StepType.FIRST) ts_last = convert_fn(ts.StepType.LAST) mid_size = maximum_fn(0, num_frames - 1) ts_mid = fill_fn([mid_size], ts.StepType.MID) step_type = concat_fn(([ts_first], ts_mid), axis=0) next_step_type = concat_fn((ts_mid, [ts_last]), axis=0) return Trajectory(step_type=step_type, observation=identity_map(observation), action=identity_map(action), policy_info=identity_map(policy_info), next_step_type=next_step_type, reward=identity_map(reward), discount=identity_map(discount)) if use_tensors: with tf.name_scope('from_episode'): return _from_episode(observation, action, policy_info, reward, discount) else: return _from_episode(observation, action, policy_info, reward, discount)
def from_episode( observation: types.NestedSpecTensorOrArray, action: types.NestedSpecTensorOrArray, policy_info: types.NestedSpecTensorOrArray, reward: types.NestedSpecTensorOrArray, discount: Optional[types.SpecTensorOrArray] = None) -> Trajectory: """Create a Trajectory from tensors representing a single episode. If none of the inputs are tensors, then numpy arrays are generated instead. If `discount` is not provided, the first entry in `reward` is used to estimate `T`: ``` reward_0 = tf.nest.flatten(reward)[0] T = shape(reward_0)[0] ``` In this case, a `discount` of all ones having dtype `float32` is generated. **NOTE**: all tensors/numpy arrays passed to this function have the same time dimension `T`. When the generated trajectory passes through `to_transition`, it will only return a `(time_steps, next_time_steps)` pair with `T - 1` in the time dimension, which means the reward at step T is dropped. So if the reward at step `T` is important, please make sure the episode passed to this function contains an additional step. Args: observation: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. action: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. policy_info: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. reward: (possibly nested tuple of) `Tensor` or `np.ndarray`; all shaped `[T, ...]`. discount: A floating point vector `Tensor` or `np.ndarray`; shaped `[T]` (optional). Returns: An instance of `Trajectory`. """ use_tensors = nest_utils.has_tensors( observation, action, policy_info, reward, discount) map_structure = functools.partial( tf.nest.map_structure, expand_composites=True) if use_tensors: ones_fn = tf.ones float_dtype = tf.float32 convert_fn = tf.convert_to_tensor concat_fn = tf.concat maximum_fn = tf.maximum fill_fn = tf.fill identity_map = lambda struct: map_structure(tf.identity, struct) else: ones_fn = np.ones float_dtype = np.float32 convert_fn = np.asarray concat_fn = np.concatenate maximum_fn = np.maximum fill_fn = np.full identity_map = lambda struct: map_structure(np.asarray, struct) def _from_episode(observation, action, policy_info, reward, discount): """Implementation of from_episode.""" if discount is not None: time_source = discount else: time_source = tf.nest.flatten(reward)[0] if tf.is_tensor(time_source): num_frames = _maybe_static_outer_dim(time_source) else: num_frames = np.shape(time_source)[0] if discount is None: discount = ones_fn([num_frames], dtype=float_dtype) if not tf.is_tensor(num_frames): def check_num_frames(t): if tf.is_tensor(t): outer_dim = _maybe_static_outer_dim(t) else: outer_dim = t.shape[0] if not tf.is_tensor(outer_dim) and outer_dim != num_frames: raise ValueError('Expected first dimension to be {}, ' 'but saw outer dim: {}'.format(num_frames, outer_dim)) tf.nest.map_structure( check_num_frames, (observation, action, policy_info, reward, discount), expand_composites=False) ts_first = convert_fn(ts.StepType.FIRST) ts_last = convert_fn(ts.StepType.LAST) mid_size = maximum_fn(0, num_frames - 1) ts_mid = fill_fn([mid_size], ts.StepType.MID) step_type = concat_fn(([ts_first], ts_mid), axis=0) next_step_type = concat_fn((ts_mid, [ts_last]), axis=0) return Trajectory( step_type=step_type, observation=identity_map(observation), action=identity_map(action), policy_info=identity_map(policy_info), next_step_type=next_step_type, reward=identity_map(reward), discount=identity_map(discount)) if use_tensors: with tf.name_scope('from_episode'): return _from_episode(observation, action, policy_info, reward, discount) else: return _from_episode(observation, action, policy_info, reward, discount)