def testTrajectoryNotSingleStepTransition(self): converter = data_converter.AsTransition(self._data_context) traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec, outer_dims=[2, 3]) converted = converter(traj) expected = trajectory.to_transition(traj) (expected, converted) = self.evaluate((expected, converted)) tf.nest.map_structure(self.assertAllEqual, converted, expected)
def testTransitionNoTimeDimensionRaises(self): converter = data_converter.AsTrajectory(self._data_context) traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec, outer_dims=[2]) transition = trajectory.to_transition(traj, traj) with self.assertRaisesRegex( ValueError, r'must have two outer dimensions: batch size and time'): converter(transition)
def testTransitionNoTimeDimensionRaises(self): converter = data_converter.AsTrajectory(self._data_context) traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec, outer_dims=[2]) transition = trajectory.to_transition(traj, traj) with self.assertRaisesRegex( ValueError, r'tensors must have shape \`\[B, T\] \+ spec.shape\`'): converter(transition)
def _experience_to_transitions(self, experience): transitions = trajectory.to_transition(experience) if not self._q_network.state_spec: transitions = tf.nest.map_structure( lambda x: composite.squeeze(x, 1), transitions) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action return time_steps, actions, next_time_steps
def testFromBatchTimeTrajectory(self): converter = data_converter.AsTransition(self._data_context, squeeze_time_dim=True) traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec, outer_dims=[4, 2]) # [B, T=2] converted = converter(traj) expected = trajectory.to_transition(traj) # Remove the now-singleton time dim. expected = tf.nest.map_structure(lambda x: tf.squeeze(x, 1), expected) (expected, converted) = self.evaluate((expected, converted)) tf.nest.map_structure(self.assertAllEqual, converted, expected)
def _experience_to_transitions(self, experience): transitions = trajectory.to_transition(experience) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action if (self.train_sequence_length is not None and self.train_sequence_length == 2): # Sequence empty time dimension if critic network is stateless. time_steps, actions, next_time_steps = tf.nest.map_structure( lambda t: tf.squeeze(t, axis=1), (time_steps, actions, next_time_steps)) return time_steps, actions, next_time_steps
def _experience_to_transitions(self, experience): transitions = trajectory.to_transition(experience) # Remove time dim if we are not using a recurrent network. if not self._actor_network.state_spec: transitions = tf.nest.map_structure(lambda x: tf.squeeze(x, [1]), transitions) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action return time_steps, actions, next_time_steps
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) #observations = time_steps.observation actions = policy_steps_.action rewards = next_time_steps.reward print(rewards) discounts = next_time_steps.discount if self._reward_normalizer: rewards = self._reward_normalizer.normalize( rewards, center_mean=False, clip_value=self._reward_norm_clipping) value_preds = self.double_batch_pred(self._mod_net, experience.observation, is_training=True) #print("VPRED",value_preds.shape,value_preds_2.shape) returns = self.compute_return(next_time_steps, value_preds) value_estimation_losses = [] loss_info = None # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, returns, weights) #action_distribution_parameters variables_to_train = self._mod_net.trainable_weights grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) self._optimizer.apply_gradients( grads_and_vars) #, global_step=self.train_step_counter) value_estimation_losses.append( loss_info.extra.value_estimation_loss) loss_info = tf.nest.map_structure(tf.identity, loss_info) return loss_info
def __call__(self, value: typing.Any): """Converts `value` to a Transition. Performs data validation and pruning. - If `value` is already a `Transition`, only validation is performed. - If `value` is a `Trajectory` and `squeeze_time_dim = True` then `value` it must have tensors with shape `[B, T=2]` outer dims. This is converted to a `Transition` object without a time dimension. - If `value` is a `Trajectory` with tensors containing a time dimension having `T != 2`, a `ValueError` is raised. Args: value: A `Trajectory` or `Transition` object to convert. Returns: A validated and pruned `Transition`. If `squeeze_time_dim = True`, the resulting `Transition` has tensors with shape `[B, ...]`. Otherwise, the tensors will have shape `[B, T - 1, ...]`. Raises: TypeError: If `value` is not one of `Trajectory` or `Transition`. ValueError: If `value` has structure that doesn't match the converter's spec. TypeError: If `value` has a structure that doesn't match the converter's spec. ValueError: If `squeeze_time_dim=True` and `value` is a `Trajectory` with a time dimension having value other than `T=2`. """ if isinstance(value, trajectory.Transition): pass elif isinstance(value, trajectory.Trajectory): required_sequence_length = 2 if self._squeeze_time_dim else None _validate_trajectory( value, self._data_context.trajectory_spec, sequence_length=required_sequence_length) value = trajectory.to_transition(value) # Remove the now-singleton time dim. if self._squeeze_time_dim: value = tf.nest.map_structure( lambda x: composite.squeeze(x, axis=1), value) else: raise TypeError('Input type not supported: {}'.format(value)) self._validate_transition(value) value = nest_utils.prune_extra_keys( self._data_context.transition_spec, value) return value
def _experience_to_transitions(self, experience): boundary_mask = tf.logical_not(experience.is_boundary()[:, 0]) experience = nest_utils.fast_map_structure( lambda *x: tf.boolean_mask(*x, boundary_mask), experience) time_steps, policy_steps, next_time_steps = trajectory.to_transition( experience) actions = policy_steps.action if (self.train_sequence_length is not None and self.train_sequence_length == 2): # Sequence empty time dimension if critic network is stateless. time_steps, actions, next_time_steps = tf.nest.map_structure( lambda t: tf.squeeze(t, axis=1), (time_steps, actions, next_time_steps)) return time_steps, actions, policy_steps.info.alpha[:, 0], next_time_steps
def _experience_to_transitions(self, experience): #batch_size = nest_utils.get_outer_array_shape(experience, self.collect_data_spec) #boundary_mask = nest_utils.where(experience.is_boundary(), # tf.zeros((batch_size,)), # tf.ones((batch_size,))) #experience = nest_utils.fast_map_structure(lambda t: t[boundary_mask], experience) transitions = trajectory.to_transition(experience) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action if (self.train_sequence_length is not None and self.train_sequence_length == 2): # Sequence empty time dimension if critic network is stateless. time_steps, actions, next_time_steps = tf.nest.map_structure( lambda t: tf.squeeze(t, axis=1), (time_steps, actions, next_time_steps)) return time_steps, actions, next_time_steps
def _train(self, experience, weights=None): # unpack trajectories (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] value_state = self._collect_policy.get_initial_value_state( batch_size=batch_size) weights = ppo_utils.make_timestep_mask(next_time_steps) value_preds, _ = self._collect_policy.apply_value_network( experience.observation, experience.step_type, value_state=value_state) value_preds = tf.stop_gradient(value_preds) rewards = next_time_steps.reward # normalize rewards if self._reward_normalizer is not None: rewards = self._reward_normalizer.normalize( rewards, center_mean=False, clip_value=self._reward_norm_clipping) returns, normalized_advantages = compute_return_and_advantage( self._discount_factor, self._lambda, rewards, next_time_steps, value_preds) policy_loss = self._update_policy(time_steps, policy_steps_, normalized_advantages, weights) value_loss = self._update_values(time_steps, returns, weights) return tf_agent.LossInfo( loss=value_loss + policy_loss, extra=TRPOLossInfo(value_estimation_loss=value_loss, policy_gradient_loss=policy_loss), )
def data_generation(self): # set up random policy initial_collect_policy = random_tf_policy.RandomTFPolicy( self._train_env.time_step_spec(), self._train_env.action_spec()) # set up a driver that with random policy to collect data init_driver = dynamic_step_driver.DynamicStepDriver( self._train_env, # a random policy that can be used to collect data from the environment initial_collect_policy, # a list of observers that are updated after every step in the environment observers=[ self._replay_buffer_observer, Progress_viz(param.DATASET_STEPS) ], # the number of steps in the dataset num_steps=param.DATASET_STEPS) # recording the sequence of state transitions and results in observers final_time_step, final_policy_state = init_driver.run() # Verify collected trajectories (optional) if self._visual_flag: trajectories, buffer_info = self._replay_buffer.get_next( sample_batch_size=2, num_steps=10) time_steps, action_steps, next_time_steps = trajectory.to_transition( trajectories) print("trajectories._fields", trajectories._fields) print("time_steps.observation.shape = ", time_steps.observation.shape) # Create Dataset from Replay Buffer self._dataset = self._replay_buffer.as_dataset( sample_batch_size=param.DATASET_BATCH, num_steps=param.DATASET_BUFFER_STEP, num_parallel_calls=param.DATASET_PARALLEL).prefetch( param.DATASET_PREFETCH)
def testToTransitionHandlesTrajectoryFromDriverCorrectly(self): env = tf_py_environment.TFPyEnvironment( drivers_test_utils.PyEnvironmentMock()) policy = drivers_test_utils.TFPolicyMock(env.time_step_spec(), env.action_spec()) replay_buffer = drivers_test_utils.make_replay_buffer(policy) driver = dynamic_episode_driver.DynamicEpisodeDriver( env, policy, num_episodes=3, observers=[replay_buffer.add_batch]) run_driver = driver.run() rb_gather_all = replay_buffer.gather_all() self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(run_driver) trajectories = self.evaluate(rb_gather_all) transitions = trajectory.to_transition(trajectories) self.assertIsInstance(transitions, trajectory.Transition) time_steps, policy_step, next_time_steps = transitions self.assertAllEqual(time_steps.observation, trajectories.observation[:, :-1]) self.assertAllEqual(time_steps.step_type, trajectories.step_type[:, :-1]) self.assertAllEqual(next_time_steps.observation, trajectories.observation[:, 1:]) self.assertAllEqual(next_time_steps.step_type, trajectories.step_type[:, 1:]) self.assertAllEqual(next_time_steps.reward, trajectories.reward[:, :-1]) self.assertAllEqual(next_time_steps.discount, trajectories.discount[:, :-1]) self.assertAllEqual(policy_step.action, trajectories.action[:, :-1]) self.assertAllEqual(policy_step.info, trajectories.policy_info[:, :-1])
def testSequencePreprocess(self, strategy_fn): with strategy_fn().scope(): counter = common.create_variable('test_train_counter') batch_size = 2 n_time_steps = 3 agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet( self._obs_spec, self._action_spec, ), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, num_epochs=1, use_gae=False, use_td_lambda_return=False, compute_value_and_advantage_in_train=False, train_step_counter=counter) agent.initialize() observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep( step_type=tf.constant( [[mid_time_step_val] * n_time_steps] * batch_size, dtype=tf.int32), reward=tf.constant([[1] * n_time_steps] * batch_size, dtype=tf.float32), discount=tf.constant( [[1] * n_time_steps] * batch_size, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) old_action_distribution_parameters = { 'loc': tf.constant( [[[0.0]] * n_time_steps] * batch_size, dtype=tf.float32), 'scale': tf.constant( [[[1.0]] * n_time_steps] * batch_size, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': old_action_distribution_parameters, 'value_prediction': value_preds, } experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) returned_experience = agent.preprocess_sequence(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(observations, returned_experience.observation) self.assertAllClose(actions, returned_experience.action) expected_value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) (_, _, next_time_steps) = trajectory.to_transition(experience) expected_returns, expected_advantages = agent.compute_return_and_advantage( next_time_steps, expected_value_preds) self.assertAllClose(old_action_distribution_parameters, returned_experience.policy_info['dist_params']) self.assertEqual((batch_size, n_time_steps), returned_experience.policy_info['return'].shape) self.assertAllClose(expected_returns, returned_experience.policy_info['return'][:, :-1]) self.assertEqual((batch_size, n_time_steps), returned_experience.policy_info['advantage'].shape) self.assertAllClose(expected_advantages, returned_experience.policy_info['advantage'][:, :-1])
def _experience_to_transitions(self, experience): transitions = trajectory.to_transition(experience) time_steps, policy_steps, next_time_steps = transitions actions = policy_steps.action return time_steps, actions, next_time_steps
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) actions = policy_steps_.action if self._debug_summaries: actions_list = tf.nest.flatten(actions) show_action_index = len(actions_list) != 1 for i, single_action in enumerate(actions_list): action_name = ('actions_{}'.format(i) if show_action_index else 'actions') tf.compat.v2.summary.histogram(name=action_name, data=single_action, step=self.train_step_counter) action_distribution_parameters = policy_steps_.info # Reconstruct per-timestep policy distribution from stored distribution # parameters. old_actions_distribution = ( distribution_spec.nested_distributions_from_specs( self._action_distribution_spec, action_distribution_parameters)) # Compute log probability of actions taken during data collection, using the # collect policy distribution. act_log_probs = common.log_probability(old_actions_distribution, actions, self._action_spec) # Compute the value predictions for states using the current value function. # To be used for return & advantage computation. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state( batch_size=batch_size) value_preds, unused_policy_state = self._collect_policy.apply_value_network( experience.observation, experience.step_type, policy_state=policy_state) value_preds = tf.stop_gradient(value_preds) valid_mask = ppo_utils.make_timestep_mask(next_time_steps) if weights is None: weights = valid_mask else: weights *= valid_mask returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Only save debug summaries for first and last epochs. debug_summaries = (self._debug_summaries and (i_epoch == 0 or i_epoch == self._num_epochs - 1)) # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, action_distribution_parameters, weights, self.train_step_counter, debug_summaries) variables_to_train = (self._actor_net.trainable_weights + self._value_net.trainable_weights) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) # If summarize_gradients, create functions for summarizing both # gradients and variables. if self._summarize_grads_and_vars and debug_summaries: eager_utils.add_gradients_summaries( grads_and_vars, self.train_step_counter) eager_utils.add_variables_summaries( grads_and_vars, self.train_step_counter) self._optimizer.apply_gradients( grads_and_vars, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. batch_size = nest_utils.get_outer_shape(time_steps, self._time_step_spec)[0] policy_state = self._collect_policy.get_initial_state(batch_size) # Compute the mean kl from previous action distribution. kl_divergence = self._kl_divergence( time_steps, action_distribution_parameters, self._collect_policy.distribution(time_steps, policy_state).action) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) # Make summaries for total loss across all epochs. # The *_losses lists will have been populated by # calls to self.get_epoch_loss. with tf.name_scope('Losses/'): total_policy_gradient_loss = tf.add_n(policy_gradient_losses) total_value_estimation_loss = tf.add_n(value_estimation_losses) total_l2_regularization_loss = tf.add_n(l2_regularization_losses) total_entropy_regularization_loss = tf.add_n( entropy_regularization_losses) total_kl_penalty_loss = tf.add_n(kl_penalty_losses) tf.compat.v2.summary.scalar(name='policy_gradient_loss', data=total_policy_gradient_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='value_estimation_loss', data=total_value_estimation_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='l2_regularization_loss', data=total_l2_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='entropy_regularization_loss', data=total_entropy_regularization_loss, step=self.train_step_counter) tf.compat.v2.summary.scalar(name='kl_penalty_loss', data=total_kl_penalty_loss, step=self.train_step_counter) total_abs_loss = (tf.abs(total_policy_gradient_loss) + tf.abs(total_value_estimation_loss) + tf.abs(total_entropy_regularization_loss) + tf.abs(total_l2_regularization_loss) + tf.abs(total_kl_penalty_loss)) tf.compat.v2.summary.scalar(name='total_abs_loss', data=total_abs_loss, step=self.train_step_counter) if self._summarize_grads_and_vars: with tf.name_scope('Variables/'): all_vars = (self._actor_net.trainable_weights + self._value_net.trainable_weights) for var in all_vars: tf.compat.v2.summary.histogram( name=var.name.replace(':', '_'), data=var, step=self.train_step_counter) return loss_info
def _train(self, experience, weights): # Get individual tensors from transitions. (time_steps, policy_steps_, next_time_steps) = trajectory.to_transition(experience) #observations = time_steps.observation actions = policy_steps_.action #rewards = next_time_steps.reward #discounts = next_time_steps.discount old_actions_distribution = policy_steps_.info act_log_probs = get_neglopacs(logits=old_actions_distribution, labels=actions) # Compute the value predictions for states using the current value function. value_preds = double_batch_pred2(self._value_net, experience.observation, self._observation_spec, is_training=True) value_preds = tf.squeeze(value_preds, -1) #NeedValue preds at all time_steps +1 final step obs #print("Weight",weights) #print("REW",rewards) #print("Dis",discounts) returns, normalized_advantages = self.compute_return_and_advantage( next_time_steps, value_preds) #print("RET",returns) #print(normalized_advantages) # Loss tensors across batches will be aggregated for summaries. policy_gradient_losses = [] value_estimation_losses = [] l2_regularization_losses = [] entropy_regularization_losses = [] kl_penalty_losses = [] loss_info = None # TODO(b/123627451): Remove. # For each epoch, create its own train op that depends on the previous one. for i_epoch in range(self._num_epochs): with tf.name_scope('epoch_%d' % i_epoch): # Build one epoch train op. with tf.GradientTape() as tape: loss_info = self.get_epoch_loss( time_steps, actions, act_log_probs, returns, normalized_advantages, old_actions_distribution, weights) #action_distribution_parameters variables_to_train = (self._actor_net.trainable_variables + self._value_net.trainable_variables) grads = tape.gradient(loss_info.loss, variables_to_train) # Tuple is used for py3, where zip is a generator producing values once. grads_and_vars = tuple(zip(grads, variables_to_train)) if self._gradient_clipping > 0: grads_and_vars = eager_utils.clip_gradient_norms( grads_and_vars, self._gradient_clipping) self._optimizer.apply_gradients( grads_and_vars) #, global_step=self.train_step_counter) policy_gradient_losses.append( loss_info.extra.policy_gradient_loss) value_estimation_losses.append( loss_info.extra.value_estimation_loss) l2_regularization_losses.append( loss_info.extra.l2_regularization_loss) entropy_regularization_losses.append( loss_info.extra.entropy_regularization_loss) kl_penalty_losses.append(loss_info.extra.kl_penalty_loss) # After update epochs, update adaptive kl beta, then update observation # normalizer and reward normalizer. # Compute the mean kl from previous action distribution. temp_ = double_batch_pred2(self._actor_net, time_steps.observation, self._observation_spec, is_training=True) kl_divergence = self._kl_divergence(time_steps, old_actions_distribution, temp_) self.update_adaptive_kl_beta(kl_divergence) if self._observation_normalizer: self._observation_normalizer.update(time_steps.observation, outer_dims=[0, 1]) else: # TODO(b/127661780): Verify performance of reward_normalizer when obs are # not normalized if self._reward_normalizer: self._reward_normalizer.update(next_time_steps.reward, outer_dims=[0, 1]) loss_info = tf.nest.map_structure(tf.identity, loss_info) return loss_info
def __call__(self, value: typing.Any) -> trajectory.Transition: """Converts `value` to a Transition. Performs data validation and pruning. - If `value` is already a `Transition`, only validation is performed. - If `value` is a `Trajectory` and `squeeze_time_dim = True` then `value` it must have tensors with shape `[B, T=2]` outer dims. This is converted to a `Transition` object without a time dimension. - If `value` is a `Trajectory` with tensors containing a time dimension having `T != 2`, a `ValueError` is raised. Args: value: A `Trajectory` or `Transition` object to convert. Returns: A validated and pruned `Transition`. If `squeeze_time_dim = True`, the resulting `Transition` has tensors with shape `[B, ...]`. Otherwise, the tensors will have shape `[B, T - 1, ...]`. Raises: TypeError: If `value` is not one of `Trajectory` or `Transition`. ValueError: If `value` has structure that doesn't match the converter's spec. TypeError: If `value` has a structure that doesn't match the converter's spec. ValueError: If `squeeze_time_dim=True` and `value` is a `Trajectory` with a time dimension having value other than `T=2`. """ if _is_transition_like(value): value = _as_tfa_transition(value) elif _is_trajectory_like(value): required_sequence_length = 2 if self._squeeze_time_dim else None _validate_trajectory( value, self._data_context.trajectory_spec, sequence_length=required_sequence_length) value = trajectory.to_transition(value) # Remove the now-singleton time dim. if self._squeeze_time_dim: value = tf.nest.map_structure( lambda x: composite.squeeze(x, axis=1), value) else: raise TypeError('Input type not supported: {}'.format(value)) num_outer_dims = 1 if self._squeeze_time_dim else 2 _validate_transition( value, self._data_context.transition_spec, num_outer_dims) value = nest_utils.prune_extra_keys( self._data_context.transition_spec, value) if self._prepend_t0_to_next_time_step: # This is useful when using sequential model. It allows target_q network # to take all the information. next_time_step_with_t0 = value.next_time_step._replace( observation=tf.nest.map_structure( lambda x, y: tf.concat([x[:, :1, ...], y], axis=1), value.time_step.observation, value.next_time_step.observation)) value = value._replace(next_time_step=next_time_step_with_t0) return value
def _time_step_batch(trajectory_batch): return trajectory.to_transition(trajectory_batch)[-1]
ShowProgress(20000)], num_steps=20000) # <=> 80,000 ALE frames final_time_step, final_policy_state = init_driver.run() # Let's sample 2 sub-episodes, with 3 time steps each and display them: tf.random.set_seed( 888) # chosen to show an example of trajectory at the end of an episode trajectories, buffer_info = replay_buffer.get_next(sample_batch_size=2, num_steps=3) print(trajectories._fields) print(trajectories.observation.shape) time_steps, action_steps, next_time_steps = to_transition(trajectories) print(time_steps.observation.shape) print(trajectories.step_type.numpy()) plt.figure(figsize=(10, 6.8)) for row in range(2): for col in range(3): plt.subplot(2, 3, row * 3 + col + 1) plot_observation(trajectories.observation[row, col].numpy()) plt.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0, wspace=0.02) save_fig("sub_episodes_plot") plt.show() dataset = replay_buffer.as_dataset(sample_batch_size=64, num_steps=2,