def testTrajectoryNotSingleStepTransition(self):
     converter = data_converter.AsTransition(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[2, 3])
     converted = converter(traj)
     expected = trajectory.to_transition(traj)
     (expected, converted) = self.evaluate((expected, converted))
     tf.nest.map_structure(self.assertAllEqual, converted, expected)
 def testTransitionNoTimeDimensionRaises(self):
     converter = data_converter.AsTrajectory(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[2])
     transition = trajectory.to_transition(traj, traj)
     with self.assertRaisesRegex(
             ValueError,
             r'must have two outer dimensions: batch size and time'):
         converter(transition)
 def testTransitionNoTimeDimensionRaises(self):
     converter = data_converter.AsTrajectory(self._data_context)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[2])
     transition = trajectory.to_transition(traj, traj)
     with self.assertRaisesRegex(
             ValueError,
             r'tensors must have shape \`\[B, T\] \+ spec.shape\`'):
         converter(transition)
Exemple #4
0
    def _experience_to_transitions(self, experience):
        transitions = trajectory.to_transition(experience)
        if not self._q_network.state_spec:
            transitions = tf.nest.map_structure(
                lambda x: composite.squeeze(x, 1), transitions)

        time_steps, policy_steps, next_time_steps = transitions
        actions = policy_steps.action
        return time_steps, actions, next_time_steps
 def testFromBatchTimeTrajectory(self):
     converter = data_converter.AsTransition(self._data_context,
                                             squeeze_time_dim=True)
     traj = tensor_spec.sample_spec_nest(self._data_context.trajectory_spec,
                                         outer_dims=[4, 2])  # [B, T=2]
     converted = converter(traj)
     expected = trajectory.to_transition(traj)
     # Remove the now-singleton time dim.
     expected = tf.nest.map_structure(lambda x: tf.squeeze(x, 1), expected)
     (expected, converted) = self.evaluate((expected, converted))
     tf.nest.map_structure(self.assertAllEqual, converted, expected)
Exemple #6
0
 def _experience_to_transitions(self, experience):
     transitions = trajectory.to_transition(experience)
     time_steps, policy_steps, next_time_steps = transitions
     actions = policy_steps.action
     if (self.train_sequence_length is not None
             and self.train_sequence_length == 2):
         # Sequence empty time dimension if critic network is stateless.
         time_steps, actions, next_time_steps = tf.nest.map_structure(
             lambda t: tf.squeeze(t, axis=1),
             (time_steps, actions, next_time_steps))
     return time_steps, actions, next_time_steps
Exemple #7
0
  def _experience_to_transitions(self, experience):
    transitions = trajectory.to_transition(experience)

    # Remove time dim if we are not using a recurrent network.
    if not self._actor_network.state_spec:
      transitions = tf.nest.map_structure(lambda x: tf.squeeze(x, [1]),
                                          transitions)

    time_steps, policy_steps, next_time_steps = transitions
    actions = policy_steps.action
    return time_steps, actions, next_time_steps
Exemple #8
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        #observations = time_steps.observation
        actions = policy_steps_.action

        rewards = next_time_steps.reward
        print(rewards)
        discounts = next_time_steps.discount
        if self._reward_normalizer:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        value_preds = self.double_batch_pred(self._mod_net,
                                             experience.observation,
                                             is_training=True)
        #print("VPRED",value_preds.shape,value_preds_2.shape)

        returns = self.compute_return(next_time_steps, value_preds)
        value_estimation_losses = []

        loss_info = None
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, returns,
                        weights)  #action_distribution_parameters

                variables_to_train = self._mod_net.trainable_weights
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                self._optimizer.apply_gradients(
                    grads_and_vars)  #, global_step=self.train_step_counter)

                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)

        loss_info = tf.nest.map_structure(tf.identity, loss_info)
        return loss_info
  def __call__(self, value: typing.Any):
    """Converts `value` to a Transition.  Performs data validation and pruning.

    - If `value` is already a `Transition`, only validation is performed.
    - If `value` is a `Trajectory` and `squeeze_time_dim = True` then
      `value` it must have tensors with shape `[B, T=2]` outer dims.
      This is converted to a `Transition` object without a time
      dimension.
    - If `value` is a `Trajectory` with tensors containing a time dimension
      having `T != 2`, a `ValueError` is raised.

    Args:
      value: A `Trajectory` or `Transition` object to convert.

    Returns:
      A validated and pruned `Transition`.  If `squeeze_time_dim = True`,
      the resulting `Transition` has tensors with shape `[B, ...]`.  Otherwise,
      the tensors will have shape `[B, T - 1, ...]`.

    Raises:
      TypeError: If `value` is not one of `Trajectory` or `Transition`.
      ValueError: If `value` has structure that doesn't match the converter's
        spec.
      TypeError: If `value` has a structure that doesn't match the converter's
        spec.
      ValueError: If `squeeze_time_dim=True` and `value` is a `Trajectory`
        with a time dimension having value other than `T=2`.
    """
    if isinstance(value, trajectory.Transition):
      pass
    elif isinstance(value, trajectory.Trajectory):
      required_sequence_length = 2 if self._squeeze_time_dim else None
      _validate_trajectory(
          value,
          self._data_context.trajectory_spec,
          sequence_length=required_sequence_length)
      value = trajectory.to_transition(value)
      # Remove the now-singleton time dim.
      if self._squeeze_time_dim:
        value = tf.nest.map_structure(
            lambda x: composite.squeeze(x, axis=1), value)
    else:
      raise TypeError('Input type not supported: {}'.format(value))

    self._validate_transition(value)
    value = nest_utils.prune_extra_keys(
        self._data_context.transition_spec, value)
    return value
Exemple #10
0
    def _experience_to_transitions(self, experience):
        boundary_mask = tf.logical_not(experience.is_boundary()[:, 0])
        experience = nest_utils.fast_map_structure(
            lambda *x: tf.boolean_mask(*x, boundary_mask), experience)
        time_steps, policy_steps, next_time_steps = trajectory.to_transition(
            experience)

        actions = policy_steps.action
        if (self.train_sequence_length is not None
                and self.train_sequence_length == 2):
            # Sequence empty time dimension if critic network is stateless.
            time_steps, actions, next_time_steps = tf.nest.map_structure(
                lambda t: tf.squeeze(t, axis=1),
                (time_steps, actions, next_time_steps))
        return time_steps, actions, policy_steps.info.alpha[:,
                                                            0], next_time_steps
  def _experience_to_transitions(self, experience):
    #batch_size = nest_utils.get_outer_array_shape(experience, self.collect_data_spec)
    #boundary_mask = nest_utils.where(experience.is_boundary(),
    #                                 tf.zeros((batch_size,)),
    #                                 tf.ones((batch_size,)))
    #experience = nest_utils.fast_map_structure(lambda t: t[boundary_mask], experience)

    transitions = trajectory.to_transition(experience)
    time_steps, policy_steps, next_time_steps = transitions
    actions = policy_steps.action
    if (self.train_sequence_length is not None and
        self.train_sequence_length == 2):
      # Sequence empty time dimension if critic network is stateless.
      time_steps, actions, next_time_steps = tf.nest.map_structure(
          lambda t: tf.squeeze(t, axis=1),
          (time_steps, actions, next_time_steps))
    return time_steps, actions, next_time_steps
Exemple #12
0
    def _train(self, experience, weights=None):
        # unpack trajectories
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)

        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        value_state = self._collect_policy.get_initial_value_state(
            batch_size=batch_size)

        weights = ppo_utils.make_timestep_mask(next_time_steps)

        value_preds, _ = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            value_state=value_state)
        value_preds = tf.stop_gradient(value_preds)

        rewards = next_time_steps.reward

        # normalize rewards
        if self._reward_normalizer is not None:
            rewards = self._reward_normalizer.normalize(
                rewards,
                center_mean=False,
                clip_value=self._reward_norm_clipping)

        returns, normalized_advantages = compute_return_and_advantage(
            self._discount_factor, self._lambda, rewards, next_time_steps,
            value_preds)

        policy_loss = self._update_policy(time_steps, policy_steps_,
                                          normalized_advantages, weights)

        value_loss = self._update_values(time_steps, returns, weights)

        return tf_agent.LossInfo(
            loss=value_loss + policy_loss,
            extra=TRPOLossInfo(value_estimation_loss=value_loss,
                               policy_gradient_loss=policy_loss),
        )
Exemple #13
0
    def data_generation(self):
        # set up random policy
        initial_collect_policy = random_tf_policy.RandomTFPolicy(
            self._train_env.time_step_spec(), self._train_env.action_spec())
        # set up a driver that with random policy to collect data
        init_driver = dynamic_step_driver.DynamicStepDriver(
            self._train_env,
            #  a random policy that can be used to collect data from the environment
            initial_collect_policy,
            # a list of observers that are updated after every step in the environment
            observers=[
                self._replay_buffer_observer,
                Progress_viz(param.DATASET_STEPS)
            ],
            # the number of steps in the dataset
            num_steps=param.DATASET_STEPS)

        # recording the sequence of state transitions and results in observers
        final_time_step, final_policy_state = init_driver.run()

        # Verify collected trajectories (optional)
        if self._visual_flag:
            trajectories, buffer_info = self._replay_buffer.get_next(
                sample_batch_size=2, num_steps=10)
            time_steps, action_steps, next_time_steps = trajectory.to_transition(
                trajectories)
            print("trajectories._fields", trajectories._fields)
            print("time_steps.observation.shape = ",
                  time_steps.observation.shape)

# Create Dataset from Replay Buffer
        self._dataset = self._replay_buffer.as_dataset(
            sample_batch_size=param.DATASET_BATCH,
            num_steps=param.DATASET_BUFFER_STEP,
            num_parallel_calls=param.DATASET_PARALLEL).prefetch(
                param.DATASET_PREFETCH)
    def testToTransitionHandlesTrajectoryFromDriverCorrectly(self):
        env = tf_py_environment.TFPyEnvironment(
            drivers_test_utils.PyEnvironmentMock())
        policy = drivers_test_utils.TFPolicyMock(env.time_step_spec(),
                                                 env.action_spec())
        replay_buffer = drivers_test_utils.make_replay_buffer(policy)

        driver = dynamic_episode_driver.DynamicEpisodeDriver(
            env, policy, num_episodes=3, observers=[replay_buffer.add_batch])

        run_driver = driver.run()
        rb_gather_all = replay_buffer.gather_all()

        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(run_driver)
        trajectories = self.evaluate(rb_gather_all)

        transitions = trajectory.to_transition(trajectories)
        self.assertIsInstance(transitions, trajectory.Transition)
        time_steps, policy_step, next_time_steps = transitions

        self.assertAllEqual(time_steps.observation,
                            trajectories.observation[:, :-1])
        self.assertAllEqual(time_steps.step_type,
                            trajectories.step_type[:, :-1])
        self.assertAllEqual(next_time_steps.observation,
                            trajectories.observation[:, 1:])
        self.assertAllEqual(next_time_steps.step_type,
                            trajectories.step_type[:, 1:])
        self.assertAllEqual(next_time_steps.reward,
                            trajectories.reward[:, :-1])
        self.assertAllEqual(next_time_steps.discount,
                            trajectories.discount[:, :-1])

        self.assertAllEqual(policy_step.action, trajectories.action[:, :-1])
        self.assertAllEqual(policy_step.info, trajectories.policy_info[:, :-1])
Exemple #15
0
  def testSequencePreprocess(self, strategy_fn):
    with strategy_fn().scope():
      counter = common.create_variable('test_train_counter')
      batch_size = 2
      n_time_steps = 3
      agent = ppo_agent.PPOAgent(
          self._time_step_spec,
          self._action_spec,
          tf.compat.v1.train.AdamOptimizer(),
          actor_net=DummyActorNet(
              self._obs_spec,
              self._action_spec,
          ),
          value_net=DummyValueNet(self._obs_spec),
          normalize_observations=False,
          num_epochs=1,
          use_gae=False,
          use_td_lambda_return=False,
          compute_value_and_advantage_in_train=False,
          train_step_counter=counter)
      agent.initialize()
    observations = tf.constant([
        [[1, 2], [3, 4], [5, 6]],
        [[1, 2], [3, 4], [5, 6]],
    ],
                               dtype=tf.float32)

    mid_time_step_val = ts.StepType.MID.tolist()
    time_steps = ts.TimeStep(
        step_type=tf.constant(
            [[mid_time_step_val] * n_time_steps] * batch_size, dtype=tf.int32),
        reward=tf.constant([[1] * n_time_steps] * batch_size, dtype=tf.float32),
        discount=tf.constant(
            [[1] * n_time_steps] * batch_size, dtype=tf.float32),
        observation=observations)
    actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32)

    old_action_distribution_parameters = {
        'loc':
            tf.constant(
                [[[0.0]] * n_time_steps] * batch_size, dtype=tf.float32),
        'scale':
            tf.constant(
                [[[1.0]] * n_time_steps] * batch_size, dtype=tf.float32),
    }

    value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                              dtype=tf.float32)
    policy_info = {
        'dist_params': old_action_distribution_parameters,
        'value_prediction': value_preds,
    }
    experience = trajectory.Trajectory(time_steps.step_type, observations,
                                       actions, policy_info,
                                       time_steps.step_type, time_steps.reward,
                                       time_steps.discount)

    returned_experience = agent.preprocess_sequence(experience)
    self.evaluate(tf.compat.v1.initialize_all_variables())

    self.assertAllClose(observations, returned_experience.observation)
    self.assertAllClose(actions, returned_experience.action)

    expected_value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]],
                                       dtype=tf.float32)
    (_, _, next_time_steps) = trajectory.to_transition(experience)
    expected_returns, expected_advantages = agent.compute_return_and_advantage(
        next_time_steps, expected_value_preds)
    self.assertAllClose(old_action_distribution_parameters,
                        returned_experience.policy_info['dist_params'])
    self.assertEqual((batch_size, n_time_steps),
                     returned_experience.policy_info['return'].shape)
    self.assertAllClose(expected_returns,
                        returned_experience.policy_info['return'][:, :-1])
    self.assertEqual((batch_size, n_time_steps),
                     returned_experience.policy_info['advantage'].shape)
    self.assertAllClose(expected_advantages,
                        returned_experience.policy_info['advantage'][:, :-1])
Exemple #16
0
 def _experience_to_transitions(self, experience):
     transitions = trajectory.to_transition(experience)
     time_steps, policy_steps, next_time_steps = transitions
     actions = policy_steps.action
     return time_steps, actions, next_time_steps
Exemple #17
0
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)
        actions = policy_steps_.action

        if self._debug_summaries:
            actions_list = tf.nest.flatten(actions)
            show_action_index = len(actions_list) != 1
            for i, single_action in enumerate(actions_list):
                action_name = ('actions_{}'.format(i)
                               if show_action_index else 'actions')
                tf.compat.v2.summary.histogram(name=action_name,
                                               data=single_action,
                                               step=self.train_step_counter)

        action_distribution_parameters = policy_steps_.info

        # Reconstruct per-timestep policy distribution from stored distribution
        #   parameters.
        old_actions_distribution = (
            distribution_spec.nested_distributions_from_specs(
                self._action_distribution_spec,
                action_distribution_parameters))

        # Compute log probability of actions taken during data collection, using the
        #   collect policy distribution.
        act_log_probs = common.log_probability(old_actions_distribution,
                                               actions, self._action_spec)

        # Compute the value predictions for states using the current value function.
        # To be used for return & advantage computation.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(
            batch_size=batch_size)

        value_preds, unused_policy_state = self._collect_policy.apply_value_network(
            experience.observation,
            experience.step_type,
            policy_state=policy_state)
        value_preds = tf.stop_gradient(value_preds)

        valid_mask = ppo_utils.make_timestep_mask(next_time_steps)

        if weights is None:
            weights = valid_mask
        else:
            weights *= valid_mask

        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):
                # Only save debug summaries for first and last epochs.
                debug_summaries = (self._debug_summaries
                                   and (i_epoch == 0
                                        or i_epoch == self._num_epochs - 1))

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, action_distribution_parameters,
                        weights, self.train_step_counter, debug_summaries)

                variables_to_train = (self._actor_net.trainable_weights +
                                      self._value_net.trainable_weights)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                # If summarize_gradients, create functions for summarizing both
                # gradients and variables.
                if self._summarize_grads_and_vars and debug_summaries:
                    eager_utils.add_gradients_summaries(
                        grads_and_vars, self.train_step_counter)
                    eager_utils.add_variables_summaries(
                        grads_and_vars, self.train_step_counter)

                self._optimizer.apply_gradients(
                    grads_and_vars, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        batch_size = nest_utils.get_outer_shape(time_steps,
                                                self._time_step_spec)[0]
        policy_state = self._collect_policy.get_initial_state(batch_size)
        # Compute the mean kl from previous action distribution.
        kl_divergence = self._kl_divergence(
            time_steps, action_distribution_parameters,
            self._collect_policy.distribution(time_steps, policy_state).action)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)

        # Make summaries for total loss across all epochs.
        # The *_losses lists will have been populated by
        #   calls to self.get_epoch_loss.
        with tf.name_scope('Losses/'):
            total_policy_gradient_loss = tf.add_n(policy_gradient_losses)
            total_value_estimation_loss = tf.add_n(value_estimation_losses)
            total_l2_regularization_loss = tf.add_n(l2_regularization_losses)
            total_entropy_regularization_loss = tf.add_n(
                entropy_regularization_losses)
            total_kl_penalty_loss = tf.add_n(kl_penalty_losses)
            tf.compat.v2.summary.scalar(name='policy_gradient_loss',
                                        data=total_policy_gradient_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='value_estimation_loss',
                                        data=total_value_estimation_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='l2_regularization_loss',
                                        data=total_l2_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='entropy_regularization_loss',
                                        data=total_entropy_regularization_loss,
                                        step=self.train_step_counter)
            tf.compat.v2.summary.scalar(name='kl_penalty_loss',
                                        data=total_kl_penalty_loss,
                                        step=self.train_step_counter)

            total_abs_loss = (tf.abs(total_policy_gradient_loss) +
                              tf.abs(total_value_estimation_loss) +
                              tf.abs(total_entropy_regularization_loss) +
                              tf.abs(total_l2_regularization_loss) +
                              tf.abs(total_kl_penalty_loss))

            tf.compat.v2.summary.scalar(name='total_abs_loss',
                                        data=total_abs_loss,
                                        step=self.train_step_counter)

        if self._summarize_grads_and_vars:
            with tf.name_scope('Variables/'):
                all_vars = (self._actor_net.trainable_weights +
                            self._value_net.trainable_weights)
                for var in all_vars:
                    tf.compat.v2.summary.histogram(
                        name=var.name.replace(':', '_'),
                        data=var,
                        step=self.train_step_counter)

        return loss_info
    def _train(self, experience, weights):
        # Get individual tensors from transitions.
        (time_steps, policy_steps_,
         next_time_steps) = trajectory.to_transition(experience)

        #observations = time_steps.observation
        actions = policy_steps_.action
        #rewards = next_time_steps.reward
        #discounts = next_time_steps.discount

        old_actions_distribution = policy_steps_.info

        act_log_probs = get_neglopacs(logits=old_actions_distribution,
                                      labels=actions)

        # Compute the value predictions for states using the current value function.

        value_preds = double_batch_pred2(self._value_net,
                                         experience.observation,
                                         self._observation_spec,
                                         is_training=True)
        value_preds = tf.squeeze(value_preds, -1)

        #NeedValue preds at all time_steps +1 final step obs
        #print("Weight",weights)
        #print("REW",rewards)
        #print("Dis",discounts)
        returns, normalized_advantages = self.compute_return_and_advantage(
            next_time_steps, value_preds)

        #print("RET",returns)
        #print(normalized_advantages)
        # Loss tensors across batches will be aggregated for summaries.
        policy_gradient_losses = []
        value_estimation_losses = []
        l2_regularization_losses = []
        entropy_regularization_losses = []
        kl_penalty_losses = []

        loss_info = None  # TODO(b/123627451): Remove.
        # For each epoch, create its own train op that depends on the previous one.
        for i_epoch in range(self._num_epochs):
            with tf.name_scope('epoch_%d' % i_epoch):

                # Build one epoch train op.
                with tf.GradientTape() as tape:
                    loss_info = self.get_epoch_loss(
                        time_steps, actions, act_log_probs, returns,
                        normalized_advantages, old_actions_distribution,
                        weights)  #action_distribution_parameters

                variables_to_train = (self._actor_net.trainable_variables +
                                      self._value_net.trainable_variables)
                grads = tape.gradient(loss_info.loss, variables_to_train)
                # Tuple is used for py3, where zip is a generator producing values once.
                grads_and_vars = tuple(zip(grads, variables_to_train))
                if self._gradient_clipping > 0:
                    grads_and_vars = eager_utils.clip_gradient_norms(
                        grads_and_vars, self._gradient_clipping)

                self._optimizer.apply_gradients(
                    grads_and_vars)  #, global_step=self.train_step_counter)

                policy_gradient_losses.append(
                    loss_info.extra.policy_gradient_loss)
                value_estimation_losses.append(
                    loss_info.extra.value_estimation_loss)
                l2_regularization_losses.append(
                    loss_info.extra.l2_regularization_loss)
                entropy_regularization_losses.append(
                    loss_info.extra.entropy_regularization_loss)
                kl_penalty_losses.append(loss_info.extra.kl_penalty_loss)

        # After update epochs, update adaptive kl beta, then update observation
        #   normalizer and reward normalizer.
        # Compute the mean kl from previous action distribution.
        temp_ = double_batch_pred2(self._actor_net,
                                   time_steps.observation,
                                   self._observation_spec,
                                   is_training=True)
        kl_divergence = self._kl_divergence(time_steps,
                                            old_actions_distribution, temp_)
        self.update_adaptive_kl_beta(kl_divergence)

        if self._observation_normalizer:
            self._observation_normalizer.update(time_steps.observation,
                                                outer_dims=[0, 1])
        else:
            # TODO(b/127661780): Verify performance of reward_normalizer when obs are
            #                    not normalized
            if self._reward_normalizer:
                self._reward_normalizer.update(next_time_steps.reward,
                                               outer_dims=[0, 1])

        loss_info = tf.nest.map_structure(tf.identity, loss_info)
        return loss_info
Exemple #19
0
  def __call__(self, value: typing.Any) -> trajectory.Transition:
    """Converts `value` to a Transition.  Performs data validation and pruning.

    - If `value` is already a `Transition`, only validation is performed.
    - If `value` is a `Trajectory` and `squeeze_time_dim = True` then
      `value` it must have tensors with shape `[B, T=2]` outer dims.
      This is converted to a `Transition` object without a time
      dimension.
    - If `value` is a `Trajectory` with tensors containing a time dimension
      having `T != 2`, a `ValueError` is raised.

    Args:
      value: A `Trajectory` or `Transition` object to convert.

    Returns:
      A validated and pruned `Transition`.  If `squeeze_time_dim = True`,
      the resulting `Transition` has tensors with shape `[B, ...]`.  Otherwise,
      the tensors will have shape `[B, T - 1, ...]`.

    Raises:
      TypeError: If `value` is not one of `Trajectory` or `Transition`.
      ValueError: If `value` has structure that doesn't match the converter's
        spec.
      TypeError: If `value` has a structure that doesn't match the converter's
        spec.
      ValueError: If `squeeze_time_dim=True` and `value` is a `Trajectory`
        with a time dimension having value other than `T=2`.
    """
    if _is_transition_like(value):
      value = _as_tfa_transition(value)
    elif _is_trajectory_like(value):
      required_sequence_length = 2 if self._squeeze_time_dim else None
      _validate_trajectory(
          value,
          self._data_context.trajectory_spec,
          sequence_length=required_sequence_length)
      value = trajectory.to_transition(value)
      # Remove the now-singleton time dim.
      if self._squeeze_time_dim:
        value = tf.nest.map_structure(
            lambda x: composite.squeeze(x, axis=1), value)
    else:
      raise TypeError('Input type not supported: {}'.format(value))

    num_outer_dims = 1 if self._squeeze_time_dim else 2
    _validate_transition(
        value, self._data_context.transition_spec, num_outer_dims)

    value = nest_utils.prune_extra_keys(
        self._data_context.transition_spec, value)

    if self._prepend_t0_to_next_time_step:
      # This is useful when using sequential model. It allows target_q network
      # to take all the information.
      next_time_step_with_t0 = value.next_time_step._replace(
          observation=tf.nest.map_structure(
              lambda x, y: tf.concat([x[:, :1, ...], y], axis=1),
              value.time_step.observation, value.next_time_step.observation))

      value = value._replace(next_time_step=next_time_step_with_t0)
    return value
Exemple #20
0
def _time_step_batch(trajectory_batch):
    return trajectory.to_transition(trajectory_batch)[-1]
               ShowProgress(20000)],
    num_steps=20000)  # <=> 80,000 ALE frames
final_time_step, final_policy_state = init_driver.run()

# Let's sample 2 sub-episodes, with 3 time steps each and display them:

tf.random.set_seed(
    888)  # chosen to show an example of trajectory at the end of an episode

trajectories, buffer_info = replay_buffer.get_next(sample_batch_size=2,
                                                   num_steps=3)

print(trajectories._fields)
print(trajectories.observation.shape)

time_steps, action_steps, next_time_steps = to_transition(trajectories)
print(time_steps.observation.shape)

print(trajectories.step_type.numpy())

plt.figure(figsize=(10, 6.8))
for row in range(2):
    for col in range(3):
        plt.subplot(2, 3, row * 3 + col + 1)
        plot_observation(trajectories.observation[row, col].numpy())
plt.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0, wspace=0.02)
save_fig("sub_episodes_plot")
plt.show()

dataset = replay_buffer.as_dataset(sample_batch_size=64,
                                   num_steps=2,