Beispiel #1
0
 def _get_mock_py_policy(self):
   mock_py_policy = mock.create_autospec(py_policy.Base)
   observation_spec = tensor_spec.TensorSpec([5], dtype=tf.float32)
   mock_py_policy.time_step_spec.return_value = ts.time_step_spec(
       observation_spec)
   mock_py_policy.action_spec.return_value = tensor_spec.BoundedTensorSpec(
       [3], tf.float32, -1.0, 1.0)
   mock_py_policy.policy_state_spec.return_value = ()
   mock_py_policy.info_spec.return_value = ()
   return mock_py_policy
Beispiel #2
0
    def testBuilds(self):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, ))

        action_spec = [
            tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3),
            tensor_spec.BoundedTensorSpec((3, ), tf.float32, 0, 3)
        ]
        net = actor_rnn_network.ActorRnnNetwork(observation_spec,
                                                action_spec,
                                                conv_layer_params=[(4, 2, 2)],
                                                input_fc_layer_params=(5, ),
                                                lstm_size=(3, ),
                                                output_fc_layer_params=(5, ))

        actions, network_state = net(time_step.observation,
                                     time_step.step_type)
        self.evaluate(tf.global_variables_initializer())
        self.assertEqual([1, 2], actions[0].shape.as_list())
        self.assertEqual([1, 3], actions[1].shape.as_list())

        self.assertEqual(13, len(net.variables))
        # Conv Net Kernel
        self.assertEqual((2, 2, 3, 4), net.variables[0].shape)
        # Conv Net bias
        self.assertEqual((4, ), net.variables[1].shape)
        # Fc Kernel
        self.assertEqual((64, 5), net.variables[2].shape)
        # Fc Bias
        self.assertEqual((5, ), net.variables[3].shape)
        # LSTM Cell Kernel
        self.assertEqual((5, 12), net.variables[4].shape)
        # LSTM Cell Recurrent Kernel
        self.assertEqual((3, 12), net.variables[5].shape)
        # LSTM Cell Bias
        self.assertEqual((12, ), net.variables[6].shape)
        # Fc Kernel
        self.assertEqual((3, 5), net.variables[7].shape)
        # Fc Bias
        self.assertEqual((5, ), net.variables[8].shape)
        # Action 1 Kernel
        self.assertEqual((5, 2), net.variables[9].shape)
        # Action 1 Bias
        self.assertEqual((2, ), net.variables[10].shape)
        # Action 2 Kernel
        self.assertEqual((5, 3), net.variables[11].shape)
        # Action 2 Bias
        self.assertEqual((3, ), net.variables[12].shape)

        # Assert LSTM cell is created.
        self.assertEqual((1, 3), network_state[0].shape)
        self.assertEqual((1, 3), network_state[1].shape)
 def setUp(self):
     super(PyTFPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32, 'obs')
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([], tf.int32, 0, 1,
                                                       'action')
     self._float_action_spec = tensor_spec.BoundedTensorSpec([], tf.float32,
                                                             0, 1, 'action')
     self._tf_policy = q_policy.QPolicy(self._time_step_spec,
                                        self._action_spec,
                                        q_network=DummyNet())
 def setUp(self):
   super(OuNoisePolicyTest, self).setUp()
   self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, 2, 3)
   actor_network = DummyActionNet(self._obs_spec, self._action_spec)
   self._wrapped_policy = actor_policy.ActorPolicy(
       time_step_spec=self._time_step_spec,
       action_spec=self._action_spec,
       actor_network=actor_network,
       clip=False)
Beispiel #5
0
 def _make_parallel_py_environment(self, constructor=None, num_envs=2):
     self.observation_spec = array_spec.ArraySpec((3, 3), np.float32)
     self.time_step_spec = ts.time_step_spec(self.observation_spec)
     self.action_spec = array_spec.BoundedArraySpec([7],
                                                    dtype=np.float32,
                                                    minimum=-1.0,
                                                    maximum=1.0)
     constructor = constructor or functools.partial(
         random_py_environment.RandomPyEnvironment, self.observation_spec,
         self.action_spec)
     return parallel_py_environment.ParallelPyEnvironment(
         env_constructors=[constructor] * num_envs, blocking=True)
Beispiel #6
0
  def setUp(self):
    super(DdpgAgentTest, self).setUp()
    self._obs_spec = [tensor_spec.TensorSpec([2], tf.float32)]
    self._time_step_spec = ts.time_step_spec(self._obs_spec)
    self._action_spec = [tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)]

    network_input_spec = (self._obs_spec, self._action_spec)
    self._critic_net = DummyCriticNetwork(network_input_spec)
    self._bounded_actor_net = DummyActorNetwork(
        self._obs_spec, self._action_spec, unbounded_actions=False)
    self._unbounded_actor_net = DummyActorNetwork(
        self._obs_spec, self._action_spec, unbounded_actions=True)
Beispiel #7
0
    def time_step_spec(self):
        """Describes the `TimeStep` fields returned by `step()`.

    Override this method to define an environment that uses non-standard values
    for any of the items returned by `step`. For example, an environment with
    array-valued rewards.

    Returns:
      A `TimeStep` namedtuple containing (possibly nested) `ArraySpec`s defining
      the step_type, reward, discount, and observation structure.
    """
        return ts.time_step_spec(self.observation_spec())
 def setUp(self):
     super(EpsilonGreedyPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._num_actions = 3
     self._greedy_action = 1
     self._action_spec = tensor_spec.BoundedTensorSpec(
         (1, ), tf.int32, 0, self._num_actions - 1)
     self._policy = fixed_policy.FixedPolicy(
         np.asarray([self._greedy_action], dtype=np.int32),
         self._time_step_spec, self._action_spec)
     observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
     self._time_step = ts.restart(observations, batch_size=2)
Beispiel #9
0
    def testAction(self):
        py_observation_spec = array_spec.BoundedArraySpec((3, ), np.int32, 1,
                                                          1)
        py_time_step_spec = ts.time_step_spec(py_observation_spec)
        py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, 1, 1)
        py_policy_state_spec = array_spec.BoundedArraySpec((5, ), np.int32, 0,
                                                           1)
        py_policy_info_spec = array_spec.BoundedArraySpec((3, ), np.int32, 0,
                                                          1)

        mock_py_policy = mock.create_autospec(py_policy.Base)
        mock_py_policy.time_step_spec = py_time_step_spec
        mock_py_policy.action_spec = py_action_spec
        mock_py_policy.policy_state_spec = py_policy_state_spec
        mock_py_policy.info_spec = py_policy_info_spec

        expected_py_policy_state = np.ones(py_policy_state_spec.shape,
                                           py_policy_state_spec.dtype)
        expected_py_time_step = tf.nest.map_structure(
            lambda arr_spec: np.ones(arr_spec.shape, arr_spec.dtype),
            py_time_step_spec)
        expected_py_action = np.ones(py_action_spec.shape,
                                     py_action_spec.dtype)
        expected_new_py_policy_state = np.zeros(py_policy_state_spec.shape,
                                                py_policy_state_spec.dtype)
        expected_py_info = np.zeros(py_policy_info_spec.shape,
                                    py_policy_info_spec.dtype)

        mock_py_policy.action.return_value = policy_step.PolicyStep(
            expected_py_action, expected_new_py_policy_state, expected_py_info)

        tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy)
        time_step = tf.nest.map_structure(
            lambda arr_spec: tf.ones(arr_spec.shape, arr_spec.dtype),
            py_time_step_spec)
        action_step = tf_mock_py_policy.action(
            time_step, tf.ones(py_policy_state_spec.shape, tf.int32))
        py_action_step = self.evaluate(action_step)

        self.assertEqual(1, mock_py_policy.action.call_count)
        np.testing.assert_equal(
            mock_py_policy.action.call_args[1]['time_step'],
            expected_py_time_step)
        np.testing.assert_equal(
            mock_py_policy.action.call_args[1]['policy_state'],
            expected_py_policy_state)
        np.testing.assert_equal(py_action_step.action, expected_py_action)
        np.testing.assert_equal(py_action_step.state,
                                expected_new_py_policy_state)
        np.testing.assert_equal(py_action_step.info, expected_py_info)
Beispiel #10
0
    def test_get_distribution_class_spec(self):
        ones = tf.ones(shape=[2], dtype=tf.float32)
        obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32)
        time_step_spec = ts.time_step_spec(obs_spec)
        mock_policy = mock.create_autospec(actor_policy.ActorPolicy)
        mock_policy.distribution.return_value = policy_step.PolicyStep(
            (tfp.distributions.Categorical(logits=ones),
             tfp.distributions.Normal(ones, ones)), None)

        class_spec = ppo_utils.get_distribution_class_spec(
            mock_policy, time_step_spec)
        self.assertAllEqual(
            (tfp.distributions.Categorical, tfp.distributions.Normal),
            class_spec)
Beispiel #11
0
    def __init__(self,
                 time_step_spec,
                 action_spec,
                 seed=None,
                 outer_dims=None):

        self._seed = seed
        self._outer_dims = outer_dims
        self._rng = np.random.RandomState(seed)
        if time_step_spec is None:
            time_step_spec = ts.time_step_spec()

        super(RandomPyPolicy, self).__init__(time_step_spec=time_step_spec,
                                             action_spec=action_spec)
Beispiel #12
0
    def test_get_distribution_params_spec(self):
        ones = tf.ones(shape=[1, 2], dtype=tf.float32)
        obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32)
        time_step_spec = ts.time_step_spec(obs_spec)
        mock_policy = mock.create_autospec(actor_policy.ActorPolicy)
        mock_policy._distribution.return_value = policy_step.PolicyStep(
            (tfp.distributions.Categorical(logits=ones),
             tfp.distributions.Normal(ones, ones)))

        params_spec = ppo_utils.get_distribution_params_spec(
            mock_policy, time_step_spec)
        self.assertAllEqual(
            [set(['logits']), set(['loc', 'scale'])],
            [set(d.keys()) for d in params_spec])
        self.assertAllEqual([[[2]], [[2], [2]]],
                            [[d[k].shape for k in d] for d in params_spec])
Beispiel #13
0
 def _initial_collect(self):
   """Collect initial experience before training begins."""
   logging.info('Collecting initial experience...')
   time_step_spec = ts.time_step_spec(self._env.observation_spec())
   random_policy = random_py_policy.RandomPyPolicy(
       time_step_spec, self._env.action_spec())
   time_step = self._env.reset()
   while self._replay_buffer.size < self._initial_collect_steps:
     if self.game_over():
       time_step = self._env.reset()
     action_step = random_policy.action(time_step)
     next_time_step = self._env.step(action_step.action)
     self._replay_buffer.add_batch(trajectory.from_transition(
         time_step, action_step, next_time_step))
     time_step = next_time_step
   logging.info('Done.')
Beispiel #14
0
    def setUp(self):
        tf.compat.v1.enable_resource_variables()
        super(TD3AgentTest, self).setUp()
        self._obs_spec = [tensor_spec.TensorSpec([2], tf.float32)]
        self._time_step_spec = ts.time_step_spec(self._obs_spec)
        self._action_spec = [
            tensor_spec.BoundedTensorSpec([1], tf.float32, -1, 1)
        ]

        input_spec = (self._obs_spec, self._action_spec)
        self._critic_net = DummyCriticNetwork(input_spec)
        self._bounded_actor_net = DummyActorNetwork(self._obs_spec,
                                                    self._action_spec,
                                                    unbounded_actions=False)
        self._unbounded_actor_net = DummyActorNetwork(self._obs_spec,
                                                      self._action_spec,
                                                      unbounded_actions=True)
Beispiel #15
0
 def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'):
     self._dtype = dtype
     self._scope = scope
     self._initial_state = tf.cast(initial_state, dtype=self._dtype)
     observation_spec = specs.TensorSpec([1], self._dtype, 'observation')
     action_spec = specs.BoundedTensorSpec([],
                                           tf.int32,
                                           minimum=0,
                                           maximum=10)
     time_step_spec = ts.time_step_spec(observation_spec)
     super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec)
     self._state = common.create_variable('state',
                                          initial_state,
                                          dtype=self._dtype)
     self.steps = common.create_variable('steps', 0)
     self.episodes = common.create_variable('episodes', 0)
     self.resets = common.create_variable('resets', 0)
    def testBuilds(self):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.int32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, 3))

        net = value_rnn_network.ValueRnnNetwork(observation_spec,
                                                conv_layer_params=[(4, 2, 2)],
                                                input_fc_layer_params=(5, ),
                                                lstm_size=(7, ),
                                                output_fc_layer_params=(3, ))

        value, state = net(time_step.observation, time_step.step_type)
        self.evaluate(tf.compat.v1.global_variables_initializer())

        self.assertEqual((1, 3), value.shape)

        self.assertEqual(11, len(net.variables))
        # Conv Net Kernel
        self.assertEqual((2, 2, 3, 4), net.variables[0].shape)
        # Conv Net bias
        self.assertEqual((4, ), net.variables[1].shape)
        # Fc Kernel
        self.assertEqual((64, 5), net.variables[2].shape)
        # Fc Bias
        self.assertEqual((5, ), net.variables[3].shape)
        # LSTM Cell Kernel
        self.assertEqual((5, 28), net.variables[4].shape)
        # LSTM Cell Recurrent Kernel
        self.assertEqual((7, 28), net.variables[5].shape)
        # LSTM Cell Bias
        self.assertEqual((28, ), net.variables[6].shape)
        # Fc Kernel
        self.assertEqual((7, 3), net.variables[7].shape)
        # Fc Bias
        self.assertEqual((3, ), net.variables[8].shape)
        # Value Shrink Kernel
        self.assertEqual((3, 1), net.variables[9].shape)
        # Value Shrink bias
        self.assertEqual((1, ), net.variables[10].shape)

        # Assert LSTM cell is created.
        self.assertEqual((1, 7), state[0].shape)
        self.assertEqual((1, 7), state[1].shape)
  def setUp(self):
    super(FixedPolicyTest, self).setUp()
    # Creates an MDP with:
    # - dim(observation) = 2
    # - number of actions = 4
    self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
    self._time_step_spec = ts.time_step_spec(self._obs_spec)
    self._num_actions = 4
    self._action_spec = tensor_spec.BoundedTensorSpec(
        shape=(1,), dtype=tf.int32,
        minimum=0, maximum=self._num_actions - 1)

    # The policy always outputs the same action.
    self._fixed_action = 1
    self._policy = fixed_policy.FixedPolicy(
        np.asarray([self._fixed_action], dtype=np.int32),
        self._time_step_spec,
        self._action_spec)
    def _create_replay_buffer(self, rb_cls):
        self._stack_count = 4
        self._single_shape = (15, 15, 1)
        shape = (15, 15, self._stack_count)
        observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = policy_step.PolicyStep(
            array_spec.BoundedArraySpec(shape=(),
                                        dtype=np.int32,
                                        minimum=0,
                                        maximum=1,
                                        name='action'))
        self._trajectory_spec = trajectory.from_transition(
            time_step_spec, action_spec, time_step_spec)

        self._capacity = 32
        self._replay_buffer = rb_cls(data_spec=self._trajectory_spec,
                                     capacity=self._capacity)
Beispiel #19
0
 def __init__(self, initial_state=0, dtype=tf.int64, scope='TFEnviroment'):
     self._dtype = dtype
     self._scope = scope
     self._initial_state = tf.cast(initial_state, dtype=self._dtype)
     observation_spec = specs.TensorSpec([1], self._dtype, 'observation')
     action_spec = specs.BoundedTensorSpec([],
                                           tf.int32,
                                           minimum=0,
                                           maximum=10)
     time_step_spec = ts.time_step_spec(observation_spec)
     super(TFEnvironmentMock, self).__init__(time_step_spec, action_spec)
     with tf.compat.v1.variable_scope(self._scope):
         self._state = tf.Variable(initial_state,
                                   name='state',
                                   dtype=self._dtype)
         self.steps = tf.Variable(0, name='steps')
         self.episodes = tf.Variable(0, name='episodes')
         self.resets = tf.Variable(0, name='resets')
Beispiel #20
0
    def testObservationSpec(self):
        observation_spec = [
            array_spec.ArraySpec((1, 2, 3), np.int32, 'obs1'),
            array_spec.ArraySpec((1, 2, 3), np.float32, 'obs2')
        ]
        time_step_spec = ts.time_step_spec(observation_spec)

        self.assertEqual(time_step_spec.observation, observation_spec)
        self.assertEqual(time_step_spec.step_type,
                         array_spec.ArraySpec([], np.int32, name='step_type'))
        self.assertEqual(time_step_spec.reward,
                         array_spec.ArraySpec([], np.float32, name='reward'))
        self.assertEqual(
            time_step_spec.discount,
            array_spec.BoundedArraySpec([],
                                        np.float32,
                                        minimum=0.0,
                                        maximum=1.0,
                                        name='discount'))
Beispiel #21
0
def make_random_trajectory():
    """Creates a random trajectory.

  This trajectory contains Tensors shaped `[1, 6, ...]` where `1` is the batch
  and `6` is the number of time steps.

  Observations are unbounded but actions are bounded to take values within
  `[1, 2]`.

  Policy info is also provided, and is equal to the actions.  It can be removed
  via:

  ```python
  traj = make_random_trajectory().clone(policy_info=())
  ```

  Returns:
    A `Trajectory`.
  """
    time_step_spec = ts.time_step_spec(
        tensor_spec.TensorSpec([], tf.int64, name='observation'))
    action_spec = tensor_spec.BoundedTensorSpec([],
                                                tf.int32,
                                                minimum=1,
                                                maximum=2,
                                                name='action')
    # info and policy state specs match that of TFPolicyMock.
    outer_dims = [1, 6]  # (batch_size, time)
    traj = trajectory.Trajectory(
        observation=tensor_spec.sample_spec_nest(time_step_spec.observation,
                                                 outer_dims=outer_dims),
        action=tensor_spec.sample_bounded_spec(action_spec,
                                               outer_dims=outer_dims),
        policy_info=tensor_spec.sample_bounded_spec(action_spec,
                                                    outer_dims=outer_dims),
        reward=tf.fill(outer_dims, 0.0),
        # step_type is F M L F M L.
        step_type=tf.reshape(tf.range(0, 6) % 3, outer_dims),
        # next_step_type is M L F M L F.
        next_step_type=tf.reshape(tf.range(1, 7) % 3, outer_dims),
        discount=tf.fill(outer_dims, 1.0),
    )
    return traj, time_step_spec, action_spec
Beispiel #22
0
    def testObservationSpec(self):
        observation_spec = [
            tensor_spec.TensorSpec((1, 2, 3), tf.int32, 'obs1'),
            tensor_spec.TensorSpec((1, 2, 3), tf.float32, 'obs2')
        ]
        time_step_spec = ts.time_step_spec(observation_spec)

        self.assertEqual(time_step_spec.observation, observation_spec)
        self.assertEqual(
            time_step_spec.step_type,
            tensor_spec.TensorSpec([], tf.int32, name='step_type'))
        self.assertEqual(time_step_spec.reward,
                         tensor_spec.TensorSpec([], tf.float32, name='reward'))
        self.assertEqual(
            time_step_spec.discount,
            tensor_spec.BoundedTensorSpec([],
                                          tf.float32,
                                          minimum=0.0,
                                          maximum=1.0,
                                          name='discount'))
  def testGeneratesBatchedActionsWithoutSpecifyingOuterDims(self):
    action_spec = [
        array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10),
        array_spec.BoundedArraySpec((1, 2), np.int32, -10, 10)
    ]
    time_step_spec = time_step.time_step_spec(
        observation_spec=array_spec.ArraySpec((1,), np.int32))
    policy = random_py_policy.RandomPyPolicy(
        time_step_spec=time_step_spec, action_spec=action_spec)

    action_step = policy.action(
        time_step.restart(np.array([[1], [2], [3]], dtype=np.int32)))
    tf.nest.assert_same_structure(action_spec, action_step.action)
    self.assertEqual((3, 2, 3), action_step.action[0].shape)
    self.assertEqual((3, 1, 2), action_step.action[1].shape)

    self.assertTrue(np.all(action_step.action[0] >= -10))
    self.assertTrue(np.all(action_step.action[0] <= 10))
    self.assertTrue(np.all(action_step.action[1] >= -10))
    self.assertTrue(np.all(action_step.action[1] <= 10))
Beispiel #24
0
    def testClipping(self):
        action_spec = (tensor_spec.BoundedTensorSpec([1], tf.float32, 2, 3),
                       tensor_spec.TensorSpec([1], tf.float32),
                       tensor_spec.BoundedTensorSpec([1], tf.int32, 2, 3),
                       tensor_spec.TensorSpec([1], tf.int32))
        time_step_spec = ts.time_step_spec(action_spec)

        policy = TfPassThroughPolicy(time_step_spec, action_spec, clip=True)

        observation = (tf.constant(1, shape=(1, ), dtype=tf.float32),
                       tf.constant(1, shape=(1, ), dtype=tf.float32),
                       tf.constant(1, shape=(1, ), dtype=tf.int32),
                       tf.constant(1, shape=(1, ), dtype=tf.int32))
        time_step = ts.restart(observation)

        clipped_action = self.evaluate(policy.action(time_step).action)
        self.assertEqual(2, clipped_action[0])
        self.assertEqual(1, clipped_action[1])
        self.assertEqual(2, clipped_action[2])
        self.assertEqual(1, clipped_action[3])
    def _generate_replay_buffer(self, rb_cls):
        stack_count = 4
        shape = (15, 15, stack_count)
        single_shape = (15, 15, 1)
        observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs')
        time_step_spec = ts.time_step_spec(observation_spec)
        action_spec = policy_step.PolicyStep(
            array_spec.BoundedArraySpec(shape=(),
                                        dtype=np.int32,
                                        minimum=0,
                                        maximum=1,
                                        name='action'))
        self._trajectory_spec = trajectory.from_transition(
            time_step_spec, action_spec, time_step_spec)

        self._capacity = 32
        self._replay_buffer = rb_cls(data_spec=self._trajectory_spec,
                                     capacity=self._capacity)

        # Generate N frames: the value of pixels is the frame index.
        # The observations will be generated by stacking K frames out of those N,
        # generating some redundancies between the observations.
        single_frames = []
        frame_count = 100
        for k in range(frame_count):
            single_frames.append(np.full(single_shape, k, dtype=np.int32))

        # Add stack of frames to the replay buffer.
        time_steps = []
        for k in range(len(single_frames) - stack_count + 1):
            observation = np.concatenate(single_frames[k:k + stack_count],
                                         axis=-1)
            time_steps.append(ts.transition(observation, reward=0.0))

        self._transition_count = len(time_steps) - 1
        dummy_action = policy_step.PolicyStep(np.int32(0))
        for k in range(self._transition_count):
            self._replay_buffer.add_batch(
                nest_utils.batch_nested_array(
                    trajectory.from_transition(time_steps[k], dummy_action,
                                               time_steps[k + 1])))
Beispiel #26
0
    def testRandomPyPolicyGeneratesActionTensors(self):

        py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, -10, 10)

        observation = tf.ones([3], tf.float32)
        time_step = ts.restart(observation)
        observation_spec = tensor_spec.TensorSpec.from_tensor(observation)
        time_step_spec = ts.time_step_spec(observation_spec)

        tf_py_random_policy = tf_py_policy.TFPyPolicy(
            random_py_policy.RandomPyPolicy(time_step_spec=time_step_spec,
                                            action_spec=py_action_spec))

        action_step = tf_py_random_policy.action(time_step=time_step)
        py_action, py_new_policy_state = self.evaluate(
            [action_step.action, action_step.state])

        self.assertEqual(py_action.shape, py_action_spec.shape)
        self.assertTrue(np.all(py_action >= py_action_spec.minimum))
        self.assertTrue(np.all(py_action <= py_action_spec.maximum))
        self.assertEqual(py_new_policy_state, ())
Beispiel #27
0
  def testBuilds(self):
    observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0,
                                                     1)
    time_step_spec = ts.time_step_spec(observation_spec)
    time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1,))

    action_spec = [
        tensor_spec.BoundedTensorSpec((2,), tf.float32, 2, 3),
        tensor_spec.BoundedTensorSpec((3,), tf.int32, 0, 3)
    ]

    net = actor_distribution_network.ActorDistributionNetwork(
        observation_spec,
        action_spec,
        conv_layer_params=[(4, 2, 2)],
        fc_layer_params=(5,))

    action_distributions, _ = net(time_step.observation, time_step.step_type,
                                  ())
    self.evaluate(tf.compat.v1.global_variables_initializer())
    self.assertEqual([1, 2], action_distributions[0].mode().shape.as_list())
    self.assertEqual([1, 3], action_distributions[1].mode().shape.as_list())
    def testBuildsStackedLstm(self):
        observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.int32,
                                                         0, 1)
        time_step_spec = ts.time_step_spec(observation_spec)
        time_step = tensor_spec.sample_spec_nest(time_step_spec,
                                                 outer_dims=(1, 3))

        net = value_rnn_network.ValueRnnNetwork(observation_spec,
                                                conv_layer_params=[(4, 2, 2)],
                                                input_fc_layer_params=(5, ),
                                                lstm_size=(7, 5),
                                                output_fc_layer_params=(3, ))

        _, state = net(time_step.observation, time_step.step_type)
        self.evaluate(tf.compat.v1.global_variables_initializer())

        # Assert LSTM cell is created.
        self.assertEqual((1, 7), state[0][0].shape)
        self.assertEqual((1, 7), state[0][1].shape)

        # Assert LSTM cell is created.
        self.assertEqual((1, 5), state[1][0].shape)
        self.assertEqual((1, 5), state[1][1].shape)
Beispiel #29
0
 def setUp(self):
     super(GreedyPolicyTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
 def setUp(self):
     super(ReinforceAgentTest, self).setUp()
     self._obs_spec = tensor_spec.TensorSpec([2], tf.float32)
     self._time_step_spec = ts.time_step_spec(self._obs_spec)
     self._action_spec = tensor_spec.BoundedTensorSpec([1], tf.float32, -1,
                                                       1)