def _action(self, time_step, policy_state, seed: Optional[types.Seed] = None): del seed # Unused. Seed passed to the class. outer_dims = self._outer_dims if outer_dims is None: if self.time_step_spec.observation: outer_dims = nest_utils.get_outer_array_shape( time_step.observation, self.time_step_spec.observation) else: outer_dims = () observation_and_action_constraint_splitter = ( self.observation_and_action_constraint_splitter) if observation_and_action_constraint_splitter is not None: _, mask = observation_and_action_constraint_splitter( time_step.observation) zero_logits = tf.cast(tf.zeros_like(mask), tf.float32) masked_categorical = masked.MaskedCategorical(zero_logits, mask) random_action = tf.cast( masked_categorical.sample() + self.action_spec.minimum, self.action_spec.dtype) # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1) rather than (B,). if len(self.action_spec.shape) == 1: random_action = tf.expand_dims(random_action, axis=-1) else: random_action = array_spec.sample_spec_nest( self._action_spec, self._rng, outer_dims=outer_dims) info = array_spec.sample_spec_nest( self._info_spec, self._rng, outer_dims=outer_dims) return policy_step.PolicyStep(random_action, policy_state, info)
def testArraySpecSample(self, dtype): spec = array_spec.ArraySpec((2, 3), dtype) sample = array_spec.sample_spec_nest(spec, self.rng) bounded = array_spec.BoundedArraySpec.from_spec(spec) self.assertTrue(np.all(sample >= bounded.minimum)) self.assertTrue(np.all(sample <= bounded.maximum))
def testSavedModel(self): path = os.path.join(self.get_temp_dir(), 'saved_policy') saver = policy_saver.PolicySaver(self.tf_policy) saver.save(path) eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy( path, self.time_step_spec, self.action_spec) rng = np.random.RandomState() sample_time_step = array_spec.sample_spec_nest(self.time_step_spec, rng) batched_sample_time_step = nest_utils.batch_nested_array( sample_time_step) original_action = self.tf_policy.action(batched_sample_time_step) unbatched_original_action = nest_utils.unbatch_nested_tensors( original_action) original_action_np = tf.nest.map_structure(lambda t: t.numpy(), unbatched_original_action) saved_policy_action = eager_py_policy.action(sample_time_step) tf.nest.assert_same_structure(saved_policy_action.action, self.action_spec) np.testing.assert_array_almost_equal(original_action_np.action, saved_policy_action.action)
def testBoundedArraySpecNoBounds(self, dtype): spec = array_spec.ArraySpec((2, 3), dtype) bounded_spec = array_spec.BoundedArraySpec.from_spec(spec) sample = array_spec.sample_spec_nest(bounded_spec, self.rng) tf_dtype = tf.as_dtype(spec.dtype) self.assertTrue(np.all(sample >= tf_dtype.min)) self.assertTrue(np.all(sample <= tf_dtype.max))
def _action(self, time_step: ts.TimeStep, policy_state: types.NestedArray) -> ps.PolicyStep: random_action = array_spec.sample_spec_nest( self._action_spec, self._rng) return ps.PolicyStep(random_action, policy_state)
def testBoundedArraySpecSampleMultipleBounds(self, dtype): spec = array_spec.BoundedArraySpec((2, ), dtype, [-10, 1], [10, 3]) sample = array_spec.sample_spec_nest(spec, self.rng) self.assertGreaterEqual(sample[0], -10) self.assertLessEqual(sample[0], 10) self.assertGreaterEqual(sample[1], 1) self.assertLessEqual(sample[1], 3)
def testNestSampleOuterDims(self, dtype): spec = example_nested_spec(dtype) outer_dims = [2, 3] sample = array_spec.sample_spec_nest(spec, self.rng, outer_dims=outer_dims) bounded = array_spec.BoundedArraySpec.from_spec(spec["array_spec_1"]) self.assertTrue(np.all(sample["array_spec_1"] >= bounded.minimum)) self.assertTrue(np.all(sample["array_spec_1"] <= bounded.maximum)) self.assertTrue(np.all(sample["bounded_spec_1"] >= -10)) self.assertTrue(np.all(sample["bounded_spec_1"] <= 10)) self.assertIn("array_spec_2", sample["dict_spec"]) self.assertIn("bounded_spec_2", sample["dict_spec"]) self.assertIn("tuple_spec", sample) self.assertIn("list_spec", sample) self.assertTrue(np.all(sample["list_spec"][1][1] >= -10)) self.assertTrue(np.all(sample["list_spec"][1][1] <= 10)) def _test_batched_shape(sample_, spec_): self.assertSequenceEqual(sample_.shape, outer_dims + list(spec_.shape)) tf.nest.map_structure(_test_batched_shape, sample, spec)
def testArraySpecSampleWithName(self, dtype): spec = array_spec.ArraySpec((2, 3), dtype, name="test_spec") sample = array_spec.sample_spec_nest(spec, self.rng) bounded = array_spec.BoundedArraySpec.from_spec(spec) self.assertTrue(np.all(sample >= bounded.minimum)) self.assertTrue(np.all(sample <= bounded.maximum)) self.assertEqual("test_spec", bounded.name)
def test_serialize_deserialize(self, dtype): spec = example_nested_spec(dtype) serializer = example_encoding.get_example_serializer(spec) decoder = example_encoding.get_example_decoder(spec) sample = array_spec.sample_spec_nest(spec, np.random.RandomState(0)) example_proto = serializer(sample) recovered = self.evaluate(decoder(example_proto)) tf.nest.map_structure(np.testing.assert_almost_equal, sample, recovered)
def _action(self, time_step, policy_state): outer_dims = self._outer_dims if outer_dims is None: if self.time_step_spec.observation: outer_dims = nest_utils.get_outer_array_shape( time_step.observation, self.time_step_spec.observation) else: outer_dims = () random_action = array_spec.sample_spec_nest( self._action_spec, self._rng, outer_dims=outer_dims) return policy_step.PolicyStep(random_action, policy_state)
def testInferenceFromCheckpoint(self): path = os.path.join(self.get_temp_dir(), 'saved_policy') saver = policy_saver.PolicySaver(self.tf_policy) saver.save(path) rng = np.random.RandomState() sample_time_step = array_spec.sample_spec_nest(self.time_step_spec, rng) batched_sample_time_step = nest_utils.batch_nested_array( sample_time_step) self.evaluate( tf.nest.map_structure(lambda v: v.assign(v * 0 + -1), self.tf_policy.variables())) checkpoint_path = os.path.join(self.get_temp_dir(), 'checkpoint') saver.save_checkpoint(checkpoint_path) eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy( path, self.time_step_spec, self.action_spec) # Use evaluate to force a copy. saved_model_variables = self.evaluate(eager_py_policy.variables()) checkpoint = tf.train.Checkpoint(policy=eager_py_policy._policy) manager = tf.train.CheckpointManager(checkpoint, directory=checkpoint_path, max_to_keep=None) eager_py_policy.update_from_checkpoint(manager.latest_checkpoint) assert_np_not_equal = lambda a, b: self.assertFalse( np.equal(a, b).all()) tf.nest.map_structure(assert_np_not_equal, saved_model_variables, self.evaluate(eager_py_policy.variables())) assert_np_all_equal = lambda a, b: self.assertTrue( np.equal(a, b).all()) tf.nest.map_structure(assert_np_all_equal, self.evaluate(self.tf_policy.variables()), self.evaluate(eager_py_policy.variables())) # Can't check if the action is different as in some cases depending on # variable initialization it will be the same. Checking that they are at # least always the same. checkpoint_action = eager_py_policy.action(sample_time_step) current_policy_action = self.tf_policy.action(batched_sample_time_step) current_policy_action = self.evaluate( nest_utils.unbatch_nested_tensors(current_policy_action)) tf.nest.map_structure(assert_np_all_equal, current_policy_action, checkpoint_action)
def testValidateOk(self): env = get_mock_env(self._action_spec, self._observation_spec, None) rng = np.random.RandomState() sample_fn = lambda: array_spec.sample_spec_nest(env.observation_spec(), rng) def step(unused_time_step): if rng.rand() < 0.10: return ts.termination(sample_fn(), 0.0) # pytype: disable=wrong-arg-types else: return ts.transition(sample_fn(), 1.0) # pytype: disable=wrong-arg-types env.step = step env.reset = lambda: ts.restart(sample_fn()) utils.validate_py_environment(env, episodes=2)
def testValidateBoundedSpecDistinctBounds(self): observation_spec = array_spec.BoundedArraySpec((3,), np.int32, [-10, -5, -2], [10, 5, 2]) env = get_mock_env(self._action_spec, observation_spec, None) rng = np.random.RandomState() sample_fn = lambda: array_spec.sample_spec_nest(env.observation_spec(), rng) def step(unused_time_step): if rng.rand() < 0.10: return ts.termination(sample_fn(), 0.0) # pytype: disable=wrong-arg-types else: return ts.transition(sample_fn(), 1.0) # pytype: disable=wrong-arg-types env.step = step env.reset = lambda: ts.restart(sample_fn()) utils.validate_py_environment(env, episodes=1)
def testSavedModel(self): if not common.has_eager_been_enabled(): self.skipTest('Only supported in eager.') observation_spec = array_spec.ArraySpec([2], np.float32) action_spec = array_spec.BoundedArraySpec([1], np.float32, 2, 3) time_step_spec = ts.time_step_spec(observation_spec) observation_tensor_spec = tensor_spec.from_spec(observation_spec) action_tensor_spec = tensor_spec.from_spec(action_spec) time_step_tensor_spec = tensor_spec.from_spec(time_step_spec) actor_net = actor_network.ActorNetwork( observation_tensor_spec, action_tensor_spec, fc_layer_params=(10, ), ) tf_policy = actor_policy.ActorPolicy(time_step_tensor_spec, action_tensor_spec, actor_network=actor_net) path = os.path.join(self.get_temp_dir(), 'saved_policy') saver = policy_saver.PolicySaver(tf_policy) saver.save(path) eager_py_policy = py_tf_eager_policy.SavedModelPyTFEagerPolicy( path, time_step_spec, action_spec) rng = np.random.RandomState() sample_time_step = array_spec.sample_spec_nest(time_step_spec, rng) batched_sample_time_step = nest_utils.batch_nested_array( sample_time_step) original_action = tf_policy.action(batched_sample_time_step) unbatched_original_action = nest_utils.unbatch_nested_tensors( original_action) original_action_np = tf.nest.map_structure(lambda t: t.numpy(), unbatched_original_action) saved_policy_action = eager_py_policy.action(sample_time_step) tf.nest.assert_same_structure(saved_policy_action.action, action_spec) np.testing.assert_array_almost_equal(original_action_np.action, saved_policy_action.action)
def testNestSample(self, dtype): spec = example_nested_spec(dtype) sample = array_spec.sample_spec_nest(spec, self.rng) bounded = array_spec.BoundedArraySpec.from_spec(spec["array_spec_1"]) self.assertTrue(np.all(sample["array_spec_1"] >= bounded.minimum)) self.assertTrue(np.all(sample["array_spec_1"] <= bounded.maximum)) self.assertTrue(np.all(sample["bounded_spec_1"] >= -10)) self.assertTrue(np.all(sample["bounded_spec_1"] <= 10)) self.assertIn("array_spec_2", sample["dict_spec"]) self.assertIn("bounded_spec_2", sample["dict_spec"]) self.assertIn("tuple_spec", sample) self.assertIn("list_spec", sample) self.assertTrue(np.all(sample["list_spec"][1][1] >= -10)) self.assertTrue(np.all(sample["list_spec"][1][1] <= 10))
def _action(self, time_step, policy_state): outer_dims = self._outer_dims if outer_dims is None: if self.time_step_spec.observation: outer_dims = nest_utils.get_outer_array_shape( time_step.observation, self.time_step_spec.observation) else: outer_dims = () random_action = np.array([ simple_human_policy( human_agent_wrapper( self.__convert_tf_obs_to_numpy( time_step.observation))).value ]) info = array_spec.sample_spec_nest(self._info_spec, self._rng, outer_dims=outer_dims) return policy_step.PolicyStep(random_action, policy_state, info)
def testBoundedArraySpecSample(self, dtype): spec = array_spec.BoundedArraySpec((2, 3), dtype, -10, 10) sample = array_spec.sample_spec_nest(spec, self.rng) self.assertTrue(np.all(sample >= -10)) self.assertTrue(np.all(sample <= 10))
def _get_observation(self): batch_size = (self._batch_size, ) if self._batch_size else () return array_spec.sample_spec_nest(self._observation_spec, self._rng, batch_size)
def testMatch(self, dtype): spec = example_nested_spec(dtype) sample = array_spec.sample_spec_nest(spec, np.random.RandomState()) self.assertTrue(array_spec.check_arrays_nest(sample, spec))