def testTrainArgspec(self): train_argspec = { 'extra': tf.TensorSpec(dtype=tf.float32, shape=[3, 4]) } agent = MyAgent(train_argspec=train_argspec) extra = tf.ones(shape=[3, 4], dtype=tf.float32) experience = tf.nest.map_structure( lambda x: x[tf.newaxis, ...], trajectory.from_episode(observation={'obs': tf.constant([1.0])}, action=(), policy_info=(), reward=tf.constant([1.0]))) loss_info = agent.train(experience, extra=extra) tf.nest.map_structure(self.assertAllEqual, (experience, extra), loss_info.extra) extra_newdim = tf.ones(shape=[2, 3, 4], dtype=tf.float32) loss_info_newdim = agent.train(experience, extra=extra_newdim) self.assertAllEqual(loss_info_newdim.extra[1], extra_newdim) with self.assertRaisesRegex(ValueError, 'Inconsistent dtypes or shapes between'): agent.train(experience, extra=tf.ones(shape=[3, 5], dtype=tf.float32)) with self.assertRaisesRegex(ValueError, 'Inconsistent dtypes or shapes between'): agent.train(experience, extra=tf.ones(shape=[3, 4], dtype=tf.int32))
def testFromEpisodeWithCompositeTensorOfTensors(self): observation = tf.SparseTensor( indices=tf.random.uniform((7, 2), maxval=9, dtype=tf.int64), values=tf.random.uniform((7, )), dense_shape=[4, 10 ]) # The 4 is important, it must match reward length. action = () policy_info = () reward = tf.random.uniform((4, )) traj = trajectory.from_episode(observation, action, policy_info, reward, discount=None) self.assertTrue(tf.is_tensor(traj.step_type)) traj_val, obs_val, reward_val = self.evaluate( (traj, observation, reward)) first = ts.StepType.FIRST mid = ts.StepType.MID last = ts.StepType.LAST self.assertAllEqual(traj_val.step_type, [first, mid, mid, mid]) self.assertAllEqual(traj_val.next_step_type, [mid, mid, mid, last]) self.assertAllClose(traj_val.observation, obs_val) self.assertAllEqual(traj_val.reward, reward_val) self.assertAllEqual(traj_val.discount, [1.0, 1.0, 1.0, 1.0])
def testLossNotMatching(self): class MyAgentWithLossNotMatching(MyAgent): def _loss(self, experience, weights=None, extra=None): return tf_agent.LossInfo(loss=(), extra=(experience, ())) train_argspec = { 'extra': tf.TensorSpec(dtype=tf.float32, shape=[3, 4]) } agent = MyAgentWithLossNotMatching(train_argspec=train_argspec) extra = tf.ones(shape=[3, 4], dtype=tf.float32) experience = tf.nest.map_structure( lambda x: x[tf.newaxis, ...], trajectory.from_episode(observation={'obs': tf.constant([1.0])}, action=(), policy_info=(), reward=tf.constant([1.0]))) with self.assertRaisesRegex( ValueError, r'.*`LossInfo` from train\(\) and `LossInfo` from loss\(\) do not have ' 'matching structures.*'): test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience, extra=extra)
def run(self, time_step, policy_state=()): """Run policy in environment given initial time_step and policy_state. Args: time_step: The initial time_step. policy_state: The initial policy_state. Returns: A tuple (final time_step, final policy_state). """ for num_episodes in range(self._max_episodes): time_step = self.env.reset() policy_state = self.policy.get_initial_state() observation = [] action = [] policy_info = [] reward = [] while not self.env.done: action_step = self.policy.action(time_step, policy_state) if self.env.debug: self.env.visualize(action_step.action, action_step.info) next_time_step = self.env.step(action_step.action) next_policy_state = action_step.state if len(self.observers) > 0: observation.append(time_step.observation) action.append(action_step.action) policy_info.append(action_step.info) reward.append(next_time_step.reward) time_step = next_time_step policy_state = next_policy_state if len(self.observers) > 0: # TODO: Find a better way than repeating the last action. observation.append(time_step.observation) action.append(action_step.action) policy_info.append(action_step.info) reward.append(next_time_step.reward) observation = stack_nested_arrays(observation) action = stack_nested_arrays(action) policy_info = stack_nested_arrays(policy_info) reward = stack_nested_arrays(reward) traj = trajectory.from_episode(observation, action, policy_info, reward) for observer in self.observers: observer(traj) return time_step, policy_state
def testLoss(self): agent = MyAgent() extra = tf.ones(shape=[3, 4], dtype=tf.float32) experience = tf.nest.map_structure( lambda x: x[tf.newaxis, ...], trajectory.from_episode(observation={'obs': tf.constant([1.0])}, action=(), policy_info=(), reward=tf.constant([1.0]))) test_util.test_loss_and_train_output(test=self, expect_equal_loss_values=True, agent=agent, experience=experience, extra=extra)
def experience_to_traj(rlt): rlt = np.array(rlt) d = rlt[:, 2] v = rlt[:, 3] / 0.001 v = v.astype(np.int32) discount = np.ones_like(v) * 0.99 policy_info = rlt[:, 4] reward = -np.abs(d - 1.22) traj = from_episode(observation=d, action=v, reward=reward, discount=discount, policy_info=policy_info) return traj
def testTrainIgnoresExtraFields(self): agent = MyAgent() extra = tf.ones(shape=[3, 4], dtype=tf.float32) experience = tf.nest.map_structure( lambda x: x[tf.newaxis, ...], trajectory.from_episode( observation={ 'obs': tf.constant([1.0]), 'ignored': tf.constant([2.0])}, action=(), policy_info=(), reward=tf.constant([1.0]))) loss_info = agent.train(experience, extra=extra) reduced_experience = experience._replace( observation=copy.copy(experience.observation)) del reduced_experience.observation['ignored'] tf.nest.map_structure( self.assertAllEqual, (reduced_experience, extra), loss_info.extra)
def testFromEpisodeArray(self): observation = np.random.rand(4, 5) action = () policy_info = () reward = np.random.rand(4) traj = trajectory.from_episode( observation, action, policy_info, reward, discount=None) self.assertFalse(tf.is_tensor(traj.step_type)) first = ts.StepType.FIRST mid = ts.StepType.MID last = ts.StepType.LAST self.assertAllEqual( traj.step_type, [first, mid, mid, mid]) self.assertAllEqual( traj.next_step_type, [mid, mid, mid, last]) self.assertAllEqual(traj.observation, observation) self.assertAllEqual(traj.reward, reward) self.assertAllEqual(traj.discount, [1.0, 1.0, 1.0, 1.0])
def testFromEpisodeTensor(self): observation = tf.random.uniform((4, 5)) action = () policy_info = () reward = tf.random.uniform((4,)) traj = trajectory.from_episode( observation, action, policy_info, reward, discount=None) self.assertTrue(tf.is_tensor(traj.step_type)) traj_val, obs_val, reward_val = self.evaluate((traj, observation, reward)) first = ts.StepType.FIRST mid = ts.StepType.MID last = ts.StepType.LAST self.assertAllEqual( traj_val.step_type, [first, mid, mid, mid]) self.assertAllEqual( traj_val.next_step_type, [mid, mid, mid, last]) self.assertAllEqual(traj_val.observation, obs_val) self.assertAllEqual(traj_val.reward, reward_val) self.assertAllEqual(traj_val.discount, [1.0, 1.0, 1.0, 1.0])
def _parser_fn(serialized_proto): """Helper function that is returned by create_`parser_fn`.""" # We copy through all context features at each frame, so even though we know # they don't change from frame to frame, they are still sequence features # and stored in the feature list. context_features = {} # pylint: disable=g-complex-comprehension sequence_features = dict( (tensor_spec.name, tf.io.FixedLenSequenceFeature(shape=tensor_spec.shape, dtype=tensor_spec.dtype)) for tensor_spec in time_step_spec.observation.values()) sequence_features[action_spec.name] = tf.io.FixedLenSequenceFeature( shape=action_spec.shape, dtype=action_spec.dtype) sequence_features[ time_step_spec.reward.name] = tf.io.FixedLenSequenceFeature( shape=time_step_spec.reward.shape, dtype=time_step_spec.reward.dtype) sequence_features.update( _get_policy_info_parsing_dict(agent_name, action_spec)) # pylint: enable=g-complex-comprehension with tf.name_scope('parse'): _, parsed_sequence = tf.io.parse_single_sequence_example( serialized_proto, context_features=context_features, sequence_features=sequence_features) # TODO(yundi): make the transformed reward configurable. action = parsed_sequence[action_spec.name] reward = tf.cast(parsed_sequence[time_step_spec.reward.name], tf.float32) policy_info = _process_parsed_sequence_and_get_policy_info( parsed_sequence, agent_name, action_spec) del parsed_sequence[time_step_spec.reward.name] del parsed_sequence[action_spec.name] full_trajectory = trajectory.from_episode( observation=parsed_sequence, action=action, policy_info=policy_info, reward=reward) return full_trajectory