def _get_initial_and_final_steps(batch_size, context_dim): observation = np.array(range(batch_size * context_dim)).reshape( [batch_size, context_dim]) reward = np.random.uniform(0.0, 1.0, [batch_size]) initial_step = time_step.TimeStep( tf.constant(time_step.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(observation, dtype=tf.float32, shape=[batch_size, context_dim], name='observation')) final_step = time_step.TimeStep( tf.constant(time_step.StepType.LAST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(reward, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(observation + 100.0, dtype=tf.float32, shape=[batch_size, context_dim], name='observation')) return initial_step, final_step
def _get_initial_and_final_steps_nested_rewards(observations, rewards): batch_size = tf.nest.flatten(observations)[0].shape[0] if isinstance(observations, np.ndarray): observations = tf.constant( observations, dtype=tf.float32, name='observation') zero_rewards = { 'reward': tf.constant(0.0, dtype=tf.float32, shape=[batch_size]), 'constraint': tf.constant(0.0, dtype=tf.float32, shape=[batch_size]) } initial_step = ts.TimeStep( tf.constant( ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), zero_rewards, tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), observations) rewards_nest = tf.nest.map_structure( lambda t: tf.convert_to_tensor(t, dtype=tf.float32), rewards) final_step = ts.TimeStep( tf.constant( ts.StepType.LAST, dtype=tf.int32, shape=[batch_size], name='step_type'), rewards_nest, tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), observations) return initial_step, final_step
def _get_initial_and_final_steps_action_mask_nested_rewards( observations, rewards): batch_size = tf.nest.flatten(observations)[0].shape[0] zero_rewards = { 'reward': tf.constant(0.0, dtype=tf.float32, shape=[batch_size]), 'constraint': tf.constant(0.0, dtype=tf.float32, shape=[batch_size]) } initial_step = ts.TimeStep( tf.constant( ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), zero_rewards, tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), (observations[0], observations[1])) rewards_nest = tf.nest.map_structure( lambda t: tf.convert_to_tensor(t, dtype=tf.float32), rewards) final_step = ts.TimeStep( tf.constant( ts.StepType.LAST, dtype=tf.int32, shape=[batch_size], name='step_type'), rewards_nest, tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), (tf.nest.map_structure( lambda x: x + 100., observations[0]), observations[1])) return initial_step, final_step
def _get_initial_and_final_steps_with_action_mask(batch_size, context_dim, num_actions=None): observation = np.array(range(batch_size * context_dim)).reshape( [batch_size, context_dim]) observation = tf.constant(observation, dtype=tf.float32) mask = 1 - tf.eye(batch_size, num_columns=num_actions, dtype=tf.int32) reward = np.random.uniform(0.0, 1.0, [batch_size]) initial_step = time_step.TimeStep( tf.constant(time_step.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), (observation, mask)) final_step = time_step.TimeStep( tf.constant(time_step.StepType.LAST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(reward, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), (observation + 100.0, mask)) return initial_step, final_step
def to_transition(trajectory: Trajectory, next_trajectory: Optional[Trajectory] = None) -> Transition: """Create a transition from a trajectory or two adjacent trajectories. **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are sliced along their *second* (`time`) dimension; for example: ``` time_steps.step_type = trajectory.step_type[:,:-1] time_steps.observation = trajectory.observation[:,:-1] next_time_steps.observation = trajectory.observation[:,1:] next_time_steps. step_type = trajectory. next_step_type[:,:-1] next_time_steps.reward = trajectory.reward[:,:-1] next_time_steps. discount = trajectory. discount[:,:-1] ``` Notice that reward and discount for time_steps are undefined, therefore filled with zero. Args: trajectory: An instance of `Trajectory`. The tensors in Trajectory must have shape `[B, T, ...]` when next_trajectory is `None`. `discount` is assumed to be a scalar float; hence the shape of `trajectory.discount` must be `[B, T]`. next_trajectory: (optional) An instance of `Trajectory`. Returns: A tuple `(time_steps, policy_steps, next_time_steps)`. The `reward` and `discount` fields of `time_steps` are filled with zeros because these cannot be deduced (please do not use them). Raises: ValueError: if `discount` rank is not within the range [1, 2]. """ _validate_rank(trajectory.discount, min_rank=1, max_rank=2) if next_trajectory is not None: _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2) if next_trajectory is None: next_trajectory = tf.nest.map_structure( lambda t: composite.slice_from(t, axis=1, start=1), trajectory) trajectory = tf.nest.map_structure( lambda t: composite.slice_to(t, axis=1, end=-1), trajectory) policy_steps = policy_step.PolicyStep(action=trajectory.action, state=(), info=trajectory.policy_info) # TODO(b/130244652): Consider replacing 0 rewards & discounts with (). time_steps = ts.TimeStep( trajectory.step_type, reward=tf.nest.map_structure(tf.zeros_like, trajectory.reward), # unknown discount=tf.zeros_like(trajectory.discount), # unknown observation=trajectory.observation) next_time_steps = ts.TimeStep(step_type=trajectory.next_step_type, reward=trajectory.reward, discount=trajectory.discount, observation=next_trajectory.observation) return Transition(time_steps, policy_steps, next_time_steps)
def testTrainPerArmAgentVariableActions(self): num_actions = 5 obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, num_actions, add_num_actions_feature=True) time_step_spec = time_step.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec( dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) encoding_dim = 10 encoder = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2), encoding_dim)) agent = neural_linucb_agent.NeuralLinUCBAgent( time_step_spec=time_step_spec, action_spec=action_spec, encoding_network=encoder, encoding_network_num_train_steps=10, encoding_dim=encoding_dim, accepts_per_arm_features=True, optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast( tf.reshape(tf.range(30), shape=[2, 5, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.constant([3, 4], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step = time_step.TimeStep( tf.constant( time_step.StepType.FIRST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[2], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) final_step = time_step.TimeStep( tf.constant( time_step.StepType.LAST, dtype=tf.int32, shape=[2], name='step_type'), tf.constant(rewards, dtype=tf.float32, name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[2], name='discount'), observations) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) loss_info, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) loss_value = self.evaluate(loss_info) self.assertGreater(loss_value, 0.0)
def __call__(self, value: typing.Any) -> trajectory.Transition: """Convert `value` to an N-step Transition; validate data & prune. - If `value` is already a `Transition`, only validation is performed. - If `value` is a `Trajectory` with tensors containing a time dimension having `T != n + 1`, a `ValueError` is raised. Args: value: A `Trajectory` or `Transition` object to convert. Returns: A validated and pruned `Transition`. If `squeeze_time_dim = True`, the resulting `Transition` has tensors with shape `[B, ...]`. Otherwise, the tensors will have shape `[B, T - 1, ...]`. Raises: TypeError: If `value` is not one of `Trajectory` or `Transition`. ValueError: If `value` has structure that doesn't match the converter's spec. TypeError: If `value` has a structure that doesn't match the converter's spec. ValueError: If `n != None` and `value` is a `Trajectory` with a time dimension having value other than `T=n + 1`. """ if _is_transition_like(value): value = _as_tfa_transition(value) elif _is_trajectory_like(value): required_sequence_length = 1 if self._squeeze_time_dim else None _validate_trajectory( value, self._data_context.trajectory_spec, sequence_length=required_sequence_length) if self._squeeze_time_dim: value = tf.nest.map_structure(lambda e: tf.squeeze(e, axis=1), value) policy_steps = policy_step.PolicyStep( action=value.action, state=(), info=value.policy_info) # TODO(b/130244652): Consider replacing 0 rewards & discounts with (). time_steps = ts.TimeStep( value.step_type, reward=tf.nest.map_structure(tf.zeros_like, value.reward), # unknown discount=tf.zeros_like(value.discount), # unknown observation=value.observation) next_time_steps = ts.TimeStep( step_type=value.next_step_type, reward=value.reward, discount=value.discount, observation=tf.zeros_like(value.discount)) value = trajectory.Transition(time_steps, policy_steps, next_time_steps) else: raise TypeError('Input type not supported: {}'.format(value)) num_outer_dims = 1 if self._squeeze_time_dim else 2 _validate_transition( value, self._data_context.transition_spec, num_outer_dims) value = nest_utils.prune_extra_keys( self._data_context.transition_spec, value) return value
def _get_mock_env_episode(self): mock_env = mock.MagicMock() mock_env.step.side_effect = [ ts.TimeStep(ts.StepType.FIRST, 2, 1, [0]), ts.TimeStep(ts.StepType.MID, 3, 1, [1]), ts.TimeStep(ts.StepType.MID, 5, 1, [2]), ts.TimeStep(ts.StepType.LAST, 7, 1, [3]), ] return mock_env
def _get_mock_env_step(self): mock_env = mock.MagicMock() mock_env.observation_spec.side_effect = [ array_spec.BoundedArraySpec((3,), np.int32, -10, 10), array_spec.BoundedArraySpec((3,), np.int32, -10, 10), array_spec.BoundedArraySpec((3,), np.int32, -10, 10), ] mock_env.reset.side_effect = [ts.TimeStep(ts.StepType.MID, 5, 1, [3, 5, 2])] mock_env.step.side_effect = [ts.TimeStep(ts.StepType.MID, 5, 1, [1, 2, 3])] return mock_env
def _get_initial_and_final_steps_with_per_arm_features(batch_size, global_context_dim, num_actions, arm_context_dim): global_observation = np.array(range(batch_size * global_context_dim)).reshape( [batch_size, global_context_dim]) arm_observation = np.array( range(batch_size * num_actions * arm_context_dim)).reshape( [batch_size, num_actions, arm_context_dim]) reward = np.random.uniform(0.0, 1.0, [batch_size]) initial_step = time_step.TimeStep( tf.constant(time_step.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), { 'global': tf.constant(global_observation, dtype=tf.float32, shape=[batch_size, global_context_dim], name='global_observation'), 'per_arm': tf.constant(arm_observation, dtype=tf.float32, shape=[batch_size, num_actions, arm_context_dim], name='arm_observation') }) final_step = time_step.TimeStep( tf.constant(time_step.StepType.LAST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(reward, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), { 'global': tf.constant(global_observation + 100.0, dtype=tf.float32, shape=[batch_size, global_context_dim], name='global_observation'), 'arm': tf.constant(arm_observation + 100.0, dtype=tf.float32, shape=[batch_size, num_actions, arm_context_dim], name='arm_observation') }) return initial_step, final_step
def _get_mock_env_episode(self): mock_env = mock.MagicMock() mock_env.step.side_effect = [ # In practice, the first reward would be 0, but test with a reward of 1. ts.TimeStep(ts.StepType.FIRST, 1, 1, [0]), ts.TimeStep(ts.StepType.MID, 2, 1, [1]), ts.TimeStep(ts.StepType.MID, 3, 1, [2]), ts.TimeStep(ts.StepType.MID, 5, 1, [3]), ts.TimeStep(ts.StepType.LAST, 7, 1, [4]), ] return mock_env
def testMixturePolicyDynamicBatchSize(self): context_dim = 35 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=9, name='action') sub_policies = [ ConstantPolicy(action_spec, time_step_spec, i) for i in range(10) ] weights = [0, 0, 0.2, 0, 0, 0.3, 0, 0, 0.5, 0] dist = tfd.Categorical(probs=weights) policy = mixture_policy.MixturePolicy(dist, sub_policies) batch_size = tf.random.uniform(shape=(), minval=10, maxval=15, dtype=tf.int32) time_step = ts.TimeStep( tf.fill(tf.expand_dims(batch_size, axis=0), ts.StepType.FIRST, name='step_type'), tf.zeros(shape=[batch_size], dtype=tf.float32, name='reward'), tf.ones(shape=[batch_size], dtype=tf.float32, name='discount'), tf.reshape(tf.range(tf.cast(batch_size * context_dim, dtype=tf.float32), dtype=tf.float32), shape=[-1, context_dim], name='observation')) action_step = policy.action(time_step) actions, bsize = self.evaluate([action_step.action, batch_size]) self.assertAllEqual(actions.shape, [bsize]) self.assertAllInSet(actions, [2, 5, 8]) saver = policy_saver.PolicySaver(policy) location = os.path.join(self.get_temp_dir(), 'saved_policy') saver.save(location) loaded_policy = tf.compat.v2.saved_model.load(location) new_batch_size = 3 new_time_step = ts.TimeStep( tf.fill(tf.expand_dims(new_batch_size, axis=0), ts.StepType.FIRST, name='step_type'), tf.zeros(shape=[new_batch_size], dtype=tf.float32, name='reward'), tf.ones(shape=[new_batch_size], dtype=tf.float32, name='discount'), tf.reshape(tf.range(tf.cast(new_batch_size * context_dim, dtype=tf.float32), dtype=tf.float32), shape=[-1, context_dim], name='observation')) new_action = self.evaluate(loaded_policy.action(new_time_step).action) self.assertAllEqual(new_action.shape, [new_batch_size]) self.assertAllInSet(new_action, [2, 5, 8])
def testMixturePolicyNegativeProb(self): context_dim = 11 observation_spec = tensor_spec.TensorSpec([context_dim], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) action_spec = tensor_spec.BoundedTensorSpec(shape=(), dtype=tf.int32, minimum=0, maximum=9, name='action') sub_policies = [ ConstantPolicy(action_spec, time_step_spec, i) for i in range(10) ] weights = [0, 0, 0.2, 0, 0, -0.3, 0, 0, 0.5, 0] policy = mixture_policy.MixturePolicy(weights, sub_policies) batch_size = 15 time_step = ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(list(range(batch_size * context_dim)), dtype=tf.float32, shape=[batch_size, context_dim], name='observation')) with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, 'Negative probability'): policy.action(time_step)
def _set_names_and_shapes(self, step_type, reward, discount, *flat_observations): """Returns a `TimeStep` namedtuple.""" step_type = tf.identity(step_type, name='step_type') reward = tf.identity(reward, name='reward') discount = tf.identity(discount, name='discount') batch_shape = () if not self.batched else (self.batch_size, ) batch_shape = tf.TensorShape(batch_shape) if not tf.executing_eagerly(): # Shapes are not required in eager mode. reward.set_shape(batch_shape) step_type.set_shape(batch_shape) discount.set_shape(batch_shape) # Give each tensor a meaningful name and set the static shape. named_observations = [] for obs, spec in zip(flat_observations, tf.nest.flatten(self.observation_spec())): named_observation = tf.identity(obs, name=spec.name) if not tf.executing_eagerly(): named_observation.set_shape(batch_shape.concatenate( spec.shape)) named_observations.append(named_observation) observations = tf.nest.pack_sequence_as(self.observation_spec(), named_observations) return ts.TimeStep(step_type, reward, discount, observations)
def _reset(self): """Starts a new sequence and returns the first `TimeStep`.""" time_step = self._env.reset() observations = time_step.observation # initial frame stacking for _ in range(self.stack_size): self._frames.append(observations['pixels']) observations['pixels'] = np.concatenate(self._frames, axis=2) # initial action stacking if self.actions_in_obs: for _ in range(self.stack_size - 1): self._actions.append( np.zeros(self._env.action_spec().shape, dtype=np.float32)) observations['actions'] = np.stack(self._actions) # initial reward stacking if self.rewards_in_obs: for _ in range(self.stack_size): self._rewards.append(np.array(0.0, dtype=np.float32)) observations['rewards'] = np.stack(self._rewards) return ts.TimeStep(time_step.step_type, time_step.reward, time_step.discount, observations)
def updated_sample(sample: Any, reward_shift: float, action_clipping: Optional[Tuple[float, float]], use_trajectories: bool): """Create a sample with reward_shift and action_clipping.""" def _clip_actions(actions): return tf.clip_by_value(actions, clip_value_min=action_clipping[0], clip_value_max=action_clipping[1]) if use_trajectories: # Update trajectory. shifted_reward = sample.reward + reward_shift if action_clipping: return sample._replace(action=tf.nest.map_structure( _clip_actions, sample.action), reward=shifted_reward) else: return sample._replace(reward=shifted_reward) else: # Update transition. next_time_step = sample.next_time_step next_time_step = ts.TimeStep(step_type=next_time_step.step_type, reward=next_time_step.reward + reward_shift, discount=next_time_step.discount, observation=next_time_step.observation) action_step = sample.action_step if action_clipping: action_step = action_step._replace(action=tf.nest.map_structure( _clip_actions, action_step.action)) return trajectory.Transition(time_step=sample.time_step, action_step=action_step, next_time_step=next_time_step)
def testWithAdvantageFn(self, with_value_network): advantage_fn = mock.Mock( side_effect=lambda returns, _: returns) value_network = (DummyValueNet(self._obs_spec) if with_value_network else None) agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet( self._obs_spec, self._action_spec, unbounded_actions=False), value_network=value_network, advantage_fn=advantage_fn, optimizer=None, ) step_type = tf.constant( [[ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST]]) reward = tf.constant([[0, 0, 0, 0]], dtype=tf.float32) discount = tf.constant([[1, 1, 1, 1]], dtype=tf.float32) observations = tf.constant( [[[1, 2], [1, 2], [1, 2], [1, 2]]], dtype=tf.float32) time_steps = ts.TimeStep(step_type, reward, discount, observations) actions = tf.constant([[[0], [1], [2], [3]]], dtype=tf.float32) agent.total_loss(time_steps, actions, time_steps.reward, None) advantage_fn.assert_called_once()
def testPolicyGradientLossMultipleEpisodes(self): agent = reinforce_agent.ReinforceAgent( self._time_step_spec, self._action_spec, actor_network=DummyActorNet( self._obs_spec, self._action_spec, unbounded_actions=True), optimizer=None, ) step_type = tf.constant( [ts.StepType.FIRST, ts.StepType.LAST, ts.StepType.FIRST, ts.StepType.LAST]) reward = tf.constant([0, 0, 0, 0], dtype=tf.float32) discount = tf.constant([1, 1, 1, 1], dtype=tf.float32) observations = tf.constant( [[1, 2], [1, 2], [1, 2], [1, 2]], dtype=tf.float32) time_steps = ts.TimeStep(step_type, reward, discount, observations) actions = tf.constant([[0], [1], [2], [3]], dtype=tf.float32) actions_distribution = agent.collect_policy.distribution( time_steps).action returns = tf.constant([1.9, 1.9, 1.0, 1.0], dtype=tf.float32) expected_loss = 5.140229225158691 loss = agent.policy_gradient_loss( actions_distribution, actions, time_steps.is_last(), returns, 2) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def testObservationShapeMismatch(self, batch_size, exploration_strategy): policy = linear_policy.LinearBanditPolicy(self._action_spec, self._a, self._b, self._num_samples_per_arm, self._time_step_spec, exploration_strategy) current_time_step = ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(np.array(range(batch_size * (self._obs_dim + 1))), dtype=tf.float32, shape=[batch_size, self._obs_dim + 1], name='observation')) with self.assertRaisesRegexp( ValueError, r'Observation shape is expected to be \[None, 2\].' r' Got \[%d, 3\].' % batch_size): policy.action(current_time_step)
def _per_arm_time_step_batch(self, batch_size): return ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant(np.array(range(batch_size * self._obs_dim)), dtype=tf.float32, shape=[batch_size, self._obs_dim], name='observation'), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.constant(np.array(range( batch_size * self._num_actions * 4)), dtype=tf.float32, shape=[batch_size, self._num_actions, 4], name='observation'), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.ones([batch_size], dtype=tf.int32) * 2 })
def _create_experience(_): observations = tf.constant([ [[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]], ], dtype=tf.float32) mid_time_step_val = ts.StepType.MID.tolist() time_steps = ts.TimeStep(step_type=tf.constant( [[mid_time_step_val] * 3] * 2, dtype=tf.int32), reward=tf.constant([[1] * 3] * 2, dtype=tf.float32), discount=tf.constant([[1] * 3] * 2, dtype=tf.float32), observation=observations) actions = tf.constant([[[0], [1], [1]], [[0], [1], [1]]], dtype=tf.float32) action_distribution_parameters = { 'loc': tf.constant([[[0.0]] * 3] * 2, dtype=tf.float32), 'scale': tf.constant([[[1.0]] * 3] * 2, dtype=tf.float32), } value_preds = tf.constant([[9., 15., 21.], [9., 15., 21.]], dtype=tf.float32) policy_info = { 'dist_params': action_distribution_parameters, } policy_info['value_prediction'] = value_preds experience = trajectory.Trajectory(time_steps.step_type, observations, actions, policy_info, time_steps.step_type, time_steps.reward, time_steps.discount) return agent._preprocess(experience) # pylint: disable=protected-access
def _step(self, action): """Steps the environment.""" if self.current_time_step().is_last(): return self.reset() total_reward = 0 for _ in range(self._action_repeat): time_step = self._env.step(action) if self._frames is not None and self._stack_within_repeat: self._frames.append(time_step.observation['pixels']) total_reward += time_step.reward if time_step.is_first() or time_step.is_last(): break # Only add the last frame of the action repeat if we don't stack within. if self._frames is not None and not self._stack_within_repeat: self._frames.append(time_step.observation['pixels']) total_reward = np.asarray(total_reward, dtype=np.asarray(time_step.reward).dtype) # Stack frames. if self._frames is not None: time_step.observation['pixels'] = np.concatenate(self._frames, axis=2) return ts.TimeStep(time_step.step_type, total_reward, time_step.discount, time_step.observation)
def testMakeTimestepMaskWithPartialEpisode(self, allow_partial): first, mid, last = ts.StepType.FIRST, ts.StepType.MID, ts.StepType.LAST next_step_types = tf.constant([[mid, mid, last, first, mid, mid, last, first, mid, mid], [mid, mid, last, first, mid, mid, mid, mid, mid, last]]) zeros = tf.zeros_like(next_step_types) next_time_step = ts.TimeStep(next_step_types, zeros, zeros, zeros) if not allow_partial: # Mask should be 0.0 for transition timesteps (3, 7) and for all timesteps # belonging to the final, incomplete episode. expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] else: # Zeros only between episodes. Incomplete episodes are valid and not # zeroed out. expected_mask = [[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0], [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]] timestep_mask = ppo_utils.make_timestep_mask( next_time_step, allow_partial_episodes=allow_partial) timestep_mask_ = self.evaluate(timestep_mask) self.assertAllClose(expected_mask, timestep_mask_)
def test_collect_data_spec_transition(self): episode_dict = { 'states': np.array([[1., 2.], [3., 4.], [5., 6.], [7., 8.]], dtype=np.float32), 'actions': np.array([[1.], [2.], [3.], [4.]], dtype=np.float32), 'rewards': np.array([[0.], [1.], [0.], [1.]], dtype=np.float32), 'discounts': np.array([1.0, 0.0, 1.0, 0.0], dtype=np.float32), 'episode_start_index': np.array([0, 2], dtype=np.int32) } time_step_spec = time_step.TimeStep( step_type=ArraySpec(shape=[], dtype=np.int32), reward=ArraySpec(shape=[1], dtype=np.float32), discount=ArraySpec(shape=[], dtype=np.float32), observation=ArraySpec(shape=[2], dtype=np.float32)) action_spec = policy_step.PolicyStep(action=ArraySpec( shape=[1], dtype=np.float32), state=(), info=()) expected_spec = trajectory.Transition(time_step=time_step_spec, action_step=action_spec, next_time_step=time_step_spec) actual_spec = create_collect_data_spec(episode_dict, use_trajectories=False) self.assertEqual(actual_spec, expected_spec)
def get_single_agent_specs(self, time_step_spec, action_spec): """Get single agent version of environment specs to feed to baby agents.""" def make_single_agent_spec(spec): if len(spec.shape) == 1: shape = 1 else: shape = spec.shape[1:] return tensor_spec.BoundedTensorSpec( shape=shape, name=spec.name, minimum=spec.minimum, maximum=spec.maximum, dtype=spec.dtype) single_obs_spec = tf.nest.map_structure(make_single_agent_spec, time_step_spec.observation) single_reward_spec = tensor_spec.TensorSpec( shape=(), dtype=time_step_spec.reward.dtype, name='reward') single_time_step_spec = ts.TimeStep(time_step_spec.step_type, single_reward_spec, time_step_spec.discount, single_obs_spec) single_action_spec = action_spec[0] return single_obs_spec, single_time_step_spec, single_action_spec
def testObservationShapeMismatch(self, batch_size, actions_from_reward_layer): policy = neural_linucb_policy.NeuralLinUCBPolicy( DummyNet(), self._encoding_dim, get_reward_layer(), actions_from_reward_layer=actions_from_reward_layer, cov_matrix=self._a, data_vector=self._b, num_samples=self._num_samples_per_arm, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) current_time_step = ts.TimeStep( tf.constant(ts.StepType.FIRST, dtype=tf.int32, shape=[batch_size], name='step_type'), tf.constant(0.0, dtype=tf.float32, shape=[batch_size], name='reward'), tf.constant(1.0, dtype=tf.float32, shape=[batch_size], name='discount'), tf.constant(np.array(range(batch_size * (self._obs_dim + 1))), dtype=tf.float32, shape=[batch_size, self._obs_dim + 1], name='observation')) with self.assertRaisesRegexp( ValueError, r'Observation shape is expected to be \[None, 2\].' r' Got \[%d, 3\].' % batch_size): policy.action(current_time_step)
def setUp(self): super(PolicySaverTest, self).setUp() self._time_step_spec = ts.TimeStep( step_type=tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), name='st', minimum=0, maximum=2), reward=tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=(), name='reward', minimum=0.0, maximum=5.0), discount=tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=(), name='discount', minimum=0.0, maximum=1.0), observation=tensor_spec.BoundedTensorSpec(dtype=tf.float32, shape=(4, ), name='obs', minimum=-10.0, maximum=10.0)) self._action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=10, name='act_0') self._global_seed = 12345 tf.compat.v1.set_random_seed(self._global_seed)
def _convert_string_vector_to_action_input(self, example): return (ts.TimeStep( step_type=tf.cast(tf.strings.to_number(example[:, 0], tf.float32), tf.int32), reward=tf.strings.to_number(example[:, 1], tf.float32), discount=tf.strings.to_number(example[:, 2], tf.float32), observation=tf.strings.to_number(example[:, 3:7], tf.float32)), ())
def _pack_and_filter_timestep_observation(self, timestep): """Pack and filter observations into a single dimension. Args: timestep: A `TimeStep` namedtuple containing: - step_type: A `StepType` value. - reward: Reward at this timestep. - discount: A discount in the range [0, 1]. - observation: A NumPy array, or a nested dict, list or tuple of arrays corresponding to `observation_spec()`. Returns: A new `TimeStep` namedtuple that has filtered observations and packed into a single dimenison. """ # We can't set attribute to the TimeStep tuple, so we make a copy of the # observations. observations = timestep.observation if self._observations_allowlist is not None: observations = self._filter_observations(observations) return ts.TimeStep( timestep.step_type, timestep.reward, timestep.discount, self._flatten_nested_observations(observations, is_batched=self._env.batched))
def loop_body(time, time_step, policy_state, output_action_tas, output_policy_info_tas): """Runs a step in environment. While loop will call multiple times. Args: time: Step time. time_step: Previous step's `TimeStep`. policy_state: Policy state tensor or nested structure of tensors. output_action_tas: Updated nest of `tf.TensorArray`, the new actions. output_policy_info_tas: Updated nest of `tf.TensorArray`, the new policy info. Returns: loop_vars for next iteration of tf.while_loop. """ policy_state, next_output_action_tas, next_output_policy_info_tas = ( process_step(time, time_step, policy_state, output_action_tas, output_policy_info_tas)) ta_read = lambda ta: ta.read(time) ta_read_prev = lambda ta: ta.read(time - 1) time_step = ts.TimeStep( step_type=ta_read(trajectory_tas.step_type), observation=tf.nest.map_structure(ta_read, trajectory_tas.observation), reward=tf.nest.map_structure(ta_read_prev, trajectory_tas.reward), discount=ta_read_prev(trajectory_tas.discount)) return (time + 1, time_step, policy_state, next_output_action_tas, next_output_policy_info_tas)