def _generate_replay_buffer(self, rb_cls): stack_count = 4 shape = (15, 15, stack_count) single_shape = (15, 15, 1) observation_spec = array_spec.ArraySpec(shape, np.int32, 'obs') time_step_spec = ts.time_step_spec(observation_spec) action_spec = policy_step.PolicyStep(array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')) self._trajectory_spec = trajectory.from_transition( time_step_spec, action_spec, time_step_spec) self._capacity = 32 self._replay_buffer = rb_cls( data_spec=self._trajectory_spec, capacity=self._capacity) # Generate N frames: the value of pixels is the frame index. # The observations will be generated by stacking K frames out of those N, # generating some redundancies between the observations. single_frames = [] frame_count = 100 for k in range(frame_count): single_frames.append(np.full(single_shape, k, dtype=np.int32)) # Add stack of frames to the replay buffer. time_steps = [] for k in range(len(single_frames) - stack_count + 1): observation = np.concatenate(single_frames[k:k + stack_count], axis=-1) time_steps.append(ts.transition(observation, reward=0.0)) self._transition_count = len(time_steps) - 1 dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._transition_count): self._replay_buffer.add_batch(nest_utils.batch_nested_array( trajectory.from_transition( time_steps[k], dummy_action, time_steps[k + 1])))
def _action(self, time_step, policy_state): if not self._built: self._build_from_time_step(time_step) batch_size = None if time_step.step_type.shape: batch_size = time_step.step_type.shape[0] if self._batch_size != batch_size: raise ValueError( 'The batch size of time_step is different from the batch size ' 'provided previously. Expected {}, but saw {}.'.format( self._batch_size, batch_size)) if not self._batched: # Since policy_state is given in a batched form from the policy and we # simply have to send it back we do not need to worry about it. Only # update time_step. time_step = nest_utils.batch_nested_array(time_step) tf.nest.assert_same_structure(self._time_step, time_step) feed_dict = {self._time_step: time_step} if policy_state is not None: # Flatten policy_state to handle specs that are not hashable due to lists. for state_ph, state in zip(tf.nest.flatten(self._policy_state), tf.nest.flatten(policy_state)): feed_dict[state_ph] = state action_step = self.session.run(self._action_step, feed_dict) action, state, info = action_step if not self._batched: action, info = nest_utils.unbatch_nested_array([action, info]) return policy_step.PolicyStep(action, state, info)
def _action(self, time_step, policy_state, seed): del seed def _mode(dist, spec): action = dist.mode() return tf.reshape(action, [ -1, ] + spec.shape.as_list()) # TODO(oars): Remove batched data checks when tf_env is batched. time_step_batched = nest_utils.is_batched_nested_tensors( time_step, self._time_step_spec) if not time_step_batched: time_step = nest_utils.batch_nested_tensors( time_step, self._time_step_spec) distribution_step = self._wrapped_policy.distribution( time_step, policy_state) actions = nest.map_structure(_mode, distribution_step.action, self._action_spec) if not time_step_batched: actions = nest_utils.unbatch_nested_tensors( actions, self._action_spec) return policy_step.PolicyStep(actions, distribution_step.state, distribution_step.info)
def _distribution(self, time_step, policy_state): # In DQN, we always either take a uniformly random action, or the action # with the highest Q-value. However, to support more complicated policies, # we expose all Q-values as a categorical distribution with Q-values as # logits, and apply the GreedyPolicy wrapper in dqn_agent.py to select the # action with the highest Q-value. q_values, policy_state = self._q_network(time_step.observation, time_step.step_type, policy_state) q_values.shape.assert_has_rank(2) # TODO(b/122314058): Validate and enforce that sampling distributions # created with the q_network logits generate the right action shapes. This # is curretly patching the problem. # If the action spec says each action should be shaped (1,), add another # dimension so the final shape is (B, 1, A), where A is the number of # actions. This will make Categorical emit events shaped (B, 1) rather than # (B,). Using axis -2 to allow for (B, T, 1, A) shaped q_values. if self._action_shape.ndims == 1: q_values = tf.expand_dims(q_values, -2) # TODO(kbanoop): Handle distributions over nests. distribution = tfp.distributions.Categorical(logits=q_values, dtype=self._action_dtype) distribution = tf.nest.pack_sequence_as(self._action_spec, [distribution]) return policy_step.PolicyStep(distribution, policy_state)
def _setup_specs(self): self._policy_step_spec = policy_step.PolicyStep( action=self._action_spec, state=self._policy_state_spec, info=self._info_spec) self._trajectory_spec = trajectory.from_transition( self._time_step_spec, self._policy_step_spec, self._time_step_spec)
def _action(self, time_step, policy_state, seed): seed_stream = tfd.SeedStream(seed=seed, salt='ou_noise') def _create_ou_process(action_spec): return common.OUProcess( lambda: tf.zeros(action_spec.shape, dtype=action_spec.dtype), self._ou_damping, self._ou_stddev, seed=seed_stream()) if self._ou_process is None: self._ou_process = nest.map_structure(_create_ou_process, self._action_spec) action_step = self._wrapped_policy.action(time_step, policy_state, seed_stream()) def _add_ou_noise(action, ou_process, action_spec): noisy_action = action + ou_process() if self._clip: return common.clip_to_spec(noisy_action, action_spec) return noisy_action actions = nest.map_structure(_add_ou_noise, action_step.action, self._ou_process, self._action_spec) return policy_step.PolicyStep(actions, action_step.state, action_step.info)
def _distribution(self, time_step, policy_state): q_values, policy_state = self._q_network( time_step.observation, time_step.step_type, policy_state, ) q_values.shape.assert_has_rank(2) if self._action_shape.ndims == 1: q_values = tf.expand_dims(q_values, -2) observation = time_step.observation.numpy()[0] amount_now = observation[-3] # can sale amount_available = observation[-2] # can buy q_values_np = q_values.numpy()[0] lower_bound = int(500 - amount_now) upper_bound = int(amount_available + 1) q_values_np[:lower_bound] = -np.inf q_values_np[upper_bound:] = -np.inf new_q_values = ops.convert_to_tensor( np.array([q_values_np], dtype=np.float32)) distribution = tfp.distributions.Categorical(logits=new_q_values, dtype=self._action_dtype) distribution = tf.nest.pack_sequence_as(self._action_spec, [distribution]) return policy_step.PolicyStep(distribution, policy_state)
def to_transition(trajectory, next_trajectory=None): """Create a transition from a trajectory or two adjacent trajectories. **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are sliced along their *second* (`time`) dimension; for example: ``` time_steps.observation = trajectory.observation[:, :-1] next_time_steps.observation = trajectory.observation[:, 1:] ``` Args: trajectory: An instance of `Trajectory`. next_trajectory: (optional) An instance of `Trajectory`. Returns: A tuple `(time_steps, policy_steps, next_time_steps)`. The `reward` and `discount` fields of `time_steps` are filled with zeros because these cannot be deduced. """ if next_trajectory is None: next_trajectory = nest.map_structure(lambda x: x[:, 1:], trajectory) trajectory = nest.map_structure(lambda x: x[:, :-1], trajectory) policy_steps = policy_step.PolicyStep(trajectory.action, (), trajectory.policy_info) # TODO(kbanoop): Consider replacing 0 rewards & discounts with (). time_steps = ts.TimeStep( trajectory.step_type, reward=nest.map_structure(tf.zeros_like, trajectory.reward), # unknown discount=tf.zeros_like(trajectory.discount), # unknown observation=trajectory.observation) next_time_steps = ts.TimeStep(trajectory.next_step_type, trajectory.reward, trajectory.discount, next_trajectory.observation) return [time_steps, policy_steps, next_time_steps]
def _fill_replay_buffer(self): # Generate N frames: the value of pixels is the frame index. # The observations will be generated by stacking K frames out of those N, # generating some redundancies between the observations. single_frames = [] frame_count = 100 for k in range(frame_count): single_frames.append(np.full(self._single_shape, k, dtype=np.int32)) # Add stack of frames to the replay buffer. time_steps = [] for k in range(len(single_frames) - self._stack_count + 1): observation = np.concatenate(single_frames[k:k + self._stack_count], axis=-1) time_steps.append(ts.transition(observation, reward=0.0)) self._transition_count = len(time_steps) - 1 dummy_action = policy_step.PolicyStep(np.int32(0)) for k in range(self._transition_count): self._replay_buffer.add_batch( nest_utils.batch_nested_array( trajectory.from_transition(time_steps[k], dummy_action, time_steps[k + 1])))
def testCreateWithDefaultInfo(self): action = 1 state = 2 info = () step = policy_step.PolicyStep(action, state) self.assertEqual(step.action, action) self.assertEqual(step.state, state) self.assertEqual(step.info, info)
def _distribution(self, time_step, policy_state): q_values = self._q_network(time_step).q_values # TODO(kbanoop): Handle distributions over nests. distribution_ = tfp.distributions.Categorical(logits=q_values, dtype=self._action_dtype) distribution_ = nest.pack_sequence_as(self._action_spec, [distribution_]) return policy_step.PolicyStep(distribution_, policy_state)
def testCreate(self): action = 1 state = 2 info = 3 step = policy_step.PolicyStep(action=action, state=state, info=info) self.assertEqual(step.action, action) self.assertEqual(step.state, state) self.assertEqual(step.info, info)
def _distribution(self, time_step, policy_state): outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec) action = common.replicate(self._action_value, outer_shape) def dist_fn(action): """Return a categorical distribution with all density on fixed action.""" return tfp.distributions.Deterministic(loc=action) return policy_step.PolicyStep(nest.map_structure(dist_fn, action), policy_state)
def _distribution(self, time_step, policy_state): def dist_fn(dist): greedy_action = dist.mode() return tfp.distributions.Deterministic(loc=greedy_action) distribution_step = self._wrapped_policy.distribution( time_step, policy_state) return policy_step.PolicyStep( tf.nest.map_structure(dist_fn, distribution_step.action), distribution_step.state, distribution_step.info)
def _action(self, time_step, policy_state, seed): outer_dims = nest_utils.get_outer_shape(time_step, self._time_step_spec) action_ = tensor_spec.sample_spec_nest( self._action_spec, seed=seed, outer_dims=outer_dims) # TODO(b/78181147): Investigate why this control dependency is required. if time_step is not None: with tf.control_dependencies(nest.flatten(time_step)): action_ = nest.map_structure(tf.identity, action_) return policy_step.PolicyStep(action_, policy_state)
def make_replay_buffer(tf_env): """Default replay buffer factory.""" time_step_spec = tf_env.time_step_spec() action_step_spec = policy_step.PolicyStep( tf_env.action_spec(), (), tensor_spec.TensorSpec((), tf.int32)) trajectory_spec = trajectory.from_transition(time_step_spec, action_step_spec, time_step_spec) return tf_uniform_replay_buffer.TFUniformReplayBuffer(trajectory_spec, batch_size=1)
def _action(self, time_step, policy_state, seed): q_values = self._q_network(time_step).q_values q_values.shape.assert_has_rank(2) # TODO(kbanoop): Add a test for temperature logits = q_values / self._temperature actions = tf.multinomial(logits, num_samples=1, seed=seed) actions = tf.reshape(actions, [ -1, ] + self._action_shape.as_list()) actions = tf.cast(actions, self._action_dtype, name='action') actions = nest.pack_sequence_as(self._action_spec, [actions]) return policy_step.PolicyStep(actions, policy_state)
def _make_ppo_trajectory_spec(self, action_distribution_params_spec): # Make policy_step_spec with action_spec, empty tuple for policy_state, and # (act_log_prob_spec, value_pred_spec, action_distribution_params_spec) for # info. policy_step_spec = policy_step.PolicyStep( action=self.action_spec(), state=self._policy.policy_state_spec(), info=action_distribution_params_spec) trajectory_spec = trajectory.from_transition(self.time_step_spec(), policy_step_spec, self.time_step_spec()) return trajectory_spec
def _setup_mocks(self): self.trainer = train_eval_atari.TrainEval(self.get_temp_dir(), 'Pong-v0', terminal_on_life_loss=True) self.trainer._env = mock.MagicMock() self.trainer._env.envs[0].game_over = False self.trainer._replay_buffer = mock.MagicMock() self.trainer._collect_policy = mock.MagicMock() action_step = policy_step.PolicyStep(action=1) self.trainer._collect_policy.action.return_value = action_step self.observer = mock.MagicMock() self.metric_observers = [self.observer]
def _action(self, time_step, policy_state): outer_dims = self._outer_dims if outer_dims is None: if self.time_step_spec.observation: outer_dims = nest_utils.get_outer_array_shape( time_step.observation, self.time_step_spec.observation) else: outer_dims = () random_action = array_spec.sample_spec_nest(self._action_spec, self._rng, outer_dims=outer_dims) return policy_step.PolicyStep(random_action, policy_state)
def _action(self, time_step, policy_state, seed): del seed # Reset the policy for batch indices that have restarted episode. policy_state = tf.where(time_step.is_first(), self._initial_policy_state, policy_state) # Take actions 1 and 2 alternating. action = tf.floormod(policy_state, 2) + 1 new_policy_state = policy_state + tf.constant( 1, shape=self._batch_shape, dtype=tf.int32) policy_info = action * 2 return policy_step.PolicyStep(action, new_policy_state, policy_info)
def to_transition(trajectory, next_trajectory=None): """Create a transition from a trajectory or two adjacent trajectories. **NOTE** If `next_trajectory` is not provided, tensors of `trajectory` are sliced along their *second* (`time`) dimension; for example: ``` time_steps.step_type = trajectory.step_type[:,:-1] time_steps.observation = trajectory.observation[:,:-1] next_time_steps.observation = trajectory.observation[:,1:] next_time_steps. step_type = trajectory. next_step_type[:,:-1] next_time_steps.reward = trajectory.reward[:,:-1] next_time_steps. discount = trajectory. discount[:,:-1] ``` Notice that reward and discount for time_steps are undefined, therefore filled with zero. Args: trajectory: An instance of `Trajectory`. The tensors in Trajectory must have shape `[ B, T, ...]` when next_trajectory is None. next_trajectory: (optional) An instance of `Trajectory`. Returns: A tuple `(time_steps, policy_steps, next_time_steps)`. The `reward` and `discount` fields of `time_steps` are filled with zeros because these cannot be deduced (please do not use them). """ _validate_rank(trajectory.discount, min_rank=1, max_rank=2) if next_trajectory is not None: _validate_rank(next_trajectory.discount, min_rank=1, max_rank=2) if next_trajectory is None: next_trajectory = tf.nest.map_structure(lambda x: x[:, 1:], trajectory) trajectory = tf.nest.map_structure(lambda x: x[:, :-1], trajectory) policy_steps = policy_step.PolicyStep(action=trajectory.action, state=(), info=trajectory.policy_info) # TODO(kbanoop): Consider replacing 0 rewards & discounts with (). time_steps = ts.TimeStep( trajectory.step_type, reward=tf.nest.map_structure(tf.zeros_like, trajectory.reward), # unknown discount=tf.zeros_like(trajectory.discount), # unknown observation=trajectory.observation) next_time_steps = ts.TimeStep(step_type=trajectory.next_step_type, reward=trajectory.reward, discount=trajectory.discount, observation=next_trajectory.observation) return [time_steps, policy_steps, next_time_steps]
def _action(self, time_step, policy_state=()): self._count += 1 # _random_function()'s range should be [0, 1), so if epsilon is 1, # we should always use random policy, and if epislon is 0, it # should always use greedy_policy since the if condition won't be # met. if self._random_function() < self._get_epsilon(): # Avoid mixing policy_state from greedy_policy and random_policy, # always return policy_state from greedy_policy. action_step = self._random_policy.action(time_step) return policy_step.PolicyStep(action_step.action, policy_state) else: return self._greedy_policy.action(time_step, policy_state=policy_state)
def _action(self, time_step, policy_state): # Reset the policy when starting a new episode. is_time_step_first = time_step.is_first() if np.isscalar(is_time_step_first): if is_time_step_first: policy_state = self._initial_policy_state else: policy_state[is_time_step_first] = self._initial_policy_state[ is_time_step_first] # Take actions 1 and 2 alternating. action = (policy_state % 2) + 1 policy_info = action * 2 return policy_step.PolicyStep(action, policy_state + 1, policy_info)
def testAction(self): py_observation_spec = array_spec.BoundedArraySpec((3, ), np.int32, 1, 1) py_time_step_spec = ts.time_step_spec(py_observation_spec) py_action_spec = array_spec.BoundedArraySpec((7, ), np.int32, 1, 1) py_policy_state_spec = array_spec.BoundedArraySpec((5, ), np.int32, 0, 1) py_policy_info_spec = array_spec.BoundedArraySpec((3, ), np.int32, 0, 1) mock_py_policy = mock.create_autospec(py_policy.Base) mock_py_policy.time_step_spec = py_time_step_spec mock_py_policy.action_spec = py_action_spec mock_py_policy.policy_state_spec = py_policy_state_spec mock_py_policy.info_spec = py_policy_info_spec expected_py_policy_state = np.ones(py_policy_state_spec.shape, py_policy_state_spec.dtype) expected_py_time_step = tf.nest.map_structure( lambda arr_spec: np.ones(arr_spec.shape, arr_spec.dtype), py_time_step_spec) expected_py_action = np.ones(py_action_spec.shape, py_action_spec.dtype) expected_new_py_policy_state = np.zeros(py_policy_state_spec.shape, py_policy_state_spec.dtype) expected_py_info = np.zeros(py_policy_info_spec.shape, py_policy_info_spec.dtype) mock_py_policy.action.return_value = policy_step.PolicyStep( expected_py_action, expected_new_py_policy_state, expected_py_info) tf_mock_py_policy = tf_py_policy.TFPyPolicy(mock_py_policy) time_step = tf.nest.map_structure( lambda arr_spec: tf.ones(arr_spec.shape, arr_spec.dtype), py_time_step_spec) action_step = tf_mock_py_policy.action( time_step, tf.ones(py_policy_state_spec.shape, tf.int32)) py_action_step = self.evaluate(action_step) self.assertEqual(1, mock_py_policy.action.call_count) np.testing.assert_equal( mock_py_policy.action.call_args[1]['time_step'], expected_py_time_step) np.testing.assert_equal( mock_py_policy.action.call_args[1]['policy_state'], expected_py_policy_state) np.testing.assert_equal(py_action_step.action, expected_py_action) np.testing.assert_equal(py_action_step.state, expected_new_py_policy_state) np.testing.assert_equal(py_action_step.info, expected_py_info)
def test_get_distribution_class_spec(self): ones = tf.ones(shape=[2], dtype=tf.float32) obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32) time_step_spec = ts.time_step_spec(obs_spec) mock_policy = mock.create_autospec(actor_policy.ActorPolicy) mock_policy.distribution.return_value = policy_step.PolicyStep( (tfp.distributions.Categorical(logits=ones), tfp.distributions.Normal(ones, ones)), None) class_spec = ppo_utils.get_distribution_class_spec( mock_policy, time_step_spec) self.assertAllEqual( (tfp.distributions.Categorical, tfp.distributions.Normal), class_spec)
def _action(self, time_step, policy_state, seed): distribution_step = self.distribution(time_step, policy_state) def _sample(dist, action_spec): action = dist.sample(seed=seed) if self._clip: return common.clip_to_spec(action, action_spec) return action actions = nest.map_structure(_sample, distribution_step.action, self._action_spec) return policy_step.PolicyStep(actions, distribution_step.state, distribution_step.info)
def _distribution(self, time_step, policy_state): # Actor network outputs nested structure of distributions or actions. actions_or_distributions, policy_state = self._apply_actor_network( time_step, policy_state) def _to_distribution(action_or_distribution): if isinstance(action_or_distribution, tf.Tensor): # This is an action tensor, so wrap it in a deterministic distribution. return tfp.distributions.Deterministic( loc=action_or_distribution) return action_or_distribution distributions = tf.nest.map_structure(_to_distribution, actions_or_distributions) return policy_step.PolicyStep(distributions, policy_state)
def _action(self, time_step, policy_state, seed): seed_stream = tfd.SeedStream(seed=seed, salt='ppo_policy') def _sample(dist, action_spec): action = dist.sample(seed=seed_stream()) if self._clip: return common_utils.clip_to_spec(action, action_spec) return action distribution_step = self.distribution(time_step, policy_state) actions = nest.map_structure(_sample, distribution_step.action, self._action_spec) return policy_step.PolicyStep(actions, distribution_step.state, distribution_step.info)
def test_get_distribution_params_spec(self): ones = tf.ones(shape=[1, 2], dtype=tf.float32) obs_spec = tensor_spec.TensorSpec(shape=[5], dtype=tf.float32) time_step_spec = ts.time_step_spec(obs_spec) mock_policy = mock.create_autospec(actor_policy.ActorPolicy) mock_policy._distribution.return_value = policy_step.PolicyStep( (tfp.distributions.Categorical(logits=ones), tfp.distributions.Normal(ones, ones))) params_spec = ppo_utils.get_distribution_params_spec( mock_policy, time_step_spec) self.assertAllEqual( [set(['logits']), set(['loc', 'scale'])], [set(d.keys()) for d in params_spec]) self.assertAllEqual([[[2]], [[2], [2]]], [[d[k].shape for k in d] for d in params_spec])