def wrap(self, input_action):
        """
        Args:
            input_action (dict): nested tensor action produced by the neural
                                 net. Dictionary keys are those marked True
                                 in 'to_learn'.

        Returns:
            actions (dict): nested tensor action which includes all action
                            components expected by the GKP class.

        """
        # step counter to follow the script of periodicity 'period'
        i = self._env._elapsed_steps % self.period
        out_shape = nest_utils.get_outer_shape(input_action, self._action_spec)

        action = {}
        for a in self.to_learn.keys():
            C1 = self.use_mask and self.mask[a][i]==0
            C2 = not self.to_learn[a]
            if C1 or C2: # if not learning: replicate scripted action
                action[a] = common.replicate(self.script[a][i], out_shape)
            else: # if learning: rescale input tensor
                action[a] = input_action[a]*self.scale[a]
                if self.learn_residuals:
                    action[a] += common.replicate(self.script[a][i], out_shape)
                
        return action
Exemple #2
0
def test_trajectory_optimiser_pathological_trajectories(
        action_space, horizon, batch_size):
    """
    The replay buffer is a FIFO buffer of fixed capacity. Ensure that the capacity is sufficient
    such that the initial observation is still present in the buffer even in the pathological case
    where all trajectories are of length 2.
    """

    # construct the environment model
    observations = list(
        chain.from_iterable(
            repeat(
                [
                    replicate(tf.constant(StepType.FIRST), [batch_size]),
                    replicate(tf.constant(StepType.LAST), [batch_size]),
                ],
                horizon,
            )))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)
    policy = RandomTFPolicy(time_step_space, action_space)
    stub_policy_state_updater = StubPolicyStateUpdater()
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon,
        population_size=batch_size,
        max_iterations=1,
        policy_state_updater=stub_policy_state_updater,
    )

    time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST), axis=0),
                        batch_size=1)

    trajectory_optimiser.optimise(time_step, environment_model)

    stored_trajectory = stub_policy_state_updater.step_types[0]
    assert stored_trajectory[0][0] == StepType.FIRST
    def _distribution(self, time_step, policy_state):
        """Implementation of `distribution`. Returns a `Categorical` distribution.

    The returned `Categorical` distribution has (unnormalized) probabilities
    `exp(inverse_temperature * weights)`.

    Args:
      time_step: A `TimeStep` tuple corresponding to `time_step_spec()`.
      policy_state: Unused in `CategoricalPolicy`. It is simply passed through.

    Returns:
      A `PolicyStep` named tuple containing:
        `action`: A (optionally nested) of tfp.distribution.Distribution
          capturing the distribution of next actions.
        `state`: A policy state tensor for the next call to distribution.
        `info`: Optional side information such as action log probabilities.
    """
        outer_shape = nest_utils.get_outer_shape(time_step,
                                                 self._time_step_spec)
        logits = (self._inverse_temperature *
                  common.replicate(self._weights, outer_shape))
        action_distribution = tfd.Independent(
            tfd.Categorical(logits=logits,
                            dtype=tf.nest.flatten(self.action_spec)[0].dtype))
        return policy_step.PolicyStep(action_distribution, policy_state)
Exemple #4
0
 def _action(self, time_step, policy_state, seed):
     del seed
     outer_shape = nest_utils.get_outer_shape(time_step,
                                              self._time_step_spec)
     action = tf.nest.map_structure(
         lambda t: common.replicate(t, outer_shape), self._action_value)
     return policy_step.PolicyStep(action, policy_state, self._policy_info)
Exemple #5
0
def test_generate_virtual_rollouts(observation_space, action_space, batch_size,
                                   horizon):
    observation = create_uniform_distribution_from_spec(
        observation_space).sample()
    network = DummyEnsembleTransitionNetwork(observation_space)
    model = KerasTransitionModel([network], observation_space, action_space)
    env_model = EnvironmentModel(
        transition_model=model,
        reward_model=ConstantReward(observation_space, action_space, -1.0),
        termination_model=ConstantFalseTermination(observation_space),
        initial_state_distribution_model=DeterministicInitialStateModel(
            observation),
        batch_size=batch_size,
    )
    random_policy = RandomTFPolicy(time_step_spec(observation_space),
                                   action_space)

    replay_buffer, driver, wrapped_env_model = virtual_rollouts_buffer_and_driver(
        env_model, random_policy, horizon)

    driver.run(wrapped_env_model.reset())
    trajectory = replay_buffer.gather_all()

    mid_steps = repeat(1, horizon - 1)
    expected_step_types = tf.constant(list(chain([0], mid_steps, [2])))
    batched_step_types = replicate(expected_step_types, (batch_size, ))
    np.testing.assert_array_equal(batched_step_types, trajectory.step_type)
Exemple #6
0
  def testReplicateScalarTensor(self):
    value = 1
    outer_shape = [2, 1]
    expected_replicated_value = np.array([[value], [value]])

    tf_value = tf.constant(value, shape=())
    replicated_value = self.evaluate(common.replicate(tf_value, outer_shape))
    self.assertAllEqual(expected_replicated_value, replicated_value)
Exemple #7
0
  def _distribution(self, time_step, policy_state):
    outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
    action = common.replicate(self._action_value, outer_shape)

    def dist_fn(action):
      """Return a categorical distribution with all density on fixed action."""
      return tfp.distributions.Deterministic(loc=action)
    return policy_step.PolicyStep(nest.map_structure(dist_fn, action),
                                  policy_state)
  def _get_policy_info_and_action(self, time_step):
    outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)

    log_probability = tf.nest.map_structure(
        lambda _: tf.zeros(outer_shape, tf.float32), self._action_spec)
    policy_info = policy_step.set_log_probability(
        self._policy_info, log_probability=log_probability)
    action = tf.nest.map_structure(lambda t: common.replicate(t, outer_shape),
                                   self._action_value)
    return policy_info, action
Exemple #9
0
    def _action(self, time_step, policy_state, seed):
        outer_shape = nest_utils.get_outer_shape(time_step,
                                                 self._time_step_spec)
        action = common.replicate(self._next_action, outer_shape)

        self._action_index += 1
        self._action_index %= self._actions.shape[0]
        self._next_action.assign(self._actions[self._action_index])

        return policy_step.PolicyStep(action, policy_state, info=())
Exemple #10
0
    def _action(self, time_step, policy_state, seed):
        i = policy_state[0] % self.period # position within the policy period
        out_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
        action = {}
        for a in self.script:
            A = common.replicate(self.script[a][i], out_shape)
            if a == 'alpha': # do Markovian feedback
                A *= time_step.observation['msmt'][:,-1,None]
                if policy_state[0] == 0: A *= 0
            action[a] = A

        return policy_step.PolicyStep(action, policy_state+1, self._policy_info)
Exemple #11
0
  def testReplicateTensor(self, outer_shape_type):
    value = np.array([[1., 2., 3.], [4., 5., 6.]])
    if outer_shape_type == 'tf_constant':
      outer_shape = tf.constant([2, 1])
    else:
      outer_shape = [2, 1]
    expected_replicated_value = np.array([[value], [value]])

    tf_value = tf.constant(value)
    replicated_value = self.evaluate(common.replicate(tf_value, outer_shape))
    self.assertAllEqual(expected_replicated_value, replicated_value)

    if isinstance(outer_shape, np.ndarray):
      # The shape should be fully defined in this case.
      self.assertEqual(tf.TensorShape(outer_shape + list(value.shape)),
                       replicated_value.shape)
Exemple #12
0
    def _action(self, time_step, policy_state, seed):
        i = policy_state[0] % self.period # position within the policy period
        out_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
        action = {}
        for a in self.script.keys():
            A = common.replicate(self.script[a][i], out_shape)
            if a == 'alpha':
                m = time_step.observation['msmt']
                if i not in [2*self.K-1, 2*self.K]:
                    # feedback after trimming rounds is Markovian, and after
                    # intermediate sharpening rounds is simply zero.
                    A *= m[:,-1,:]
                    if policy_state[0] == 0: A *= 0
                else: # after K sharpening rounds do the Baysian feedback
                    A = self.Bayesian_feedback(i, m)
            action[a] = A

        return policy_step.PolicyStep(action, policy_state+1, self._policy_info)
Exemple #13
0
  def testReplicateTensor(self, outer_shape_type):
    value = np.array([[1., 2., 3.], [4., 5., 6.]])
    if outer_shape_type == 'placeholder':
      outer_shape = tf.placeholder(tf.int32, shape=[2])
    elif outer_shape_type == 'tf_constant':
      outer_shape = tf.constant([2, 1])
    else:
      outer_shape = [2, 1]
    expected_replicated_value = np.array([[value], [value]])

    tf_value = tf.constant(value)
    tf_replicated_value = common.replicate(tf_value, outer_shape)
    if isinstance(outer_shape, np.ndarray):
      # The shape should be fully defined in this case.
      self.assertEqual(tf.TensorShape(outer_shape + list(value.shape)),
                       tf_replicated_value.shape)

    with self.test_session() as sess:
      feed_dict = {}
      if outer_shape_type == 'placeholder':
        feed_dict = {outer_shape: np.array([2, 1])}
      replicated_value = sess.run(tf_replicated_value, feed_dict)
      self.assertAllEqual(expected_replicated_value, replicated_value)
Exemple #14
0
 def _action(self, time_step, policy_state, seed):
   del seed
   outer_shape = nest_utils.get_outer_shape(time_step, self._time_step_spec)
   action = common.replicate(self._action_value, outer_shape)
   return policy_step.PolicyStep(action, policy_state)
Exemple #15
0
def test_trajectory_optimiser_each_iteration_starts_with_the_initial_observation(
        action_space, horizon, batch_size, max_iterations):
    class WrappedRandomTFPolicy(TFPolicy):
        def __init__(
            self,
            ts_spec: ts.TimeStep,
            action_spec: types.NestedTensorSpec,
            env_model: EnvironmentModel,
        ):
            super().__init__(ts_spec, action_spec)

            self._internal_policy = RandomTFPolicy(ts_spec, action_space)

            self._environment_model = env_model

        def _action(
            self,
            time_step: ts.TimeStep,
            policy_state: types.NestedTensor,
            seed: Optional[types.Seed],
        ) -> policy_step.PolicyStep:
            np.testing.assert_array_equal(
                time_step.observation,
                self._environment_model.current_time_step().observation)
            return self._internal_policy._action(time_step, policy_state, seed)

        def _distribution(
                self, time_step: ts.TimeStep,
                policy_state: types.NestedTensorSpec
        ) -> policy_step.PolicyStep:
            raise NotImplementedError()

    observations = list(
        repeat(replicate(tf.constant(StepType.MID), [batch_size]),
               max_iterations * (horizon + 1)))

    transition_model = TrajectoryOptimiserTransitionModel(
        action_space, iter(observations))
    reward = ConstantReward(OBSERVATION_SPACE_SPEC, action_space, -1.0)
    termination_model = TrajectoryOptimiserTerminationModel(
        OBSERVATION_SPACE_SPEC)
    environment_model = EnvironmentModel(
        transition_model=transition_model,
        reward_model=reward,
        termination_model=termination_model,
        initial_state_distribution_model=DeterministicInitialStateModel(
            StepType.FIRST),
        batch_size=batch_size,
    )

    time_step_space = time_step_spec(OBSERVATION_SPACE_SPEC)

    policy = WrappedRandomTFPolicy(time_step_space, action_space,
                                   environment_model)
    trajectory_optimiser = PolicyTrajectoryOptimiser(
        policy,
        horizon=horizon,
        population_size=batch_size,
        max_iterations=max_iterations,
    )

    initial_time_step = restart(tf.expand_dims(tf.constant(StepType.FIRST),
                                               axis=0),
                                batch_size=1)

    trajectory_optimiser.optimise(initial_time_step, environment_model)