Beispiel #1
0
    def _get_tf_exploration_action_op(
        self,
        action_dist: ActionDistribution,
        explore: bool,
        timestep: Union[int, TensorType],
    ):
        ts = timestep if timestep is not None else self.last_timestep

        # The deterministic actions (if explore=False).
        deterministic_actions = action_dist.deterministic_sample()

        # Take a Gaussian sample with our stddev (mean=0.0) and scale it.
        gaussian_sample = self.scale_schedule(ts) * tf.random.normal(
            tf.shape(deterministic_actions), stddev=self.stddev)

        # Stochastic actions could either be: random OR action + noise.
        random_actions, _ = self.random_exploration.get_tf_exploration_action_op(
            action_dist, explore)
        stochastic_actions = tf.cond(
            pred=tf.convert_to_tensor(ts < self.random_timesteps),
            true_fn=lambda: random_actions,
            false_fn=lambda: tf.clip_by_value(
                deterministic_actions + gaussian_sample,
                self.action_space.low * tf.ones_like(deterministic_actions),
                self.action_space.high * tf.ones_like(deterministic_actions),
            ),
        )

        # Chose by `explore` (main exploration switch).
        action = tf.cond(
            pred=tf.constant(explore, dtype=tf.bool) if isinstance(
                explore, bool) else explore,
            true_fn=lambda: stochastic_actions,
            false_fn=lambda: deterministic_actions,
        )
        # Logp=always zero.
        logp = zero_logps_from_actions(deterministic_actions)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        if self.framework in ["tf2", "tfe"]:
            if timestep is None:
                self.last_timestep.assign_add(1)
            else:
                self.last_timestep.assign(tf.cast(timestep, tf.int64))
            return action, logp
        else:
            assign_op = (tf1.assign_add(self.last_timestep, 1)
                         if timestep is None else tf1.assign(
                             self.last_timestep, timestep))
            with tf1.control_dependencies([assign_op]):
                return action, logp
Beispiel #2
0
    def _get_tf_exploration_action_op(
        self,
        action_dist: ActionDistribution,
        explore: Union[bool, TensorType],
        timestep: Union[int, TensorType],
    ):
        ts = timestep if timestep is not None else self.last_timestep
        scale = self.scale_schedule(ts)

        # The deterministic actions (if explore=False).
        deterministic_actions = action_dist.deterministic_sample()

        # Apply base-scaled and time-annealed scaled OU-noise to
        # deterministic actions.
        gaussian_sample = tf.random.normal(shape=[self.action_space.low.size],
                                           stddev=self.stddev)
        ou_new = self.ou_theta * -self.ou_state + self.ou_sigma * gaussian_sample
        if self.framework in ["tf2", "tfe"]:
            self.ou_state.assign_add(ou_new)
            ou_state_new = self.ou_state
        else:
            ou_state_new = tf1.assign_add(self.ou_state, ou_new)
        high_m_low = self.action_space.high - self.action_space.low
        high_m_low = tf.where(tf.math.is_inf(high_m_low),
                              tf.ones_like(high_m_low), high_m_low)
        noise = scale * self.ou_base_scale * ou_state_new * high_m_low
        stochastic_actions = tf.clip_by_value(
            deterministic_actions + noise,
            self.action_space.low * tf.ones_like(deterministic_actions),
            self.action_space.high * tf.ones_like(deterministic_actions),
        )

        # Stochastic actions could either be: random OR action + noise.
        random_actions, _ = self.random_exploration.get_tf_exploration_action_op(
            action_dist, explore)
        exploration_actions = tf.cond(
            pred=tf.convert_to_tensor(ts < self.random_timesteps),
            true_fn=lambda: random_actions,
            false_fn=lambda: stochastic_actions,
        )

        # Chose by `explore` (main exploration switch).
        action = tf.cond(
            pred=tf.constant(explore, dtype=tf.bool) if isinstance(
                explore, bool) else explore,
            true_fn=lambda: exploration_actions,
            false_fn=lambda: deterministic_actions,
        )
        # Logp=always zero.
        logp = zero_logps_from_actions(deterministic_actions)

        # Increment `last_timestep` by 1 (or set to `timestep`).
        if self.framework in ["tf2", "tfe"]:
            if timestep is None:
                self.last_timestep.assign_add(1)
            else:
                self.last_timestep.assign(tf.cast(timestep, tf.int64))
        else:
            assign_op = (tf1.assign_add(self.last_timestep, 1)
                         if timestep is None else tf1.assign(
                             self.last_timestep, timestep))
            with tf1.control_dependencies([assign_op, ou_state_new]):
                action = tf.identity(action)
                logp = tf.identity(logp)

        return action, logp
Beispiel #3
0
    def get_tf_exploration_action_op(self, action_dist: ActionDistribution,
                                     explore: Optional[Union[bool,
                                                             TensorType]]):
        def true_fn():
            batch_size = 1
            req = force_tuple(
                action_dist.required_model_output_shape(
                    self.action_space, getattr(self.model, "model_config",
                                               None)))
            # Add a batch dimension?
            if len(action_dist.inputs.shape) == len(req) + 1:
                batch_size = tf.shape(action_dist.inputs)[0]

            # Function to produce random samples from primitive space
            # components: (Multi)Discrete or Box.
            def random_component(component):
                # Have at least an additional shape of (1,), even if the
                # component is Box(-1.0, 1.0, shape=()).
                shape = component.shape or (1, )

                if isinstance(component, Discrete):
                    return tf.random.uniform(shape=(batch_size, ) +
                                             component.shape,
                                             maxval=component.n,
                                             dtype=component.dtype)
                elif isinstance(component, MultiDiscrete):
                    return tf.concat([
                        tf.random.uniform(shape=(batch_size, 1),
                                          maxval=n,
                                          dtype=component.dtype)
                        for n in component.nvec
                    ],
                                     axis=1)
                elif isinstance(component, Box):
                    if component.bounded_above.all() and \
                            component.bounded_below.all():
                        if component.dtype.name.startswith("int"):
                            return tf.random.uniform(
                                shape=(batch_size, ) + shape,
                                minval=component.low.flat[0],
                                maxval=component.high.flat[0],
                                dtype=component.dtype)
                        else:
                            return tf.random.uniform(shape=(batch_size, ) +
                                                     shape,
                                                     minval=component.low,
                                                     maxval=component.high,
                                                     dtype=component.dtype)
                    else:
                        return tf.random.normal(shape=(batch_size, ) + shape,
                                                dtype=component.dtype)
                else:
                    assert isinstance(component, Simplex), \
                        "Unsupported distribution component '{}' for random " \
                        "sampling!".format(component)
                    return tf.nn.softmax(
                        tf.random.uniform(shape=(batch_size, ) + shape,
                                          minval=0.0,
                                          maxval=1.0,
                                          dtype=component.dtype))

            actions = tree.map_structure(random_component,
                                         self.action_space_struct)
            return actions

        def false_fn():
            return action_dist.deterministic_sample()

        action = tf.cond(pred=tf.constant(explore, dtype=tf.bool)
                         if isinstance(explore, bool) else explore,
                         true_fn=true_fn,
                         false_fn=false_fn)

        logp = zero_logps_from_actions(action)
        return action, logp