Esempio n. 1
0
    def tf_sample(self, parameters, deterministic):
        mean, stddev, _ = parameters

        # Deterministic: mean as action
        definite = mean

        # Non-deterministic: sample action using default normal distribution
        normal_distribution = tf.random.normal(
            shape=tf.shape(input=mean), dtype=util.tf_dtype(dtype='float')
        )
        sampled = mean + stddev * normal_distribution

        action = tf.where(condition=deterministic, x=definite, y=sampled)

        # Clip if bounded action
        if 'min_value' in self.action_spec:
            min_value = tf.constant(
                value=self.action_spec['min_value'], dtype=util.tf_dtype(dtype='float')
            )
            max_value = tf.constant(
                value=self.action_spec['max_value'], dtype=util.tf_dtype(dtype='float')
            )
            action = tf.clip_by_value(t=action, clip_value_min=min_value, clip_value_max=max_value)

        return action
Esempio n. 2
0
    def tf_parametrize(self, x, mask):
        epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float'))
        shape = (-1,) + self.action_spec['shape'] + (self.action_spec['num_values'],)
        value_shape = (-1,) + self.action_spec['shape'] + (1,)

        # Deviations
        action_values = self.deviations.apply(x=x)
        action_values = tf.reshape(tensor=action_values, shape=shape)
        min_float = tf.fill(
            dims=tf.shape(input=action_values), value=util.tf_dtype(dtype='float').min
        )

        # States value
        if self.value is None:
            action_values = tf.where(condition=mask, x=action_values, y=min_float)
            states_value = tf.reduce_logsumexp(input_tensor=action_values, axis=-1)
        else:
            states_value = self.value.apply(x=x)
            if len(self.embedding_shape) == 1:
                states_value = tf.reshape(tensor=states_value, shape=value_shape)
            action_values = states_value + action_values - tf.math.reduce_mean(
                input_tensor=action_values, axis=-1, keepdims=True
            )
            states_value = tf.squeeze(input=states_value, axis=-1)
            action_values = tf.where(condition=mask, x=action_values, y=min_float)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=action_values, axis=-1)

        # "Normalized" logits
        logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon))

        Module.update_tensor(name=(self.name + '-probabilities'), tensor=probabilities)

        return logits, probabilities, states_value, action_values
Esempio n. 3
0
        def apply_update():
            one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float'))
            axes = tuple(1 + axis for axis in self.axes)

            decay = self.decay.value()
            batch_size = tf.dtypes.cast(x=tf.shape(input=x)[0], dtype=util.tf_dtype(dtype='float'))
            decay = tf.math.pow(x=decay, y=batch_size)

            mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True)
            mean = tf.where(
                condition=self.after_first_call,
                x=(decay * self.moving_mean + (one - decay) * mean), y=mean
            )

            variance = tf.reduce_mean(
                input_tensor=tf.math.squared_difference(x=x, y=mean), axis=axes, keepdims=True
            )
            variance = tf.where(
                condition=self.after_first_call,
                x=(decay * self.moving_variance + (one - decay) * variance), y=variance
            )

            with tf.control_dependencies(control_inputs=(mean, variance)):
                assignment = self.after_first_call.assign(
                    value=tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')),
                    read_value=False
                )

            with tf.control_dependencies(control_inputs=(assignment,)):
                variance = self.moving_variance.assign(value=variance)
                mean = self.moving_mean.assign(value=mean)

            return mean, variance
Esempio n. 4
0
    def tf_retrieve_timesteps(self, n):
        one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long'))
        capacity = tf.constant(value=self.capacity, dtype=util.tf_dtype(dtype='long'))

        # Start index of oldest episode
        oldest_episode_start = self.terminal_indices[0] + one

        # Number of timesteps (minus/plus one to prevent zero but allow capacity)
        num_timesteps = self.memory_index - oldest_episode_start - one
        num_timesteps = tf.mod(x=num_timesteps, y=capacity) + one

        # Check whether memory contains enough timesteps
        assertion = tf.debugging.assert_less_equal(x=n, y=num_timesteps)

        # Randomly sampled timestep indices
        with tf.control_dependencies(control_inputs=(assertion,)):
            indices = tf.random_uniform(
                shape=(n,), maxval=num_timesteps, dtype=util.tf_dtype(dtype='long')
            )
            indices = tf.mod(x=(self.memory_index - one - indices), y=capacity)

        # Retrieve timestep indices
        timesteps = self.retrieve_indices(indices=indices)

        return timesteps
Esempio n. 5
0
    def tf_retrieve_timesteps(self, n, past_padding, future_padding):
        one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long'))
        capacity = tf.constant(value=self.capacity,
                               dtype=util.tf_dtype(dtype='long'))

        # # Start index of oldest episode
        # oldest_episode_start = self.terminal_indices[0] + one + past_padding

        # # Number of timesteps (minus/plus one to prevent zero but allow capacity)
        # num_timesteps = self.buffer_index - oldest_episode_start - future_padding - one
        # num_timesteps = tf.mod(x=num_timesteps, y=capacity) + one

        # Check whether memory contains enough timesteps
        num_timesteps = tf.minimum(x=self.buffer_index,
                                   y=capacity) - past_padding - future_padding
        assertion = tf.debugging.assert_less_equal(x=n, y=num_timesteps)

        # Most recent timestep indices range
        with tf.control_dependencies(control_inputs=(
                assertion, )):  # Assertions in memory as warning!!!
            indices = tf.range(start=(self.buffer_index - n),
                               limit=self.buffer_index)
            indices = tf.mod(x=(indices - future_padding), y=capacity)

        return indices
Esempio n. 6
0
    def tf_step(self, x, conjugate, residual, squared_residual):
        """
        Iteration loop body of the conjugate gradient algorithm.

        Args:
            x: Current solution estimate $x_t$.
            conjugate: Current conjugate $c_t$.
            residual: Current residual $r_t$.
            squared_residual: Current squared residual $r_t^2$.

        Returns:
            Updated arguments for next iteration.
        """

        # Ac := A * c_t
        A_conjugate = self.fn_x(conjugate)

        # TODO: reference?
        damping = self.damping.value()

        def no_damping():
            return A_conjugate

        def apply_damping():
            return [A_conj + damping * conj for A_conj, conj in zip(A_conjugate, conjugate)]

        zero = tf.constant(value=0.0, dtype=util.tf_dtype(dtype='float'))
        skip_damping = tf.math.equal(x=damping, y=zero)
        A_conjugate = self.cond(pred=skip_damping, true_fn=no_damping, false_fn=apply_damping)

        # cAc := c_t^T * Ac
        conjugate_A_conjugate = tf.add_n(
            inputs=[
                tf.reduce_sum(input_tensor=(conj * A_conj))
                for conj, A_conj in zip(conjugate, A_conjugate)
            ]
        )

        # \alpha := r_t^2 / cAc
        epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float'))
        alpha = squared_residual / tf.maximum(x=conjugate_A_conjugate, y=epsilon)

        # x_{t+1} := x_t + \alpha * c_t
        next_x = [t + alpha * conj for t, conj in zip(x, conjugate)]

        # r_{t+1} := r_t - \alpha * Ac
        next_residual = [res - alpha * A_conj for res, A_conj in zip(residual, A_conjugate)]

        # r_{t+1}^2 := r_{t+1}^T * r_{t+1}
        next_squared_residual = tf.add_n(
            inputs=[tf.reduce_sum(input_tensor=(res * res)) for res in next_residual]
        )

        # \beta = r_{t+1}^2 / r_t^2
        beta = next_squared_residual / tf.maximum(x=squared_residual, y=epsilon)

        # c_{t+1} := r_{t+1} + \beta * c_t
        next_conjugate = [res + beta * conj for res, conj in zip(next_residual, conjugate)]

        return next_x, next_conjugate, next_residual, next_squared_residual
Esempio n. 7
0
    def create_tf_operations(self, config):
        """
        Creates generic TensorFlow operations and placeholders required for models.
        
        Args:
            config: Model configuration which must contain entries for states and actions.

        Returns:

        """
        self.action_taken = dict()
        self.internal_inputs = list()
        self.internal_outputs = list()
        self.internal_inits = list()

        # Placeholders
        with tf.variable_scope('placeholder'):
            # States
            self.state = dict()
            for name, state in config.states.items():
                self.state[name] = tf.placeholder(dtype=util.tf_dtype(state.type), shape=(None,) + tuple(state.shape), name=name)

            # Actions
            self.action = dict()
            self.discrete_actions = []
            self.continuous_actions = []
            for name, action in config.actions:
                if action.continuous:
                    if not self.__class__.allows_continuous_actions:
                        raise TensorForceError("Error: Model does not support continuous actions.")
                    self.action[name] = tf.placeholder(dtype=util.tf_dtype('float'), shape=(None,), name=name)
                else:
                    if not self.__class__.allows_discrete_actions:
                        raise TensorForceError("Error: Model does not support discrete actions.")
                    self.action[name] = tf.placeholder(dtype=util.tf_dtype('int'), shape=(None,), name=name)

            # Reward & terminal
            self.reward = tf.placeholder(dtype=tf.float32, shape=(None,), name='reward')
            self.terminal = tf.placeholder(dtype=tf.bool, shape=(None,), name='terminal')

            # Deterministic action flag
            self.deterministic = tf.placeholder(dtype=tf.bool, shape=(), name='deterministic')

        # Optimizer
        if config.optimizer is not None:
            learning_rate = config.learning_rate
            with tf.variable_scope('optimization'):
                optimizer = util.function(config.optimizer, optimizers)
                args = config.optimizer_args or ()
                kwargs = config.optimizer_kwargs or {}
                self.optimizer = optimizer(learning_rate, *args, **kwargs)
        else:
            self.optimizer = None