Beispiel #1
0
    def sample(self, *, parameters, temperature):
        logits, probabilities, action_values = parameters.get(
            ('logits', 'probabilities', 'action_values'))

        # Distribution parameter summaries
        def fn_summary():
            axis = range(self.action_spec.rank + 1)
            probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis)
            return [probs[n] for n in range(self.action_spec.num_values)]

        prefix = 'distributions/' + self.name + '-probability'
        names = [prefix + str(n) for n in range(self.action_spec.num_values)]
        dependencies = self.summary(label='distribution',
                                    name=names,
                                    data=fn_summary,
                                    step='timesteps')

        # Entropy summary
        def fn_summary():
            entropy = -tf.reduce_sum(input_tensor=(probabilities * logits),
                                     axis=-1)
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy',
                         name=name,
                         data=fn_summary,
                         step='timesteps'))

        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: maximum likelihood action
        definite = tf.argmax(input=action_values, axis=-1)
        definite = tf_util.cast(x=definite, dtype='int')

        # Set logits to minimal value
        min_float = tf.fill(dims=tf.shape(input=logits),
                            value=tf_util.get_dtype(type='float').min)
        logits = logits / temperature
        logits = tf.where(condition=(probabilities < epsilon),
                          x=min_float,
                          y=logits)

        # Non-deterministic: sample action using Gumbel distribution
        uniform_distribution = tf.random.uniform(
            shape=tf.shape(input=logits),
            minval=epsilon,
            maxval=(one - epsilon),
            dtype=tf_util.get_dtype(type='float'))
        gumbel_distribution = -tf.math.log(
            x=-tf.math.log(x=uniform_distribution))
        sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1)
        sampled = tf_util.cast(x=sampled, dtype='int')

        with tf.control_dependencies(control_inputs=dependencies):
            return tf.where(condition=(temperature < epsilon),
                            x=definite,
                            y=sampled)
Beispiel #2
0
 def fn_sample():
     # Non-deterministic: sample action using gamma distribution
     alpha_sample = tf.random.gamma(
         shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float'))
     beta_sample = tf.random.gamma(
         shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float'))
     return beta_sample / (alpha_sample + beta_sample)
Beispiel #3
0
    def sample(self, *, parameters, temperature):
        alpha, beta, alpha_beta, log_norm = parameters.get(
            ('alpha', 'beta', 'alpha_beta', 'log_norm')
        )

        # Distribution parameter summaries
        def fn_summary():
            return tf.math.reduce_mean(input_tensor=alpha, axis=range(self.action_spec.rank + 1)), \
                tf.math.reduce_mean(input_tensor=beta, axis=range(self.action_spec.rank + 1))

        prefix = 'distributions/' + self.name
        dependencies = self.summary(
            label='distribution', name=(prefix + '-alpha', prefix + '-beta'), data=fn_summary,
            step='timesteps'
        )

        # Entropy summary
        def fn_summary():
            one = tf_util.constant(value=1.0, dtype='float')
            digamma_alpha = tf_util.cast(
                x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float'
            )
            digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float')
            digamma_alpha_beta = tf_util.cast(
                x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float'
            )
            entropy = log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \
                (alpha_beta - one - one) * digamma_alpha_beta
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')
        )

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: mean as action
        definite = beta / alpha_beta

        # Non-deterministic: sample action using gamma distribution
        alpha_sample = tf.random.gamma(shape=(), alpha=alpha, dtype=tf_util.get_dtype(type='float'))
        beta_sample = tf.random.gamma(shape=(), alpha=beta, dtype=tf_util.get_dtype(type='float'))

        sampled = beta_sample / tf.maximum(x=(alpha_sample + beta_sample), y=epsilon)

        action = tf.where(condition=(temperature < epsilon), x=definite, y=sampled)

        min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float')
        max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float')

        with tf.control_dependencies(control_inputs=dependencies):
            return min_value + (max_value - min_value) * action
Beispiel #4
0
 def body(lengths, predecessor_indices, mask):
     previous_index = tf.math.mod(x=(predecessor_indices[:, :1] - one), y=capacity)
     predecessor_indices = tf.concat(values=(previous_index, predecessor_indices), axis=1)
     previous_terminal = tf.gather(params=self.buffers['terminal'], indices=previous_index)
     is_not_terminal = tf.math.logical_and(
         x=tf.math.logical_not(x=tf.math.greater(x=previous_terminal, y=zero)),
         y=mask[:, :1]
     )
     mask = tf.concat(values=(is_not_terminal, mask), axis=1)
     is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1)
     zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int'))
     ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int'))
     lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros)
     return lengths, predecessor_indices, mask
Beispiel #5
0
 def body(lengths, successor_indices, mask):
     current_index = successor_indices[:, -1:]
     current_terminal = tf.gather(params=self.buffers['terminal'], indices=current_index)
     is_not_terminal = tf.math.logical_and(
         x=tf.math.logical_not(x=tf.math.greater(x=current_terminal, y=zero)),
         y=mask[:, -1:]
     )
     next_index = tf.math.mod(x=(current_index + one), y=capacity)
     successor_indices = tf.concat(values=(successor_indices, next_index), axis=1)
     mask = tf.concat(values=(mask, is_not_terminal), axis=1)
     is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1)
     zeros = tf.zeros_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int'))
     ones = tf.ones_like(input=is_not_terminal, dtype=tf_util.get_dtype(type='int'))
     lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros)
     return lengths, successor_indices, mask
Beispiel #6
0
 def function(name, spec, a_value):
     advantage_value = a_value.apply(x=embedding)
     if spec.type == 'bool':
         shape = (-1, ) + spec.shape + (2, )
     elif spec.type == 'int':
         shape = (-1, ) + spec.shape + (spec.num_values, )
     advantage_value = tf.reshape(tensor=advantage_value,
                                  shape=shape)
     mean = tf.math.reduce_mean(input_tensor=advantage_value,
                                axis=-1,
                                keepdims=True)
     shape = (-1, ) + tuple(1 for _ in range(spec.rank + 1))
     _state_value = tf.reshape(tensor=state_value, shape=shape)
     action_value = _state_value + (advantage_value - mean)
     if spec.type == 'bool':
         return tf.math.maximum(x=action_value[..., 0],
                                y=action_value[..., 1])
     elif spec.type == 'int':
         if self.config.enable_int_action_masking:
             mask = auxiliaries[name]['mask']
             min_float = tf_util.get_dtype(type='float').min
             min_float = tf.fill(dims=tf.shape(input=action_value),
                                 value=min_float)
             action_value = tf.where(condition=mask,
                                     x=action_value,
                                     y=min_float)
         return tf.math.reduce_max(input_tensor=action_value,
                                   axis=-1)
Beispiel #7
0
        def fn_sample():
            # Set logits to minimal value
            min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min)
            temp_logits = logits / tf.math.maximum(x=temperature, y=epsilon)
            temp_logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=temp_logits)

            # Non-deterministic: sample action using Gumbel distribution
            one = tf_util.constant(value=1.0, dtype='float')
            uniform_distribution = tf.random.uniform(
                shape=tf.shape(input=temp_logits), minval=epsilon, maxval=(one - epsilon),
                dtype=tf_util.get_dtype(type='float')
            )
            # Second log numerically stable since log(1-eps) ~ -eps
            gumbel_distribution = -tf.math.log(x=-tf.math.log(x=uniform_distribution))
            action = tf.math.argmax(input=(temp_logits + gumbel_distribution), axis=-1)
            return tf_util.cast(x=action, dtype='int')
Beispiel #8
0
    def retrieve_episodes(self, *, n):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')

        # Check whether memory contains at least one episode
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_greater_equal(x=self.episode_count, y=one))

        # Get start and limit indices for randomly sampled n episodes
        with tf.control_dependencies(control_inputs=assertions):
            n = tf.math.minimum(x=n, y=self.episode_count)
            random_indices = tf.random.uniform(
                shape=(n, ),
                maxval=self.episode_count,
                dtype=tf_util.get_dtype(type='int'))

            # (Increment terminal of previous episode)
            starts = tf.gather(params=self.terminal_indices,
                               indices=random_indices) + one
            limits = tf.gather(params=self.terminal_indices,
                               indices=(random_indices + one)) + one

            # Correct limit index if smaller than start index
            limits = limits + tf.where(
                condition=(limits < starts), x=capacity, y=zero)

            # Random episode indices ranges
            indices = tf.ragged.range(starts=starts, limits=limits).values
            indices = tf.math.mod(x=indices, y=capacity)

        return indices
Beispiel #9
0
    def retrieve_timesteps(self, *, n, past_horizon, future_horizon):
        one = tf_util.constant(value=1, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')

        # Check whether memory contains at least one valid timestep
        num_timesteps = tf.math.minimum(x=self.buffer_index, y=capacity)
        num_timesteps -= (past_horizon + future_horizon)
        num_timesteps = tf.math.maximum(x=num_timesteps, y=self.episode_count)

        # Check whether memory contains at least one timestep
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_greater_equal(x=num_timesteps, y=one))

        # Randomly sampled timestep indices
        with tf.control_dependencies(control_inputs=assertions):
            n = tf.math.minimum(x=n, y=num_timesteps)
            indices = tf.random.uniform(shape=(n, ),
                                        maxval=num_timesteps,
                                        dtype=tf_util.get_dtype(type='int'))
            indices = tf.math.mod(x=(self.buffer_index - one - indices -
                                     future_horizon),
                                  y=capacity)

        return indices
Beispiel #10
0
        def function(name, spec, action_value):
            if spec.type == 'bool':

                def fn_summary():
                    axis = range(spec.rank + 1)
                    values = tf.math.reduce_mean(input_tensor=action_value, axis=axis)
                    return [values[0], values[1]]

                if name is None:
                    names = ['action-values/true', 'action-values/false']
                else:
                    names = ['action-values/' + name + '-true', 'action-values/' + name + '-false']
                dependencies = self.summary(
                    label='action-value', name=names, data=fn_summary, step='timesteps'
                )

                def fn_tracking():
                    return tf.math.reduce_mean(input_tensor=action_value, axis=0)

                if name is None:
                    n = 'action-values'
                else:
                    n = name + '-values'
                dependencies = self.track(label='action-value', name=n, data=fn_tracking)

                with tf.control_dependencies(control_inputs=dependencies):
                    return (action_value[..., 0] > action_value[..., 1])

            elif spec.type == 'int':

                def fn_summary():
                    axis = range(spec.rank + 1)
                    values = tf.math.reduce_mean(input_tensor=action_value, axis=axis)
                    return [values[n] for n in range(spec.num_values)]

                if name is None:
                    prefix = 'action-values/action'
                else:
                    prefix = 'action-values/' + name + '-action'
                names = [prefix + str(n) for n in range(spec.num_values)]
                dependencies = self.summary(
                    label='action-value', name=names, data=fn_summary, step='timesteps'
                )

                def fn_tracking():
                    return tf.math.reduce_mean(input_tensor=action_value, axis=0)

                if name is None:
                    n = 'action-values'
                else:
                    n = name + '-values'
                dependencies = self.track(label='action-value', name=n, data=fn_tracking)

                with tf.control_dependencies(control_inputs=dependencies):
                    if self.config.enable_int_action_masking:
                        mask = auxiliaries[name]['mask']
                        min_float = tf_util.get_dtype(type='float').min
                        min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float)
                        action_value = tf.where(condition=mask, x=action_value, y=min_float)
                    return tf.math.argmax(input=action_value, axis=-1, output_type=spec.tf_type())
Beispiel #11
0
        def body(deltas, previous_perturbations):
            with tf.control_dependencies(control_inputs=deltas):
                perturbations = [
                    learning_rate *
                    tf.random.normal(shape=tf_util.shape(x=variable),
                                     dtype=tf_util.get_dtype(type='float'))
                    for variable in variables
                ]
                perturbation_deltas = [
                    pert - prev_pert for pert, prev_pert in zip(
                        perturbations, previous_perturbations)
                ]
                assignments = list()
                for variable, delta in zip(variables, perturbation_deltas):
                    assignments.append(
                        variable.assign_add(delta=delta, read_value=False))

            with tf.control_dependencies(control_inputs=assignments):
                perturbed_loss = fn_loss(**arguments.to_kwargs())
                direction = tf.math.sign(x=(unperturbed_loss - perturbed_loss))
                deltas = [
                    delta + direction * perturbation
                    for delta, perturbation in zip(deltas, perturbations)
                ]

            return deltas, perturbations
Beispiel #12
0
            def body(deltas, previous_perturbations):
                with tf.control_dependencies(control_inputs=deltas):
                    perturbations = list()
                    for variable in variables:
                        perturbation = tf.random.normal(shape=variable.shape,
                                                        dtype=variable.dtype)
                        if variable.dtype == tf_util.get_dtype(type='float'):
                            perturbations.append(learning_rate * perturbation)
                        else:
                            perturbations.append(
                                tf.cast(x=learning_rate, dtype=variable.dtype)
                                * perturbation)

                    perturbation_deltas = [
                        pert - prev_pert for pert, prev_pert in zip(
                            perturbations, previous_perturbations)
                    ]
                    assignments = list()
                    for variable, delta in zip(variables, perturbation_deltas):
                        assignments.append(
                            variable.assign_add(delta=delta, read_value=False))

                with tf.control_dependencies(control_inputs=assignments):
                    perturbed_loss = fn_loss(**arguments.to_kwargs())

                    one_float = tf_util.constant(value=1.0, dtype='float')
                    neg_one_float = tf_util.constant(value=-1.0, dtype='float')
                    direction = tf.where(
                        condition=(perturbed_loss < unperturbed_loss),
                        x=one_float,
                        y=neg_one_float)

                    next_deltas = list()
                    for variable, delta, perturbation in zip(
                            variables, deltas, perturbations):
                        if variable.dtype == tf_util.get_dtype(type='float'):
                            next_deltas.append(delta +
                                               direction * perturbation)
                        else:
                            next_deltas.append(
                                delta +
                                tf.cast(x=direction, dtype=variable.dtype) *
                                perturbation)

                return next_deltas, perturbations
 def fn_sample():
     # Non-deterministic: sample true if >= uniform distribution
     # Exp numerically stable since logits <= 0.0
     e_true_logit = tf.math.exp(x=(true_logit / tf.math.maximum(x=temperature, y=epsilon)))
     e_false_logit = tf.math.exp(x=(false_logit / tf.math.maximum(x=temperature, y=epsilon)))
     probability = e_true_logit / tf.math.maximum(x=(e_true_logit + e_false_logit), y=epsilon)
     uniform = tf.random.uniform(
         shape=tf.shape(input=probability), dtype=tf_util.get_dtype(type='float')
     )
     return tf.greater_equal(x=probability, y=uniform)
Beispiel #14
0
    def reset(self):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        three = tf_util.constant(value=3, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')
        last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity)

        def correct_terminal():
            # Replace last observation terminal marker with abort terminal
            dependencies = list()
            two = tf_util.constant(value=2, dtype='int')
            sparse_delta = tf.IndexedSlices(values=two, indices=last_index)
            dependencies.append(self.buffers['terminal'].scatter_update(
                sparse_delta=sparse_delta))
            sparse_delta = tf.IndexedSlices(values=last_index,
                                            indices=(self.episode_count + one))
            dependencies.append(
                self.terminal_indices.scatter_update(
                    sparse_delta=sparse_delta))
            with tf.control_dependencies(control_inputs=dependencies):
                return self.episode_count.assign_add(delta=one,
                                                     read_value=False)

        last_terminal = tf.gather(params=self.buffers['terminal'],
                                  indices=last_index)
        is_incorrect = tf.math.equal(x=last_terminal, y=three)
        corrected = tf.cond(pred=is_incorrect,
                            true_fn=correct_terminal,
                            false_fn=tf.no_op)

        with tf.control_dependencies(control_inputs=(corrected, )):
            assertions = [corrected]
            if self.config.create_tf_assertions:
                # general check: all terminal indices true
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.reduce_all(input_tensor=tf.gather(
                            params=tf.math.greater(x=self.buffers['terminal'],
                                                   y=zero),
                            indices=self.terminal_indices[:self.episode_count +
                                                          one])),
                        y=tf_util.constant(value=True, dtype='bool'),
                        message="Memory consistency check."))
                # general check: only terminal indices true
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.math.count_nonzero(
                            input=self.buffers['terminal'],
                            dtype=tf_util.get_dtype(type='int')),
                        y=(self.episode_count + one),
                        message="Memory consistency check."))

        with tf.control_dependencies(control_inputs=assertions):
            return one < zero
Beispiel #15
0
 def function(name, spec, a_value):
     action_value = a_value.apply(x=embedding)
     if spec.type == 'bool':
         shape = (-1,) + spec.shape + (2,)
     elif spec.type == 'int':
         shape = (-1,) + spec.shape + (spec.num_values,)
     action_value = tf.reshape(tensor=action_value, shape=shape)
     if spec.type == 'bool':
         return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1])
     elif spec.type == 'int':
         mask = auxiliaries[name]['mask']
         min_float = tf_util.get_dtype(type='float').min
         min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float)
         action_value = tf.where(condition=mask, x=action_value, y=min_float)
         return tf.math.reduce_max(input_tensor=action_value, axis=-1)
Beispiel #16
0
    def __init__(self,
                 *,
                 layer,
                 l2_regularization=None,
                 name=None,
                 input_spec=None,
                 **kwargs):
        super().__init__(l2_regularization=l2_regularization,
                         name=name,
                         input_spec=input_spec)

        self.keras_layer = getattr(tf.keras.layers, layer)(
            name=name,
            dtype=tf_util.get_dtype(type='float'),
            input_shape=input_spec.shape,
            **kwargs)
Beispiel #17
0
    def sample(self, *, parameters, temperature):
        true_logit, false_logit, probability = parameters.get(
            ('true_logit', 'false_logit', 'probability'))

        # Distribution parameter summaries
        def fn_summary():
            axis = range(self.action_spec.rank + 1)
            return tf.math.reduce_mean(input_tensor=probability, axis=axis)

        name = 'distributions/' + self.name + '-probability'
        dependencies = self.summary(label='distribution',
                                    name=name,
                                    data=fn_summary,
                                    step='timesteps')

        # Entropy summary
        def fn_summary():
            one = tf_util.constant(value=1.0, dtype='float')
            entropy = -probability * true_logit - (one -
                                                   probability) * false_logit
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy',
                         name=name,
                         data=fn_summary,
                         step='timesteps'))

        half = tf_util.constant(value=0.5, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: true if >= 0.5
        definite = tf.greater_equal(x=probability, y=half)

        # Non-deterministic: sample true if >= uniform distribution
        e_true_logit = tf.math.exp(x=(true_logit / temperature))
        e_false_logit = tf.math.exp(x=(false_logit / temperature))
        probability = e_true_logit / (e_true_logit + e_false_logit)
        uniform = tf.random.uniform(shape=tf.shape(input=probability),
                                    dtype=tf_util.get_dtype(type='float'))
        sampled = tf.greater_equal(x=probability, y=uniform)

        with tf.control_dependencies(control_inputs=dependencies):
            return tf.where(condition=(temperature < epsilon),
                            x=definite,
                            y=sampled)
Beispiel #18
0
                def negate_deltas():
                    neg_two_float = tf_util.constant(value=-2.0, dtype='float')
                    assignments = list()
                    for variable, delta in zip(variables, deltas):
                        if variable.dtype == tf_util.get_dtype(type='float'):
                            assignments.append(
                                variable.assign_add(delta=(neg_two_float *
                                                           delta),
                                                    read_value=False))
                        else:
                            _ng_two_float = tf.constant(value=-2.0,
                                                        dtype=variable.dtype)
                            assignments.append(
                                variable.assign_add(delta=(_ng_two_float *
                                                           delta),
                                                    read_value=False))

                    with tf.control_dependencies(control_inputs=assignments):
                        return [tf.math.negative(x=delta) for delta in deltas]
Beispiel #19
0
    def parametrize(self, *, x, conditions):
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        shape = (-1, ) + self.action_spec.shape + (
            self.action_spec.num_values, )

        # Action values
        action_values = self.action_values.apply(x=x)
        action_values = tf.reshape(tensor=action_values, shape=shape)

        # States value
        if self.state_value is None:
            # Implicit states value (TODO: experimental)
            states_value = tf.reduce_logsumexp(input_tensor=action_values,
                                               axis=-1)

        else:
            # Explicit states value and advantage-based action values
            states_value = self.state_value.apply(x=x)
            states_value = tf.reshape(tensor=states_value, shape=shape[:-1])
            action_values = tf.expand_dims(input=states_value,
                                           axis=-1) + action_values
            action_values -= tf.math.reduce_mean(input_tensor=action_values,
                                                 axis=-1,
                                                 keepdims=True)

        # Masking (TODO: before or after above?)
        if self.config.enable_int_action_masking:
            min_float = tf.fill(dims=tf.shape(input=action_values),
                                value=tf_util.get_dtype(type='float').min)
            action_values = tf.where(condition=conditions['mask'],
                                     x=action_values,
                                     y=min_float)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=action_values, axis=-1)

        # "Normalized" logits
        logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon))

        return TensorDict(logits=logits,
                          probabilities=probabilities,
                          states_value=states_value,
                          action_values=action_values)
Beispiel #20
0
 def function(name, spec, a_value):
     if name is None:
         x = embedding.get('action-embedding', embedding['embedding'])
     else:
         x = embedding.get(name + '-embedding', embedding['embedding'])
     action_value = a_value.apply(x=x)
     if spec.type == 'bool':
         shape = (-1,) + spec.shape + (2,)
     elif spec.type == 'int':
         shape = (-1,) + spec.shape + (spec.num_values,)
     action_value = tf.reshape(tensor=action_value, shape=shape)
     if spec.type == 'bool':
         return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1])
     elif spec.type == 'int':
         if self.config.enable_int_action_masking:
             mask = auxiliaries[name]['mask']
             min_float = tf_util.get_dtype(type='float').min
             min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float)
             action_value = tf.where(condition=mask, x=action_value, y=min_float)
         return tf.math.reduce_max(input_tensor=action_value, axis=-1)
Beispiel #21
0
        def subsampled_step():
            subsampled_arguments = TensorDict()
            indices = tf.random.uniform(
                shape=(fraction,), maxval=batch_size, dtype=tf_util.get_dtype(type='int')
            )

            if 'states' in arguments and 'horizons' in arguments:
                horizons = tf.gather(params=arguments['horizons'], indices=indices)
                starts = horizons[:, 0]
                lengths = horizons[:, 1]
                states_indices = tf.ragged.range(starts=starts, limits=(starts + lengths)).values
                function = (lambda x: tf.gather(params=x, indices=states_indices))
                subsampled_arguments['states'] = arguments['states'].fmap(function=function)
                starts = tf.math.cumsum(x=lengths, exclusive=True)
                subsampled_arguments['horizons'] = tf.stack(values=(starts, lengths), axis=1)

            for name, argument in arguments.items():
                if name not in subsampled_arguments:
                    subsampled_arguments[name] = tf.gather(params=argument, indices=indices)

            return self.optimizer.step(arguments=subsampled_arguments, **kwargs)
Beispiel #22
0
    def __init__(self,
                 *,
                 layer,
                 l2_regularization=None,
                 name=None,
                 input_spec=None,
                 **kwargs):
        super().__init__(l2_regularization=l2_regularization,
                         name=name,
                         input_spec=input_spec)

        self.keras_layer = getattr(tf.keras.layers, layer)(
            name=name,
            dtype=tf_util.get_dtype(type='float'),
            input_shape=input_spec.shape,
            **kwargs)

        self.architecture_kwargs['layer'] = str(layer)
        if l2_regularization is not None:
            self.architecture_kwargs['l2_regularization'] = str(
                l2_regularization)
Beispiel #23
0
    def sample(self, *, parameters, temperature):
        mean, stddev, log_stddev = parameters.get(
            ('mean', 'stddev', 'log_stddev'))

        # Distribution parameter and entropy summaries
        def fn_summary():
            return tf.math.reduce_mean(input_tensor=mean, axis=range(self.action_spec.rank + 1)), \
                tf.math.reduce_mean(input_tensor=stddev, axis=range(self.action_spec.rank + 1))

        prefix = 'distributions/' + self.name
        dependencies = self.summary(label='distribution',
                                    name=(prefix + '-mean',
                                          prefix + '-stddev'),
                                    data=fn_summary,
                                    step='timesteps')

        # Entropy summary
        def fn_summary():
            half_log_two_pi_e = tf_util.constant(
                value=(0.5 * np.log(2.0 * np.pi * np.e)), dtype='float')
            entropy = log_stddev + half_log_two_pi_e
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy',
                         name=name,
                         data=fn_summary,
                         step='timesteps'))

        normal_distribution = tf.random.normal(
            shape=tf.shape(input=mean), dtype=tf_util.get_dtype(type='float'))

        with tf.control_dependencies(control_inputs=dependencies):
            action = mean + stddev * temperature * normal_distribution

            # Bounded transformation
            if self.bounded_transform is not None:
                if self.bounded_transform == 'tanh':
                    action = tf.math.tanh(x=action)

                if self.action_spec.min_value is not None and \
                        self.action_spec.max_value is not None:
                    one = tf_util.constant(value=1.0, dtype='float')
                    half = tf_util.constant(value=0.5, dtype='float')
                    min_value = tf_util.constant(
                        value=self.action_spec.min_value, dtype='float')
                    max_value = tf_util.constant(
                        value=self.action_spec.max_value, dtype='float')
                    action = min_value + (max_value -
                                          min_value) * half * (action + one)

                elif self.action_spec.min_value is not None:
                    min_value = tf_util.constant(
                        value=self.action_spec.min_value, dtype='float')
                    action = tf.maximum(x=min_value, y=action)
                else:
                    assert self.action_spec.max_value is not None
                    max_value = tf_util.constant(
                        value=self.action_spec.max_value, dtype='float')
                    action = tf.minimum(x=max_value, y=action)

            return action
Beispiel #24
0
    def parametrize(self, *, x, conditions):
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        log_epsilon = tf_util.constant(value=np.log(util.epsilon),
                                       dtype='float')
        log_two = tf_util.constant(value=np.log(2.0), dtype='float')

        # Action values
        action_values = self.action_values.apply(x=x)
        shape = (-1, ) + self.action_spec.shape + (
            self.action_spec.num_values, )
        action_values = tf.reshape(tensor=action_values, shape=shape)

        # Softplus standard deviation
        if self.temperature_mode == 'global':
            multiples = (tf.shape(input=x)[0], ) + tuple(
                1 for _ in range(self.action_spec.rank + 1))
            softplus_temperature = tf.tile(input=self.softplus_temperature,
                                           multiples=multiples)
        elif self.temperature_mode == 'predicted':
            softplus_temperature = self.softplus_temperature.apply(x=x)
            shape = (-1, ) + self.action_spec.shape + (1, )
            softplus_temperature = tf.reshape(tensor=softplus_temperature,
                                              shape=shape)

        if self.temperature_mode is None:
            # Logits
            logits = action_values

            # Implicit states value
            state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1)

        else:
            # Clip softplus_temperature for numerical stability (epsilon < 1.0, hence negative)
            softplus_temperature = tf.clip_by_value(
                t=softplus_temperature,
                clip_value_min=log_epsilon,
                clip_value_max=-log_epsilon)

            # Softplus transformation (based on https://arxiv.org/abs/2007.06059)
            softplus_shift = tf_util.constant(value=0.2, dtype='float')
            temperature = (tf.nn.softplus(features=softplus_temperature) + softplus_shift) / \
                (log_two + softplus_shift)

            # Logits
            logits = action_values / temperature

            # Implicit states value
            temperature = tf.squeeze(input=temperature, axis=-1)
            state_value = temperature * tf.reduce_logsumexp(
                input_tensor=logits, axis=-1)

        # # Explicit states value and advantage-based action values
        # state_value = self.state_value.apply(x=x)
        # state_value = tf.reshape(tensor=state_value, shape=shape[:-1])
        # action_values = tf.expand_dims(input=state_value, axis=-1) + action_values
        # action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True)

        # Action masking, affects action_values/probabilities/logits but not state_value
        if self.config.enable_int_action_masking:
            min_float = tf.fill(dims=tf.shape(input=action_values),
                                value=tf_util.get_dtype(type='float').min)
            action_values = tf.where(condition=conditions['mask'],
                                     x=action_values,
                                     y=min_float)
            logits = tf.where(condition=conditions['mask'],
                              x=logits,
                              y=min_float)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=logits, axis=-1)

        # "Normalized" logits
        logits = tf.math.log(x=(probabilities + epsilon))
        # Unstable
        # logits = tf.nn.log_softmax(logits=logits, axis=-1)
        # Doesn't take masking into account
        # logits = action_values - tf.expand_dims(input=state_value, axis=-1) ... / temperature

        if self.temperature_mode is None:
            return TensorDict(probabilities=probabilities,
                              logits=logits,
                              action_values=action_values,
                              state_value=state_value)
        else:
            return TensorDict(probabilities=probabilities,
                              temperature=temperature,
                              logits=logits,
                              action_values=action_values,
                              state_value=state_value)
Beispiel #25
0
    def observe(self, *, terminal, reward, parallel):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int')
        expanded_parallel = tf.expand_dims(input=tf.expand_dims(input=parallel,
                                                                axis=0),
                                           axis=1)
        is_terminal = tf.math.greater(x=terminal[-1], y=zero)

        # Input assertions
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.extend(
                self.terminal_spec.tf_assert(
                    x=terminal,
                    batch_size=batch_size,
                    message='Agent.observe: invalid {issue} for terminal input.'
                ))
            assertions.extend(
                self.reward_spec.tf_assert(
                    x=reward,
                    batch_size=batch_size,
                    message='Agent.observe: invalid {issue} for terminal input.'
                ))
            assertions.extend(
                self.parallel_spec.tf_assert(
                    x=parallel,
                    message='Agent.observe: invalid {issue} for parallel input.'
                ))
            # Assertion: at most one terminal
            num_terms = tf.math.count_nonzero(
                input=terminal, dtype=tf_util.get_dtype(type='int'))
            assertions.append(
                tf.debugging.assert_less_equal(
                    x=num_terms,
                    y=one,
                    message=
                    "Agent.observe: input contains more than one terminal."))
            # Assertion: if terminal, last timestep in batch
            assertions.append(
                tf.debugging.assert_equal(
                    x=tf.math.greater(x=num_terms, y=zero),
                    y=is_terminal,
                    message=
                    "Agent.observe: terminal is not the last input timestep."))

        with tf.control_dependencies(control_inputs=assertions):
            dependencies = list()

            # Reward summary
            if self.summaries == 'all' or 'reward' in self.summaries:
                with self.summarizer.as_default():
                    x = tf.math.reduce_mean(input_tensor=reward)
                    dependencies.append(
                        tf.summary.scalar(name='reward',
                                          data=x,
                                          step=self.timesteps))

            # Update episode length/reward
            updates = tf.expand_dims(input=batch_size, axis=0)
            value = tf.tensor_scatter_nd_add(tensor=self.episode_length,
                                             indices=expanded_parallel,
                                             updates=updates)
            dependencies.append(self.episode_length.assign(value=value))
            # sparse_delta = tf.IndexedSlices(values=batch_size, indices=parallel)
            # dependencies.append(self.episode_length.scatter_add(sparse_delta=sparse_delta))
            sum_reward = tf.math.reduce_sum(input_tensor=reward, keepdims=True)
            value = tf.tensor_scatter_nd_add(tensor=self.episode_reward,
                                             indices=expanded_parallel,
                                             updates=sum_reward)
            dependencies.append(self.episode_reward.assign(value=value))
            # sum_reward = tf.math.reduce_sum(input_tensor=reward)
            # sparse_delta = tf.IndexedSlices(values=sum_reward, indices=parallel)
            # dependencies.append(self.episode_reward.scatter_add(sparse_delta=sparse_delta))

            # Core observe (before terminal handling)
            updated = self.core_observe(terminal=terminal,
                                        reward=reward,
                                        parallel=parallel)
            dependencies.append(updated)

        # Handle terminal (after core observe and episode reward)
        with tf.control_dependencies(control_inputs=dependencies):

            def fn_terminal():
                operations = list()

                # Reset internals
                def function(spec, initial):
                    return tf_util.constant(value=initial, dtype=spec.type)

                initials = self.internals_spec.fmap(
                    function=function,
                    cls=TensorDict,
                    zip_values=self.initial_internals)
                for name, previous, initial in self.previous_internals.zip_items(
                        initials):
                    updates = tf.expand_dims(input=initial, axis=0)
                    value = tf.tensor_scatter_nd_update(
                        tensor=previous,
                        indices=expanded_parallel,
                        updates=updates)
                    operations.append(previous.assign(value=value))
                    # sparse_delta = tf.IndexedSlices(values=initial, indices=parallel)
                    # operations.append(previous.scatter_update(sparse_delta=sparse_delta))

                # Episode length/reward summaries (before episode reward reset / episodes increment)
                dependencies = list()
                if self.summaries == 'all' or 'reward' in self.summaries:
                    with self.summarizer.as_default():
                        x = tf.gather(params=self.episode_length,
                                      indices=parallel)
                        dependencies.append(
                            tf.summary.scalar(name='episode-length',
                                              data=x,
                                              step=self.episodes))
                        x = tf.gather(params=self.episode_reward,
                                      indices=parallel)
                        dependencies.append(
                            tf.summary.scalar(name='episode-reward',
                                              data=x,
                                              step=self.episodes))

                # Reset episode length/reward
                with tf.control_dependencies(control_inputs=dependencies):
                    zeros = tf_util.zeros(shape=(1, ), dtype='int')
                    value = tf.tensor_scatter_nd_update(
                        tensor=self.episode_length,
                        indices=expanded_parallel,
                        updates=zeros)
                    operations.append(self.episode_length.assign(value=value))
                    # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel)
                    # operations.append(self.episode_length.scatter_update(sparse_delta=sparse_delta))
                    zeros = tf_util.zeros(shape=(1, ), dtype='float')
                    value = tf.tensor_scatter_nd_update(
                        tensor=self.episode_reward,
                        indices=expanded_parallel,
                        updates=zeros)
                    operations.append(self.episode_reward.assign(value=value))
                    # zero_float = tf_util.constant(value=0.0, dtype='float')
                    # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel)
                    # operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta))

                # Increment episodes counter
                operations.append(
                    self.episodes.assign_add(delta=one, read_value=False))

                return tf.group(*operations)

            handle_terminal = tf.cond(pred=is_terminal,
                                      true_fn=fn_terminal,
                                      false_fn=tf.no_op)

        with tf.control_dependencies(control_inputs=(handle_terminal, )):
            episodes = tf_util.identity(input=self.episodes)
            updates = tf_util.identity(input=self.updates)
            return updated, episodes, updates
Beispiel #26
0
    def enqueue(self, *, states, internals, auxiliaries, actions, terminal,
                reward):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        three = tf_util.constant(value=3, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')
        num_timesteps = tf_util.cast(x=tf.shape(input=terminal)[0],
                                     dtype='int')

        last_index = tf.math.mod(x=(self.buffer_index - one), y=capacity)

        def correct_terminal():
            # Remove last observation terminal marker
            sparse_delta = tf.IndexedSlices(values=zero, indices=last_index)
            assignment = self.buffers['terminal'].scatter_update(
                sparse_delta=sparse_delta)
            with tf.control_dependencies(control_inputs=(assignment, )):
                return last_index < zero

        last_terminal = tf.gather(params=self.buffers['terminal'],
                                  indices=last_index)
        is_incorrect = tf.math.equal(x=last_terminal, y=three)
        corrected = tf.cond(pred=is_incorrect,
                            true_fn=correct_terminal,
                            false_fn=tf.no_op)

        # Assertions
        last_terminal = tf.concat(values=([zero], terminal), axis=0)[-1]
        assertions = [corrected]
        if self.config.create_tf_assertions:
            with tf.control_dependencies(control_inputs=(corrected, )):
                # check: number of timesteps fit into effectively available buffer
                assertions.append(
                    tf.debugging.assert_less_equal(
                        x=num_timesteps,
                        y=capacity,
                        message="Memory does not have enough capacity."))
                # at most one terminal
                assertions.append(
                    tf.debugging.assert_less_equal(
                        x=tf.math.count_nonzero(
                            input=terminal,
                            dtype=tf_util.get_dtype(type='int')),
                        y=one,
                        message="Timesteps contain more than one terminal."))
                # if terminal, last timestep in batch
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.math.reduce_any(
                            input_tensor=tf.math.greater(x=terminal, y=zero)),
                        y=tf.math.greater(x=last_terminal, y=zero),
                        message="Terminal is not the last timestep."))
                # general check: all terminal indices true
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.reduce_all(input_tensor=tf.gather(
                            params=tf.math.greater(x=self.buffers['terminal'],
                                                   y=zero),
                            indices=self.terminal_indices[:self.episode_count +
                                                          one])),
                        y=tf_util.constant(value=True, dtype='bool'),
                        message="Memory consistency check."))
                # general check: only terminal indices true
                assertions.append(
                    tf.debugging.assert_equal(
                        x=tf.math.count_nonzero(
                            input=self.buffers['terminal'],
                            dtype=tf_util.get_dtype(type='int')),
                        y=(self.episode_count + one),
                        message="Memory consistency check."))

        # Buffer indices to overwrite
        with tf.control_dependencies(control_inputs=assertions):
            overwritten_indices = tf.range(start=self.buffer_index,
                                           limit=(self.buffer_index +
                                                  num_timesteps))
            overwritten_indices = tf.math.mod(x=overwritten_indices,
                                              y=capacity)

            # Count number of overwritten episodes
            num_episodes = tf.math.count_nonzero(
                input=tf.gather(params=self.buffers['terminal'],
                                indices=overwritten_indices),
                axis=0,
                dtype=tf_util.get_dtype(type='int'))

            # Shift remaining terminal indices accordingly
            index = self.episode_count + one
            assertions = list()
            if self.config.create_tf_assertions:
                assertions.append(
                    tf.debugging.assert_greater_equal(
                        x=index,
                        y=num_episodes,
                        message="Memory episode overwriting check."))

        with tf.control_dependencies(control_inputs=assertions):
            sparse_delta = tf.IndexedSlices(
                values=self.terminal_indices[num_episodes:index],
                indices=tf.range(index - num_episodes))
            assignment = self.terminal_indices.scatter_update(
                sparse_delta=sparse_delta)

        # Decrement episode count accordingly
        with tf.control_dependencies(control_inputs=(assignment, )):
            assignment = self.episode_count.assign_sub(delta=num_episodes,
                                                       read_value=False)

        # Write new observations
        with tf.control_dependencies(control_inputs=(assignment, )):
            # Add last observation terminal marker
            corrected_terminal = tf.where(condition=tf.math.equal(
                x=terminal[-1:], y=zero),
                                          x=three,
                                          y=terminal[-1:])
            corrected_terminal = tf.concat(values=(terminal[:-1],
                                                   corrected_terminal),
                                           axis=0)
            values = TensorDict(states=states,
                                internals=internals,
                                auxiliaries=auxiliaries,
                                actions=actions,
                                terminal=corrected_terminal,
                                reward=reward)
            indices = tf.range(start=self.buffer_index,
                               limit=(self.buffer_index + num_timesteps))
            indices = tf.math.mod(x=indices, y=capacity)

            def function(buffer, value):
                sparse_delta = tf.IndexedSlices(values=value, indices=indices)
                return buffer.scatter_update(sparse_delta=sparse_delta)

            assignments = self.buffers.fmap(function=function,
                                            cls=list,
                                            zip_values=values)

        # Increment buffer index
        with tf.control_dependencies(control_inputs=assignments):
            assignment = self.buffer_index.assign_add(delta=num_timesteps,
                                                      read_value=False)

        # Count number of new episodes
        with tf.control_dependencies(control_inputs=(assignment, )):
            num_new_episodes = tf.math.count_nonzero(
                input=terminal, dtype=tf_util.get_dtype(type='int'))

            # Write new terminal indices
            new_terminal_indices = tf.boolean_mask(tensor=overwritten_indices,
                                                   mask=tf.math.greater(
                                                       x=terminal, y=zero))
            start = self.episode_count + one
            sparse_delta = tf.IndexedSlices(
                values=new_terminal_indices,
                indices=tf.range(start=start,
                                 limit=(start + num_new_episodes)))
            assignment = self.terminal_indices.scatter_update(
                sparse_delta=sparse_delta)

        # Increment episode count accordingly
        with tf.control_dependencies(control_inputs=(assignment, )):
            assignment = self.episode_count.assign_add(delta=num_new_episodes)
            return assignment < zero
Beispiel #27
0
    def successors(self, *, indices, horizon, sequence_values, final_values):
        assert isinstance(sequence_values, tuple)
        assert isinstance(final_values, tuple)

        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        capacity = tf_util.constant(value=self.capacity, dtype='int')

        def body(lengths, successor_indices, mask):
            current_index = successor_indices[:, -1:]
            current_terminal = tf.gather(params=self.buffers['terminal'],
                                         indices=current_index)
            is_not_terminal = tf.math.logical_and(x=tf.math.logical_not(
                x=tf.math.greater(x=current_terminal, y=zero)),
                                                  y=mask[:, -1:])
            next_index = tf.math.mod(x=(current_index + one), y=capacity)
            successor_indices = tf.concat(values=(successor_indices,
                                                  next_index),
                                          axis=1)
            mask = tf.concat(values=(mask, is_not_terminal), axis=1)
            is_not_terminal = tf.squeeze(input=is_not_terminal, axis=1)
            zeros = tf.zeros_like(input=is_not_terminal,
                                  dtype=tf_util.get_dtype(type='int'))
            ones = tf.ones_like(input=is_not_terminal,
                                dtype=tf_util.get_dtype(type='int'))
            lengths += tf.where(condition=is_not_terminal, x=ones, y=zeros)
            return lengths, successor_indices, mask

        lengths = tf.ones_like(input=indices,
                               dtype=tf_util.get_dtype(type='int'))
        successor_indices = tf.expand_dims(input=indices, axis=1)
        mask = tf.ones_like(input=successor_indices,
                            dtype=tf_util.get_dtype(type='bool'))
        shape = tf.TensorShape(dims=((None, None)))

        lengths, successor_indices, mask = tf.while_loop(
            cond=tf_util.always_true,
            body=body,
            loop_vars=(lengths, successor_indices, mask),
            shape_invariants=(lengths.get_shape(), shape, shape),
            maximum_iterations=tf_util.int32(x=horizon))

        successor_indices = tf.reshape(tensor=successor_indices, shape=(-1, ))
        mask = tf.reshape(tensor=mask, shape=(-1, ))
        successor_indices = tf.boolean_mask(tensor=successor_indices,
                                            mask=mask,
                                            axis=0)

        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_greater_equal(x=tf.math.mod(
                    x=(self.buffer_index - one - successor_indices),
                    y=capacity),
                                                  y=zero,
                                                  message="Successor check."))

        with tf.control_dependencies(control_inputs=assertions):
            function = (lambda buffer: tf.gather(params=buffer,
                                                 indices=successor_indices))
            values = self.buffers[sequence_values].fmap(function=function,
                                                        cls=TensorDict)
            sequence_values = tuple(values[name] for name in sequence_values)

            starts = tf.math.cumsum(x=lengths, exclusive=True)
            ends = tf.math.cumsum(x=lengths) - one
            final_indices = tf.gather(params=successor_indices, indices=ends)
            function = (
                lambda buffer: tf.gather(params=buffer, indices=final_indices))
            values = self.buffers[final_values].fmap(function=function,
                                                     cls=TensorDict)
            final_values = tuple(values[name] for name in final_values)

        if len(sequence_values) == 0:
            if len(final_values) == 0:
                return lengths
            else:
                return lengths, final_values

        elif len(final_values) == 0:
            return tf.stack(values=(starts, lengths), axis=1), sequence_values

        else:
            return tf.stack(values=(starts, lengths),
                            axis=1), sequence_values, final_values
Beispiel #28
0
 def fn_sample():
     normal_distribution = tf.random.normal(
         shape=tf.shape(input=mean),
         dtype=tf_util.get_dtype(type='float'))
     return mean + stddev * temperature * normal_distribution
Beispiel #29
0
    def core_act(self, *, states, internals, auxiliaries, parallel,
                 deterministic, independent):
        assert len(internals) == 0

        actions = TensorDict()
        for name, spec in self.actions_spec.items():
            shape = tf.concat(values=(tf_util.cast(
                x=tf.shape(input=states.value())[:1],
                dtype='int'), tf_util.constant(value=spec.shape, dtype='int')),
                              axis=0)

            if spec.type == 'bool':
                # Random bool action: uniform[True, False]
                half = tf_util.constant(value=0.5, dtype='float')
                uniform = tf.random.uniform(
                    shape=shape, dtype=tf_util.get_dtype(type='float'))
                actions[name] = (uniform < half)

            elif self.config.enable_int_action_masking and spec.type == 'int' and \
                    spec.num_values is not None:
                # Random masked action: uniform[unmasked]
                # (Similar code as for Model.apply_exploration)
                mask = auxiliaries[name]['mask']
                choices = tf_util.constant(value=list(range(spec.num_values)),
                                           dtype=spec.type,
                                           shape=(tuple(1
                                                        for _ in spec.shape) +
                                                  (1, spec.num_values)))
                one = tf_util.constant(value=1, dtype='int', shape=(1, ))
                multiples = tf.concat(values=(shape, one), axis=0)
                choices = tf.tile(input=choices, multiples=multiples)
                choices = tf.boolean_mask(tensor=choices, mask=mask)
                mask = tf_util.cast(x=mask, dtype='int')
                num_valid = tf.math.reduce_sum(input_tensor=mask,
                                               axis=(spec.rank + 1))
                num_valid = tf.reshape(tensor=num_valid, shape=(-1, ))
                masked_offset = tf.math.cumsum(x=num_valid,
                                               axis=0,
                                               exclusive=True)
                uniform = tf.random.uniform(
                    shape=shape, dtype=tf_util.get_dtype(type='float'))
                uniform = tf.reshape(tensor=uniform, shape=(-1, ))
                num_valid = tf_util.cast(x=num_valid, dtype='float')
                random_offset = tf.dtypes.cast(x=(uniform * num_valid),
                                               dtype=tf.dtypes.int64)
                action = tf.gather(params=choices,
                                   indices=(masked_offset + random_offset))
                actions[name] = tf.reshape(tensor=action, shape=shape)

            elif spec.type != 'bool' and spec.min_value is not None:
                if spec.max_value is not None:
                    # Random bounded action: uniform[min_value, max_value]
                    actions[name] = tf.random.uniform(shape=shape,
                                                      minval=spec.min_value,
                                                      maxval=spec.max_value,
                                                      dtype=spec.tf_type())

                else:
                    # Random left-bounded action: not implemented
                    raise NotImplementedError

            elif spec.type != 'bool' and spec.max_value is not None:
                # Random right-bounded action: not implemented
                raise NotImplementedError

            else:
                # Random unbounded int/float action
                actions[name] = tf.random.normal(shape=shape,
                                                 dtype=spec.tf_type())

        return actions, TensorDict()
Beispiel #30
0
    def update(self, *, arguments, variables, **kwargs):
        assert self.is_initialized_given_variables
        assert all(variable.dtype.is_floating for variable in variables)

        deltas = self.step(arguments=arguments, variables=variables, **kwargs)

        assertions = list(deltas)
        if self.config.create_debug_assertions:
            from tensorforce.core.optimizers import DoublecheckStep, NaturalGradient, \
                Synchronization, UpdateModifier
            optimizer = self
            while isinstance(optimizer, UpdateModifier):
                if isinstance(optimizer, DoublecheckStep):
                    break
                optimizer = optimizer.optimizer
            if not isinstance(optimizer, DoublecheckStep) and (
                    not isinstance(optimizer, NaturalGradient)
                    or not optimizer.only_positive_updates) and (
                        not isinstance(self, Synchronization)
                        or self.sync_frequency is None):
                for delta, variable in zip(deltas, variables):
                    if '_distribution/mean/linear/' in variable.name:
                        # Gaussian.state_value does not use mean
                        continue
                    # if variable.name.endswith('/bias:0') and isinstance(self, Synchronization) \
                    #         and self.root.updates.numpy() == 0:
                    #     # Initialization values are equivalent for bias
                    #     continue
                    assertions.append(
                        tf.debugging.assert_equal(x=tf.math.logical_or(
                            x=tf.math.reduce_all(input_tensor=tf.math.greater(
                                x=tf.math.count_nonzero(
                                    input=delta,
                                    dtype=tf_util.get_dtype(type='int')),
                                y=tf_util.constant(value=0, dtype='int'))),
                            y=tf.reduce_all(input_tensor=tf.math.equal(
                                x=arguments['reward'],
                                y=tf_util.constant(value=0.0,
                                                   dtype='float')))),
                                                  y=tf_util.constant(
                                                      value=True,
                                                      dtype='bool'),
                                                  message=variable.name))

        with tf.control_dependencies(control_inputs=assertions):
            dependencies = list()

            if self.root.summaries == 'all' or 'update-norm' in self.root.summaries:
                with self.root.summarizer.as_default():
                    x = tf.linalg.global_norm(t_list=[
                        tf_util.cast(x=delta, dtype='float')
                        for delta in deltas
                    ])
                    dependencies.append(
                        tf.summary.scalar(name='update-norm',
                                          data=x,
                                          step=self.root.updates))

            if self.root.summaries == 'all' or 'updates' in self.root.summaries:
                with self.root.summarizer.as_default():
                    for var in variables:
                        assert var.name.startswith(
                            self.root.name + '/') and var.name[-2:] == ':0'
                        mean_name = var.name[len(self.root.name) +
                                             1:-2] + '-mean'
                        var_name = var.name[len(self.root.name) +
                                            1:-2] + '-variance'
                        mean, variance = tf.nn.moments(
                            x=var, axes=list(range(tf_util.rank(x=var))))
                        dependencies.append(
                            tf.summary.scalar(name=mean_name,
                                              data=mean,
                                              step=self.root.updates))
                        dependencies.append(
                            tf.summary.scalar(name=var_name,
                                              data=variance,
                                              step=self.root.updates))

        with tf.control_dependencies(control_inputs=dependencies):
            return tf_util.identity(
                input=tf_util.constant(value=True, dtype='bool'))