Exemple #1
0
    def sample(self, *, parameters, temperature):
        logits, probabilities, action_values = parameters.get(
            ('logits', 'probabilities', 'action_values'))

        # Distribution parameter summaries
        def fn_summary():
            axis = range(self.action_spec.rank + 1)
            probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis)
            return [probs[n] for n in range(self.action_spec.num_values)]

        prefix = 'distributions/' + self.name + '-probability'
        names = [prefix + str(n) for n in range(self.action_spec.num_values)]
        dependencies = self.summary(label='distribution',
                                    name=names,
                                    data=fn_summary,
                                    step='timesteps')

        # Entropy summary
        def fn_summary():
            entropy = -tf.reduce_sum(input_tensor=(probabilities * logits),
                                     axis=-1)
            return tf.math.reduce_mean(input_tensor=entropy)

        name = 'entropies/' + self.name
        dependencies.extend(
            self.summary(label='entropy',
                         name=name,
                         data=fn_summary,
                         step='timesteps'))

        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')

        # Deterministic: maximum likelihood action
        definite = tf.argmax(input=action_values, axis=-1)
        definite = tf_util.cast(x=definite, dtype='int')

        # Set logits to minimal value
        min_float = tf.fill(dims=tf.shape(input=logits),
                            value=tf_util.get_dtype(type='float').min)
        logits = logits / temperature
        logits = tf.where(condition=(probabilities < epsilon),
                          x=min_float,
                          y=logits)

        # Non-deterministic: sample action using Gumbel distribution
        uniform_distribution = tf.random.uniform(
            shape=tf.shape(input=logits),
            minval=epsilon,
            maxval=(one - epsilon),
            dtype=tf_util.get_dtype(type='float'))
        gumbel_distribution = -tf.math.log(
            x=-tf.math.log(x=uniform_distribution))
        sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1)
        sampled = tf_util.cast(x=sampled, dtype='int')

        with tf.control_dependencies(control_inputs=dependencies):
            return tf.where(condition=(temperature < epsilon),
                            x=definite,
                            y=sampled)
    def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent):
        assert len(internals) == 0

        actions = TensorDict()
        for name, spec in self.actions_spec.items():
            shape = tf.concat(values=(
                tf_util.cast(x=tf.shape(input=states.value())[:1], dtype='int'),
                tf_util.constant(value=spec.shape, dtype='int')
            ), axis=0)

            if self.action_values is not None and name in self.action_values:
                # If user-specified, choose given action
                action = tf_util.constant(value=self.action_values[name], dtype=spec.type)
                actions[name] = tf.fill(dims=shape, value=action)

            elif self.config.enable_int_action_masking and spec.type == 'int' and \
                    spec.num_values is not None:
                # If masking, choose first unmasked action
                mask = auxiliaries[name]['mask']
                choices = tf_util.constant(
                    value=list(range(spec.num_values)), dtype='int',
                    shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values))
                )
                one = tf_util.constant(value=1, dtype='int', shape=(1,))
                multiples = tf.concat(values=(shape, one), axis=0)
                choices = tf.tile(input=choices, multiples=multiples)
                choices = tf.boolean_mask(tensor=choices, mask=mask)
                mask = tf_util.cast(x=mask, dtype='int')
                num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1))
                num_valid = tf.reshape(tensor=num_valid, shape=(-1,))
                masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True)
                action = tf.gather(params=choices, indices=masked_offset)
                actions[name] = tf.reshape(tensor=action, shape=shape)

            elif spec.type != 'bool' and spec.min_value is not None:
                if spec.max_value is not None:
                    # If min/max_value given, choose mean action
                    action = spec.min_value + 0.5 * (spec.max_value - spec.min_value)
                    action = tf_util.constant(value=action, dtype=spec.type)
                    actions[name] = tf.fill(dims=shape, value=action)

                else:
                    # If only min_value given, choose min_value
                    action = tf_util.constant(value=spec.min_value, dtype=spec.type)
                    actions[name] = tf.fill(dims=shape, value=action)

            elif spec.type != 'bool' and spec.max_value is not None:
                # If only max_value given, choose max_value
                action = tf_util.constant(value=spec.max_value, dtype=spec.type)
                actions[name] = tf.fill(dims=shape, value=action)

            else:
                # Else choose zero
                actions[name] = tf_util.zeros(shape=shape, dtype=spec.type)

        return actions, TensorDict()
Exemple #3
0
    def apply(self, *, x):
        x = tf_util.float32(x=x)
        x = self.rnn(inputs=x, initial_state=None)

        if not self.return_final_state:
            x = tf_util.cast(x=x[0], dtype='float')
        elif self.cell_type == 'gru':
            x = tf_util.cast(x=x[1], dtype='float')
        elif self.cell_type == 'lstm':
            x = tf_util.cast(x=tf.concat(values=(x[1], x[2]), axis=1), dtype='float')

        return super().apply(x=x)
Exemple #4
0
 def fn_summary():
     one = tf_util.constant(value=1.0, dtype='float')
     digamma_alpha = tf_util.cast(
         x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float'
     )
     digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float')
     digamma_alpha_beta = tf_util.cast(
         x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float'
     )
     entropy = log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \
         (alpha_beta - one - one) * digamma_alpha_beta
     return tf.math.reduce_mean(input_tensor=entropy)
Exemple #5
0
    def entropy(self, *, parameters):
        alpha, beta, alpha_beta, log_norm = parameters.get(
            ('alpha', 'beta', 'alpha_beta', 'log_norm'))

        one = tf_util.constant(value=1.0, dtype='float')

        digamma_alpha = tf_util.cast(
            x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float')
        digamma_beta = tf_util.cast(
            x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float')
        digamma_alpha_beta = tf_util.cast(
            x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float')

        return log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \
            (alpha_beta - one - one) * digamma_alpha_beta
Exemple #6
0
    def parameter_value(self, *, step):
        parameter = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
            boundaries=self.boundaries, values=self.values)(step=step)

        parameter = tf_util.cast(x=parameter, dtype=self.spec.type)

        return parameter
Exemple #7
0
    def log_probability(self, *, parameters, action):
        mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev'))

        # Inverse bounded transformation
        if self.bounded_transform is not None:
            if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
                one = tf_util.constant(value=1.0, dtype='float')
                two = tf_util.constant(value=2.0, dtype='float')
                min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float')
                max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float')
                action = two * (action - min_value) / (max_value - min_value) - one

            if self.bounded_transform == 'tanh':
                clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float')
                action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip)
                action = tf_util.cast(x=tf.math.atanh(x=tf_util.float32(x=action)), dtype='float')

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        half = tf_util.constant(value=0.5, dtype='float')
        half_log_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float')

        sq_mean_distance = tf.square(x=(action - mean))
        sq_stddev = tf.maximum(x=tf.square(x=stddev), y=epsilon)

        log_prob = -half * sq_mean_distance / sq_stddev - log_stddev - half_log_two_pi

        if self.bounded_transform == 'tanh':
            log_two = tf_util.constant(value=np.log(2.0), dtype='float')
            log_prob -= two * (log_two - action - tf.math.softplus(features=(-two * action)))

        return log_prob
Exemple #8
0
 def apply_l2_regularization():
     l2_variables = list()
     for variable in self.this_trainable_variables:
         variable = tf_util.cast(x=variable, dtype='float')
         l2_variables.append(
             tf.reduce_sum(input_tensor=tf.square(x=variable)))
     return l2_regularization * tf.math.add_n(inputs=l2_variables)
Exemple #9
0
    def kl_divergence(self, *, parameters1, parameters2):
        alpha1, beta1, alpha_beta1, log_norm1 = parameters1.get(
            ('alpha', 'beta', 'alpha_beta', 'log_norm'))
        alpha2, beta2, alpha_beta2, log_norm2 = parameters2.get(
            ('alpha', 'beta', 'alpha_beta', 'log_norm'))

        digamma_alpha1 = tf_util.cast(
            x=tf.math.digamma(x=tf_util.float32(x=alpha1)), dtype='float')
        digamma_beta1 = tf_util.cast(
            x=tf.math.digamma(x=tf_util.float32(x=beta1)), dtype='float')
        digamma_alpha_beta1 = tf_util.cast(
            x=tf.math.digamma(x=tf_util.float32(x=alpha_beta1)), dtype='float')

        return log_norm2 - log_norm1 - digamma_beta1 * (beta2 - beta1) - \
            digamma_alpha1 * (alpha2 - alpha1) + digamma_alpha_beta1 * \
            (alpha_beta2 - alpha_beta1)
Exemple #10
0
    def step(self, *, arguments, variables, fn_loss, **kwargs):
        learning_rate = self.learning_rate.value()

        unperturbed_loss = fn_loss(**arguments.to_kwargs())

        deltas = [tf.zeros_like(input=variable) for variable in variables]
        previous_perturbations = [
            tf.zeros_like(input=variable) for variable in variables
        ]

        def body(deltas, previous_perturbations):
            with tf.control_dependencies(control_inputs=deltas):
                perturbations = [
                    learning_rate *
                    tf.random.normal(shape=tf_util.shape(x=variable),
                                     dtype=tf_util.get_dtype(type='float'))
                    for variable in variables
                ]
                perturbation_deltas = [
                    pert - prev_pert for pert, prev_pert in zip(
                        perturbations, previous_perturbations)
                ]
                assignments = list()
                for variable, delta in zip(variables, perturbation_deltas):
                    assignments.append(
                        variable.assign_add(delta=delta, read_value=False))

            with tf.control_dependencies(control_inputs=assignments):
                perturbed_loss = fn_loss(**arguments.to_kwargs())
                direction = tf.math.sign(x=(unperturbed_loss - perturbed_loss))
                deltas = [
                    delta + direction * perturbation
                    for delta, perturbation in zip(deltas, perturbations)
                ]

            return deltas, perturbations

        num_samples = self.num_samples.value()
        deltas, perturbations = tf.while_loop(
            cond=tf_util.always_true,
            body=body,
            loop_vars=(deltas, previous_perturbations),
            maximum_iterations=tf_util.int32(x=num_samples))

        with tf.control_dependencies(control_inputs=deltas):
            num_samples = tf_util.cast(x=num_samples, dtype='float')
            deltas = [delta / num_samples for delta in deltas]
            perturbation_deltas = [
                delta - pert for delta, pert in zip(deltas, perturbations)
            ]
            assignments = list()
            for variable, delta in zip(variables, perturbation_deltas):
                assignments.append(
                    variable.assign_add(delta=delta, read_value=False))

        with tf.control_dependencies(control_inputs=assignments):
            # Trivial operation to enforce control dependency
            return [tf_util.identity(input=delta) for delta in deltas]
Exemple #11
0
    def apply(self, *, x, horizons, internals, deterministic, independent):
        if x.is_singleton():
            inputs = x.singleton()
        else:
            inputs = list(x.values())

        x = self.keras_model(inputs=inputs, training=(not independent))

        return tf_util.cast(x=x, dtype='float'), internals
Exemple #12
0
    def mode(self, *, parameters, independent):
        if self.temperature_mode is None:
            probabilities, action_values = parameters.get(
                ('probabilities', 'action_values'))
        else:
            probabilities, temperature, action_values = parameters.get(
                ('probabilities', 'temperature', 'action_values'))

        # Distribution parameter summaries
        dependencies = list()
        if not independent:

            def fn_summary():
                axis = range(self.action_spec.rank + 1)
                probs = tf.math.reduce_mean(input_tensor=probabilities,
                                            axis=axis)
                probs = [probs[n] for n in range(self.action_spec.num_values)]
                if self.temperature_mode is not None:
                    probs.append(
                        tf.math.reduce_mean(input_tensor=temperature,
                                            axis=axis))
                return probs

            prefix = 'distributions/' + self.name + '-probability'
            names = [
                prefix + str(n) for n in range(self.action_spec.num_values)
            ]
            if self.temperature_mode is not None:
                names.append('distributions/' + self.name + '-temperature')
            dependencies.extend(
                self.summary(label='distribution',
                             name=names,
                             data=fn_summary,
                             step='timesteps'))

        # Distribution parameter tracking
        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=probabilities, axis=0)

        dependencies.extend(
            self.track(label='distribution',
                       name='probabilities',
                       data=fn_tracking))

        if self.temperature_mode is not None:

            def fn_tracking():
                return tf.math.reduce_mean(input_tensor=temperature, axis=0)

            dependencies.extend(
                self.track(label='distribution',
                           name='temperature',
                           data=fn_tracking))

        with tf.control_dependencies(control_inputs=dependencies):
            action = tf.math.argmax(input=action_values, axis=-1)
            return tf_util.cast(x=action, dtype='int')
Exemple #13
0
    def parameter_value(self, *, step):
        delta = self.theta * (self.mu - self.process) + self.sigma * tf.random.normal(shape=())
        if self.absolute:
            parameter = self.process.assign(value=tf.math.abs(x=(self.process + delta)))
        else:
            parameter = self.process.assign_add(delta=delta)

        parameter = tf_util.cast(x=parameter, dtype=self.spec.type)

        return parameter
Exemple #14
0
    def iterative_apply(self, *, x, internals):
        x = tf_util.float32(x=x)
        state = tf_util.float32(x=internals['state'])

        if self.cell_type == 'gru':
            state = (state, )
        elif self.cell_type == 'lstm':
            state = (state[:, 0, :], state[:, 1, :])

        x, state = self.cell(inputs=x, states=state)

        if self.cell_type == 'gru':
            state = state[0]
        elif self.cell_type == 'lstm':
            state = tf.stack(values=state, axis=1)

        x = tf_util.cast(x=x, dtype='float')
        internals['state'] = tf_util.cast(x=state, dtype='float')

        return x, internals
Exemple #15
0
    def apply(self, *, x):
        output_shape = tf.concat(values=[
            tf_util.cast(x=tf.shape(input=x)[:1], dtype='int'),
            tf_util.constant(value=self.output_shape, dtype='int')
        ], axis=0)
        x = tf.nn.conv2d_transpose(
            input=x, filters=self.weights, output_shape=tf_util.int32(x=output_shape),
            strides=self.stride, padding=self.padding.upper(), dilations=self.dilation
        )

        return super().apply(x=x)
Exemple #16
0
    def step(self, *, arguments, **kwargs):
        if not self.is_fraction_absolute and self.fraction.is_constant(value=1.0):
            return self.optimizer.step(arguments=arguments, **kwargs)

        batch_size = tf_util.cast(x=tf.shape(input=arguments['reward'])[0], dtype='int')
        if self.is_fraction_absolute:
            fraction = self.fraction.is_constant()
            if fraction is None:
                fraction = self.fraction.value()
        else:
            fraction = self.fraction.value() * tf_util.cast(x=batch_size, dtype='float')
            fraction = tf_util.cast(x=fraction, dtype='int')
            one = tf_util.constant(value=1, dtype='int')
            fraction = tf.math.maximum(x=fraction, y=one)

        def subsampled_step():
            subsampled_arguments = TensorDict()
            indices = tf.random.uniform(
                shape=(fraction,), maxval=batch_size, dtype=tf_util.get_dtype(type='int')
            )

            if 'states' in arguments and 'horizons' in arguments:
                horizons = tf.gather(params=arguments['horizons'], indices=indices)
                starts = horizons[:, 0]
                lengths = horizons[:, 1]
                states_indices = tf.ragged.range(starts=starts, limits=(starts + lengths)).values
                function = (lambda x: tf.gather(params=x, indices=states_indices))
                subsampled_arguments['states'] = arguments['states'].fmap(function=function)
                starts = tf.math.cumsum(x=lengths, exclusive=True)
                subsampled_arguments['horizons'] = tf.stack(values=(starts, lengths), axis=1)

            for name, argument in arguments.items():
                if name not in subsampled_arguments:
                    subsampled_arguments[name] = tf.gather(params=argument, indices=indices)

            return self.optimizer.step(arguments=subsampled_arguments, **kwargs)

        def normal_step():
            return self.optimizer.step(arguments=arguments, **kwargs)

        return tf.cond(pred=(fraction < batch_size), true_fn=subsampled_step, false_fn=normal_step)
Exemple #17
0
    def mode(self, *, parameters):
        probabilities, action_values = parameters.get(('probabilities', 'action_values'))

        # Distribution parameter tracking
        def fn_tracking():
            return tf.math.reduce_mean(input_tensor=probabilities, axis=0)

        dependencies = self.track(label='distribution', name='probabilities', data=fn_tracking)

        with tf.control_dependencies(control_inputs=dependencies):
            action = tf.math.argmax(input=action_values, axis=-1)
            return tf_util.cast(x=action, dtype='int')
    def apply(self, *, x, independent):
        dependencies = list()

        if independent:
            mean = self.moving_mean
            variance = self.moving_variance

        else:
            one = tf_util.constant(value=1.0, dtype='float')
            axes = (0, ) + tuple(1 + axis for axis in self.axes)

            decay = self.decay.value()
            batch_size = tf_util.cast(x=tf.shape(input=x)[0], dtype='float')
            decay = tf.math.pow(x=decay, y=batch_size)
            condition = tf.math.logical_or(x=self.after_first_call,
                                           y=tf.math.equal(x=batch_size, y=0))

            mean = tf.math.reduce_mean(input_tensor=x,
                                       axis=axes,
                                       keepdims=True)
            mean = tf.where(condition=condition,
                            x=(decay * self.moving_mean +
                               (one - decay) * mean),
                            y=mean)

            variance = tf.reduce_mean(input_tensor=tf.math.squared_difference(
                x=x, y=mean),
                                      axis=axes,
                                      keepdims=True)
            variance = tf.where(condition=condition,
                                x=(decay * self.moving_variance +
                                   (one - decay) * variance),
                                y=variance)

            with tf.control_dependencies(control_inputs=(mean, variance)):
                value = tf.math.logical_or(x=self.after_first_call,
                                           y=(batch_size > 0))
                dependencies.append(
                    self.after_first_call.assign(value=value,
                                                 read_value=False))

            mean = self.moving_mean.assign(value=mean)
            variance = self.moving_variance.assign(value=variance)

        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=epsilon))

        with tf.control_dependencies(control_inputs=dependencies):
            x = (x - tf.stop_gradient(input=mean)) * tf.stop_gradient(
                input=reciprocal_stddev)

        return x
Exemple #19
0
    def independent_act(self, *, states, internals=None, auxiliaries=None):
        if internals is None:
            assert len(self.internals_spec) == 0
            internals = TensorDict()
        if auxiliaries is None:
            assert len(self.auxiliaries_spec) == 0
            auxiliaries = TensorDict()
        true = tf_util.constant(value=True, dtype='bool')
        batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int')

        # Input assertions
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.extend(self.states_spec.tf_assert(
                x=states, batch_size=batch_size,
                message='Agent.independent_act: invalid {issue} for {name} state input.'
            ))
            assertions.extend(self.internals_spec.tf_assert(
                x=internals, batch_size=batch_size,
                message='Agent.independent_act: invalid {issue} for {name} internal input.'
            ))
            assertions.extend(self.auxiliaries_spec.tf_assert(
                x=auxiliaries, batch_size=batch_size,
                message='Agent.independent_act: invalid {issue} for {name} input.'
            ))
            # Mask assertions
            if self.config.enable_int_action_masking:
                for name, spec in self.actions_spec.items():
                    if spec.type == 'int':
                        assertions.append(tf.debugging.assert_equal(
                            x=tf.reduce_all(input_tensor=tf.math.reduce_any(
                                input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1)
                            )), y=true,
                            message="Agent.independent_act: at least one action has to be valid."
                        ))

        with tf.control_dependencies(control_inputs=assertions):
            # Core act
            parallel = tf_util.zeros(shape=(1,), dtype='int')
            actions, internals = self.core_act(
                states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel,
                independent=True
            )
            # Skip action assertions

            # SavedModel requires flattened output
            if len(self.internals_spec) > 0:
                return OrderedDict(TensorDict(actions=actions, internals=internals))
            else:
                return OrderedDict(actions)
Exemple #20
0
        def fn_sample():
            # Set logits to minimal value
            min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min)
            temp_logits = logits / tf.math.maximum(x=temperature, y=epsilon)
            temp_logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=temp_logits)

            # Non-deterministic: sample action using Gumbel distribution
            one = tf_util.constant(value=1.0, dtype='float')
            uniform_distribution = tf.random.uniform(
                shape=tf.shape(input=temp_logits), minval=epsilon, maxval=(one - epsilon),
                dtype=tf_util.get_dtype(type='float')
            )
            # Second log numerically stable since log(1-eps) ~ -eps
            gumbel_distribution = -tf.math.log(x=-tf.math.log(x=uniform_distribution))
            action = tf.math.argmax(input=(temp_logits + gumbel_distribution), axis=-1)
            return tf_util.cast(x=action, dtype='int')
Exemple #21
0
    def apply(self, *, x):
        queries = self.query.apply(x=x)
        keys = self.key.apply(x=x)
        values = self.value.apply(x=x)

        if self.input_spec.rank > 2:
            batch_size = tf_util.cast(x=tf.shape(input=x)[:1], dtype='int')

            flattened_shape = tf_util.constant(
                value=(util.product(xs=self.input_spec.shape[:-1]),
                       self.attention_size),
                dtype='int')
            flattened_shape = tf.concat(values=(batch_size, flattened_shape),
                                        axis=0)
            queries = tf.reshape(tensor=queries, shape=flattened_shape)
            keys = tf.reshape(tensor=keys, shape=flattened_shape)

            flattened_shape = tf_util.constant(
                value=(util.product(xs=self.input_spec.shape[:-1]), self.size),
                dtype='int')
            flattened_shape = tf.concat(values=(batch_size, flattened_shape),
                                        axis=0)
            values = tf.reshape(tensor=values, shape=flattened_shape)

        attention = tf.linalg.matmul(a=queries, b=keys, transpose_b=True)
        attention = attention / tf_util.constant(
            value=np.sqrt(self.attention_size), dtype='float')
        attention = tf.nn.softmax(logits=attention, axis=-1)
        x = tf.linalg.matmul(a=attention, b=values)

        if self.input_spec.rank > 2:
            shape = tf_util.constant(value=self.output_spec().shape,
                                     dtype='int')
            shape = tf.concat(values=(batch_size, shape), axis=0)
            x = tf.reshape(tensor=x, shape=shape)

        return super().apply(x=x)
Exemple #22
0
    def iterative_body(self, x, indices, remaining, current_x,
                       current_internals):
        batch_size = tf_util.cast(x=tf.shape(input=current_x)[:1], dtype='int')
        zeros = tf_util.zeros(shape=batch_size, dtype='int')
        ones = tf_util.ones(shape=batch_size, dtype='int')
        batch_size = batch_size[0]

        current_x = tf.gather(params=x, indices=indices)
        next_x, next_internals = self.iterative_apply(
            x=current_x, internals=current_internals)

        with tf.control_dependencies(control_inputs=(current_x, next_x)):
            is_finished = tf.math.equal(x=remaining, y=zeros)
            if isinstance(next_internals, dict):
                for name, current_internal, next_internal in current_internals.zip_items(
                        next_internals):
                    condition = is_finished
                    for _ in range(tf_util.rank(x=current_internal) - 1):
                        condition = tf.expand_dims(input=condition, axis=1)
                    next_internals[name] = tf.where(condition=condition,
                                                    x=current_internal,
                                                    y=next_internal)

            else:
                condition = is_finished
                for _ in range(tf_util.rank(x=current_internals) - 1):
                    condition = tf.expand_dims(input=condition, axis=1)
                next_internals = tf.where(condition=condition,
                                          x=current_internals,
                                          y=next_internals)

            remaining -= tf.where(condition=is_finished, x=zeros, y=ones)
            indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros),
                                x=zeros,
                                y=ones)

        return x, indices, remaining, next_x, next_internals
    def mode(self, *, parameters):
        action_values = parameters['action_values']

        action = tf.math.argmax(input=action_values, axis=-1)
        return tf_util.cast(x=action, dtype='int')
Exemple #24
0
 def fn_mode():
     # Deterministic: maximum likelihood action
     action = tf.math.argmax(input=action_values, axis=-1)
     return tf_util.cast(x=action, dtype='int')
Exemple #25
0
    def core_act(self, *, states, internals, auxiliaries, parallel,
                 deterministic, independent):
        assert len(internals) == 0

        actions = TensorDict()
        for name, spec in self.actions_spec.items():
            shape = tf.concat(values=(tf_util.cast(
                x=tf.shape(input=states.value())[:1],
                dtype='int'), tf_util.constant(value=spec.shape, dtype='int')),
                              axis=0)

            if spec.type == 'bool':
                # Random bool action: uniform[True, False]
                half = tf_util.constant(value=0.5, dtype='float')
                uniform = tf.random.uniform(
                    shape=shape, dtype=tf_util.get_dtype(type='float'))
                actions[name] = (uniform < half)

            elif self.config.enable_int_action_masking and spec.type == 'int' and \
                    spec.num_values is not None:
                # Random masked action: uniform[unmasked]
                # (Similar code as for Model.apply_exploration)
                mask = auxiliaries[name]['mask']
                choices = tf_util.constant(value=list(range(spec.num_values)),
                                           dtype=spec.type,
                                           shape=(tuple(1
                                                        for _ in spec.shape) +
                                                  (1, spec.num_values)))
                one = tf_util.constant(value=1, dtype='int', shape=(1, ))
                multiples = tf.concat(values=(shape, one), axis=0)
                choices = tf.tile(input=choices, multiples=multiples)
                choices = tf.boolean_mask(tensor=choices, mask=mask)
                mask = tf_util.cast(x=mask, dtype='int')
                num_valid = tf.math.reduce_sum(input_tensor=mask,
                                               axis=(spec.rank + 1))
                num_valid = tf.reshape(tensor=num_valid, shape=(-1, ))
                masked_offset = tf.math.cumsum(x=num_valid,
                                               axis=0,
                                               exclusive=True)
                uniform = tf.random.uniform(
                    shape=shape, dtype=tf_util.get_dtype(type='float'))
                uniform = tf.reshape(tensor=uniform, shape=(-1, ))
                num_valid = tf_util.cast(x=num_valid, dtype='float')
                random_offset = tf.dtypes.cast(x=(uniform * num_valid),
                                               dtype=tf.dtypes.int64)
                action = tf.gather(params=choices,
                                   indices=(masked_offset + random_offset))
                actions[name] = tf.reshape(tensor=action, shape=shape)

            elif spec.type != 'bool' and spec.min_value is not None:
                if spec.max_value is not None:
                    # Random bounded action: uniform[min_value, max_value]
                    actions[name] = tf.random.uniform(shape=shape,
                                                      minval=spec.min_value,
                                                      maxval=spec.max_value,
                                                      dtype=spec.tf_type())

                else:
                    # Random left-bounded action: not implemented
                    raise NotImplementedError

            elif spec.type != 'bool' and spec.max_value is not None:
                # Random right-bounded action: not implemented
                raise NotImplementedError

            else:
                # Random unbounded int/float action
                actions[name] = tf.random.normal(shape=shape,
                                                 dtype=spec.tf_type())

        return actions, TensorDict()
Exemple #26
0
    def observe(self, *, terminal, reward, parallel):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int')
        expanded_parallel = tf.expand_dims(input=tf.expand_dims(input=parallel,
                                                                axis=0),
                                           axis=1)
        is_terminal = tf.math.greater(x=terminal[-1], y=zero)

        # Input assertions
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.extend(
                self.terminal_spec.tf_assert(
                    x=terminal,
                    batch_size=batch_size,
                    message='Agent.observe: invalid {issue} for terminal input.'
                ))
            assertions.extend(
                self.reward_spec.tf_assert(
                    x=reward,
                    batch_size=batch_size,
                    message='Agent.observe: invalid {issue} for terminal input.'
                ))
            assertions.extend(
                self.parallel_spec.tf_assert(
                    x=parallel,
                    message='Agent.observe: invalid {issue} for parallel input.'
                ))
            # Assertion: at most one terminal
            num_terms = tf.math.count_nonzero(
                input=terminal, dtype=tf_util.get_dtype(type='int'))
            assertions.append(
                tf.debugging.assert_less_equal(
                    x=num_terms,
                    y=one,
                    message=
                    "Agent.observe: input contains more than one terminal."))
            # Assertion: if terminal, last timestep in batch
            assertions.append(
                tf.debugging.assert_equal(
                    x=tf.math.greater(x=num_terms, y=zero),
                    y=is_terminal,
                    message=
                    "Agent.observe: terminal is not the last input timestep."))

        with tf.control_dependencies(control_inputs=assertions):
            dependencies = list()

            # Reward summary
            if self.summaries == 'all' or 'reward' in self.summaries:
                with self.summarizer.as_default():
                    x = tf.math.reduce_mean(input_tensor=reward)
                    dependencies.append(
                        tf.summary.scalar(name='reward',
                                          data=x,
                                          step=self.timesteps))

            # Update episode length/reward
            updates = tf.expand_dims(input=batch_size, axis=0)
            value = tf.tensor_scatter_nd_add(tensor=self.episode_length,
                                             indices=expanded_parallel,
                                             updates=updates)
            dependencies.append(self.episode_length.assign(value=value))
            # sparse_delta = tf.IndexedSlices(values=batch_size, indices=parallel)
            # dependencies.append(self.episode_length.scatter_add(sparse_delta=sparse_delta))
            sum_reward = tf.math.reduce_sum(input_tensor=reward, keepdims=True)
            value = tf.tensor_scatter_nd_add(tensor=self.episode_reward,
                                             indices=expanded_parallel,
                                             updates=sum_reward)
            dependencies.append(self.episode_reward.assign(value=value))
            # sum_reward = tf.math.reduce_sum(input_tensor=reward)
            # sparse_delta = tf.IndexedSlices(values=sum_reward, indices=parallel)
            # dependencies.append(self.episode_reward.scatter_add(sparse_delta=sparse_delta))

            # Core observe (before terminal handling)
            updated = self.core_observe(terminal=terminal,
                                        reward=reward,
                                        parallel=parallel)
            dependencies.append(updated)

        # Handle terminal (after core observe and episode reward)
        with tf.control_dependencies(control_inputs=dependencies):

            def fn_terminal():
                operations = list()

                # Reset internals
                def function(spec, initial):
                    return tf_util.constant(value=initial, dtype=spec.type)

                initials = self.internals_spec.fmap(
                    function=function,
                    cls=TensorDict,
                    zip_values=self.initial_internals)
                for name, previous, initial in self.previous_internals.zip_items(
                        initials):
                    updates = tf.expand_dims(input=initial, axis=0)
                    value = tf.tensor_scatter_nd_update(
                        tensor=previous,
                        indices=expanded_parallel,
                        updates=updates)
                    operations.append(previous.assign(value=value))
                    # sparse_delta = tf.IndexedSlices(values=initial, indices=parallel)
                    # operations.append(previous.scatter_update(sparse_delta=sparse_delta))

                # Episode length/reward summaries (before episode reward reset / episodes increment)
                dependencies = list()
                if self.summaries == 'all' or 'reward' in self.summaries:
                    with self.summarizer.as_default():
                        x = tf.gather(params=self.episode_length,
                                      indices=parallel)
                        dependencies.append(
                            tf.summary.scalar(name='episode-length',
                                              data=x,
                                              step=self.episodes))
                        x = tf.gather(params=self.episode_reward,
                                      indices=parallel)
                        dependencies.append(
                            tf.summary.scalar(name='episode-reward',
                                              data=x,
                                              step=self.episodes))

                # Reset episode length/reward
                with tf.control_dependencies(control_inputs=dependencies):
                    zeros = tf_util.zeros(shape=(1, ), dtype='int')
                    value = tf.tensor_scatter_nd_update(
                        tensor=self.episode_length,
                        indices=expanded_parallel,
                        updates=zeros)
                    operations.append(self.episode_length.assign(value=value))
                    # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel)
                    # operations.append(self.episode_length.scatter_update(sparse_delta=sparse_delta))
                    zeros = tf_util.zeros(shape=(1, ), dtype='float')
                    value = tf.tensor_scatter_nd_update(
                        tensor=self.episode_reward,
                        indices=expanded_parallel,
                        updates=zeros)
                    operations.append(self.episode_reward.assign(value=value))
                    # zero_float = tf_util.constant(value=0.0, dtype='float')
                    # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel)
                    # operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta))

                # Increment episodes counter
                operations.append(
                    self.episodes.assign_add(delta=one, read_value=False))

                return tf.group(*operations)

            handle_terminal = tf.cond(pred=is_terminal,
                                      true_fn=fn_terminal,
                                      false_fn=tf.no_op)

        with tf.control_dependencies(control_inputs=(handle_terminal, )):
            episodes = tf_util.identity(input=self.episodes)
            updates = tf_util.identity(input=self.updates)
            return updated, episodes, updates
Exemple #27
0
    def act(self, *, states, auxiliaries, parallel):
        batch_size = tf_util.cast(x=tf.shape(input=parallel)[0], dtype='int')

        # Input assertions
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.extend(
                self.states_spec.tf_assert(
                    x=states,
                    batch_size=batch_size,
                    message='Agent.act: invalid {issue} for {name} state input.'
                ))
            assertions.extend(
                self.auxiliaries_spec.tf_assert(
                    x=auxiliaries,
                    batch_size=batch_size,
                    message='Agent.act: invalid {issue} for {name} input.'))
            assertions.extend(
                self.parallel_spec.tf_assert(
                    x=parallel,
                    batch_size=batch_size,
                    message='Agent.act: invalid {issue} for parallel input.'))
            # Mask assertions
            if self.config.enable_int_action_masking:
                true = tf_util.constant(value=True, dtype='bool')
                for name, spec in self.actions_spec.items():
                    if spec.type == 'int':
                        assertions.append(
                            tf.debugging.assert_equal(
                                x=tf.reduce_all(
                                    input_tensor=tf.math.reduce_any(
                                        input_tensor=auxiliaries[name]['mask'],
                                        axis=(spec.rank + 1))),
                                y=true,
                                message=
                                "Agent.independent_act: at least one action has to be valid."
                            ))

        with tf.control_dependencies(control_inputs=assertions):
            # Retrieve internals
            internals = self.previous_internals.fmap(
                function=(lambda x: tf.gather(params=x, indices=parallel)),
                cls=TensorDict)

            # Core act
            deterministic = tf_util.constant(value=False, dtype='bool')
            actions, internals = self.core_act(states=states,
                                               internals=internals,
                                               auxiliaries=auxiliaries,
                                               parallel=parallel,
                                               deterministic=deterministic,
                                               independent=False)

        # Action assertions
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.extend(
                self.actions_spec.tf_assert(x=actions, batch_size=batch_size))
            if self.config.enable_int_action_masking:
                for name, spec, action in self.actions_spec.zip_items(actions):
                    if spec.type == 'int':
                        is_valid = tf.reduce_all(input_tensor=tf.gather(
                            params=auxiliaries[name]['mask'],
                            indices=tf.expand_dims(input=action,
                                                   axis=(spec.rank + 1)),
                            batch_dims=(spec.rank + 1)))
                        assertions.append(
                            tf.debugging.assert_equal(
                                x=is_valid,
                                y=true,
                                message="Action mask check."))

        # Remember internals
        dependencies = list()
        for name, previous, internal in self.previous_internals.zip_items(
                internals):
            indices = tf.expand_dims(input=parallel, axis=1)
            value = tf.tensor_scatter_nd_update(tensor=previous,
                                                indices=indices,
                                                updates=internal)
            dependencies.append(previous.assign(value=value))
            # sparse_delta = tf.IndexedSlices(values=internal, indices=parallel)
            # dependencies.append(previous.scatter_update(sparse_delta=sparse_delta))

        # Increment timestep (after core act)
        with tf.control_dependencies(control_inputs=(actions.flatten() +
                                                     internals.flatten())):
            dependencies.append(
                self.timesteps.assign_add(delta=batch_size, read_value=False))

        with tf.control_dependencies(control_inputs=(dependencies +
                                                     assertions)):
            actions = actions.fmap(function=tf_util.identity)
            timestep = tf_util.identity(input=self.timesteps)
            return actions, timestep
Exemple #28
0
    def parameter_value(self, *, step):
        initial_value = tf_util.constant(value=self.initial_value,
                                         dtype='float')

        if self.decay == 'cosine':
            assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0
            parameter = tf.keras.experimental.CosineDecay(
                initial_learning_rate=initial_value,
                decay_steps=(self.num_steps + 1),
                alpha=self.kwargs.get('alpha', 0.0))(step=step)

        elif self.decay == 'cosine_restarts':
            assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0
            parameter = tf.keras.experimental.CosineDecayRestarts(
                initial_learning_rate=initial_value,
                first_decay_steps=(self.num_steps + 1),
                t_mul=self.kwargs.get('t_mul', 2.0),
                m_mul=self.kwargs.get('m_mul', 1.0),
                alpha=self.kwargs.get('alpha', 0.0))(step=step)

        elif self.decay == 'exponential':
            assert self.kwargs['decay_rate'] >= 0.0
            parameter = tf.keras.optimizers.schedules.ExponentialDecay(
                initial_learning_rate=initial_value,
                decay_steps=(self.num_steps + 1),
                decay_rate=self.kwargs['decay_rate'],
                staircase=self.kwargs.get('staircase', False))(step=step)

        elif self.decay == 'inverse_time':
            assert self.kwargs['decay_rate'] >= 0.0
            parameter = tf.keras.optimizers.schedules.InverseTimeDecay(
                initial_learning_rate=initial_value,
                decay_steps=(self.num_steps + 1),
                decay_rate=self.kwargs['decay_rate'],
                staircase=self.kwargs.get('staircase', False))(step=step)

        elif self.decay == 'linear_cosine':
            assert self.kwargs.get('beta', 0.001) >= 0.0
            parameter = tf.keras.experimental.LinearCosineDecay(
                initial_learning_rate=initial_value,
                decay_steps=(self.num_steps + 1),
                num_periods=self.kwargs.get('num_periods', 0.5),
                alpha=self.kwargs.get('alpha', 0.0),
                beta=self.kwargs.get('beta', 0.001))(step=step)

        elif self.decay == 'linear_cosine_noisy':
            assert self.kwargs.get('beta', 0.001) >= 0.0
            parameter = tf.keras.experimental.NoisyLinearCosineDecay(
                initial_learning_rate=initial_value,
                decay_steps=(self.num_steps + 1),
                initial_variance=self.kwargs.get('initial_variance', 1.0),
                variance_decay=self.kwargs.get('variance_decay', 0.55),
                num_periods=self.kwargs.get('num_periods', 0.5),
                alpha=self.kwargs.get('alpha', 0.0),
                beta=self.kwargs.get('beta', 0.001))(step=step)

        elif self.decay == 'polynomial':
            assert self.kwargs.get('power', 1.0) >= 0.0
            parameter = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=initial_value,
                decay_steps=(self.num_steps + 1),
                end_learning_rate=self.kwargs['final_value'],
                power=self.kwargs.get('power', 1.0),
                cycle=self.kwargs.get('cycle', False))(step=step)

        if self.increasing:
            one = tf_util.constant(value=1.0, dtype='float')
            parameter = one - parameter

        if self.inverse:
            one = tf_util.constant(value=1.0, dtype='float')
            parameter = tf.math.reciprocal(x=parameter)

        if self.scale != 1.0:
            scale = tf_util.constant(value=self.scale, dtype='float')
            parameter = parameter * scale

        parameter = tf_util.cast(x=parameter, dtype=self.spec.type)

        return parameter
Exemple #29
0
 def fn_summary():
     return tf.linalg.global_norm(t_list=[
         tf_util.cast(x=delta, dtype='float') for delta in deltas
     ])
Exemple #30
0
    def apply(self, *, x, horizons, internals):
        zero = tf_util.constant(value=0, dtype='int')
        one = tf_util.constant(value=1, dtype='int')
        batch_size = tf_util.cast(x=tf.shape(input=horizons)[0], dtype='int')
        zeros = tf_util.zeros(shape=(batch_size, ), dtype='int')
        ones = tf_util.ones(shape=(batch_size, ), dtype='int')

        # including 0th step
        horizon = self.horizon.value() + one
        # in case of longer horizon than necessary (e.g. main vs baseline policy)
        starts = horizons[:, 0] + tf.maximum(x=(horizons[:, 1] - horizon),
                                             y=zeros)
        lengths = horizons[:, 1] - tf.maximum(x=(horizons[:, 1] - horizon),
                                              y=zeros)
        horizon = tf.minimum(x=horizon,
                             y=tf.math.reduce_max(input_tensor=lengths,
                                                  axis=0))
        output_spec = self.output_spec()

        if self.temporal_processing == 'cumulative':
            if self.horizon.is_constant(value=0):
                x = self.iterative_apply(xs=x, lengths=ones)

            else:

                def body(x, indices, remaining, xs):
                    current_x = tf.gather(params=x, indices=indices)
                    current_x = tf.expand_dims(input=current_x, axis=1)
                    xs = tf.concat(values=(xs, current_x), axis=1)
                    remaining -= tf.where(condition=tf.math.equal(x=remaining,
                                                                  y=zeros),
                                          x=zeros,
                                          y=ones)
                    indices += tf.where(condition=tf.math.equal(x=remaining,
                                                                y=zeros),
                                        x=zeros,
                                        y=ones)
                    return x, indices, remaining, xs

                initial_xs = tf_util.zeros(shape=((batch_size, 0) +
                                                  output_spec.shape),
                                           dtype=output_spec.type)

                _, final_indices, final_remaining, xs = tf.while_loop(
                    cond=tf_util.always_true,
                    body=body,
                    loop_vars=(x, starts, lengths, initial_xs),
                    maximum_iterations=tf_util.int64(x=horizon))

                x = self.cumulative_apply(xs=xs, lengths=lengths)

        elif self.temporal_processing == 'iterative':
            if self.horizon.is_constant(value=0):
                x, final_internals = self.iterative_apply(x=x,
                                                          internals=internals)

            else:
                initial_x = tf_util.zeros(shape=((batch_size, ) +
                                                 output_spec.shape),
                                          dtype=output_spec.type)

                signature = self.input_signature(function='iterative_body')
                internals = signature['current_internals'].kwargs_to_args(
                    kwargs=internals)
                _, final_indices, final_remaining, x, final_internals = tf.while_loop(
                    cond=tf_util.always_true,
                    body=self.iterative_body,
                    loop_vars=(x, starts, lengths, initial_x, internals),
                    maximum_iterations=tf_util.int32(x=horizon))
                internals = signature['current_internals'].args_to_kwargs(
                    args=final_internals)

        assertions = list()
        if self.config.create_tf_assertions:
            assertions.append(
                tf.debugging.assert_equal(x=final_indices,
                                          y=(tf.math.cumsum(x=lengths) -
                                             ones)))
            assertions.append(
                tf.debugging.assert_equal(
                    x=tf.math.reduce_sum(input_tensor=final_remaining),
                    y=zero))

        with tf.control_dependencies(control_inputs=assertions):
            if self.temporal_processing == 'cumulative':
                return tf_util.identity(input=super().apply(x=x))
            elif self.temporal_processing == 'iterative':
                return tf_util.identity(input=super().apply(x=x)), internals