def linesearch():
                loss_after = fn_loss(**arguments.to_kwargs())

                with tf.control_dependencies(control_inputs=(loss_after, )):
                    # Replace "/" with "_" to ensure TensorDict is flat
                    _deltas = TensorDict(
                        ((var.name[:-2].replace('/', '_'), delta)
                         for var, delta in zip(variables, deltas)))

                    # TODO: should be moved to initialize_given_variables, but fn_loss...
                    def evaluate_step(arguments, deltas):
                        assignments = list()
                        for variable, delta in zip(variables, deltas.values()):
                            assignments.append(
                                variable.assign_add(delta=delta,
                                                    read_value=False))
                        with tf.control_dependencies(
                                control_inputs=assignments):
                            return fn_loss(**arguments.to_kwargs())

                    _deltas = self.line_search.solve(arguments=arguments,
                                                     x_init=_deltas,
                                                     base_value=loss_before,
                                                     zero_value=loss_after,
                                                     fn_x=evaluate_step)
                    return tuple(_deltas.values())
    def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent):
        assert len(internals) == 0

        actions = TensorDict()
        for name, spec in self.actions_spec.items():
            shape = tf.concat(values=(
                tf_util.cast(x=tf.shape(input=states.value())[:1], dtype='int'),
                tf_util.constant(value=spec.shape, dtype='int')
            ), axis=0)

            if self.action_values is not None and name in self.action_values:
                # If user-specified, choose given action
                action = tf_util.constant(value=self.action_values[name], dtype=spec.type)
                actions[name] = tf.fill(dims=shape, value=action)

            elif self.config.enable_int_action_masking and spec.type == 'int' and \
                    spec.num_values is not None:
                # If masking, choose first unmasked action
                mask = auxiliaries[name]['mask']
                choices = tf_util.constant(
                    value=list(range(spec.num_values)), dtype='int',
                    shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values))
                )
                one = tf_util.constant(value=1, dtype='int', shape=(1,))
                multiples = tf.concat(values=(shape, one), axis=0)
                choices = tf.tile(input=choices, multiples=multiples)
                choices = tf.boolean_mask(tensor=choices, mask=mask)
                mask = tf_util.cast(x=mask, dtype='int')
                num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1))
                num_valid = tf.reshape(tensor=num_valid, shape=(-1,))
                masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True)
                action = tf.gather(params=choices, indices=masked_offset)
                actions[name] = tf.reshape(tensor=action, shape=shape)

            elif spec.type != 'bool' and spec.min_value is not None:
                if spec.max_value is not None:
                    # If min/max_value given, choose mean action
                    action = spec.min_value + 0.5 * (spec.max_value - spec.min_value)
                    action = tf_util.constant(value=action, dtype=spec.type)
                    actions[name] = tf.fill(dims=shape, value=action)

                else:
                    # If only min_value given, choose min_value
                    action = tf_util.constant(value=spec.min_value, dtype=spec.type)
                    actions[name] = tf.fill(dims=shape, value=action)

            elif spec.type != 'bool' and spec.max_value is not None:
                # If only max_value given, choose max_value
                action = tf_util.constant(value=spec.max_value, dtype=spec.type)
                actions[name] = tf.fill(dims=shape, value=action)

            else:
                # Else choose zero
                actions[name] = tf_util.zeros(shape=shape, dtype=spec.type)

        return actions, TensorDict()
Beispiel #3
0
    def independent_act(self, *, states, internals=None, auxiliaries=None):
        if internals is None:
            assert len(self.internals_spec) == 0
            internals = TensorDict()
        if auxiliaries is None:
            assert len(self.auxiliaries_spec) == 0
            auxiliaries = TensorDict()
        true = tf_util.constant(value=True, dtype='bool')
        batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int')

        # Input assertions
        assertions = list()
        if self.config.create_tf_assertions:
            assertions.extend(self.states_spec.tf_assert(
                x=states, batch_size=batch_size,
                message='Agent.independent_act: invalid {issue} for {name} state input.'
            ))
            assertions.extend(self.internals_spec.tf_assert(
                x=internals, batch_size=batch_size,
                message='Agent.independent_act: invalid {issue} for {name} internal input.'
            ))
            assertions.extend(self.auxiliaries_spec.tf_assert(
                x=auxiliaries, batch_size=batch_size,
                message='Agent.independent_act: invalid {issue} for {name} input.'
            ))
            # Mask assertions
            if self.config.enable_int_action_masking:
                for name, spec in self.actions_spec.items():
                    if spec.type == 'int':
                        assertions.append(tf.debugging.assert_equal(
                            x=tf.reduce_all(input_tensor=tf.math.reduce_any(
                                input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1)
                            )), y=true,
                            message="Agent.independent_act: at least one action has to be valid."
                        ))

        with tf.control_dependencies(control_inputs=assertions):
            # Core act
            parallel = tf_util.zeros(shape=(1,), dtype='int')
            actions, internals = self.core_act(
                states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel,
                independent=True
            )
            # Skip action assertions

            # SavedModel requires flattened output
            if len(self.internals_spec) > 0:
                return OrderedDict(TensorDict(actions=actions, internals=internals))
            else:
                return OrderedDict(actions)
Beispiel #4
0
    def apply(self, *, x, deterministic, independent):
        assert x.is_singleton()
        x = x.singleton()
        registered_tensors = TensorDict(input=x)

        for layer in self.layers:
            if isinstance(layer, Register):
                if layer.tensor in registered_tensors:
                    raise TensorforceError.exists(name='registered tensor',
                                                  value=layer.tensor)
                x = layer.apply(x=x)
                registered_tensors[layer.tensor] = x

            elif isinstance(layer, MultiInputLayer):
                if layer.tensors not in registered_tensors:
                    raise TensorforceError.exists_not(name='registered tensor',
                                                      value=layer.tensors)
                x = layer.apply(x=registered_tensors[layer.tensors])

            elif isinstance(layer, NondeterministicLayer):
                x = layer.apply(x=x, deterministic=deterministic)

            elif isinstance(layer, StatefulLayer):
                x = layer.apply(x=x, independent=independent)

            else:
                x = layer.apply(x=x)

        return x
Beispiel #5
0
    def state_value(self, *, states, horizons, internals, auxiliaries):
        if self.state_value_mode == 'separate':
            deterministic = tf_util.constant(value=True, dtype='bool')
            embedding, _ = self.network.apply(
                x=states, horizons=horizons, internals=internals, deterministic=deterministic,
                independent=True
            )
            if not isinstance(embedding, TensorDict):
                embedding = TensorDict(embedding=embedding)

            return self.s_value.apply(x=embedding.get('state-embedding', embedding['embedding']))

        else:
            return super().state_value(
                states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries
            )
Beispiel #6
0
        def fisher_matrix_product(arguments, deltas):
            # Second-order gradients
            with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape1:
                for variable in variables:
                    tape1.watch(tensor=variable)
                with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape2:
                    for variable in variables:
                        tape2.watch(tensor=variable)

                    # kldiv
                    kldiv = fn_kl_divergence(**arguments.to_kwargs())

                # grad(kldiv)
                kldiv_grads = tape2.gradient(target=kldiv, sources=variables)
                kldiv_grads = [
                    tf.zeros_like(input=var) if grad is None else grad
                    for var, grad in zip(variables, kldiv_grads)
                ]

                # delta' * grad(kldiv)
                multiply = functools.partial(
                    tf_util.lift_indexedslices, tf.math.multiply,
                    with_assertions=self.config.create_tf_assertions
                )
                delta_kldiv_grads = tf.math.add_n(inputs=[
                    tf.math.reduce_sum(input_tensor=multiply(delta, grad))
                    for delta, grad in zip(deltas.values(), kldiv_grads)
                ])

            # [delta' * F] = grad(delta' * grad(kldiv))
            delta_kldiv_grads2 = tape1.gradient(target=delta_kldiv_grads, sources=variables)
            return TensorDict((
                (var.name, tf.zeros_like(input=var) if grad is None else grad)
                for var, grad in zip(variables, delta_kldiv_grads2)
            ))
Beispiel #7
0
    def action_values(self, *, states, horizons, internals, auxiliaries,
                      actions):
        deterministic = tf_util.constant(value=True, dtype='bool')
        embedding, _ = self.network.apply(x=states,
                                          horizons=horizons,
                                          internals=internals,
                                          deterministic=deterministic,
                                          independent=True)
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        def function(name, distribution, action):
            if name is None:
                x = embedding.get('action-embedding', embedding['embedding'])
            else:
                x = embedding.get(name + '-embedding', embedding['embedding'])
            conditions = auxiliaries.get(name, default=TensorDict())
            parameters = distribution.parametrize(x=x, conditions=conditions)
            return distribution.action_value(parameters=parameters,
                                             action=action)

        return self.distributions.fmap(function=function,
                                       cls=TensorDict,
                                       with_names=True,
                                       zip_values=actions)
Beispiel #8
0
    def parametrize(self, *, x, conditions):
        # Softplus to ensure alpha and beta >= 1
        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float')
        shape = (-1,) + self.action_spec.shape

        # Alpha
        alpha = self.alpha.apply(x=x)
        # epsilon < 1.0, hence negative
        alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon)
        alpha = tf.math.softplus(features=alpha) + one
        if len(self.input_spec.shape) == 1:
            alpha = tf.reshape(tensor=alpha, shape=shape)

        # Beta
        beta = self.beta.apply(x=x)
        # epsilon < 1.0, hence negative
        beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon)
        beta = tf.math.softplus(features=beta) + one
        if len(self.input_spec.shape) == 1:
            beta = tf.reshape(tensor=beta, shape=shape)

        # Alpha + Beta
        alpha_beta = tf.maximum(x=(alpha + beta), y=epsilon)

        # Log norm
        log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta)

        return TensorDict(alpha=alpha, beta=beta, alpha_beta=alpha_beta, log_norm=log_norm)
Beispiel #9
0
    def parametrize(self, *, x, conditions):
        log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float')
        shape = (-1,) + self.action_spec.shape

        # Mean
        mean = self.mean.apply(x=x)
        if len(self.input_spec.shape) == 1:
            mean = tf.reshape(tensor=mean, shape=shape)

        # Log standard deviation
        if self.global_stddev:
            multiples = (tf.shape(input=x)[0],) + tuple(1 for _ in range(self.action_spec.rank))
            log_stddev = tf.tile(input=self.log_stddev, multiples=multiples)
        else:
            log_stddev = self.log_stddev.apply(x=x)
            if len(self.input_spec.shape) == 1:
                log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Shift log stddev to reduce zero value (TODO: 0.1 random choice)
        if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
            log_stddev += tf_util.constant(value=np.log(0.1), dtype='float')

        # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative)
        log_stddev = tf.clip_by_value(
            t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon
        )

        # Standard deviation
        stddev = tf.math.exp(x=log_stddev)

        return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
 def function(name, distribution):
     conditions = auxiliaries.get(name, default=TensorDict())
     parameters = distribution.parametrize(
         x=embedding, conditions=conditions)
     return distribution.sample(parameters=parameters,
                                temperature=temperature,
                                independent=independent)
Beispiel #11
0
    def parametrize(self, *, x, conditions):
        log_epsilon = tf_util.constant(value=np.log(util.epsilon),
                                       dtype='float')
        shape = (-1, ) + self.action_spec.shape

        # Mean
        mean = self.mean.apply(x=x)
        if len(self.input_spec.shape) == 1:
            mean = tf.reshape(tensor=mean, shape=shape)

        # Log standard deviation
        if self.global_stddev:
            log_stddev = self.log_stddev
        else:
            log_stddev = self.log_stddev.apply(x=x)
            if len(self.input_spec.shape) == 1:
                log_stddev = tf.reshape(tensor=log_stddev, shape=shape)

        # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative)
        log_stddev = tf.clip_by_value(t=log_stddev,
                                      clip_value_min=log_epsilon,
                                      clip_value_max=-log_epsilon)

        # Standard deviation
        stddev = tf.exp(x=log_stddev)

        return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
Beispiel #12
0
    def apply(self, *, x, horizons, internals, deterministic, independent):
        if x.is_singleton():
            registered_tensors = TensorDict(state=x.singleton())
        else:
            registered_tensors = x.copy()
        x = x.value()

        for layer in self.layers:
            if isinstance(layer, Register):
                if layer.tensor in registered_tensors:
                    raise TensorforceError.exists(name='registered tensor',
                                                  value=layer.tensor)
                x = layer.apply(x=x)
                registered_tensors[layer.tensor] = x

            elif isinstance(layer, MultiInputLayer):
                if layer.tensors not in registered_tensors:
                    raise TensorforceError.exists_not(name='registered tensor',
                                                      value=layer.tensors)
                x = layer.apply(x=registered_tensors[layer.tensors])

            elif isinstance(layer, NondeterministicLayer):
                x = layer.apply(x=x, deterministic=deterministic)

            elif isinstance(layer, StatefulLayer):
                x = layer.apply(x=x, independent=independent)

            elif isinstance(layer, TemporalLayer):
                x, internals[layer.name] = layer.apply(
                    x=x, horizons=horizons, internals=internals[layer.name])

            else:
                x = layer.apply(x=x)

        return x, internals
Beispiel #13
0
 def function(name, distribution):
     if name is None:
         x = embedding.get('action-embedding', embedding['embedding'])
     else:
         x = embedding.get(name + '-embedding', embedding['embedding'])
     conditions = auxiliaries.get(name, default=TensorDict())
     return distribution.parametrize(x=x, conditions=conditions)
Beispiel #14
0
    def parametrize(self, *, x, conditions):
        one = tf_util.constant(value=1.0, dtype='float')
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        shape = (-1, ) + self.action_spec.shape

        # Logit
        logit = self.logit.apply(x=x)
        if len(self.input_spec.shape) == 1:
            logit = tf.reshape(tensor=logit, shape=shape)

        # States value
        state_value = logit

        # Sigmoid for corresponding probability
        probability = tf.sigmoid(x=logit)

        # Clip probability for numerical stability
        probability = tf.clip_by_value(t=probability,
                                       clip_value_min=epsilon,
                                       clip_value_max=(one - epsilon))

        # "Normalized" logits
        true_logit = tf.math.log(x=probability)
        false_logit = tf.math.log(x=(one - probability))

        return TensorDict(true_logit=true_logit,
                          false_logit=false_logit,
                          probability=probability,
                          state_value=state_value)
 def function(name, distribution):
     conditions = auxiliaries.get(name, default=TensorDict())
     parameters = distribution.parametrize(x=embedding,
                                           conditions=conditions)
     mode = distribution.mode(parameters=parameters,
                              independent=independent)
     entropy = distribution.entropy(parameters=parameters)
     return mode, entropy
Beispiel #16
0
    def apply(self, *, x, deterministic, independent):
        assert x.is_singleton()
        x = x.singleton()
        registered_tensors = TensorDict(input=x)

        x = Preprocessor._recursive_apply(
            layer=self.layers,
            x=x,
            deterministic=deterministic,
            independent=independent,
            registered_tensors=registered_tensors)

        return x
Beispiel #17
0
 def function(name, distribution):
     if name is None:
         x = embedding.get('action-embedding',
                           embedding['embedding'])
     else:
         x = embedding.get(name + '-embedding',
                           embedding['embedding'])
     conditions = auxiliaries.get(name, default=TensorDict())
     parameters = distribution.parametrize(
         x=x, conditions=conditions)
     return distribution.sample(parameters=parameters,
                                temperature=temperature,
                                independent=independent)
Beispiel #18
0
 def function(name, distribution):
     if name is None:
         x = embedding.get('action-embedding',
                           embedding['embedding'])
     else:
         x = embedding.get(name + '-embedding',
                           embedding['embedding'])
     conditions = auxiliaries.get(name, default=TensorDict())
     parameters = distribution.parametrize(x=x,
                                           conditions=conditions)
     mode = distribution.mode(parameters=parameters,
                              independent=independent)
     entropy = distribution.entropy(parameters=parameters)
     return mode, entropy
Beispiel #19
0
    def next_internals(self, *, states, horizons, internals, actions, deterministic, independent):
        inputs = TensorDict()
        if self.states_spec.is_singleton():
            inputs['states'] = states.singleton()
        else:
            inputs['states'] = states
        if self.actions_spec.is_singleton():
            inputs['actions'] = actions.singleton()
        else:
            inputs['actions'] = actions

        return super().next_internals(
            states=inputs, horizons=horizons, internals=internals, deterministic=deterministic,
            independent=independent
        )
Beispiel #20
0
    def apply(self, *, x, horizons, internals, deterministic, independent):
        if x.is_singleton():
            registered_tensors = TensorDict(state=x.singleton())
        else:
            registered_tensors = x.copy()
        x = x.value()

        temporal_layer_check = False
        x, _ = LayeredNetwork._recursive_apply(
            layer=self.layers,
            x=x,
            horizons=horizons,
            internals=internals,
            deterministic=deterministic,
            independent=independent,
            registered_tensors=registered_tensors,
            temporal_layer_check=temporal_layer_check)

        if self.outputs is not None:
            x = TensorDict(embedding=x)
            x.update(((output, registered_tensors[output])
                      for output in self.outputs))

        return x, internals
Beispiel #21
0
    def parametrize(self, *, x, conditions):
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        log_epsilon = tf_util.constant(value=np.log(util.epsilon),
                                       dtype='float')

        # Mean
        mean = self.mean.apply(x=x)
        if self.input_spec.rank == 1:
            shape = (-1, ) + self.action_spec.shape
            mean = tf.reshape(tensor=mean, shape=shape)

        # Softplus standard deviation
        if self.stddev_mode == 'global':
            multiples = (tf.shape(input=x)[0], ) + tuple(
                1 for _ in range(self.action_spec.rank))
            softplus_stddev = tf.tile(input=self.softplus_stddev,
                                      multiples=multiples)
        else:
            softplus_stddev = self.softplus_stddev.apply(x=x)
            if self.input_spec.rank == 1:
                softplus_stddev = tf.reshape(tensor=softplus_stddev,
                                             shape=shape)

        # # Shift softplus_stddev to reduce zero value to 0.25 (TODO: 0.25 random choice)
        # if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
        #     softplus_stddev += tf_util.constant(value=np.log(0.25), dtype='float')

        # Clip softplus_stddev for numerical stability (epsilon < 1.0, hence negative)
        softplus_stddev = tf.clip_by_value(t=softplus_stddev,
                                           clip_value_min=log_epsilon,
                                           clip_value_max=-log_epsilon)

        # Softplus transformation (based on https://arxiv.org/abs/2007.06059)
        softplus_shift = tf_util.constant(value=0.2, dtype='float')
        log_two = tf_util.constant(value=np.log(2.0), dtype='float')
        stddev = (tf.nn.softplus(features=softplus_stddev) + softplus_shift) / \
            (log_two + softplus_shift)

        # Divide stddev to reduce zero value to 0.25 (TODO: 0.25 random choice)
        if self.action_spec.min_value is not None and self.action_spec.max_value is not None:
            stddev *= tf_util.constant(value=0.25, dtype='float')

        # Log stddev
        log_stddev = tf.math.log(x=(stddev + epsilon))

        return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
Beispiel #22
0
    def action_value(self, *, states, horizons, internals, auxiliaries, actions):
        inputs = TensorDict()
        if self.states_spec.is_singleton():
            inputs['states'] = states.singleton()
        else:
            inputs['states'] = states
        if self.actions_spec.is_singleton():
            inputs['actions'] = actions.singleton()
        else:
            inputs['actions'] = actions

        deterministic = tf_util.constant(value=True, dtype='bool')
        embedding, _ = self.network.apply(
            x=inputs, horizons=horizons, internals=internals, deterministic=deterministic,
            independent=True
        )

        return self.value.apply(x=embedding)
Beispiel #23
0
    def parametrize(self, *, x, conditions):
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        shape = (-1, ) + self.action_spec.shape + (
            self.action_spec.num_values, )

        # Action values
        action_values = self.action_values.apply(x=x)
        action_values = tf.reshape(tensor=action_values, shape=shape)

        # States value
        if self.state_value is None:
            # Implicit states value (TODO: experimental)
            states_value = tf.reduce_logsumexp(input_tensor=action_values,
                                               axis=-1)

        else:
            # Explicit states value and advantage-based action values
            states_value = self.state_value.apply(x=x)
            states_value = tf.reshape(tensor=states_value, shape=shape[:-1])
            action_values = tf.expand_dims(input=states_value,
                                           axis=-1) + action_values
            action_values -= tf.math.reduce_mean(input_tensor=action_values,
                                                 axis=-1,
                                                 keepdims=True)

        # Masking (TODO: before or after above?)
        if self.config.enable_int_action_masking:
            min_float = tf.fill(dims=tf.shape(input=action_values),
                                value=tf_util.get_dtype(type='float').min)
            action_values = tf.where(condition=conditions['mask'],
                                     x=action_values,
                                     y=min_float)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=action_values, axis=-1)

        # "Normalized" logits
        logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon))

        return TensorDict(logits=logits,
                          probabilities=probabilities,
                          states_value=states_value,
                          action_values=action_values)
Beispiel #24
0
    def apply(self, *, x, horizons, internals, deterministic, independent):
        if x.is_singleton():
            registered_tensors = TensorDict(state=x.singleton())
        else:
            registered_tensors = x.copy()
        x = x.value()

        temporal_layer_check = False
        x, _ = LayeredNetwork._recursive_apply(
            layer=self.layers,
            x=x,
            horizons=horizons,
            internals=internals,
            deterministic=deterministic,
            independent=independent,
            registered_tensors=registered_tensors,
            temporal_layer_check=temporal_layer_check)

        return x, internals
Beispiel #25
0
    def apply(self, *, x, horizons, internals, deterministic, independent):
        if x.is_singleton():
            registered_tensors = TensorDict(state=x.singleton())
        else:
            registered_tensors = x.copy()
        x = x.value()

        temporal_layer_check = False
        for layer in self.layers:
            if isinstance(layer, Register):
                if layer.tensor in registered_tensors:
                    raise TensorforceError.exists(name='registered tensor',
                                                  value=layer.tensor)
                x = layer.apply(x=x)
                registered_tensors[layer.tensor] = x

            elif isinstance(layer, MultiInputLayer):
                if layer.tensors not in registered_tensors:
                    raise TensorforceError.exists_not(name='registered tensor',
                                                      value=layer.tensors)
                x = layer.apply(x=registered_tensors[layer.tensors])
                temporal_layer_check = False

            elif isinstance(layer, NondeterministicLayer):
                x = layer.apply(x=x, deterministic=deterministic)

            elif isinstance(layer, StatefulLayer):
                x = layer.apply(x=x, independent=independent)

            elif isinstance(layer, TemporalLayer):
                if temporal_layer_check:
                    raise TensorforceError(
                        "Multiple successive temporal layers like RNNs are currently not supported."
                    )
                x, internals[layer.name] = layer.apply(
                    x=x, horizons=horizons, internals=internals[layer.name])
                temporal_layer_check = True

            else:
                x = layer.apply(x=x)

        return x, internals
Beispiel #26
0
        def fn_initial_gradients(*, states, horizons, internals, auxiliaries,
                                 actions, reward, reference):
            if 'policy' in internals:
                policy_internals = internals['policy']
                baseline_internals = internals['baseline']
            else:
                policy_internals = internals
                # TODO: Baseline currently cannot have internal states, since generally only policy
                # internals are passed to policy optimizer
                assert len(baseline.internals_spec) == 0
                baseline_internals = TensorDict()

            actions = policy.act(states=states,
                                 horizons=horizons,
                                 internals=policy_internals,
                                 auxiliaries=auxiliaries,
                                 independent=True,
                                 return_internals=False)
            assert len(actions) == 1
            action = actions.value()
            shape = tf_util.shape(x=action)
            assert len(shape) <= 2

            with tf.GradientTape(persistent=False,
                                 watch_accessed_variables=False) as tape:
                tape.watch(tensor=action)
                actions_value = baseline.actions_value(
                    states=states,
                    horizons=horizons,
                    internals=baseline_internals,
                    auxiliaries=auxiliaries,
                    actions=actions,
                    reduced=True,
                    return_per_action=False)
                if len(shape) == 1:
                    return -tape.gradient(target=actions_value,
                                          sources=action)[0]
                elif len(shape) == 2 and shape[1] == 1:
                    return -tape.gradient(target=actions_value,
                                          sources=action)[0][0]
                else:
                    assert False
Beispiel #27
0
    def apply(self, *, x, independent):
        registered_tensors = TensorDict(x=x)

        for layer in self.layers:
            if isinstance(layer, Register):
                if layer.tensor in registered_tensors:
                    raise TensorforceError.exists(name='registered tensor',
                                                  value=layer.tensor)
                x = layer.apply(x=x)
                registered_tensors[layer.tensor] = x

            elif isinstance(layer, MultiInputLayer):
                if layer.tensors not in registered_tensors:
                    raise TensorforceError.exists_not(name='registered tensor',
                                                      value=layer.tensors)
                x = layer.apply(x=registered_tensors[layer.tensors])

            elif isinstance(layer, StatefulLayer):
                x = layer.apply(x=x, independent=independent)

            else:
                x = layer.apply(x=x)

        return x
Beispiel #28
0
    def act_entropy(self, *, states, horizons, internals, auxiliaries,
                    deterministic, independent):
        assertions = list()
        if self.config.create_tf_assertions:
            if not independent:
                false = tf_util.constant(value=False, dtype='bool')
                assertions.append(
                    tf.debugging.assert_equal(x=deterministic, y=false))

        embedding, internals = self.network.apply(x=states,
                                                  horizons=horizons,
                                                  internals=internals,
                                                  deterministic=deterministic,
                                                  independent=independent)
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        def fn_mode():
            def function(name, distribution):
                if name is None:
                    x = embedding.get('action-embedding',
                                      embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding',
                                      embedding['embedding'])
                conditions = auxiliaries.get(name, default=TensorDict())
                parameters = distribution.parametrize(x=x,
                                                      conditions=conditions)
                mode = distribution.mode(parameters=parameters,
                                         independent=independent)
                entropy = distribution.entropy(parameters=parameters)
                return mode, entropy

            return self.distributions.fmap(function=function,
                                           cls=TensorDict,
                                           with_names=True)

        def fn_sample():
            if isinstance(self.temperature, dict):

                def function(name, distribution, temp):
                    if name is None:
                        x = embedding.get('action-embedding',
                                          embedding['embedding'])
                    else:
                        x = embedding.get(name + '-embedding',
                                          embedding['embedding'])
                    conditions = auxiliaries.get(name, default=TensorDict())
                    parameters = distribution.parametrize(
                        x=x, conditions=conditions)
                    sample = distribution.sample(parameters=parameters,
                                                 temperature=temp,
                                                 independent=independent)
                    entropy = distribution.entropy(parameters=parameters)
                    return sample, entropy

                temperature = self.temperature.fmap(
                    function=(lambda x: x.value()), cls=TensorDict)
                return self.distributions.fmap(function=function,
                                               cls=TensorDict,
                                               with_names=True,
                                               zip_values=(temperature, ))

            else:
                temperature = self.temperature.value()

                def function(name, distribution):
                    if name is None:
                        x = embedding.get('action-embedding',
                                          embedding['embedding'])
                    else:
                        x = embedding.get(name + '-embedding',
                                          embedding['embedding'])
                    conditions = auxiliaries.get(name, default=TensorDict())
                    parameters = distribution.parametrize(
                        x=x, conditions=conditions)
                    sample = distribution.sample(parameters=parameters,
                                                 temperature=temperature,
                                                 independent=independent)
                    entropy = distribution.entropy(parameters=parameters)
                    return sample, entropy

                return self.distributions.fmap(function=function,
                                               cls=TensorDict,
                                               with_names=True)

        with tf.control_dependencies(control_inputs=assertions):
            actions, entropies = tf.cond(pred=deterministic,
                                         true_fn=fn_mode,
                                         false_fn=fn_sample)

            def function(value, spec):
                return tf.reshape(tensor=value, shape=(-1, spec.size))

            # See also implementation of StochasticPolicy.entropy()
            entropies = entropies.fmap(function=function,
                                       zip_values=self.actions_spec)
            entropies = tf.concat(values=tuple(entropies.values()), axis=1)
            entropy = tf.math.reduce_mean(input_tensor=entropies, axis=1)

            return actions, internals, entropy
Beispiel #29
0
    def act(self, *, states, horizons, internals, auxiliaries, deterministic,
            independent):
        assertions = list()
        if self.config.create_tf_assertions:
            if not independent:
                false = tf_util.constant(value=False, dtype='bool')
                assertions.append(
                    tf.debugging.assert_equal(x=deterministic, y=false))

        embedding, internals = self.network.apply(x=states,
                                                  horizons=horizons,
                                                  internals=internals,
                                                  deterministic=deterministic,
                                                  independent=independent)
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        def fn_mode():
            def function(name, distribution):
                if name is None:
                    x = embedding.get('action-embedding',
                                      embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding',
                                      embedding['embedding'])
                conditions = auxiliaries.get(name, default=TensorDict())
                parameters = distribution.parametrize(x=x,
                                                      conditions=conditions)
                return distribution.mode(parameters=parameters,
                                         independent=independent)

            return self.distributions.fmap(function=function,
                                           cls=TensorDict,
                                           with_names=True)

        def fn_sample():
            if isinstance(self.temperature, dict):

                def function(name, distribution, temp):
                    if name is None:
                        x = embedding.get('action-embedding',
                                          embedding['embedding'])
                    else:
                        x = embedding.get(name + '-embedding',
                                          embedding['embedding'])
                    conditions = auxiliaries.get(name, default=TensorDict())
                    parameters = distribution.parametrize(
                        x=x, conditions=conditions)
                    return distribution.sample(parameters=parameters,
                                               temperature=temp,
                                               independent=independent)

                temperature = self.temperature.fmap(
                    function=(lambda x: x.value()), cls=TensorDict)
                return self.distributions.fmap(function=function,
                                               cls=TensorDict,
                                               with_names=True,
                                               zip_values=(temperature, ))

            else:
                temperature = self.temperature.value()

                def function(name, distribution):
                    if name is None:
                        x = embedding.get('action-embedding',
                                          embedding['embedding'])
                    else:
                        x = embedding.get(name + '-embedding',
                                          embedding['embedding'])
                    conditions = auxiliaries.get(name, default=TensorDict())
                    parameters = distribution.parametrize(
                        x=x, conditions=conditions)
                    return distribution.sample(parameters=parameters,
                                               temperature=temperature,
                                               independent=independent)

                return self.distributions.fmap(function=function,
                                               cls=TensorDict,
                                               with_names=True)

        with tf.control_dependencies(control_inputs=assertions):
            actions = tf.cond(pred=deterministic,
                              true_fn=fn_mode,
                              false_fn=fn_sample)
            return actions, internals
Beispiel #30
0
    def parametrize(self, *, x, conditions):
        epsilon = tf_util.constant(value=util.epsilon, dtype='float')
        log_epsilon = tf_util.constant(value=np.log(util.epsilon),
                                       dtype='float')
        log_two = tf_util.constant(value=np.log(2.0), dtype='float')

        # Action values
        action_values = self.action_values.apply(x=x)
        shape = (-1, ) + self.action_spec.shape + (
            self.action_spec.num_values, )
        action_values = tf.reshape(tensor=action_values, shape=shape)

        # Softplus standard deviation
        if self.temperature_mode == 'global':
            multiples = (tf.shape(input=x)[0], ) + tuple(
                1 for _ in range(self.action_spec.rank + 1))
            softplus_temperature = tf.tile(input=self.softplus_temperature,
                                           multiples=multiples)
        elif self.temperature_mode == 'predicted':
            softplus_temperature = self.softplus_temperature.apply(x=x)
            shape = (-1, ) + self.action_spec.shape + (1, )
            softplus_temperature = tf.reshape(tensor=softplus_temperature,
                                              shape=shape)

        if self.temperature_mode is None:
            # Logits
            logits = action_values

            # Implicit states value
            state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1)

        else:
            # Clip softplus_temperature for numerical stability (epsilon < 1.0, hence negative)
            softplus_temperature = tf.clip_by_value(
                t=softplus_temperature,
                clip_value_min=log_epsilon,
                clip_value_max=-log_epsilon)

            # Softplus transformation (based on https://arxiv.org/abs/2007.06059)
            softplus_shift = tf_util.constant(value=0.2, dtype='float')
            temperature = (tf.nn.softplus(features=softplus_temperature) + softplus_shift) / \
                (log_two + softplus_shift)

            # Logits
            logits = action_values / temperature

            # Implicit states value
            temperature = tf.squeeze(input=temperature, axis=-1)
            state_value = temperature * tf.reduce_logsumexp(
                input_tensor=logits, axis=-1)

        # # Explicit states value and advantage-based action values
        # state_value = self.state_value.apply(x=x)
        # state_value = tf.reshape(tensor=state_value, shape=shape[:-1])
        # action_values = tf.expand_dims(input=state_value, axis=-1) + action_values
        # action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True)

        # Action masking, affects action_values/probabilities/logits but not state_value
        if self.config.enable_int_action_masking:
            min_float = tf.fill(dims=tf.shape(input=action_values),
                                value=tf_util.get_dtype(type='float').min)
            action_values = tf.where(condition=conditions['mask'],
                                     x=action_values,
                                     y=min_float)
            logits = tf.where(condition=conditions['mask'],
                              x=logits,
                              y=min_float)

        # Softmax for corresponding probabilities
        probabilities = tf.nn.softmax(logits=logits, axis=-1)

        # "Normalized" logits
        logits = tf.math.log(x=(probabilities + epsilon))
        # Unstable
        # logits = tf.nn.log_softmax(logits=logits, axis=-1)
        # Doesn't take masking into account
        # logits = action_values - tf.expand_dims(input=state_value, axis=-1) ... / temperature

        if self.temperature_mode is None:
            return TensorDict(probabilities=probabilities,
                              logits=logits,
                              action_values=action_values,
                              state_value=state_value)
        else:
            return TensorDict(probabilities=probabilities,
                              temperature=temperature,
                              logits=logits,
                              action_values=action_values,
                              state_value=state_value)