コード例 #1
0
    def state_value(self, *, states, horizons, internals, auxiliaries):
        if self.state_value_mode == 'separate':
            deterministic = tf_util.constant(value=True, dtype='bool')
            embedding, _ = self.network.apply(
                x=states, horizons=horizons, internals=internals, deterministic=deterministic,
                independent=True
            )
            if not isinstance(embedding, TensorDict):
                embedding = TensorDict(embedding=embedding)

            return self.s_value.apply(x=embedding.get('state-embedding', embedding['embedding']))

        else:
            return super().state_value(
                states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries
            )
コード例 #2
0
    def state_values(self, *, states, horizons, internals, auxiliaries):
        deterministic = tf_util.constant(value=True, dtype='bool')
        embedding, _ = self.network.apply(
            x=states, horizons=horizons, internals=internals, deterministic=deterministic,
            independent=True
        )
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        if self.state_value_mode == 'implicit':

            def function(name, spec, a_value):
                if name is None:
                    x = embedding.get('action-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding', embedding['embedding'])
                action_value = a_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                action_value = tf.reshape(tensor=action_value, shape=shape)
                if spec.type == 'bool':
                    return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1])
                elif spec.type == 'int':
                    if self.config.enable_int_action_masking:
                        mask = auxiliaries[name]['mask']
                        min_float = tf_util.get_dtype(type='float').min
                        min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float)
                        action_value = tf.where(condition=mask, x=action_value, y=min_float)
                    return tf.math.reduce_max(input_tensor=action_value, axis=-1)

            return self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,)
            )

        elif self.state_value_mode == 'separate':
            state_value = self.s_value.apply(
                x=embedding.get('state-embedding', embedding['embedding'])
            )

            def function(name, spec, a_value):
                if name is None:
                    x = embedding.get('action-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding', embedding['embedding'])
                advantage_value = a_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                advantage_value = tf.reshape(tensor=advantage_value, shape=shape)
                mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True)
                shape = (-1,) + tuple(1 for _ in range(spec.rank + 1))
                _state_value = tf.reshape(tensor=state_value, shape=shape)
                action_value = _state_value + (advantage_value - mean)
                if spec.type == 'bool':
                    return tf.math.maximum(x=action_value[..., 0], y=action_value[..., 1])
                elif spec.type == 'int':
                    if self.config.enable_int_action_masking:
                        mask = auxiliaries[name]['mask']
                        min_float = tf_util.get_dtype(type='float').min
                        min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float)
                        action_value = tf.where(condition=mask, x=action_value, y=min_float)
                    return tf.math.reduce_max(input_tensor=action_value, axis=-1)

            return self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,)
            )

        elif self.state_value_mode == 'separate-per-action':

            def function(name, spec, s_value):
                if name is None:
                    x = embedding.get('state-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-state-embedding', embedding['embedding'])
                state_value = s_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape
                return tf.reshape(tensor=state_value, shape=shape)

            return self.actions_spec.fmap(
                function=function, cls=TensorDict, zip_values=(self.s_values,)
            )
コード例 #3
0
    def action_values(self, *, states, horizons, internals, auxiliaries, actions):
        deterministic = tf_util.constant(value=True, dtype='bool')
        embedding, _ = self.network.apply(
            x=states, horizons=horizons, internals=internals, deterministic=deterministic,
            independent=True
        )
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        if self.state_value_mode == 'implicit':

            def function(name, spec, a_value, action):
                if name is None:
                    x = embedding.get('action-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding', embedding['embedding'])
                action_value = a_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                action_value = tf.reshape(tensor=action_value, shape=shape)
                if spec.type == 'bool':
                    return tf.where(
                        condition=action, x=action_value[..., 0], y=action_value[..., 1]
                    )
                elif spec.type == 'int':
                    action = tf.expand_dims(input=action, axis=(spec.rank + 1))
                    action_value = tf.gather(
                        params=action_value, indices=action, batch_dims=(spec.rank + 1)
                    )
                    return tf.squeeze(input=action_value, axis=(spec.rank + 1))

            return self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True,
                zip_values=(self.a_values, actions)
            )

        elif self.state_value_mode == 'separate':
            state_value = self.s_value.apply(
                x=embedding.get('state-embedding', embedding['embedding'])
            )

            def function(name, spec, a_value, action):
                if name is None:
                    x = embedding.get('action-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding', embedding['embedding'])
                advantage_value = a_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                advantage_value = tf.reshape(tensor=advantage_value, shape=shape)
                mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True)
                shape = (-1,) + tuple(1 for _ in range(spec.rank + 1))
                _state_value = tf.reshape(tensor=state_value, shape=shape)
                action_value = _state_value + (advantage_value - mean)
                if spec.type == 'bool':
                    return tf.where(
                        condition=action, x=action_value[..., 0], y=action_value[..., 1]
                    )
                elif spec.type == 'int':
                    action = tf.expand_dims(input=action, axis=(spec.rank + 1))
                    action_value = tf.gather(
                        params=action_value, indices=action, batch_dims=(spec.rank + 1)
                    )
                    return tf.squeeze(input=action_value, axis=(spec.rank + 1))

            return self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True,
                zip_values=(self.a_values, actions)
            )

        elif self.state_value_mode == 'separate-per-action':

            def function(name, spec, s_value, a_value, action):
                if name is None:
                    state_value = s_value.apply(
                        x=embedding.get('state-embedding', embedding['embedding'])
                    )
                    advantage_value = a_value.apply(
                        x=embedding.get('action-embedding', embedding['embedding'])
                    )
                else:
                    state_value = s_value.apply(
                        x=embedding.get(name + '-state-embedding', embedding['embedding'])
                    )
                    advantage_value = a_value.apply(
                        x=embedding.get(name + '-embedding', embedding['embedding'])
                    )
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                advantage_value = tf.reshape(tensor=advantage_value, shape=shape)
                mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True)
                action_value = tf.expand_dims(input=state_value, axis=-1) + (advantage_value - mean)
                if spec.type == 'bool':
                    return tf.where(
                        condition=action, x=action_value[..., 0], y=action_value[..., 1]
                    )
                elif spec.type == 'int':
                    action = tf.expand_dims(input=action, axis=(spec.rank + 1))
                    action_value = tf.gather(
                        params=action_value, indices=action, batch_dims=(spec.rank + 1)
                    )
                    return tf.squeeze(input=action_value, axis=(spec.rank + 1))

            return self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True,
                zip_values=(self.s_values, self.a_values, actions)
            )
コード例 #4
0
    def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent):
        embedding, internals = self.network.apply(
            x=states, horizons=horizons, internals=internals, deterministic=deterministic,
            independent=independent
        )
        if not isinstance(embedding, TensorDict):
            embedding = TensorDict(embedding=embedding)

        if self.state_value_mode == 'implicit':

            def function(name, spec, a_value):
                if name is None:
                    x = embedding.get('action-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding', embedding['embedding'])
                action_value = a_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                return tf.reshape(tensor=action_value, shape=shape)

            action_values = self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,)
            )

        elif self.state_value_mode == 'separate':
            state_value = self.s_value.apply(
                x=embedding.get('state-embedding', embedding['embedding'])
            )

            def function(name, spec, a_value):
                if name is None:
                    x = embedding.get('action-embedding', embedding['embedding'])
                else:
                    x = embedding.get(name + '-embedding', embedding['embedding'])
                advantage_value = a_value.apply(x=x)
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                advantage_value = tf.reshape(tensor=advantage_value, shape=shape)
                mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True)
                shape = (-1,) + tuple(1 for _ in range(spec.rank + 1))
                _state_value = tf.reshape(tensor=state_value, shape=shape)
                return _state_value + (advantage_value - mean)

            action_values = self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True, zip_values=(self.a_values,)
            )

        elif self.state_value_mode == 'separate-per-action':

            def function(name, spec, s_value, a_value):
                if name is None:
                    state_value = s_value.apply(
                        x=embedding.get('state-embedding', embedding['embedding'])
                    )
                    advantage_value = a_value.apply(
                        x=embedding.get('action-embedding', embedding['embedding'])
                    )
                else:
                    state_value = s_value.apply(
                        x=embedding.get(name + '-state-embedding', embedding['embedding'])
                    )
                    advantage_value = a_value.apply(
                        x=embedding.get(name + '-embedding', embedding['embedding'])
                    )
                if spec.type == 'bool':
                    shape = (-1,) + spec.shape + (2,)
                elif spec.type == 'int':
                    shape = (-1,) + spec.shape + (spec.num_values,)
                advantage_value = tf.reshape(tensor=advantage_value, shape=shape)
                mean = tf.math.reduce_mean(input_tensor=advantage_value, axis=-1, keepdims=True)
                return tf.expand_dims(input=state_value, axis=-1) + (advantage_value - mean)

            action_values = self.actions_spec.fmap(
                function=function, cls=TensorDict, with_names=True,
                zip_values=(self.s_values, self.a_values)
            )

        def function(name, spec, action_value):
            if spec.type == 'bool':

                def fn_summary():
                    axis = range(spec.rank + 1)
                    values = tf.math.reduce_mean(input_tensor=action_value, axis=axis)
                    return [values[0], values[1]]

                if name is None:
                    names = ['action-values/true', 'action-values/false']
                else:
                    names = ['action-values/' + name + '-true', 'action-values/' + name + '-false']
                dependencies = self.summary(
                    label='action-value', name=names, data=fn_summary, step='timesteps'
                )

                def fn_tracking():
                    return tf.math.reduce_mean(input_tensor=action_value, axis=0)

                if name is None:
                    n = 'action-values'
                else:
                    n = name + '-values'
                dependencies = self.track(label='action-value', name=n, data=fn_tracking)

                with tf.control_dependencies(control_inputs=dependencies):
                    return (action_value[..., 0] > action_value[..., 1])

            elif spec.type == 'int':

                def fn_summary():
                    axis = range(spec.rank + 1)
                    values = tf.math.reduce_mean(input_tensor=action_value, axis=axis)
                    return [values[n] for n in range(spec.num_values)]

                if name is None:
                    prefix = 'action-values/action'
                else:
                    prefix = 'action-values/' + name + '-action'
                names = [prefix + str(n) for n in range(spec.num_values)]
                dependencies = self.summary(
                    label='action-value', name=names, data=fn_summary, step='timesteps'
                )

                def fn_tracking():
                    return tf.math.reduce_mean(input_tensor=action_value, axis=0)

                if name is None:
                    n = 'action-values'
                else:
                    n = name + '-values'
                dependencies = self.track(label='action-value', name=n, data=fn_tracking)

                with tf.control_dependencies(control_inputs=dependencies):
                    if self.config.enable_int_action_masking:
                        mask = auxiliaries[name]['mask']
                        min_float = tf_util.get_dtype(type='float').min
                        min_float = tf.fill(dims=tf.shape(input=action_value), value=min_float)
                        action_value = tf.where(condition=mask, x=action_value, y=min_float)
                    return tf.math.argmax(input=action_value, axis=-1, output_type=spec.tf_type())

        actions = self.actions_spec.fmap(
            function=function, cls=TensorDict, with_names=True, zip_values=(action_values,)
        )

        return actions, internals