class DummyNNWithDictInput(NeuralNetwork):
    """
    Dummy NN with dict input taking a dict with keys "a" and "b" passes them both through two different (parallel,
    not connected in any way) dense layers and then concatenating the outputs to yield the final output.
    """

    def __init__(self, num_units_a=3, num_units_b=2, scope="dummy-nn-with-dict-input", **kwargs):
        super(DummyNNWithDictInput, self).__init__(scope=scope, **kwargs)

        self.num_units_a = num_units_a
        self.num_units_b = num_units_b

        # Splits the input into two streams.
        self.splitter = ContainerSplitter("a", "b")
        self.stack_a = DenseLayer(units=self.num_units_a, scope="dense-a")
        self.stack_b = DenseLayer(units=self.num_units_b, scope="dense-b")
        self.concat_layer = ConcatLayer()

        # Add all sub-components to this one.
        self.add_components(self.splitter, self.stack_a, self.stack_b, self.concat_layer)

    @rlgraph_api
    def call(self, input_dict):
        # Split the input dict into two streams.
        input_a, input_b = self.splitter.call(input_dict)

        # Get the two stack outputs.
        output_a = self.stack_a.call(input_a)
        output_b = self.stack_b.call(input_b)

        # Concat everything together, that's the output.
        concatenated_data = self.concat_layer.call(output_a, output_b)

        return concatenated_data
Exemple #2
0
    def __init__(self, network_spec, units_state_value_stream,
                 weights_spec_state_value_stream=None, biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu", scope="dueling-policy", **kwargs):
        super(DuelingPolicy, self).__init__(network_spec, scope=scope, **kwargs)

        self.action_space_flattened = self.action_space.flatten()

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # Our softmax component to produce probabilities.
        self.softmax = Softmax()

        # Create all state value extra Layers.
        # TODO: Make this a NN-spec as well (right now it's one layer fixed plus the final value node).
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream, weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream"
        )
        self.state_value_node = DenseLayer(
            units=1,
            activation="linear",
            scope="state-value-node"
        )

        self.add_components(self.dense_layer_state_value_stream, self.state_value_node)
    def __init__(self, num_units_a=3, num_units_b=2, scope="dummy-nn-with-dict-input", **kwargs):
        super(DummyNNWithDictInput, self).__init__(scope=scope, **kwargs)

        self.num_units_a = num_units_a
        self.num_units_b = num_units_b

        # Splits the input into two streams.
        self.splitter = ContainerSplitter("a", "b")
        self.stack_a = DenseLayer(units=self.num_units_a, scope="dense-a")
        self.stack_b = DenseLayer(units=self.num_units_b, scope="dense-b")
        self.concat_layer = ConcatLayer()

        # Add all sub-components to this one.
        self.add_components(self.splitter, self.stack_a, self.stack_b, self.concat_layer)
    def test_add_layer_to_simple_nn(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        space = FloatBox(shape=(3, ), add_batch_rank=True)

        # Create a simple neural net from json.
        neural_net = NeuralNetwork.from_spec(
            config_from_path(
                "configs/test_simple_nn.json"))  # type: NeuralNetwork
        # Add another layer to it.
        neural_net.add_layer(DenseLayer(units=10, scope="last-layer"))

        # Do not seed, we calculate expectations manually.
        test = ComponentTest(component=neural_net,
                             input_spaces=dict(nn_input=space))

        # Batch of size=3.
        input_ = space.sample(3)
        # Calculate output manually.
        var_dict = test.read_variable_values(neural_net.variable_registry)

        expected = dense_layer(
            dense_layer(input_,
                        var_dict["test-network/hidden-layer/dense/kernel"],
                        var_dict["test-network/hidden-layer/dense/bias"]),
            var_dict["test-network/last-layer/dense/kernel"],
            var_dict["test-network/last-layer/dense/bias"])

        test.test(("apply", input_),
                  expected_outputs=dict(output=expected),
                  decimals=5)

        test.terminate()
Exemple #5
0
    def __init__(self,
                 network_spec,
                 value_weights_spec=None,
                 value_biases_spec=None,
                 value_activation=None,
                 value_fold_time_rank=False,
                 value_unfold_time_rank=False,
                 scope="shared-value-function-policy",
                 **kwargs):
        super(SharedValueFunctionPolicy, self).__init__(network_spec,
                                                        scope=scope,
                                                        **kwargs)

        # Create the extra value dense layer with 1 node.
        self.value_unfold_time_rank = value_unfold_time_rank
        self.value_network = NeuralNetwork(
            DenseLayer(
                units=1,
                activation=value_activation,
                weights_spec=value_weights_spec,
                biases_spec=value_biases_spec,
            ),
            fold_time_rank=value_fold_time_rank,
            unfold_time_rank=value_unfold_time_rank,
            scope="value-function-node")

        self.add_components(self.value_network)
Exemple #6
0
    def test_faulty_op_catching(self):
        """
        Adds a single component with 2-to-2 graph_fn to the core and passes two containers through it
        with flatten/split options enabled.
        """
        # Construct some easy component containing a sub-component.
        dense_layer = DenseLayer(units=2, scope="dense-layer")
        string_layer = EmbeddingLookup(embed_dim=3,
                                       vocab_size=4,
                                       scope="embed-layer")
        container_component = Component(dense_layer, string_layer)

        # Add the component's API method.
        @rlgraph_api(component=container_component)
        def test_api(self, a):
            dense_result = self.get_sub_component_by_name("dense-layer").call(
                a)
            # First call dense to get a vector output, then call embedding, which is expecting an int input.
            # This should fail EmbeddingLookup's input space checking (only during the build phase).
            return self.get_sub_component_by_name("embed-layer").call(
                dense_result)

        # Test graphviz component graph drawing.
        draw_meta_graph(container_component, apis=True)

        test = ComponentTest(
            component=container_component,
            input_spaces=dict(
                a=spaces.FloatBox(shape=(4, ), add_batch_rank=True)))
Exemple #7
0
    def build_image_processing_stack():
        """
        Constructs a ReShape preprocessor to fold the time rank into the batch rank.

        Then builds the 2 Conv2D Layers followed by ReLUs.

        Then adds: fc(256) + ReLU.
        """
        # Collect components for image stack before unfolding time-rank going into main LSTM.
        sub_components = list()

        # Divide by 255
        sub_components.append(Divide(divisor=255, scope="divide-255"))

        for i, (num_filters, kernel_size, stride) in enumerate(zip([16, 32], [8, 4], [4, 2])):
            # Conv2D plus ReLU activation function.
            conv2d = Conv2DLayer(
                filters=num_filters, kernel_size=kernel_size, strides=stride, padding="same",
                activation="relu", scope="conv2d-{}".format(i)
            )
            sub_components.append(conv2d)

        # A Flatten preprocessor and then an fc block (surrounded by ReLUs) and a time-rank-unfolding.
        sub_components.extend([
            ReShape(flatten=True, scope="flatten"),  # Flattener (to flatten Conv2D output for the fc layer).
            DenseLayer(units=256),  # Dense layer.
            NNLayer(activation="relu", scope="relu-before-lstm"),
        ])

        #stack_before_unfold = <- formerly known as
        image_stack = Stack(sub_components, scope="image-stack")

        return image_stack
Exemple #8
0
    def __init__(self,
                 action_space=None,
                 final_shape=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 pre_network_spec=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Optional[Space]): The action Space within which this Component will create actions.
                NOTE: Exactly one of `action_space` of `final_shape` must be provided.

            final_shape (Optional[Tuple[int]): An optional final output shape (in case action_space is not provided).
                If None, will calculate the shape automatically from the given `action_space`.
                NOTE: Exactly one of `action_space` of `final_shape` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).

            pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the
                last action layer. If None, only the action layer itself is applied.
        """
        # Build the action layer for this adapter based on the given action-space.
        self.action_space = None
        if action_space is not None:
            self.action_space = action_space.with_batch_rank()
            assert not isinstance(self.action_space, ContainerSpace),\
                "ERROR: ActionAdapter cannot handle ContainerSpaces!"

        units, self.final_shape = self.get_units_and_shape()

        action_layer = DenseLayer(units=units,
                                  activation=activation,
                                  weights_spec=weights_spec,
                                  biases_spec=biases_spec,
                                  scope="action-layer")

        # Do we have a pre-NN?
        self.network = NeuralNetwork.from_spec(
            pre_network_spec, scope="action-network")  # type: NeuralNetwork
        self.network.add_layer(action_layer)

        # Add the reshape layer to match the action space's shape.
        self.network.add_layer(ReShape(new_shape=self.final_shape))

        super(ActionAdapter, self).__init__(self.network,
                                            scope=scope,
                                            **kwargs)
    def __init__(self, z_units, encoder_network_spec, decoder_network_spec,
                 **kwargs):
        """
        Args:
            z_units (int): Number of units of the latent (z) vectors that the encoder will produce.

            encoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct an encoder
                NeuralNetwork object from or a NeuralNetwork Component directly.

            decoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct a decoder
                NeuralNetwork object from or a NeuralNetwork Component directly.
        """
        super(VariationalAutoEncoder,
              self).__init__(scope="variational-auto-encoder", **kwargs)

        self.z_units = z_units

        # Create encoder and decoder networks.
        self.encoder_network = NeuralNetwork.from_spec(encoder_network_spec,
                                                       scope="encoder-network")
        self.decoder_network = NeuralNetwork.from_spec(decoder_network_spec,
                                                       scope="decoder-network")

        # Create the two Gaussian layers.
        self.mean_layer = DenseLayer(units=self.z_units, scope="mean-layer")
        self.stddev_layer = DenseLayer(units=self.z_units,
                                       scope="stddev-layer")

        # Create the Normal Distribution from which to sample.
        self.normal_distribution = Normal()

        # A concat layer to concat mean and stddev before passing it to the Normal distribution.
        # No longer needed: Pass Tuple (mean + stddev) into API-method instead of concat'd tensor.
        #self.concat_layer = ConcatLayer(axis=-1)

        # Add all sub-Components.
        self.add_components(
            self.encoder_network,
            self.decoder_network,
            self.mean_layer,
            self.stddev_layer,
            self.normal_distribution  #, self.concat_layer
        )
Exemple #10
0
    def build_image_processing_stack():
        """
        Constructs a ReShape preprocessor to fold the time rank into the batch rank.

        Then builds the 3 sequential Conv2D blocks that process the image information.
        Each of these 3 blocks consists of:
        - 1 Conv2D layer followed by a MaxPool2D
        - 2 residual blocks, each of which looks like:
            - ReLU + Conv2D + ReLU + Conv2D + element-wise add with original input

        Then adds: ReLU + fc(256) + ReLU.
        """
        # Collect components for image stack before unfolding time-rank going into main LSTM.
        sub_components = list()

        # Divide by 255
        sub_components.append(Divide(divisor=255, scope="divide-255"))

        for i, num_filters in enumerate([16, 32, 32]):
            # Conv2D plus MaxPool2D.
            conv2d_plus_maxpool = Stack(
                Conv2DLayer(filters=num_filters, kernel_size=3, strides=1, padding="same"),
                MaxPool2DLayer(pool_size=3, strides=2, padding="same"),
                scope="conv-max"
            )

            # Single unit for the residual layers (ReLU + Conv2D 3x3 stride=1).
            residual_unit = Stack(
                NNLayer(activation="relu"),  # single ReLU
                Conv2DLayer(filters=num_filters, kernel_size=3, strides=1, padding="same"),
                scope="relu-conv"
            )
            # Residual Layer.
            residual_layer = ResidualLayer(residual_unit=residual_unit, repeats=2)
            # Repeat same residual layer 2x.
            residual_repeater = RepeaterStack(sub_component=residual_layer, repeats=2)

            sub_components.append(Stack(conv2d_plus_maxpool, residual_repeater, scope="conv-unit-{}".format(i)))

        # A Flatten preprocessor and then an fc block (surrounded by ReLUs) and a time-rank-unfolding.
        sub_components.extend([
            ReShape(flatten=True, scope="flatten"),  # Flattener (to flatten Conv2D output for the fc layer).
            NNLayer(activation="relu", scope="relu-1"),  # ReLU 1
            DenseLayer(units=256),  # Dense layer.
            NNLayer(activation="relu", scope="relu-2"),  # ReLU 2
        ])

        image_stack = Stack(sub_components, scope="image-stack")

        return image_stack
Exemple #11
0
    def __init__(self,
                 units_state_value_stream,
                 units_advantage_stream,
                 weights_spec_state_value_stream=None,
                 biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu",
                 weights_spec_advantage_stream=None,
                 biases_spec_advantage_stream=None,
                 activation_advantage_stream="relu",
                 scope="dueling-action-adapter",
                 **kwargs):
        # TODO: change add_units=-1 once we have a true base class for action-adapters.
        super(DuelingActionAdapter, self).__init__(add_units=0,
                                                   scope=scope,
                                                   **kwargs)

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # The advantage stream.
        self.units_advantage_stream = units_advantage_stream
        self.weights_spec_advantage_stream = weights_spec_advantage_stream
        self.biases_spec_advantage_stream = biases_spec_advantage_stream
        self.activation_advantage_stream = activation_advantage_stream

        # Create all 4 extra DenseLayers.
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream,
            weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream")
        self.dense_layer_advantage_stream = DenseLayer(
            units=self.units_state_value_stream,
            weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-advantage-stream")
        self.state_value_node = DenseLayer(units=1,
                                           activation="linear",
                                           scope="state-value-node")
        # self.action_layer is our advantage layer

        self.add_components(self.dense_layer_state_value_stream,
                            self.dense_layer_advantage_stream,
                            self.state_value_node)
Exemple #12
0
class ActionAdapter(Component):
    """
    A Component that cleans up a neural network's flat output and gets it ready for parameterizing a
    Distribution Component.
    Processing steps include:
    - Sending the raw, flattened NN output through a Dense layer whose number of units matches the flattened
    action space.
    - Reshaping (according to the action Space).
    - Translating the reshaped outputs (logits) into probabilities (by softmaxing) and log-probabilities (log).
    """
    def __init__(self,
                 action_space,
                 add_units=0,
                 units=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Space): The action Space within which this Component will create actions.

            add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action-
                layer nodes. Can be negative to subtract units from the auto-calculated value.
                NOTE: Only one of either `add_units` or `units` must be provided.

            units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate
                the number of units automatically from the given action_space.
                NOTE: Only one of either `add_units` or `units` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).
        """
        super(ActionAdapter, self).__init__(scope=scope, **kwargs)

        self.action_space = action_space.with_batch_rank()
        self.weights_spec = weights_spec
        self.biases_spec = biases_spec
        self.activation = activation

        # Our (dense) action layer representing the flattened action space.
        self.action_layer = None

        # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space
        # or using a given fixed number (`units`).
        # Also generate the ReShape sub-Component and give it the new_shape.
        if isinstance(self.action_space, IntBox):
            if units is None:
                units = add_units + self.action_space.flat_dim_with_categories
            self.reshape = ReShape(
                new_shape=self.action_space.get_shape(with_category_rank=True),
                flatten_categories=False)
        else:
            if units is None:
                units = add_units + 2 * self.action_space.flat_dim  # Those two dimensions are the mean and log sd
            # Manually add moments after batch/time ranks.
            new_shape = tuple([2] + list(self.action_space.shape))
            self.reshape = ReShape(new_shape=new_shape)

        assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format(
            units)

        # Create the action-layer and add it to this component.
        self.action_layer = DenseLayer(units=units,
                                       activation=self.activation,
                                       weights_spec=self.weights_spec,
                                       biases_spec=self.biases_spec,
                                       scope="action-layer")

        self.add_components(self.action_layer, self.reshape)

    def check_input_spaces(self, input_spaces, action_space=None):
        # Check the input Space.
        last_nn_layer_space = input_spaces["nn_output"]  # type: Space
        sanity_check_space(last_nn_layer_space,
                           non_allowed_types=[ContainerSpace])

        # Check the action Space.
        sanity_check_space(self.action_space, must_have_batch_rank=True)
        if isinstance(self.action_space, IntBox):
            sanity_check_space(self.action_space, must_have_categories=True)
        else:
            # Fixme: Are there other restraints on continuous action spaces? E.g. no dueling layers?
            pass

    @rlgraph_api
    def get_action_layer_output(self, nn_output):
        """
        Returns the raw, non-reshaped output of the action-layer (DenseLayer) after passing through it the raw
        nn_output (coming from the previous Component).

        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            DataOpRecord: The output of the action layer (a DenseLayer) after passing `nn_output` through it.
        """
        out = self.action_layer.apply(nn_output)
        return dict(output=out)

    @rlgraph_api
    def get_logits(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            SingleDataOp: The logits (raw nn_output, BUT reshaped).
        """
        aa_output = self.get_action_layer_output(nn_output)
        logits = self.reshape.apply(aa_output["output"])
        return logits

    @rlgraph_api
    def get_logits_probabilities_log_probs(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            Tuple[SingleDataOp]:
                - logits (raw nn_output, BUT reshaped)
                - probabilities (softmaxed(logits))
                - log(probabilities)
        """
        logits = self.get_logits(nn_output)
        probabilities, log_probs = self._graph_fn_get_probabilities_log_probs(
            logits)
        return dict(logits=logits,
                    probabilities=probabilities,
                    log_probs=log_probs)

    # TODO: Use a SoftMax Component instead (uses the same code as the one below).
    @graph_fn
    def _graph_fn_get_probabilities_log_probs(self, logits):
        """
        Creates properties/parameters and log-probs from some reshaped output.

        Args:
            logits (SingleDataOp): The output of some layer that is already reshaped
                according to our action Space.

        Returns:
            tuple (2x SingleDataOp):
                parameters (DataOp): The parameters, ready to be passed to a Distribution object's
                    get_distribution API-method (usually some probabilities or loc/scale pairs).
                log_probs (DataOp): Simply the log(parameters).
        """
        if get_backend() == "tf":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = tf.maximum(x=tf.nn.softmax(logits=logits,
                                                        axis=-1),
                                        y=SMALL_NUMBER)
                # Log probs.
                log_probs = tf.log(x=parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = tf.split(value=logits,
                                        num_or_size_splits=2,
                                        axis=1)
                # Remove moments rank.
                mean = tf.squeeze(input=mean, axis=1)
                log_sd = tf.squeeze(input=log_sd, axis=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = tf.clip_by_value(t=log_sd,
                                          clip_value_min=log(SMALL_NUMBER),
                                          clip_value_max=-log(SMALL_NUMBER))

                # Turn log sd into sd.
                sd = tf.exp(x=log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(tf.log(x=mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs

        elif get_backend() == "pytorch":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                softmax_logits = torch.softmax(logits, dim=-1)
                parameters = torch.max(softmax_logits, SMALL_NUMBER_TORCH)
                # Log probs.
                log_probs = torch.log(parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = torch.split(logits,
                                           split_size_or_sections=2,
                                           dim=1)
                # Remove moments rank.
                mean = torch.squeeze(mean, dim=1)
                log_sd = torch.squeeze(log_sd, dim=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = torch.clamp(log_sd,
                                     min=LOG_SMALL_NUMBER,
                                     max=-LOG_SMALL_NUMBER)

                # Turn log sd into sd.
                sd = torch.exp(log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(torch.log(mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs
    def test_lstm_nn_with_custom_apply(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        units = 3
        batch_size = 2
        time_steps = 4
        input_nodes = 2
        input_space = FloatBox(shape=(input_nodes, ),
                               add_batch_rank=True,
                               add_time_rank=True)
        internal_states_space = Tuple(FloatBox(shape=(units, )),
                                      FloatBox(shape=(units, )),
                                      add_batch_rank=True)

        def custom_apply(self, input_, internal_states=None):
            d0_out = self.get_sub_component_by_name("d0").apply(input_)
            lstm_out = self.get_sub_component_by_name("lstm").apply(
                d0_out, internal_states)
            d1_out = self.get_sub_component_by_name("d1").apply(
                lstm_out["output"])
            return dict(output=d1_out,
                        last_internal_states=lstm_out["last_internal_states"])

        # Create a simple neural net with the above custom API-method.
        neural_net = NeuralNetwork(DenseLayer(units, scope="d0"),
                                   LSTMLayer(units, scope="lstm"),
                                   DenseLayer(units, scope="d1"),
                                   api_methods={("apply", custom_apply)})

        # Do not seed, we calculate expectations manually.
        test = ComponentTest(component=neural_net,
                             input_spaces=dict(
                                 input_=input_space,
                                 internal_states=internal_states_space))

        # Batch of size=2, time-steps=3.
        input_ = input_space.sample((batch_size, time_steps))
        internal_states = internal_states_space.sample(batch_size)

        # Calculate output manually.
        w0_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d0/dense/kernel"])
        b0_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d0/dense/bias"])
        w1_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/d1/dense/bias"])
        lstm_w_value = test.read_variable_values(
            neural_net.
            variable_registry["neural-network/lstm/lstm-cell/kernel"])
        lstm_b_value = test.read_variable_values(
            neural_net.variable_registry["neural-network/lstm/lstm-cell/bias"])

        d0_out = dense_layer(input_, w0_value, b0_value)
        lstm_out, last_internal_states = lstm_layer(
            d0_out,
            lstm_w_value,
            lstm_b_value,
            initial_internal_states=internal_states,
            time_major=False)
        d1_out = dense_layer(lstm_out, w1_value, b1_value)

        expected = dict(output=d1_out,
                        last_internal_states=last_internal_states)
        test.test(("apply", [input_, internal_states]),
                  expected_outputs=expected,
                  decimals=5)

        test.terminate()
class VariationalAutoEncoder(NeuralNetwork):
    def __init__(self, z_units, encoder_network_spec, decoder_network_spec,
                 **kwargs):
        """
        Args:
            z_units (int): Number of units of the latent (z) vectors that the encoder will produce.

            encoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct an encoder
                NeuralNetwork object from or a NeuralNetwork Component directly.

            decoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct a decoder
                NeuralNetwork object from or a NeuralNetwork Component directly.
        """
        super(VariationalAutoEncoder,
              self).__init__(scope="variational-auto-encoder", **kwargs)

        self.z_units = z_units

        # Create encoder and decoder networks.
        self.encoder_network = NeuralNetwork.from_spec(encoder_network_spec,
                                                       scope="encoder-network")
        self.decoder_network = NeuralNetwork.from_spec(decoder_network_spec,
                                                       scope="decoder-network")

        # Create the two Gaussian layers.
        self.mean_layer = DenseLayer(units=self.z_units, scope="mean-layer")
        self.stddev_layer = DenseLayer(units=self.z_units,
                                       scope="stddev-layer")

        # Create the Normal Distribution from which to sample.
        self.normal_distribution = Normal()

        # A concat layer to concat mean and stddev before passing it to the Normal distribution.
        # No longer needed: Pass Tuple (mean + stddev) into API-method instead of concat'd tensor.
        #self.concat_layer = ConcatLayer(axis=-1)

        # Add all sub-Components.
        self.add_components(
            self.encoder_network,
            self.decoder_network,
            self.mean_layer,
            self.stddev_layer,
            self.normal_distribution  #, self.concat_layer
        )

    @rlgraph_api
    def call(self, input_):
        """
        Our custom `call` method.
        """
        encoder_out = self.encode(input_)
        decoder_out = self.decode(encoder_out["z_sample"])
        return decoder_out

    @rlgraph_api
    def encode(self, input_):
        # Get the encoder raw output.
        encoder_output = self.encoder_network.call(input_)
        # Push it through our two mean/std layers.
        mean = self.mean_layer.call(encoder_output)
        log_stddev = self.stddev_layer.call(encoder_output)
        stddev = self._graph_fn_exp(log_stddev)
        # Generate a Tuple to be passed into `sample_stochastic` as parameters of a Normal distribution.
        z_sample = self.normal_distribution.sample_stochastic(
            tuple([mean, stddev]))
        return dict(z_sample=z_sample, mean=mean, stddev=stddev)

    @rlgraph_api
    def decode(self, z_vector):
        return self.decoder_network.call(z_vector)

    @graph_fn
    def _graph_fn_exp(self, input_):
        if get_backend() == "tf":
            return tf.exp(input_)
        elif get_backend() == "pytorch":
            return torch.exp(input_)
Exemple #15
0
    def __init__(self,
                 action_space,
                 add_units=0,
                 units=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Space): The action Space within which this Component will create actions.

            add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action-
                layer nodes. Can be negative to subtract units from the auto-calculated value.
                NOTE: Only one of either `add_units` or `units` must be provided.

            units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate
                the number of units automatically from the given action_space.
                NOTE: Only one of either `add_units` or `units` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).
        """
        super(ActionAdapter, self).__init__(scope=scope, **kwargs)

        self.action_space = action_space.with_batch_rank()
        self.weights_spec = weights_spec
        self.biases_spec = biases_spec
        self.activation = activation

        # Our (dense) action layer representing the flattened action space.
        self.action_layer = None

        # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space
        # or using a given fixed number (`units`).
        # Also generate the ReShape sub-Component and give it the new_shape.
        if isinstance(self.action_space, IntBox):
            if units is None:
                units = add_units + self.action_space.flat_dim_with_categories
            self.reshape = ReShape(
                new_shape=self.action_space.get_shape(with_category_rank=True),
                flatten_categories=False)
        else:
            if units is None:
                units = add_units + 2 * self.action_space.flat_dim  # Those two dimensions are the mean and log sd
            # Manually add moments after batch/time ranks.
            new_shape = tuple([2] + list(self.action_space.shape))
            self.reshape = ReShape(new_shape=new_shape)

        assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format(
            units)

        # Create the action-layer and add it to this component.
        self.action_layer = DenseLayer(units=units,
                                       activation=self.activation,
                                       weights_spec=self.weights_spec,
                                       biases_spec=self.biases_spec,
                                       scope="action-layer")

        self.add_components(self.action_layer, self.reshape)
Exemple #16
0
class DuelingPolicy(Policy):
    def __init__(self,
                 network_spec,
                 units_state_value_stream,
                 weights_spec_state_value_stream=None,
                 biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu",
                 scope="dueling-policy",
                 **kwargs):
        super(DuelingPolicy, self).__init__(network_spec,
                                            scope=scope,
                                            **kwargs)

        self.action_space_flattened = self.action_space.flatten()

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # Our softmax component to produce probabilities.
        self.softmax = Softmax()

        # Create all state value extra Layers.
        # TODO: Make this a NN-spec as well (right now it's one layer fixed plus the final value node).
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream,
            weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream")
        self.state_value_node = DenseLayer(units=1,
                                           activation="linear",
                                           scope="state-value-node")

        self.add_components(self.dense_layer_state_value_stream,
                            self.state_value_node)

    @rlgraph_api
    def get_state_values(self, nn_input, internal_states=None):
        """
        Returns the state value node's output passing some nn-input through the policy and the state-value
        stream.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        state_values_tmp = self.dense_layer_state_value_stream.apply(
            nn_output["output"])
        state_values = self.state_value_node.apply(state_values_tmp)

        return dict(state_values=state_values,
                    last_internal_states=nn_output.get("last_internal_states"))

    @rlgraph_api
    def get_state_values_logits_parameters_log_probs(self,
                                                     nn_input,
                                                     internal_states=None):
        """
        Similar to `get_values_logits_probabilities_log_probs`, but also returns in the return dict under key
        `state_value` the output of our state-value function node.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
                logits: The (reshaped) logits from the ActionAdapter.
                parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting
                    logits as mean and stddev for a normal distribution).
                log_probs: The log(probabilities) values.
                last_internal_states: The last internal states (if network is RNN-based).
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        advantages, _, _ = self._graph_fn_get_action_adapter_logits_parameters_log_probs(
            nn_output["output"], nn_input)
        state_values_tmp = self.dense_layer_state_value_stream.apply(
            nn_output["output"])
        state_values = self.state_value_node.apply(state_values_tmp)

        q_values = self._graph_fn_calculate_q_values(state_values, advantages)

        parameters, log_probs = self._graph_fn_get_action_adapter_parameters_log_probs(
            q_values)

        return dict(state_values=state_values,
                    logits=q_values,
                    parameters=parameters,
                    log_probs=log_probs,
                    last_internal_states=nn_output.get("last_internal_states"),
                    advantages=advantages,
                    q_values=q_values)

    def get_state_values_logits_probabilities_log_probs(
            self, nn_input, internal_states=None):
        raise RLGraphObsoletedError(
            "API method", "get_state_values_logits_probabilities_log_probs",
            "get_state_values_logits_parameters_log_probs")

    @rlgraph_api
    def get_logits_parameters_log_probs(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                logits: The q-values after adding advantages to state values (and subtracting the mean advantage).
                parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting
                    logits as mean and stddev for a normal distribution).
                log_probs: The log(probabilities) values.
                last_internal_states: The final internal states after passing through a possible RNN.
        """
        out = self.get_state_values_logits_parameters_log_probs(
            nn_input, internal_states)
        return dict(logits=out["logits"],
                    parameters=out["parameters"],
                    log_probs=out["log_probs"],
                    last_internal_states=out.get("last_internal_states"))

    def get_logits_probabilities_log_probs(self,
                                           nn_input,
                                           internal_states=None):
        raise RLGraphObsoletedError("API method",
                                    "get_logits_probabilities_log_probs",
                                    "get_logits_parameters_log_probs")

    @graph_fn(flatten_ops=True, split_ops=True)
    def _graph_fn_calculate_q_values(self, state_value, advantage_values):
        """
        Args:
            state_value (SingleDataOp): The single node state-value output.
            advantage_values (SingleDataOp): The already reshaped advantage-values.

        Returns:
            SingleDataOp: The calculated, reshaped Q values (for each composite action) based on:
                Q = V + [A - mean(A)]
        """
        # Use the very first node as value function output.
        # Use all following nodes as advantage function output.
        if get_backend() == "tf":
            # Calculate the q-values according to [1] and return.
            mean_advantages = tf.reduce_mean(input_tensor=advantage_values,
                                             axis=-1,
                                             keepdims=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = tf.expand_dims(state_value_expanded,
                                                      axis=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            # q-values
            return q_values

        elif get_backend() == "pytorch":
            mean_advantages = torch.mean(advantage_values,
                                         dim=-1,
                                         keepdim=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = torch.unsqueeze(state_value_expanded,
                                                       dim=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            # q-values
            return q_values

    @graph_fn(flatten_ops=True,
              split_ops=True,
              add_auto_key_as_first_param=True)
    def _graph_fn_get_action_adapter_parameters_log_probs(self, key, q_values):
        """
        """
        out = self.action_adapters[key].get_parameters_log_probs(q_values)
        return out["parameters"], out["log_probs"]
Exemple #17
0
class DuelingPolicy(Policy):
    def __init__(self,
                 network_spec,
                 units_state_value_stream,
                 weights_spec_state_value_stream=None,
                 biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu",
                 scope="dueling-policy",
                 **kwargs):
        super(DuelingPolicy, self).__init__(network_spec,
                                            scope=scope,
                                            **kwargs)

        self.action_space_flattened = self.action_space.flatten()

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # Our softmax component to produce probabilities.
        self.softmax = Softmax()

        # Create all state value extra Layers.
        # TODO: Make this a NN-spec as well (right now it's one layer fixed plus the final value node).
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream,
            weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream")
        self.state_value_node = DenseLayer(units=1,
                                           activation="linear",
                                           scope="state-value-node")

        self.add_components(self.dense_layer_state_value_stream,
                            self.state_value_node)

    @rlgraph_api
    def get_state_values(self, nn_input, internal_states=None):
        """
        Returns the state value node's output passing some nn-input through the policy and the state-value
        stream.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        state_values_tmp = self.dense_layer_state_value_stream.apply(
            nn_output["output"])
        state_values = self.state_value_node.apply(state_values_tmp)

        return dict(state_values=state_values,
                    last_internal_states=nn_output.get("last_internal_states"))

    @rlgraph_api
    def get_state_values_logits_parameters_log_probs(self,
                                                     nn_input,
                                                     internal_states=None):
        """
        Similar to `get_values_logits_probabilities_log_probs`, but also returns in the return dict under key
        `state_value` the output of our state-value function node.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
                logits: The (reshaped) logits from the ActionAdapter.
                parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting
                    logits as mean and stddev for a normal distribution).
                log_probs: The log(probabilities) values.
                last_internal_states: The last internal states (if network is RNN-based).
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        advantages, _, _ = self._graph_fn_get_action_adapter_logits_parameters_log_probs(
            nn_output["output"], nn_input)
        state_values_tmp = self.dense_layer_state_value_stream.apply(
            nn_output["output"])
        state_values = self.state_value_node.apply(state_values_tmp)

        q_values = self._graph_fn_calculate_q_values(state_values, advantages)

        parameters, log_probs = self._graph_fn_get_parameters_log_probs(
            q_values)

        return dict(state_values=state_values,
                    logits=q_values,
                    parameters=parameters,
                    log_probs=log_probs,
                    last_internal_states=nn_output.get("last_internal_states"),
                    advantages=advantages,
                    q_values=q_values)

    @rlgraph_api
    def get_state_values_logits_probabilities_log_probs(
            self, nn_input, internal_states=None):
        """
        Similar to `get_values_logits_probabilities_log_probs`, but also returns in the return dict under key
        `state_value` the output of our state-value function node.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
                logits: The (reshaped) logits from the ActionAdapter.
                probabilities: The probabilities gained from the softmaxed logits.
                log_probs: The log(probabilities) values.
                last_internal_states: The last internal states (if network is RNN-based).
        """
        self.logger.warn(
            "Deprecated API method `get_state_values_logits_probabilities_log_probs` used!"
            "Use `get_state_values_logits_parameters_log_probs` instead.")

        nn_output = self.get_nn_output(nn_input, internal_states)
        advantages, _, _ = self._graph_fn_get_action_adapter_logits_parameters_log_probs(
            nn_output["output"], nn_input)
        state_values_tmp = self.dense_layer_state_value_stream.apply(
            nn_output["output"])
        state_values = self.state_value_node.apply(state_values_tmp)

        q_values = self._graph_fn_calculate_q_values(state_values, advantages)

        parameters, log_probs = self._graph_fn_get_parameters_log_probs(
            q_values)

        return dict(state_values=state_values,
                    logits=q_values,
                    probabilities=parameters,
                    parameters=parameters,
                    log_probs=log_probs,
                    last_internal_states=nn_output.get("last_internal_states"),
                    advantages=advantages,
                    q_values=q_values)

    @rlgraph_api
    def get_logits_parameters_log_probs(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                logits: The q-values after adding advantages to state values (and subtracting the mean advantage).
                parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting
                    logits as mean and stddev for a normal distribution).
                log_probs: The log(probabilities) values.
                last_internal_states: The final internal states after passing through a possible RNN.
        """
        out = self.get_state_values_logits_parameters_log_probs(
            nn_input, internal_states)
        return dict(logits=out["logits"],
                    parameters=out["parameters"],
                    log_probs=out["log_probs"],
                    last_internal_states=out.get("last_internal_states"))

    @rlgraph_api
    def get_logits_probabilities_log_probs(self,
                                           nn_input,
                                           internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                logits: The q-values after adding advantages to state values (and subtracting the mean advantage).
                probabilities: The probabilities gained from the softmaxed logits.
                log_probs: The log(probabilities) values.
                last_internal_states: The final internal states after passing through a possible RNN.
        """
        self.logger.warn(
            "Deprecated API method `get_logits_probabilities_log_probs` used!"
            "Use `get_logits_parameters_log_probs` instead.")
        out = self.get_state_values_logits_parameters_log_probs(
            nn_input, internal_states)
        return dict(logits=out["logits"],
                    probabilities=out["parameters"],
                    parameters=out["parameters"],
                    log_probs=out["log_probs"],
                    last_internal_states=out.get("last_internal_states"))

    @graph_fn(flatten_ops=True, split_ops=True)
    def _graph_fn_calculate_q_values(self, state_value, advantage_values):
        """
        Args:
            state_value (SingleDataOp): The single node state-value output.
            advantage_values (SingleDataOp): The already reshaped advantage-values.

        Returns:
            SingleDataOp: The calculated, reshaped Q values (for each composite action) based on:
                Q = V + [A - mean(A)]
        """
        # Use the very first node as value function output.
        # Use all following nodes as advantage function output.
        if get_backend() == "tf":
            # Calculate the q-values according to [1] and return.
            mean_advantages = tf.reduce_mean(input_tensor=advantage_values,
                                             axis=-1,
                                             keepdims=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = tf.expand_dims(state_value_expanded,
                                                      axis=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            # q-values
            return q_values

        elif get_backend() == "pytorch":
            mean_advantages = torch.mean(advantage_values,
                                         dim=-1,
                                         keepdim=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = torch.unsqueeze(state_value_expanded,
                                                       dim=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            # q-values
            return q_values

    @graph_fn(flatten_ops=True,
              split_ops=True,
              add_auto_key_as_first_param=True)
    def _graph_fn_get_parameters_log_probs(self, key, logits):
        """
        Creates parameters and log-probs from some reshaped output.

        Args:
            logits (SingleDataOp): The output of some layer that is already reshaped
                according to our action Space.

        Returns:
            tuple (2x SingleDataOp):
                parameters (DataOp): The parameters, ready to be passed to a Distribution object's
                    get_distribution API-method (usually some probabilities or loc/scale pairs).

                log_probs (DataOp): Simply the log(parameters).
        """

        if get_backend() == "tf":
            if isinstance(self.action_space_flattened[key], IntBox):
                # Discrete actions.
                parameters = tf.maximum(x=tf.nn.softmax(logits=logits,
                                                        axis=-1),
                                        y=SMALL_NUMBER)
                # Log probs.
                log_probs = tf.log(x=parameters)
            elif isinstance(self.action_space_flattened[key], FloatBox):
                # Continuous actions.
                mean, log_sd = tf.split(value=logits,
                                        num_or_size_splits=2,
                                        axis=1)
                # Remove moments rank.
                mean = tf.squeeze(input=mean, axis=1)
                log_sd = tf.squeeze(input=log_sd, axis=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = tf.clip_by_value(
                    t=log_sd,
                    clip_value_min=math.log(SMALL_NUMBER),
                    clip_value_max=-math.log(SMALL_NUMBER))

                # Turn log sd into sd.
                sd = tf.exp(x=log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(tf.log(x=mean), log_sd)
            else:
                raise NotImplementedError
            return parameters, log_probs

        elif get_backend() == "pytorch":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = torch.max(torch.softmax(logits, dim=-1),
                                       torch.tensor(SMALL_NUMBER))
                # Log probs.
                log_probs = torch.log(parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = torch.split(logits,
                                           split_size_or_sections=2,
                                           dim=1)
                # Remove moments rank.
                mean = torch.squeeze(mean, dim=1)
                log_sd = torch.squeeze(log_sd, dim=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = torch.clamp(log_sd,
                                     min=math.log(SMALL_NUMBER),
                                     max=-math.log(SMALL_NUMBER))

                # Turn log sd into sd.
                sd = torch.exp(log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(torch.log(mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs
Exemple #18
0
    def __init__(self,
                 action_space,
                 add_units=0,
                 units=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 pre_network_spec=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Space): The action Space within which this Component will create actions.

            add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action-
                layer nodes. Can be negative to subtract units from the auto-calculated value.
                NOTE: Only one of either `add_units` or `units` must be provided.

            units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate
                the number of units automatically from the given action_space.
                NOTE: Only one of either `add_units` or `units` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).

            pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the
                last action layer. If None, only the action layer itself is applied.
        """
        # Build the action layer for this adapter based on the given action-space.
        self.action_space = action_space.with_batch_rank()
        assert not isinstance(
            self.action_space, ContainerSpace
        ), "ERROR: ActionAdapter cannot handle ContainerSpaces!"
        # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space
        # or using a given fixed number (`units`).
        # Also generate the ReShape sub-Component and give it the new_shape.
        if isinstance(self.action_space, IntBox):
            if units is None:
                units = add_units + self.action_space.flat_dim_with_categories
            new_shape = self.action_space.get_shape(with_category_rank=True)
        else:
            if units is None:
                units = add_units + 2 * self.action_space.flat_dim  # Those two dimensions are the mean and log sd
            # Manually add moments after batch/time ranks.
            new_shape = tuple([2] + list(self.action_space.shape))

        assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format(
            units)

        action_layer = DenseLayer(units=units,
                                  activation=activation,
                                  weights_spec=weights_spec,
                                  biases_spec=biases_spec,
                                  scope="action-layer")

        # Do we have a pre-NN?
        self.network = NeuralNetwork.from_spec(
            pre_network_spec, scope="action-network")  # type: NeuralNetwork
        self.network.add_layer(action_layer)

        # Add the reshape layer to match the action space's shape.
        self.network.add_layer(ReShape(new_shape=new_shape))

        super(ActionAdapter, self).__init__(self.network,
                                            scope=scope,
                                            **kwargs)
Exemple #19
0
class DuelingActionAdapter(ActionAdapter):
    """
    An ActionAdapter that adds a dueling Q calculation to the flattened output of a neural network.

    API:
        get_dueling_output(nn_output) (Tuple[SingleDataOp x 3]): The state-value, advantage-values
            (reshaped) and q-values (reshaped) after passing action_layer_output through the dueling layer.
    """
    def __init__(self,
                 units_state_value_stream,
                 units_advantage_stream,
                 weights_spec_state_value_stream=None,
                 biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu",
                 weights_spec_advantage_stream=None,
                 biases_spec_advantage_stream=None,
                 activation_advantage_stream="relu",
                 scope="dueling-action-adapter",
                 **kwargs):
        # TODO: change add_units=-1 once we have a true base class for action-adapters.
        super(DuelingActionAdapter, self).__init__(add_units=0,
                                                   scope=scope,
                                                   **kwargs)

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # The advantage stream.
        self.units_advantage_stream = units_advantage_stream
        self.weights_spec_advantage_stream = weights_spec_advantage_stream
        self.biases_spec_advantage_stream = biases_spec_advantage_stream
        self.activation_advantage_stream = activation_advantage_stream

        # Create all 4 extra DenseLayers.
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream,
            weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream")
        self.dense_layer_advantage_stream = DenseLayer(
            units=self.units_state_value_stream,
            weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-advantage-stream")
        self.state_value_node = DenseLayer(units=1,
                                           activation="linear",
                                           scope="state-value-node")
        # self.action_layer is our advantage layer

        self.add_components(self.dense_layer_state_value_stream,
                            self.dense_layer_advantage_stream,
                            self.state_value_node)

    @rlgraph_api
    def get_action_layer_output(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            tuple:
                DataOpRecord: The output of the state-value stream (a DenseLayer) after passing `nn_output` through it.

                DataOpRecord: The output of the advantage-value stream (a DenseLayer) after passing `nn_output` through
                    it. Note: These will be flat advantage nodes that have not been reshaped yet according to the
                    action_space.
        """
        output_state_value_dense = self.dense_layer_state_value_stream.apply(
            nn_output)
        output_advantage_dense = self.dense_layer_advantage_stream.apply(
            nn_output)
        state_value_node = self.state_value_node.apply(
            output_state_value_dense)
        advantage_nodes = self.action_layer.apply(output_advantage_dense)
        return dict(state_value_node=state_value_node, output=advantage_nodes)

    @rlgraph_api
    def get_logits_probabilities_log_probs(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            tuple (4x DataOpRecord):
                - The single state value node output.
                - The (already reshaped) q-values (the logits).
                - The probabilities obtained by softmaxing the q-values.
                - The log-probs.
        """
        out = self.get_action_layer_output(nn_output)
        advantage_values_reshaped = self.reshape.apply(out["output"])
        q_values = self._graph_fn_calculate_q_values(
            out["state_value_node"], advantage_values_reshaped)
        probabilities, log_probs = self._graph_fn_get_probabilities_log_probs(
            q_values)
        return dict(state_values=out["state_value_node"],
                    logits=q_values,
                    probabilities=probabilities,
                    log_probs=log_probs)

    @graph_fn
    def _graph_fn_calculate_q_values(self, state_value, advantage_values):
        """
        Args:
            state_value (SingleDataOp): The single node state-value output.
            advantage_values (SingleDataOp): The already reshaped advantage-values.

        Returns:
            SingleDataOp: The calculated, reshaped Q values (for each composite action) based on:
                Q = V + [A - mean(A)]
        """
        # Use the very first node as value function output.
        # Use all following nodes as advantage function output.
        if get_backend() == "tf":
            ## Separate out the single state-value node.
            #state_value, advantages = tf.split(
            #    value=inputs, num_or_size_splits=(1, self.num_advantage_values), axis=-1
            #)
            # Now we have to reshape the advantages according to our action space.
            #shape = list(self.target_space.get_shape(with_batch_rank=-1, with_category_rank=True))
            #advantages = tf.reshape(tensor=advantage_values, shape=shape)
            # Calculate the q-values according to [1] and return.
            mean_advantages = tf.reduce_mean(input_tensor=advantage_values,
                                             axis=-1,
                                             keepdims=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = tf.expand_dims(state_value_expanded,
                                                      axis=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            ## state-value, advantages, q_values
            # q-values
            return q_values
            #tf.squeeze(state_value, axis=-1), advantages,
        elif get_backend() == "pytorch":
            mean_advantages = torch.mean(advantage_values,
                                         dim=-1,
                                         keepdim=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = torch.unsqueeze(state_value_expanded,
                                                       dim=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            ## state-value, advantages, q_values
            # q-values
            return q_values

    # TODO: Use a SoftMax Component instead (uses the same code as the one below).
    @graph_fn
    def _graph_fn_get_probabilities_log_probs(self, logits):
        """
        Creates properties/parameters and log-probs from some reshaped output.

        Args:
            logits (SingleDataOp): The output of some layer that is already reshaped
                according to our action Space.

        Returns:
            tuple (2x SingleDataOp):
                parameters (DataOp): The parameters, ready to be passed to a Distribution object's
                    get_distribution API-method (usually some probabilities or loc/scale pairs).

                log_probs (DataOp): Simply the log(parameters).
        """
        if get_backend() == "tf":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = tf.maximum(x=tf.nn.softmax(logits=logits,
                                                        axis=-1),
                                        y=SMALL_NUMBER)
                # Log probs.
                log_probs = tf.log(x=parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = tf.split(value=logits,
                                        num_or_size_splits=2,
                                        axis=1)
                # Remove moments rank.
                mean = tf.squeeze(input=mean, axis=1)
                log_sd = tf.squeeze(input=log_sd, axis=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = tf.clip_by_value(
                    t=log_sd,
                    clip_value_min=math.log(SMALL_NUMBER),
                    clip_value_max=-math.log(SMALL_NUMBER))

                # Turn log sd into sd.
                sd = tf.exp(x=log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(tf.log(x=mean), log_sd)
            else:
                raise NotImplementedError
            return parameters, log_probs

        elif get_backend() == "pytorch":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = torch.max(torch.softmax(logits, dim=-1),
                                       torch.tensor(SMALL_NUMBER))
                # Log probs.
                log_probs = torch.log(parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = torch.split(logits,
                                           split_size_or_sections=2,
                                           dim=1)
                # Remove moments rank.
                mean = torch.squeeze(mean, dim=1)
                log_sd = torch.squeeze(log_sd, dim=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = torch.clamp(log_sd,
                                     min=math.log(SMALL_NUMBER),
                                     max=-math.log(SMALL_NUMBER))

                # Turn log sd into sd.
                sd = torch.exp(log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(torch.log(mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs