class DummyNNWithDictInput(NeuralNetwork):
    """
    Dummy NN with dict input taking a dict with keys "a" and "b" passes them both through two different (parallel,
    not connected in any way) dense layers and then concatenating the outputs to yield the final output.
    """

    def __init__(self, num_units_a=3, num_units_b=2, scope="dummy-nn-with-dict-input", **kwargs):
        super(DummyNNWithDictInput, self).__init__(scope=scope, **kwargs)

        self.num_units_a = num_units_a
        self.num_units_b = num_units_b

        # Splits the input into two streams.
        self.splitter = ContainerSplitter("a", "b")
        self.stack_a = DenseLayer(units=self.num_units_a, scope="dense-a")
        self.stack_b = DenseLayer(units=self.num_units_b, scope="dense-b")
        self.concat_layer = ConcatLayer()

        # Add all sub-components to this one.
        self.add_components(self.splitter, self.stack_a, self.stack_b, self.concat_layer)

    @rlgraph_api
    def call(self, input_dict):
        # Split the input dict into two streams.
        input_a, input_b = self.splitter.call(input_dict)

        # Get the two stack outputs.
        output_a = self.stack_a.call(input_a)
        output_b = self.stack_b.call(input_b)

        # Concat everything together, that's the output.
        concatenated_data = self.concat_layer.call(output_a, output_b)

        return concatenated_data
class VariationalAutoEncoder(NeuralNetwork):
    def __init__(self, z_units, encoder_network_spec, decoder_network_spec,
                 **kwargs):
        """
        Args:
            z_units (int): Number of units of the latent (z) vectors that the encoder will produce.

            encoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct an encoder
                NeuralNetwork object from or a NeuralNetwork Component directly.

            decoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct a decoder
                NeuralNetwork object from or a NeuralNetwork Component directly.
        """
        super(VariationalAutoEncoder,
              self).__init__(scope="variational-auto-encoder", **kwargs)

        self.z_units = z_units

        # Create encoder and decoder networks.
        self.encoder_network = NeuralNetwork.from_spec(encoder_network_spec,
                                                       scope="encoder-network")
        self.decoder_network = NeuralNetwork.from_spec(decoder_network_spec,
                                                       scope="decoder-network")

        # Create the two Gaussian layers.
        self.mean_layer = DenseLayer(units=self.z_units, scope="mean-layer")
        self.stddev_layer = DenseLayer(units=self.z_units,
                                       scope="stddev-layer")

        # Create the Normal Distribution from which to sample.
        self.normal_distribution = Normal()

        # A concat layer to concat mean and stddev before passing it to the Normal distribution.
        # No longer needed: Pass Tuple (mean + stddev) into API-method instead of concat'd tensor.
        #self.concat_layer = ConcatLayer(axis=-1)

        # Add all sub-Components.
        self.add_components(
            self.encoder_network,
            self.decoder_network,
            self.mean_layer,
            self.stddev_layer,
            self.normal_distribution  #, self.concat_layer
        )

    @rlgraph_api
    def call(self, input_):
        """
        Our custom `call` method.
        """
        encoder_out = self.encode(input_)
        decoder_out = self.decode(encoder_out["z_sample"])
        return decoder_out

    @rlgraph_api
    def encode(self, input_):
        # Get the encoder raw output.
        encoder_output = self.encoder_network.call(input_)
        # Push it through our two mean/std layers.
        mean = self.mean_layer.call(encoder_output)
        log_stddev = self.stddev_layer.call(encoder_output)
        stddev = self._graph_fn_exp(log_stddev)
        # Generate a Tuple to be passed into `sample_stochastic` as parameters of a Normal distribution.
        z_sample = self.normal_distribution.sample_stochastic(
            tuple([mean, stddev]))
        return dict(z_sample=z_sample, mean=mean, stddev=stddev)

    @rlgraph_api
    def decode(self, z_vector):
        return self.decoder_network.call(z_vector)

    @graph_fn
    def _graph_fn_exp(self, input_):
        if get_backend() == "tf":
            return tf.exp(input_)
        elif get_backend() == "pytorch":
            return torch.exp(input_)
Exemple #3
0
class DuelingPolicy(Policy):
    def __init__(self, network_spec, units_state_value_stream,
                 weights_spec_state_value_stream=None, biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu", scope="dueling-policy", **kwargs):
        super(DuelingPolicy, self).__init__(network_spec, scope=scope, **kwargs)

        self.action_space_flattened = self.action_space.flatten()

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # Our softmax component to produce probabilities.
        self.softmax = Softmax()

        # Create all state value extra Layers.
        # TODO: Make this a NN-spec as well (right now it's one layer fixed plus the final value node).
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream, weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream"
        )
        self.state_value_node = DenseLayer(
            units=1,
            activation="linear",
            scope="state-value-node"
        )

        self.add_components(self.dense_layer_state_value_stream, self.state_value_node)

    @rlgraph_api
    def get_state_values(self, nn_inputs):  #, internal_states=None):
        """
        Returns the state value node's output passing some nn-input through the policy and the state-value
        stream.

        Args:
            nn_inputs (any): The input to our neural network.
            #internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
        """
        nn_outputs = self.get_nn_outputs(nn_inputs)
        state_values_tmp = self.dense_layer_state_value_stream.call(nn_outputs)
        state_values = self.state_value_node.call(state_values_tmp)

        return dict(state_values=state_values, nn_outputs=nn_outputs)

    @rlgraph_api
    def get_state_values_adapter_outputs_and_parameters(self, nn_inputs):
        """
        Similar to `get_values_logits_probabilities_log_probs`, but also returns in the return dict under key
        `state_value` the output of our state-value function node.

        Args:
            nn_inputs (any): The input to our neural network.
            #internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                state_values: The single (but batched) value function node output.
                action_adapter_outputs: The (reshaped) logits from the ActionAdapter.
                parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting
                    logits as mean and stddev for a normal distribution).
                log_probs: The log(probabilities) values.
                last_internal_states: The last internal states (if network is RNN-based).
        """
        nn_outputs = self.get_nn_outputs(nn_inputs)
        advantages, _, _ = self._graph_fn_get_adapter_outputs_and_parameters(nn_outputs)
        state_values_tmp = self.dense_layer_state_value_stream.call(nn_outputs)
        state_values = self.state_value_node.call(state_values_tmp)

        q_values = self._graph_fn_calculate_q_values(state_values, advantages)

        parameters, log_probs = self._graph_fn_get_parameters_from_q_values(q_values)

        return dict(
            nn_outputs=nn_outputs, adapter_outputs=q_values, state_values=state_values,
            parameters=parameters, log_probs=log_probs,
            advantages=advantages, q_values=q_values
        )

    @rlgraph_api
    def get_adapter_outputs(self, nn_inputs):
        """
        Args:
            nn_inputs (any): The input to our neural network.

        Returns:
            Dict:
                nn_outputs: The raw NN outputs.
                adapter_outputs: The q-values after adding advantages to state values (and subtracting the
                    mean advantage).
                advantages:
                q_values:
        """
        nn_outputs = self.get_nn_outputs(nn_inputs)
        advantages, _, _ = self._graph_fn_get_adapter_outputs_and_parameters(nn_outputs)
        state_values_tmp = self.dense_layer_state_value_stream.call(nn_outputs)
        state_values = self.state_value_node.call(state_values_tmp)

        q_values = self._graph_fn_calculate_q_values(state_values, advantages)

        return dict(
            nn_outputs=nn_outputs,
            adapter_outputs=q_values,
            advantages=advantages,
            q_values=q_values
        )

    @rlgraph_api
    def get_adapter_outputs_and_parameters(self, nn_inputs):
        """
        Args:
            nn_inputs (any): The input to our neural network.
            #internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                nn_outputs: The raw NN outputs.
                adapter_outputs: The q-values after adding advantages to state values (and subtracting the
                    mean advantage).
                parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting
                    logits as mean and stddev for a normal distribution).
                log_probs: The log(probabilities) values iff we have a discrete action space.
        """
        out = self.get_state_values_adapter_outputs_and_parameters(nn_inputs)
        return dict(
            nn_outputs=out["nn_outputs"],
            adapter_outputs=out["adapter_outputs"],
            parameters=out["parameters"],
            log_probs=out["log_probs"]
        )

    @graph_fn(flatten_ops=True, split_ops=True)
    def _graph_fn_calculate_q_values(self, state_value, advantage_values):
        """
        Args:
            state_value (SingleDataOp): The single node state-value output.
            advantage_values (SingleDataOp): The already reshaped advantage-values.

        Returns:
            SingleDataOp: The calculated, reshaped Q values (for each composite action) based on:
                Q = V + [A - mean(A)]
        """
        # Use the very first node as value function output.
        # Use all following nodes as advantage function output.
        if get_backend() == "tf":
            # Calculate the q-values according to [1] and return.
            mean_advantages = tf.reduce_mean(input_tensor=advantage_values, axis=-1, keepdims=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = tf.expand_dims(state_value_expanded, axis=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            # q-values
            return q_values

        elif get_backend() == "pytorch":
            mean_advantages = torch.mean(advantage_values, dim=-1, keepdim=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = torch.unsqueeze(state_value_expanded, dim=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            # q-values
            return q_values

    @graph_fn(flatten_ops=True, split_ops=True, add_auto_key_as_first_param=True)
    def _graph_fn_get_parameters_from_q_values(self, key, q_values):
        """
        """
        out = self.action_adapters[key].get_parameters_from_adapter_outputs(q_values)
        return out["parameters"], out["log_probs"]

    def get_state_values_logits_probabilities_log_probs(self, nn_input, internal_states=None):
        raise RLGraphObsoletedError(
            "API method", "get_state_values_logits_probabilities_log_probs",
            "get_state_values_adpater_outputs_and_parameters"
        )

    def get_logits_probabilities_log_probs(self, nn_input, internal_states=None):
        raise RLGraphObsoletedError(
            "API method", "get_logits_probabilities_log_probs",
            "get_adapter_outputs_and_parameters"
        )