Esempio n. 1
0
class BatchApply(Component):
    """
    Takes an input with batch and time ranks, then folds the time rank into the batch rank,
    calls a certain API of some arbitrary child component, and unfolds the time rank again.
    """
    def __init__(self,
                 sub_component,
                 api_method_name,
                 scope="batch-apply",
                 **kwargs):
        """
        Args:
            sub_component (Component): The sub-Component to apply the batch to.
            api_method_name (str): The name of the API-method to call on the sub-component.
        """
        super(BatchApply, self).__init__(scope=scope, **kwargs)

        self.sub_component = sub_component
        self.api_method_name = api_method_name

        # Create the necessary reshape components.
        self.folder = ReShape(fold_time_rank=True, scope="folder")
        self.unfolder = ReShape(unfold_time_rank=True, scope="unfolder")

        self.add_components(self.sub_component, self.folder, self.unfolder)

    @rlgraph_api
    def call(self, input_):
        folded = self.folder.call(input_)
        applied = getattr(self.sub_component, self.api_method_name)(folded)
        unfolded = self.unfolder.call(applied,
                                      input_before_time_rank_folding=input_)
        return unfolded
Esempio n. 2
0
class BatchApply(Component):
    """
    Takes an input with batch and time ranks, then folds the time rank into the batch rank,
    calls a certain API of some arbitrary child component, and unfolds the time rank again.
    """
    def __init__(self,
                 sub_component,
                 api_method_name,
                 scope="batch-apply",
                 **kwargs):
        """
        Args:
            sub_component (Component): The sub-Component to apply the batch to.
            api_method_name (str): The name of the API-method to call on the sub-component.
        """
        super(BatchApply, self).__init__(scope=scope, **kwargs)

        self.sub_component = sub_component
        self.api_method_name = api_method_name

        # Create the necessary reshape components.
        self.folder = ReShape(fold_time_rank=True, scope="folder")
        self.unfolder = ReShape(unfold_time_rank=True, scope="unfolder")

        self.add_components(self.sub_component, self.folder, self.unfolder)

    @rlgraph_api
    def apply(self, input_):
        folded = self._graph_fn_fold(input_)
        applied = self._graph_fn_apply(folded)
        unfolded = self._graph_fn_unfold(applied, input_)
        return unfolded

    @graph_fn(flatten_ops=True, split_ops=True)
    def _graph_fn_fold(self, input_):
        if get_backend() == "tf":
            # Fold the time rank.
            input_folded = self.folder.apply(input_)
            return input_folded

    @graph_fn
    def _graph_fn_apply(self, input_folded):
        if get_backend() == "tf":
            # Send the folded input through the sub-component.
            sub_component_out = getattr(self.sub_component,
                                        self.api_method_name)(input_folded)
            return sub_component_out

    @graph_fn(flatten_ops=True, split_ops=True)
    def _graph_fn_unfold(self, sub_component_out, orig_input):
        if get_backend() == "tf":
            # Un-fold the time rank again.
            output = self.unfolder.apply(
                sub_component_out, input_before_time_rank_folding=orig_input)
            return output
    def test_keras_style_one_container_input_space(self):
        # Define one container input Space.
        input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True)

        # One-hot flatten the int tensor.
        flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0])
        # Run the float tensor through two dense layers.
        dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1])
        dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out)
        # Concat everything.
        cat_out = ConcatLayer()(flatten_layer_out, dense_2_out)

        # Use the `outputs` arg to allow your network to trace back the data flow until the input space.
        # `inputs` is not needed  here as we only have one single input (the Tuple).
        neural_net = NeuralNetwork(outputs=cat_out)

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space))

        var_dict = neural_net.variable_registry
        w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"])
        w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"])
        b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"])

        # Batch of size=n.
        input_ = input_space.sample(4)

        expected = np.concatenate([  # concat everything
            one_hot(input_[0]),  # int flattening
            dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value)  # float -> 2 x dense
        ], axis=-1)
        out = test.test(("call", tuple([input_])), expected_outputs=expected)

        test.terminate()
Esempio n. 4
0
    def build_image_processing_stack():
        """
        Constructs a ReShape preprocessor to fold the time rank into the batch rank.

        Then builds the 2 Conv2D Layers followed by ReLUs.

        Then adds: fc(256) + ReLU.
        """
        # Collect components for image stack before unfolding time-rank going into main LSTM.
        sub_components = list()

        # Divide by 255
        sub_components.append(Divide(divisor=255, scope="divide-255"))

        for i, (num_filters, kernel_size, stride) in enumerate(zip([16, 32], [8, 4], [4, 2])):
            # Conv2D plus ReLU activation function.
            conv2d = Conv2DLayer(
                filters=num_filters, kernel_size=kernel_size, strides=stride, padding="same",
                activation="relu", scope="conv2d-{}".format(i)
            )
            sub_components.append(conv2d)

        # A Flatten preprocessor and then an fc block (surrounded by ReLUs) and a time-rank-unfolding.
        sub_components.extend([
            ReShape(flatten=True, scope="flatten"),  # Flattener (to flatten Conv2D output for the fc layer).
            DenseLayer(units=256),  # Dense layer.
            NNLayer(activation="relu", scope="relu-before-lstm"),
        ])

        #stack_before_unfold = <- formerly known as
        image_stack = Stack(sub_components, scope="image-stack")

        return image_stack
    def test_keras_style_two_separate_input_spaces(self):
        # Define two input Spaces first. Independently (no container).
        input_space_1 = IntBox(3, add_batch_rank=True)
        input_space_2 = FloatBox(shape=(4,), add_batch_rank=True)

        # One-hot flatten the int tensor.
        flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space_1)
        # Run the float tensor through two dense layers.
        dense_1_out = DenseLayer(units=3, scope="d1")(input_space_2)
        dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out)
        # Concat everything.
        cat_out = ConcatLayer()(flatten_layer_out, dense_2_out)

        # Use the `outputs` arg to allow your network to trace back the data flow until the input space.
        neural_net = NeuralNetwork(inputs=[input_space_1, input_space_2], outputs=cat_out)

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=[input_space_1, input_space_2]))

        var_dict = neural_net.variable_registry
        w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"])
        w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"])
        b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"])

        # Batch of size=n.
        input_ = [input_space_1.sample(4), input_space_2.sample(4)]

        expected = np.concatenate([  # concat everything
            one_hot(input_[0]),  # int flattening
            dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value)  # float -> 2 x dense
        ], axis=-1)
        out = test.test(("call", input_), expected_outputs=expected)

        test.terminate()
Esempio n. 6
0
    def __init__(self, worker_sample_size=100, scope="impala-network", **kwargs):
        """
        Args:
            worker_sample_size (int): How many time-steps an IMPALA actor will have performed in one rollout.
        """
        super(IMPALANetwork, self).__init__(scope=scope, **kwargs)

        self.worker_sample_size = worker_sample_size

        # Create all needed sub-components.

        # ContainerSplitter for the Env signal (dict of 4 keys: for env image, env text, previous action and reward).
        self.splitter = ContainerSplitter("RGB_INTERLEAVED", "INSTR", "previous_action", "previous_reward",
                                          scope="input-splitter")

        # Fold the time rank into the batch rank.
        self.time_rank_fold_before_lstm = ReShape(fold_time_rank=True, scope="time-rank-fold-before-lstm")
        self.time_rank_unfold_before_lstm = ReShape(unfold_time_rank=True, time_major=True,
                                                    scope="time-rank-unfold-before-lstm")

        # The Image Processing Stack (left side of "Large Architecture" Figure 3 in [1]).
        # Conv2D column + ReLU + fc(256) + ReLU.
        self.image_processing_stack = self.build_image_processing_stack()

        # The text processing pipeline: Takes a batch of string tensors as input, creates a hash-bucket thereof,
        # and passes the output of the hash bucket through an embedding-lookup(20) layer. The output of the embedding
        # lookup is then passed through an LSTM(64).
        self.text_processing_stack = self.build_text_processing_stack()

        #self.debug_slicer = Slice(scope="internal-states-slicer", squeeze=True)

        # The concatenation layer (concatenates outputs from image/text processing stacks, previous action/reward).
        self.concat_layer = ConcatLayer()

        # The main LSTM (going into the ActionAdapter (next in the Policy Component that uses this NN Component)).
        # Use time-major as it's faster (say tf docs).
        self.main_lstm = LSTMLayer(units=256, scope="lstm-256", time_major=True, static_loop=self.worker_sample_size)

        # Add all sub-components to this one.
        self.add_components(
            self.splitter, self.image_processing_stack, self.text_processing_stack,
            self.concat_layer,
            self.main_lstm,
            self.time_rank_fold_before_lstm, self.time_rank_unfold_before_lstm,
            #self.debug_slicer
        )
Esempio n. 7
0
    def __init__(self,
                 action_space=None,
                 final_shape=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 pre_network_spec=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Optional[Space]): The action Space within which this Component will create actions.
                NOTE: Exactly one of `action_space` of `final_shape` must be provided.

            final_shape (Optional[Tuple[int]): An optional final output shape (in case action_space is not provided).
                If None, will calculate the shape automatically from the given `action_space`.
                NOTE: Exactly one of `action_space` of `final_shape` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).

            pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the
                last action layer. If None, only the action layer itself is applied.
        """
        # Build the action layer for this adapter based on the given action-space.
        self.action_space = None
        if action_space is not None:
            self.action_space = action_space.with_batch_rank()
            assert not isinstance(self.action_space, ContainerSpace),\
                "ERROR: ActionAdapter cannot handle ContainerSpaces!"

        units, self.final_shape = self.get_units_and_shape()

        action_layer = DenseLayer(units=units,
                                  activation=activation,
                                  weights_spec=weights_spec,
                                  biases_spec=biases_spec,
                                  scope="action-layer")

        # Do we have a pre-NN?
        self.network = NeuralNetwork.from_spec(
            pre_network_spec, scope="action-network")  # type: NeuralNetwork
        self.network.add_layer(action_layer)

        # Add the reshape layer to match the action space's shape.
        self.network.add_layer(ReShape(new_shape=self.final_shape))

        super(ActionAdapter, self).__init__(self.network,
                                            scope=scope,
                                            **kwargs)
Esempio n. 8
0
    def __init__(self,
                 sub_component,
                 api_method_name,
                 scope="batch-apply",
                 **kwargs):
        """
        Args:
            sub_component (Component): The sub-Component to apply the batch to.
            api_method_name (str): The name of the API-method to call on the sub-component.
        """
        super(BatchApply, self).__init__(scope=scope, **kwargs)

        self.sub_component = sub_component
        self.api_method_name = api_method_name

        # Create the necessary reshape components.
        self.folder = ReShape(fold_time_rank=True, scope="folder")
        self.unfolder = ReShape(unfold_time_rank=True, scope="unfolder")

        self.add_components(self.sub_component, self.folder, self.unfolder)
Esempio n. 9
0
    def build_image_processing_stack():
        """
        Constructs a ReShape preprocessor to fold the time rank into the batch rank.

        Then builds the 3 sequential Conv2D blocks that process the image information.
        Each of these 3 blocks consists of:
        - 1 Conv2D layer followed by a MaxPool2D
        - 2 residual blocks, each of which looks like:
            - ReLU + Conv2D + ReLU + Conv2D + element-wise add with original input

        Then adds: ReLU + fc(256) + ReLU.
        """
        # Collect components for image stack before unfolding time-rank going into main LSTM.
        sub_components = list()

        # Divide by 255
        sub_components.append(Divide(divisor=255, scope="divide-255"))

        for i, num_filters in enumerate([16, 32, 32]):
            # Conv2D plus MaxPool2D.
            conv2d_plus_maxpool = Stack(
                Conv2DLayer(filters=num_filters, kernel_size=3, strides=1, padding="same"),
                MaxPool2DLayer(pool_size=3, strides=2, padding="same"),
                scope="conv-max"
            )

            # Single unit for the residual layers (ReLU + Conv2D 3x3 stride=1).
            residual_unit = Stack(
                NNLayer(activation="relu"),  # single ReLU
                Conv2DLayer(filters=num_filters, kernel_size=3, strides=1, padding="same"),
                scope="relu-conv"
            )
            # Residual Layer.
            residual_layer = ResidualLayer(residual_unit=residual_unit, repeats=2)
            # Repeat same residual layer 2x.
            residual_repeater = RepeaterStack(sub_component=residual_layer, repeats=2)

            sub_components.append(Stack(conv2d_plus_maxpool, residual_repeater, scope="conv-unit-{}".format(i)))

        # A Flatten preprocessor and then an fc block (surrounded by ReLUs) and a time-rank-unfolding.
        sub_components.extend([
            ReShape(flatten=True, scope="flatten"),  # Flattener (to flatten Conv2D output for the fc layer).
            NNLayer(activation="relu", scope="relu-1"),  # ReLU 1
            DenseLayer(units=256),  # Dense layer.
            NNLayer(activation="relu", scope="relu-2"),  # ReLU 2
        ])

        image_stack = Stack(sub_components, scope="image-stack")

        return image_stack
    def __init__(self, distribution_specs, scope="joint-cumulative-distribution", **kwargs):
        """
        Args:
            distribution_specs (dict): Dict with flat-keys containing the specifications of the single
                sub-distributions.
        """
        super(JointCumulativeDistribution, self).__init__(scope=scope, **kwargs)

        # Create the flattened sub-distributions and add them.
        self.flattened_sub_distributions = \
            {flat_key: Distribution.from_spec(spec, scope="sub-distribution-{}".format(i))
             for i, (flat_key, spec) in enumerate(distribution_specs.items())
            }
        self.flattener = ReShape(flatten=True)

        self.add_components(self.flattener, *list(self.flattened_sub_distributions.values()))
Esempio n. 11
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 worker_sample_size=100,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.

        Keyword Args:
            type (str): One of "single", "actor" or "learner". Default: "single".
        """
        type_ = kwargs.pop("type", "single")
        assert type_ in ["single", "actor", "learner"]
        self.type = type_
        self.worker_sample_size = worker_sample_size

        # Network-spec by default is a "large architecture" IMPALA network.
        self.network_spec = kwargs.pop(
            "network_spec",
            dict(
                type=
                "rlgraph.components.neural_networks.impala.impala_networks.{}IMPALANetwork"
                .format("Large" if architecture == "large" else "Small")))
        if isinstance(self.network_spec, dict) and "type" in self.network_spec and \
                "IMPALANetwork" in self.network_spec["type"]:
            self.network_spec = default_dict(
                self.network_spec,
                dict(worker_sample_size=1 if self.type ==
                     "actor" else self.worker_sample_size + 1))

        # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need.
        self.exploration_spec = kwargs.pop("exploration_spec", None)
        optimizer_spec = kwargs.pop("optimizer_spec", None)
        observe_spec = kwargs.pop("observe_spec", None)

        self.feed_previous_action_through_nn = feed_previous_action_through_nn
        self.feed_previous_reward_through_nn = feed_previous_reward_through_nn

        # Run everything in a single process.
        if self.type == "single":
            environment_spec = environment_spec or self.default_environment_spec
            update_spec = kwargs.pop("update_spec", None)
        # Actors won't need to learn (no optimizer needed in graph).
        elif self.type == "actor":
            optimizer_spec = None
            update_spec = kwargs.pop("update_spec", dict(do_updates=False))
            environment_spec = environment_spec or self.default_environment_spec
        # Learners won't need to explore (act) or observe (insert into Queue).
        else:
            observe_spec = None
            update_spec = kwargs.pop("update_spec", None)
            environment_spec = None

        # Add previous-action/reward preprocessors to env-specific preprocessor spec.
        # TODO: remove this empty hard-coded preprocessor.
        self.preprocessing_spec = kwargs.pop(
            "preprocessing_spec",
            dict(
                type="dict-preprocessor-stack",
                preprocessors=dict(
                    # Flatten actions.
                    previous_action=[
                        dict(type="reshape",
                             flatten=True,
                             flatten_categories=kwargs.get(
                                 "action_space").num_categories)
                    ],
                    # Bump reward and convert to float32, so that it can be concatenated by the Concat layer.
                    previous_reward=[dict(type="reshape", new_shape=(1, ))])))

        # Limit communication in distributed mode between each actor and the learner (never between actors).
        execution_spec = kwargs.pop("execution_spec", None)
        if execution_spec is not None and execution_spec.get(
                "mode") == "distributed":
            default_dict(
                execution_spec["session_config"],
                dict(type="monitored-training-session",
                     allow_soft_placement=True,
                     device_filters=["/job:learner/task:0"] + ([
                         "/job:actor/task:{}".format(
                             execution_spec["distributed_spec"]["task_index"])
                     ] if self.type == "actor" else ["/job:learner/task:0"])))
            # If Actor, make non-chief in either case (even if task idx == 0).
            if self.type == "actor":
                execution_spec["distributed_spec"]["is_chief"] = False
                # Hard-set device to the CPU for actors.
                execution_spec["device_strategy"] = "custom"
                execution_spec[
                    "default_device"] = "/job:{}/task:{}/cpu".format(
                        self.type,
                        execution_spec["distributed_spec"]["task_index"])

        self.policy_spec = kwargs.pop("policy_spec", dict())
        # TODO: Create some auto-setting based on LSTM inside the NN.
        default_dict(
            self.policy_spec,
            dict(type="shared-value-function-policy",
                 deterministic=False,
                 reuse_variable_scope="shared-policy",
                 action_space=kwargs.get("action_space")))

        # Now that we fixed the Agent's spec, call the super constructor.
        super(IMPALAAgent,
              self).__init__(discount=discount,
                             preprocessing_spec=self.preprocessing_spec,
                             network_spec=self.network_spec,
                             policy_spec=self.policy_spec,
                             exploration_spec=self.exploration_spec,
                             optimizer_spec=optimizer_spec,
                             observe_spec=observe_spec,
                             update_spec=update_spec,
                             execution_spec=execution_spec,
                             name=kwargs.pop(
                                 "name", "impala-{}-agent".format(self.type)),
                             **kwargs)
        # Always use 1st learner as the parameter server for all policy variables.
        if self.execution_spec["mode"] == "distributed" and self.execution_spec[
                "distributed_spec"]["cluster_spec"]:
            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu")))

        # Check whether we have an RNN.
        self.has_rnn = self.policy.neural_network.has_rnn()
        # Check, whether we are running with GPU.
        self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \
            self.execution_spec["gpu_spec"]["num_gpus"] > 0

        # Some FIFO-queue specs.
        self.fifo_queue_keys = ["terminals", "states"] + \
                               (["actions"] if not self.feed_previous_action_through_nn else []) + \
                               (["rewards"] if not self.feed_previous_reward_through_nn else []) + \
                               ["action_probs"] + \
                               (["initial_internal_states"] if self.has_rnn else [])
        # Define FIFO record space.
        # Note that only states and internal_states (RNN) contain num-steps+1 items, all other sub-records only contain
        # num-steps items.
        self.fifo_record_space = Dict(
            {
                "terminals":
                bool,
                "action_probs":
                FloatBox(shape=(self.action_space.num_categories, )),
            },
            add_batch_rank=False,
            add_time_rank=self.worker_sample_size)
        self.fifo_record_space["states"] = self.state_space.with_time_rank(
            self.worker_sample_size + 1)
        # Add action and rewards to state or do they have an extra channel?
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_action"] = \
                self.action_space.with_time_rank(self.worker_sample_size + 1)
        else:
            self.fifo_record_space[
                "actions"] = self.action_space.with_time_rank(
                    self.worker_sample_size)
        if self.feed_previous_action_through_nn:
            self.fifo_record_space["states"]["previous_reward"] = FloatBox(
                add_time_rank=self.worker_sample_size + 1)
        else:
            self.fifo_record_space["rewards"] = FloatBox(
                add_time_rank=self.worker_sample_size)

        if self.has_rnn:
            self.fifo_record_space[
                "initial_internal_states"] = self.internal_states_space.with_time_rank(
                    False)

        # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue).
        self.fifo_queue = FIFOQueue.from_spec(
            fifo_queue_spec or dict(capacity=1),
            reuse_variable_scope="shared-fifo-queue",
            only_insert_single_records=True,
            record_space=self.fifo_record_space,
            device="/job:learner/task:0/cpu"
            if self.execution_spec["mode"] == "distributed"
            and self.execution_spec["distributed_spec"]["cluster_spec"] else
            None)

        # Remove `states` key from input_spaces: not needed.
        del self.input_spaces["states"]

        # Add all our sub-components to the core.
        if self.type == "single":
            pass

        elif self.type == "actor":
            # No learning, no loss function.
            self.loss_function = None
            # A Dict Splitter to split things from the EnvStepper.
            self.env_output_splitter = ContainerSplitter(
                tuple_length=4, scope="env-output-splitter")

            self.states_dict_splitter = None

            # Slice some data from the EnvStepper (e.g only first internal states are needed).
            self.internal_states_slicer = Slice(scope="internal-states-slicer",
                                                squeeze=True)
            # Merge back to insert into FIFO.
            self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

            # Dummy Flattener to calculate action-probs space.
            dummy_flattener = ReShape(
                flatten=True,
                flatten_categories=self.action_space.num_categories)
            self.environment_stepper = EnvironmentStepper(
                environment_spec=environment_spec,
                actor_component_spec=ActorComponent(self.preprocessor,
                                                    self.policy,
                                                    self.exploration),
                state_space=self.state_space.with_batch_rank(),
                reward_space=
                float,  # TODO <- float64 for deepmind? may not work for other envs
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_previous_action_to_state=True,
                add_previous_reward_to_state=True,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space))
            sub_components = [
                self.environment_stepper, self.env_output_splitter,
                self.internal_states_slicer, self.fifo_input_merger,
                self.fifo_queue
            ]
        # Learner.
        else:
            self.environment_stepper = None

            # A Dict splitter to split up items from the queue.
            self.fifo_input_merger = None
            self.fifo_output_splitter = ContainerSplitter(
                *self.fifo_queue_keys, scope="fifo-output-splitter")
            self.states_dict_splitter = ContainerSplitter(
                *list(self.fifo_record_space["states"].keys()),
                scope="states-dict-splitter")
            self.internal_states_slicer = None

            self.transposer = Transpose(
                scope="transposer", device=dict(ops="/job:learner/task:0/cpu"))
            self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

            # Create an IMPALALossFunction with some parameters.
            self.loss_function = IMPALALossFunction(
                discount=self.discount,
                weight_pg=weight_pg,
                weight_baseline=weight_baseline,
                weight_entropy=weight_entropy,
                slice_actions=self.feed_previous_action_through_nn,
                slice_rewards=self.feed_previous_reward_through_nn,
                device="/job:learner/task:0/gpu")

            self.policy.propagate_sub_component_properties(
                dict(device=dict(variables="/job:learner/task:0/cpu",
                                 ops="/job:learner/task:0/gpu")))
            for component in [
                    self.staging_area, self.preprocessor, self.optimizer
            ]:
                component.propagate_sub_component_properties(
                    dict(device="/job:learner/task:0/gpu"))

            sub_components = [
                self.fifo_output_splitter, self.fifo_queue,
                self.states_dict_splitter, self.transposer, self.staging_area,
                self.preprocessor, self.policy, self.loss_function,
                self.optimizer
            ]

        if self.type != "single":
            # Add all the agent's sub-components to the root.
            self.root_component.add_components(*sub_components)

            # Define the Agent's (root Component's) API.
            self.define_graph_api(*sub_components)

        if self.type != "single" and self.auto_build:
            if self.type == "learner":
                build_options = dict(
                    build_device_context="/job:learner/task:0/cpu",
                    pin_global_variable_device="/job:learner/task:0/cpu")
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=build_options)
            else:
                self._build_graph([self.root_component],
                                  self.input_spaces,
                                  optimizer=self.optimizer,
                                  build_options=None)

            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))

                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
            if self.type == "actor":
                self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \
                    out_op_columns[0].op_records[0].op
Esempio n. 12
0
class IMPALANetwork(NeuralNetwork):
    """
    The base class for both "large and small architecture" versions of the networks used in [1].

    [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer,
        Munos et al. - 2018 (https://arxiv.org/abs/1802.01561)
    """
    def __init__(self,
                 worker_sample_size=100,
                 scope="impala-network",
                 **kwargs):
        """
        Args:
            worker_sample_size (int): How many time-steps an IMPALA actor will have performed in one rollout.
        """
        super(IMPALANetwork, self).__init__(scope=scope, **kwargs)

        self.worker_sample_size = worker_sample_size

        # Create all needed sub-components.

        # ContainerSplitter for the Env signal (dict of 4 keys: for env image, env text, previous action and reward).
        self.splitter = ContainerSplitter("RGB_INTERLEAVED",
                                          "INSTR",
                                          "previous_action",
                                          "previous_reward",
                                          scope="input-splitter")

        # Fold the time rank into the batch rank.
        self.time_rank_fold_before_lstm = ReShape(
            fold_time_rank=True, scope="time-rank-fold-before-lstm")
        self.time_rank_unfold_before_lstm = ReShape(
            unfold_time_rank=True,
            time_major=True,
            scope="time-rank-unfold-before-lstm")

        # The Image Processing Stack (left side of "Large Architecture" Figure 3 in [1]).
        # Conv2D column + ReLU + fc(256) + ReLU.
        self.image_processing_stack = self.build_image_processing_stack()

        # The text processing pipeline: Takes a batch of string tensors as input, creates a hash-bucket thereof,
        # and passes the output of the hash bucket through an embedding-lookup(20) layer. The output of the embedding
        # lookup is then passed through an LSTM(64).
        self.text_processing_stack = self.build_text_processing_stack()

        #self.debug_slicer = Slice(scope="internal-states-slicer", squeeze=True)

        # The concatenation layer (concatenates outputs from image/text processing stacks, previous action/reward).
        self.concat_layer = ConcatLayer()

        # The main LSTM (going into the ActionAdapter (next in the Policy Component that uses this NN Component)).
        # Use time-major as it's faster (say tf docs).
        self.main_lstm = LSTMLayer(units=256,
                                   scope="lstm-256",
                                   time_major=True,
                                   static_loop=self.worker_sample_size)

        # Add all sub-components to this one.
        self.add_components(
            self.splitter,
            self.image_processing_stack,
            self.text_processing_stack,
            self.concat_layer,
            self.main_lstm,
            self.time_rank_fold_before_lstm,
            self.time_rank_unfold_before_lstm,
            #self.debug_slicer
        )

    @staticmethod
    def build_image_processing_stack():
        """
        Builds the image processing pipeline for IMPALA and returns it.
        """
        raise NotImplementedError

    @staticmethod
    def build_text_processing_stack():
        """
        Helper function to build the text processing pipeline for both the large and small architectures, consisting of:
        - ReShape preprocessor to fold the incoming time rank into the batch rank.
        - StringToHashBucket Layer taking a batch of sentences and converting them to an indices-table of dimensions:
          cols=length of longest sentences in input
          rows=number of items in the batch
          The cols dimension could be interpreted as the time rank into a consecutive LSTM. The StringToHashBucket
          Component returns the sequence length of each batch item for exactly that purpose.
        - Embedding Lookup Layer of embedding size 20 and number of rows == num_hash_buckets (see previous layer).
        - LSTM processing the batched sequences of words coming from the embedding layer as batches of rows.
        """
        num_hash_buckets = 1000

        # Create a hash bucket from the sentences and use that bucket to do an embedding lookup (instead of
        # a vocabulary).
        string_to_hash_bucket = StringToHashBucket(
            num_hash_buckets=num_hash_buckets)
        embedding = EmbeddingLookup(embed_dim=20,
                                    vocab_size=num_hash_buckets,
                                    pad_empty=True)
        # The time rank for the LSTM is now the sequence of words in a sentence, NOT the original env time rank.
        # We will only use the last output of the LSTM-64 for further processing as that is the output after having
        # seen all words in the sentence.
        # The original env stepping time rank is currently folded into the batch rank and must be unfolded again before
        # passing it into the main LSTM.
        lstm64 = LSTMLayer(units=64, scope="lstm-64", time_major=False)

        tuple_splitter = ContainerSplitter(tuple_length=2,
                                           scope="tuple-splitter")

        def custom_apply(self, inputs):
            hash_bucket, lengths = self.sub_components[
                "string-to-hash-bucket"].apply(inputs)

            embedding_output = self.sub_components["embedding-lookup"].apply(
                hash_bucket)

            # Return only the last output (sentence of words, where we are not interested in intermediate results
            # where the LSTM has not seen the entire sentence yet).
            # Last output is the final internal h-state (slot 1 in the returned LSTM tuple; slot 0 is final c-state).
            lstm_output = self.sub_components["lstm-64"].apply(
                embedding_output, sequence_length=lengths)
            lstm_final_internals = lstm_output["last_internal_states"]

            # Need to split once more because the LSTM state is always a tuple of final c- and h-states.
            _, lstm_final_h_state = self.sub_components[
                "tuple-splitter"].split(lstm_final_internals)

            return lstm_final_h_state

        text_processing_stack = Stack(string_to_hash_bucket,
                                      embedding,
                                      lstm64,
                                      tuple_splitter,
                                      api_methods={("apply", custom_apply)},
                                      scope="text-stack")

        return text_processing_stack

    @rlgraph_api
    def apply(self, input_dict, internal_states=None):
        # Split the input dict coming directly from the Env.
        _, _, _, orig_previous_reward = self.splitter.split(input_dict)

        folded_input = self.time_rank_fold_before_lstm.apply(input_dict)
        image, text, previous_action, previous_reward = self.splitter.split(
            folded_input)

        # Get the left-stack (image) and right-stack (text) output (see [1] for details).
        text_processing_output = self.text_processing_stack.apply(text)
        image_processing_output = self.image_processing_stack.apply(image)

        # Concat everything together.
        concatenated_data = self.concat_layer.apply(image_processing_output,
                                                    text_processing_output,
                                                    previous_action,
                                                    previous_reward)

        unfolded_concatenated_data = self.time_rank_unfold_before_lstm.apply(
            concatenated_data, orig_previous_reward)

        # Feed concat'd input into main LSTM(256).
        lstm_output = self.main_lstm.apply(unfolded_concatenated_data,
                                           internal_states)

        return lstm_output
Esempio n. 13
0
    def __init__(self,
                 network_spec,
                 action_space=None,
                 action_adapter_spec=None,
                 max_likelihood=True,
                 scope="policy",
                 **kwargs):
        """
        Args:
            network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build
                one.

            action_space (Space): The action Space within which this Component will create actions.

            action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default
                ActionAdapter object.

            max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling.
                Default: True.
        """
        super(Policy, self).__init__(scope=scope, **kwargs)

        self.neural_network = NeuralNetwork.from_spec(network_spec)
        if action_space is None:
            self.action_adapter = ActionAdapter.from_spec(action_adapter_spec)
            action_space = self.action_adapter.action_space
        else:
            self.action_adapter = ActionAdapter.from_spec(
                action_adapter_spec, action_space=action_space)
        self.action_space = action_space
        self.max_likelihood = max_likelihood

        # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding.
        # TODO: Replace entirely via sonnet-like BatchApply Component.
        is_impala = "IMPALANetwork" in type(self.neural_network).__name__

        # Add API-method to get baseline output (if we use an extra value function baseline node).
        if isinstance(self.action_adapter, BaselineActionAdapter):
            # TODO: IMPALA attempt to speed up final pass after LSTM.
            if is_impala:
                self.time_rank_folder = ReShape(fold_time_rank=True,
                                                scope="time-rank-fold")
                self.time_rank_unfolder_v = ReShape(unfold_time_rank=True,
                                                    time_major=True,
                                                    scope="time-rank-unfold-v")
                self.time_rank_unfolder_a_probs = ReShape(
                    unfold_time_rank=True,
                    time_major=True,
                    scope="time-rank-unfold-a-probs")
                self.time_rank_unfolder_logits = ReShape(
                    unfold_time_rank=True,
                    time_major=True,
                    scope="time-rank-unfold-logits")
                self.time_rank_unfolder_log_probs = ReShape(
                    unfold_time_rank=True,
                    time_major=True,
                    scope="time-rank-unfold-log-probs")
                self.add_components(self.time_rank_folder,
                                    self.time_rank_unfolder_v,
                                    self.time_rank_unfolder_a_probs,
                                    self.time_rank_unfolder_log_probs,
                                    self.time_rank_unfolder_logits)

            @rlgraph_api(component=self)
            def get_state_values_logits_probabilities_log_probs(
                    self, nn_input, internal_states=None):
                nn_output = self.neural_network.apply(nn_input,
                                                      internal_states)
                last_internal_states = nn_output.get("last_internal_states")
                nn_output = nn_output["output"]

                # TODO: IMPALA attempt to speed up final pass after LSTM.
                if is_impala:
                    nn_output = self.time_rank_folder.apply(nn_output)

                out = self.action_adapter.get_logits_probabilities_log_probs(
                    nn_output)

                # TODO: IMPALA attempt to speed up final pass after LSTM.
                if is_impala:
                    state_values = self.time_rank_unfolder_v.apply(
                        out["state_values"], nn_output)
                    logits = self.time_rank_unfolder_logits.apply(
                        out["logits"], nn_output)
                    probs = self.time_rank_unfolder_a_probs.apply(
                        out["probabilities"], nn_output)
                    log_probs = self.time_rank_unfolder_log_probs.apply(
                        out["log_probs"], nn_output)
                else:
                    state_values = out["state_values"]
                    logits = out["logits"]
                    probs = out["probabilities"]
                    log_probs = out["log_probs"]

                return dict(state_values=state_values,
                            logits=logits,
                            probabilities=probs,
                            log_probs=log_probs,
                            last_internal_states=last_internal_states)

        # Figure out our Distribution.
        if isinstance(action_space, IntBox):
            self.distribution = Categorical()
        # Continuous action space -> Normal distribution (each action needs mean and variance from network).
        elif isinstance(action_space, FloatBox):
            self.distribution = Normal()
        else:
            raise RLGraphError(
                "ERROR: `action_space` is of type {} and not allowed in {} Component!"
                .format(type(action_space).__name__, self.name))

        self.add_components(self.neural_network, self.action_adapter,
                            self.distribution)

        if is_impala:
            self.add_components(self.time_rank_folder,
                                self.time_rank_unfolder_v,
                                self.time_rank_unfolder_a_probs,
                                self.time_rank_unfolder_log_probs,
                                self.time_rank_unfolder_logits)
    def test_keras_style_complex_multi_stream_nn(self):
        # 3 inputs.
        input_spaces = [
            Dict({
                "img": FloatBox(shape=(6, 6, 3)),
                "int": IntBox(3)
            }, add_batch_rank=True, add_time_rank=True),
            FloatBox(shape=(2,), add_batch_rank=True),
            Tuple(IntBox(2), TextBox(), add_batch_rank=True, add_time_rank=True)
        ]

        # Same NN as in test above, only using some of the sub-Spaces from the input spaces.
        # Tests whether this NN can add automatically the correct splitters.
        folded_text = ReShape(fold_time_rank=True)(input_spaces[2][1])
        # String layer will create batched AND time-ranked (individual words) hash outputs (int64).
        string_bucket_out, lengths = StringToHashBucket(num_hash_buckets=5)(folded_text)
        # Batched and time-ranked embedding output (floats) with embed dim=n.
        embedding_out = EmbeddingLookup(embed_dim=10, vocab_size=5)(string_bucket_out)
        # Pass embeddings through a text LSTM and use last output (reduce time-rank).
        string_lstm_out, _ = LSTMLayer(units=2, return_sequences=False, scope="lstm-layer-txt")(
            embedding_out, sequence_length=lengths
        )
        # Unfold to get original time-rank back.
        string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(string_lstm_out, input_spaces[2][1])

        # Parallel image stream via 1 CNN layer plus dense.
        folded_img = ReShape(fold_time_rank=True, scope="img-fold")(input_spaces[0]["img"])
        cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img)
        unfolded_cnn_out = ReShape(unfold_time_rank=True, scope="img-unfold")(cnn_out, input_spaces[0]["img"])
        unfolded_cnn_out_flattened = ReShape(flatten=True, scope="img-flat")(unfolded_cnn_out)
        dense_out = DenseLayer(units=2, scope="dense-0")(unfolded_cnn_out_flattened)

        # Concat everything.
        concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out)

        # LSTM output has batch+time.
        main_lstm_out, internal_states = LSTMLayer(units=2, scope="lstm-layer-main")(concat_out)

        dense1_after_lstm_out = DenseLayer(units=3, scope="dense-1")(main_lstm_out)
        dense2_after_lstm_out = DenseLayer(units=2, scope="dense-2")(dense1_after_lstm_out)
        dense3_after_lstm_out = DenseLayer(units=1, scope="dense-3")(dense2_after_lstm_out)

        # A NN with 3 outputs.
        neural_net = NeuralNetwork(inputs=input_spaces, outputs=[dense3_after_lstm_out, main_lstm_out, internal_states])

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_spaces))

        # Batch of size=n.
        sample_shape = (4, 2)
        input_ = [input_spaces[0].sample(sample_shape), input_spaces[1].sample(sample_shape[0]),
                  input_spaces[2].sample(sample_shape)]

        out = test.test(("call", tuple(input_)), expected_outputs=None)
        # Main output (Dense out after LSTM).
        self.assertTrue(out[0].shape == sample_shape + (1,))  # 1=1 unit in dense layer
        self.assertTrue(out[0].dtype == np.float32)
        # main-LSTM out.
        self.assertTrue(out[1].shape == sample_shape + (2,))  # 2=2 LSTM units
        self.assertTrue(out[1].dtype == np.float32)
        # main-LSTM internal-states.
        self.assertTrue(out[2][0].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][0].dtype == np.float32)
        self.assertTrue(out[2][1].shape == sample_shape[:1] + (2,))  # 2=2 LSTM units
        self.assertTrue(out[2][1].dtype == np.float32)

        test.terminate()
    def __init__(
            self, action_space, world_option_model_network, encoder_network, num_features, num_mixtures, beta=0.2,
            post_phi_concat_network=None,
            reward_clipping=1.0,
            intrinsic_rewards_weight=0.1,
            concat_with_command_vector=False,
            optimizer=None, deterministic=False, scope="intrinsic-curiosity-world-option-model",
            **kwargs
    ):
        """
        Args:
            action_space (Space): The action Space to be fed into the model together with the latent feature vector
                for the states. Will be flattened automatically and then concatenated by this component.

            world_option_model_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to
                construct the world-option-model's neural network.

            encoder_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to
                construct the inverse dynamics model's encoder network leading from s to phi (feature vector).

            num_features (int): The size of the feature vectors phi.

            num_mixtures (int): The number of mixture Normals to use for the next-state distribution output.

            beta (float): The weight for the phi' loss (action loss is then 1.0 - beta).

            post_phi_concat_network

            reward_clipping (float): 0.0 for no clipping, some other value for +/- reward value clipping.
                Default: 1.0.

            concat_with_command_vector (bool): If True, this model needs an additional command vector (coming from the
                policy above) to concat it together with the latent state vector.

            optimizer (Optional[Optimizer]): The optimizer to use for supervised learning of the two networks
                (ICM and WOM).
        """
        self.num_features = num_features
        self.num_mixtures = num_mixtures
        self.deterministic = deterministic
        self.beta = beta
        assert 0.0 < self.beta < 1.0, "ERROR: `beta` must be between 0 and 1!"
        self.reward_clipping = reward_clipping
        self.intrinsic_rewards_weight = intrinsic_rewards_weight

        # Create the encoder network inside a SupervisedPredictor (so we get the adapter + distribution with it).
        self.state_encoder = SupervisedPredictor(
            network_spec=encoder_network, output_space=FloatBox(shape=(num_features,), add_batch_rank=True),
            scope="state-encoder"
        )

        # Create the container loss function for the two prediction tasks:
        # a) Action prediction and b) next-state prediction, each of them using a simple neg log likelihood loss
        # comparing the actual action and s' with their log-likelihood value vs the respective distributions.
        self.loss_functions = dict(
            # Action prediction loss (neg log likelihood of observed action vs the parameterized distribution).
            predicted_actions=NegativeLogLikelihoodLoss(
                distribution_spec=get_default_distribution_from_space(action_space),
                scope="action-loss"
            ),
            # s' prediction loss (neg log likelihood of observed s' vs the parameterized mixed normal distribution).
            predicted_phi_=NegativeLogLikelihoodLoss(distribution_spec=dict(type="mixture", _args=[
                "multi-variate-normal" for _ in range(num_mixtures)
            ]), scope="phi-loss")
        )

        # TODO: Support for command vector concatenation.
        #self.concat_with_command_vector = concat_with_command_vector

        # Define the Model's network's custom call method.
        def custom_call(self, inputs):
            phi = inputs["phi"]
            actions = inputs["actions"]
            phi_ = inputs["phi_"]
            actions_flat = self.get_sub_component_by_name("action-flattener").call(actions)
            concat_phis = self.get_sub_component_by_name("concat-phis").call(phi, phi_)
            # Predict the action that lead from s to s'.
            predicted_actions = self.get_sub_component_by_name("post-phi-concat-nn").call(concat_phis)

            # Concat phi with flattened actions.
            phi_and_actions = self.get_sub_component_by_name("concat-states-and-actions").call(
                phi, actions_flat
            )
            # Add stop-gradient to phi here before predicting phi'
            # (the phis should only be trained by the inverse dynamics model, not by the world option model).
            # NOT DONE IN ORIGINAL PAPER's CODE AND ALSO NOT IN MLAGENTS EQUIVALENT.
            # phi_and_actions = self.get_sub_component_by_name("stop-gradient").stop(phi_and_actions)
            # Predict phi' (through a mixture gaussian distribution).
            predicted_phi_ = self.get_sub_component_by_name("wom-nn").call(phi_and_actions)

            return dict(
                # Predictions (actions and next-state-features (mixture distribution)).
                predicted_actions=predicted_actions,
                predicted_phi_=predicted_phi_
                ## Also return the two feature vectors for s and s'.
                #phi=phi, phi_=phi_
            )

        # Create the SupervisedPredictor's neural network.
        predictor_network = NeuralNetwork(
            # The world option model network taking action-cat-phi and mapping them to the predicted phi'.
            NeuralNetwork.from_spec(world_option_model_network, scope="wom-nn"),
            # The concat component concatenating both latent state vectors (phi and phi').
            ConcatLayer(scope="concat-phis"),
            # The NN mapping from phi-cat-phi' to the action prediction.
            NeuralNetwork.from_spec(post_phi_concat_network, scope="post-phi-concat-nn"),
            # The ReShape component for flattening all actions in arbitrary action spaces.
            ReShape(flatten=True, flatten_categories=True, flatten_containers=True, scope="action-flattener"),
            # The concat component concatenating latent state feature vector and incoming (flattened) actions.
            ConcatLayer(scope="concat-states-and-actions"),
            # Set the `call` method.
            api_methods={("call", custom_call)}
        )

        if optimizer is None:
            optimizer = dict(type="adam", learning_rate=3e-4)

        super(IntrinsicCuriosityWorldOptionModel, self).__init__(
            predictor=dict(
                network_spec=predictor_network,
                output_space=Dict({
                    "predicted_actions": action_space,
                    "predicted_phi_": FloatBox(shape=(self.num_features,))
                }, add_batch_rank=action_space.has_batch_rank, add_time_rank=action_space.has_time_rank),
                distribution_adapter_spec=dict(
                    # for `predicted_actions`: use default adapter
                    # for predicted_phi': use normal-mixture adapter & distribution.
                    predicted_phi_={"type": "normal-mixture-adapter", "num_mixtures": num_mixtures}
                ),
                deterministic=deterministic
            ),
            loss_function=self.loss_functions["predicted_actions"],
            optimizer=optimizer, scope=scope, **kwargs
        )

        self.add_components(self.state_encoder, self.loss_functions["predicted_phi_"])
Esempio n. 16
0
    def __init__(self,
                 action_space,
                 add_units=0,
                 units=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Space): The action Space within which this Component will create actions.

            add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action-
                layer nodes. Can be negative to subtract units from the auto-calculated value.
                NOTE: Only one of either `add_units` or `units` must be provided.

            units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate
                the number of units automatically from the given action_space.
                NOTE: Only one of either `add_units` or `units` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).
        """
        super(ActionAdapter, self).__init__(scope=scope, **kwargs)

        self.action_space = action_space.with_batch_rank()
        self.weights_spec = weights_spec
        self.biases_spec = biases_spec
        self.activation = activation

        # Our (dense) action layer representing the flattened action space.
        self.action_layer = None

        # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space
        # or using a given fixed number (`units`).
        # Also generate the ReShape sub-Component and give it the new_shape.
        if isinstance(self.action_space, IntBox):
            if units is None:
                units = add_units + self.action_space.flat_dim_with_categories
            self.reshape = ReShape(
                new_shape=self.action_space.get_shape(with_category_rank=True),
                flatten_categories=False)
        else:
            if units is None:
                units = add_units + 2 * self.action_space.flat_dim  # Those two dimensions are the mean and log sd
            # Manually add moments after batch/time ranks.
            new_shape = tuple([2] + list(self.action_space.shape))
            self.reshape = ReShape(new_shape=new_shape)

        assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format(
            units)

        # Create the action-layer and add it to this component.
        self.action_layer = DenseLayer(units=units,
                                       activation=self.activation,
                                       weights_spec=self.weights_spec,
                                       biases_spec=self.biases_spec,
                                       scope="action-layer")

        self.add_components(self.action_layer, self.reshape)
Esempio n. 17
0
class ActionAdapter(Component):
    """
    A Component that cleans up a neural network's flat output and gets it ready for parameterizing a
    Distribution Component.
    Processing steps include:
    - Sending the raw, flattened NN output through a Dense layer whose number of units matches the flattened
    action space.
    - Reshaping (according to the action Space).
    - Translating the reshaped outputs (logits) into probabilities (by softmaxing) and log-probabilities (log).
    """
    def __init__(self,
                 action_space,
                 add_units=0,
                 units=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Space): The action Space within which this Component will create actions.

            add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action-
                layer nodes. Can be negative to subtract units from the auto-calculated value.
                NOTE: Only one of either `add_units` or `units` must be provided.

            units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate
                the number of units automatically from the given action_space.
                NOTE: Only one of either `add_units` or `units` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).
        """
        super(ActionAdapter, self).__init__(scope=scope, **kwargs)

        self.action_space = action_space.with_batch_rank()
        self.weights_spec = weights_spec
        self.biases_spec = biases_spec
        self.activation = activation

        # Our (dense) action layer representing the flattened action space.
        self.action_layer = None

        # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space
        # or using a given fixed number (`units`).
        # Also generate the ReShape sub-Component and give it the new_shape.
        if isinstance(self.action_space, IntBox):
            if units is None:
                units = add_units + self.action_space.flat_dim_with_categories
            self.reshape = ReShape(
                new_shape=self.action_space.get_shape(with_category_rank=True),
                flatten_categories=False)
        else:
            if units is None:
                units = add_units + 2 * self.action_space.flat_dim  # Those two dimensions are the mean and log sd
            # Manually add moments after batch/time ranks.
            new_shape = tuple([2] + list(self.action_space.shape))
            self.reshape = ReShape(new_shape=new_shape)

        assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format(
            units)

        # Create the action-layer and add it to this component.
        self.action_layer = DenseLayer(units=units,
                                       activation=self.activation,
                                       weights_spec=self.weights_spec,
                                       biases_spec=self.biases_spec,
                                       scope="action-layer")

        self.add_components(self.action_layer, self.reshape)

    def check_input_spaces(self, input_spaces, action_space=None):
        # Check the input Space.
        last_nn_layer_space = input_spaces["nn_output"]  # type: Space
        sanity_check_space(last_nn_layer_space,
                           non_allowed_types=[ContainerSpace])

        # Check the action Space.
        sanity_check_space(self.action_space, must_have_batch_rank=True)
        if isinstance(self.action_space, IntBox):
            sanity_check_space(self.action_space, must_have_categories=True)
        else:
            # Fixme: Are there other restraints on continuous action spaces? E.g. no dueling layers?
            pass

    @rlgraph_api
    def get_action_layer_output(self, nn_output):
        """
        Returns the raw, non-reshaped output of the action-layer (DenseLayer) after passing through it the raw
        nn_output (coming from the previous Component).

        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            DataOpRecord: The output of the action layer (a DenseLayer) after passing `nn_output` through it.
        """
        out = self.action_layer.apply(nn_output)
        return dict(output=out)

    @rlgraph_api
    def get_logits(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            SingleDataOp: The logits (raw nn_output, BUT reshaped).
        """
        aa_output = self.get_action_layer_output(nn_output)
        logits = self.reshape.apply(aa_output["output"])
        return logits

    @rlgraph_api
    def get_logits_probabilities_log_probs(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            Tuple[SingleDataOp]:
                - logits (raw nn_output, BUT reshaped)
                - probabilities (softmaxed(logits))
                - log(probabilities)
        """
        logits = self.get_logits(nn_output)
        probabilities, log_probs = self._graph_fn_get_probabilities_log_probs(
            logits)
        return dict(logits=logits,
                    probabilities=probabilities,
                    log_probs=log_probs)

    # TODO: Use a SoftMax Component instead (uses the same code as the one below).
    @graph_fn
    def _graph_fn_get_probabilities_log_probs(self, logits):
        """
        Creates properties/parameters and log-probs from some reshaped output.

        Args:
            logits (SingleDataOp): The output of some layer that is already reshaped
                according to our action Space.

        Returns:
            tuple (2x SingleDataOp):
                parameters (DataOp): The parameters, ready to be passed to a Distribution object's
                    get_distribution API-method (usually some probabilities or loc/scale pairs).
                log_probs (DataOp): Simply the log(parameters).
        """
        if get_backend() == "tf":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = tf.maximum(x=tf.nn.softmax(logits=logits,
                                                        axis=-1),
                                        y=SMALL_NUMBER)
                # Log probs.
                log_probs = tf.log(x=parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = tf.split(value=logits,
                                        num_or_size_splits=2,
                                        axis=1)
                # Remove moments rank.
                mean = tf.squeeze(input=mean, axis=1)
                log_sd = tf.squeeze(input=log_sd, axis=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = tf.clip_by_value(t=log_sd,
                                          clip_value_min=log(SMALL_NUMBER),
                                          clip_value_max=-log(SMALL_NUMBER))

                # Turn log sd into sd.
                sd = tf.exp(x=log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(tf.log(x=mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs

        elif get_backend() == "pytorch":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                softmax_logits = torch.softmax(logits, dim=-1)
                parameters = torch.max(softmax_logits, SMALL_NUMBER_TORCH)
                # Log probs.
                log_probs = torch.log(parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = torch.split(logits,
                                           split_size_or_sections=2,
                                           dim=1)
                # Remove moments rank.
                mean = torch.squeeze(mean, dim=1)
                log_sd = torch.squeeze(log_sd, dim=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = torch.clamp(log_sd,
                                     min=LOG_SMALL_NUMBER,
                                     max=-LOG_SMALL_NUMBER)

                # Turn log sd into sd.
                sd = torch.exp(log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(torch.log(mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs
Esempio n. 18
0
    def __init__(self,
                 action_space,
                 add_units=0,
                 units=None,
                 weights_spec=None,
                 biases_spec=None,
                 activation=None,
                 pre_network_spec=None,
                 scope="action-adapter",
                 **kwargs):
        """
        Args:
            action_space (Space): The action Space within which this Component will create actions.

            add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action-
                layer nodes. Can be negative to subtract units from the auto-calculated value.
                NOTE: Only one of either `add_units` or `units` must be provided.

            units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate
                the number of units automatically from the given action_space.
                NOTE: Only one of either `add_units` or `units` must be provided.

            weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                weights of `self.action layer`. Default: None (use default initializer).

            biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the
                biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0).

            activation (Optional[str]): The activation function to use for `self.action_layer`.
                Default: None (=linear).

            pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the
                last action layer. If None, only the action layer itself is applied.
        """
        # Build the action layer for this adapter based on the given action-space.
        self.action_space = action_space.with_batch_rank()
        assert not isinstance(
            self.action_space, ContainerSpace
        ), "ERROR: ActionAdapter cannot handle ContainerSpaces!"
        # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space
        # or using a given fixed number (`units`).
        # Also generate the ReShape sub-Component and give it the new_shape.
        if isinstance(self.action_space, IntBox):
            if units is None:
                units = add_units + self.action_space.flat_dim_with_categories
            new_shape = self.action_space.get_shape(with_category_rank=True)
        else:
            if units is None:
                units = add_units + 2 * self.action_space.flat_dim  # Those two dimensions are the mean and log sd
            # Manually add moments after batch/time ranks.
            new_shape = tuple([2] + list(self.action_space.shape))

        assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format(
            units)

        action_layer = DenseLayer(units=units,
                                  activation=activation,
                                  weights_spec=weights_spec,
                                  biases_spec=biases_spec,
                                  scope="action-layer")

        # Do we have a pre-NN?
        self.network = NeuralNetwork.from_spec(
            pre_network_spec, scope="action-network")  # type: NeuralNetwork
        self.network.add_layer(action_layer)

        # Add the reshape layer to match the action space's shape.
        self.network.add_layer(ReShape(new_shape=new_shape))

        super(ActionAdapter, self).__init__(self.network,
                                            scope=scope,
                                            **kwargs)
Esempio n. 19
0
    def __init__(self,
                 discount=0.99,
                 fifo_queue_spec=None,
                 architecture="large",
                 environment_spec=None,
                 feed_previous_action_through_nn=True,
                 feed_previous_reward_through_nn=True,
                 weight_pg=None,
                 weight_baseline=None,
                 weight_entropy=None,
                 num_workers=1,
                 worker_sample_size=100,
                 dynamic_batching=False,
                 visualize=False,
                 **kwargs):
        """
        Args:
            discount (float): The discount factor gamma.
            architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if
                `network_spec` is given explicitly in kwargs. Default: "large".
            fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm.
            environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent.
            feed_previous_action_through_nn (bool): Whether to add the previous action as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_action". Default: True.
            feed_previous_reward_through_nn (bool): Whether to add the previous reward as another input channel to the
                ActionComponent's (NN's) input at each step. This is only possible if the state space is already a Dict.
                It will be added under the key "previous_reward". Default: True.
            weight_pg (float): See IMPALALossFunction Component.
            weight_baseline (float): See IMPALALossFunction Component.
            weight_entropy (float): See IMPALALossFunction Component.
            num_workers (int): How many actors (workers) should be run in separate threads.
            worker_sample_size (int): How many steps the actor will perform in the environment each sample-run.
            dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the
                optimizer's step call. The batcher.so file must be compiled for this to work (see Docker file).
                Default: False.
            visualize (Union[int,bool]): Whether and how many workers to visualize.
                Default: False (no visualization).
        """
        # Now that we fixed the Agent's spec, call the super constructor.
        super(SingleIMPALAAgent, self).__init__(
            type="single",
            discount=discount,
            architecture=architecture,
            fifo_queue_spec=fifo_queue_spec,
            environment_spec=environment_spec,
            feed_previous_action_through_nn=feed_previous_action_through_nn,
            feed_previous_reward_through_nn=feed_previous_reward_through_nn,
            weight_pg=weight_pg,
            weight_baseline=weight_baseline,
            weight_entropy=weight_entropy,
            worker_sample_size=worker_sample_size,
            name=kwargs.pop("name", "impala-single-agent"),
            **kwargs)
        self.dynamic_batching = dynamic_batching
        self.num_workers = num_workers
        self.visualize = visualize

        # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we
        # actually call below during our build.
        if self.dynamic_batching:
            self.policy = DynamicBatchingPolicy(policy_spec=self.policy,
                                                scope="")

        self.env_output_splitter = ContainerSplitter(
            tuple_length=3 if self.has_rnn is False else 4,
            scope="env-output-splitter")
        self.fifo_output_splitter = ContainerSplitter(
            *self.fifo_queue_keys, scope="fifo-output-splitter")
        self.states_dict_splitter = ContainerSplitter(
            *list(self.fifo_record_space["states"].keys(
            ) if isinstance(self.state_space, Dict) else "dummy"),
            scope="states-dict-splitter")

        self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys))

        # Slice some data from the EnvStepper (e.g only first internal states are needed).
        if self.has_rnn:
            internal_states_slicer = Slice(scope="internal-states-slicer",
                                           squeeze=True)
        else:
            internal_states_slicer = None

        self.transposer = Transpose(scope="transposer")

        # Create an IMPALALossFunction with some parameters.
        self.loss_function = IMPALALossFunction(
            discount=self.discount,
            weight_pg=weight_pg,
            weight_baseline=weight_baseline,
            weight_entropy=weight_entropy,
            slice_actions=self.feed_previous_action_through_nn,
            slice_rewards=self.feed_previous_reward_through_nn)

        # Merge back to insert into FIFO.
        self.fifo_input_merger = DictMerger(*self.fifo_queue_keys)

        # Dummy Flattener to calculate action-probs space.
        dummy_flattener = ReShape(
            flatten=True, flatten_categories=self.action_space.num_categories)

        self.environment_steppers = list()
        for i in range(self.num_workers):
            environment_spec_ = copy.deepcopy(environment_spec)
            if self.visualize is True or (isinstance(self.visualize, int)
                                          and i + 1 <= self.visualize):
                environment_spec_["visualize"] = True

            # Force worker_sample_size for IMPALA NNs (LSTM) in env-stepper to be 1.
            policy_spec = copy.deepcopy(self.policy_spec)
            if isinstance(policy_spec, dict) and isinstance(policy_spec["network_spec"], dict) and \
                    "type" in policy_spec["network_spec"] and "IMPALANetwork" in policy_spec["network_spec"]["type"]:
                policy_spec["network_spec"]["worker_sample_size"] = 1

            env_stepper = EnvironmentStepper(
                environment_spec=environment_spec_,
                actor_component_spec=ActorComponent(
                    preprocessor_spec=self.preprocessing_spec,
                    policy_spec=policy_spec,
                    exploration_spec=self.exploration_spec),
                state_space=self.state_space.with_batch_rank(),
                action_space=self.action_space.with_batch_rank(),
                reward_space=float,
                internal_states_space=self.internal_states_space,
                num_steps=self.worker_sample_size,
                add_action=not self.feed_previous_action_through_nn,
                add_reward=not self.feed_previous_reward_through_nn,
                add_previous_action_to_state=self.
                feed_previous_action_through_nn,
                add_previous_reward_to_state=self.
                feed_previous_reward_through_nn,
                add_action_probs=True,
                action_probs_space=dummy_flattener.get_preprocessed_space(
                    self.action_space),
                scope="env-stepper-{}".format(i))
            if self.dynamic_batching:
                env_stepper.actor_component.policy.parent_component = None
                env_stepper.actor_component.policy = DynamicBatchingPolicy(
                    policy_spec=env_stepper.actor_component.policy, scope="")
                env_stepper.actor_component.add_components(
                    env_stepper.actor_component.policy)

            self.environment_steppers.append(env_stepper)

        # Create the QueueRunners (one for each env-stepper).
        self.queue_runner = QueueRunner(
            self.fifo_queue,
            "step",
            -1,  # -1: Take entire return value of API-method `step` as record to insert.
            self.env_output_splitter,
            self.fifo_input_merger,
            internal_states_slicer,
            *self.environment_steppers)

        sub_components = [
            self.fifo_output_splitter, self.fifo_queue, self.queue_runner,
            self.transposer, self.staging_area, self.preprocessor,
            self.states_dict_splitter, self.policy, self.loss_function,
            self.optimizer
        ]

        # Add all the agent's sub-components to the root.
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root Component's) API.
        self.define_graph_api()

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              build_options=None)
            self.graph_built = True

            if self.has_gpu:
                # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op).
                self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \
                    out_op_columns[0].op_records[0].op
                # Initialize the stage.
                self.graph_executor.monitored_session.run_step_fn(
                    lambda step_context: step_context.session.run(self.stage_op
                                                                  ))
                # TODO remove after full refactor.
                self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \
                    out_op_columns[0].op_records[0].op
    def test_functional_api_multi_stream_nn(self):
        # Input Space of the network.
        input_space = Dict(
            {
                "img": FloatBox(shape=(6, 6, 3)),  # some RGB img
                "txt": TextBox()  # some text
            },
            add_batch_rank=True,
            add_time_rank=True)

        img, txt = ContainerSplitter("img", "txt")(input_space)
        # Complex NN assembly via our Keras-style functional API.
        # Fold text input into single batch rank.
        folded_text = ReShape(fold_time_rank=True)(txt)
        # String layer will create batched AND time-ranked (individual words) hash outputs (int64).
        string_bucket_out, lengths = StringToHashBucket(
            num_hash_buckets=5)(folded_text)
        # Batched and time-ranked embedding output (floats) with embed dim=n.
        embedding_out = EmbeddingLookup(embed_dim=10,
                                        vocab_size=5)(string_bucket_out)
        # Pass embeddings through a text LSTM and use last output (reduce time-rank).
        string_lstm_out, _ = LSTMLayer(units=2,
                                       return_sequences=False,
                                       scope="lstm-layer-txt")(
                                           embedding_out,
                                           sequence_length=lengths)
        # Unfold to get original time-rank back.
        string_lstm_out_unfolded = ReShape(unfold_time_rank=True)(
            string_lstm_out, txt)

        # Parallel image stream via 1 CNN layer plus dense.
        folded_img = ReShape(fold_time_rank=True, scope="img-fold")(img)
        cnn_out = Conv2DLayer(filters=1, kernel_size=2, strides=2)(folded_img)
        unfolded_cnn_out = ReShape(unfold_time_rank=True,
                                   scope="img-unfold")(cnn_out, img)
        unfolded_cnn_out_flattened = ReShape(
            flatten=True, scope="img-flat")(unfolded_cnn_out)
        dense_out = DenseLayer(units=2,
                               scope="dense-0")(unfolded_cnn_out_flattened)

        # Concat everything.
        concat_out = ConcatLayer()(string_lstm_out_unfolded, dense_out)

        # LSTM output has batch+time.
        main_lstm_out, internal_states = LSTMLayer(
            units=2, scope="lstm-layer-main")(concat_out)

        dense1_after_lstm_out = DenseLayer(units=3,
                                           scope="dense-1")(main_lstm_out)
        dense2_after_lstm_out = DenseLayer(
            units=2, scope="dense-2")(dense1_after_lstm_out)
        dense3_after_lstm_out = DenseLayer(
            units=1, scope="dense-3")(dense2_after_lstm_out)

        # A NN with 2 outputs.
        neural_net = NeuralNetwork(
            outputs=[dense3_after_lstm_out, main_lstm_out, internal_states])

        test = ComponentTest(component=neural_net,
                             input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        sample_shape = (4, 2)
        input_ = input_space.sample(sample_shape)

        out = test.test(("call", input_), expected_outputs=None)
        # Main output (Dense out after LSTM).
        self.assertTrue(out[0].shape == sample_shape +
                        (1, ))  # 1=1 unit in dense layer
        self.assertTrue(out[0].dtype == np.float32)
        # main-LSTM out.
        self.assertTrue(out[1].shape == sample_shape + (2, ))  # 2=2 LSTM units
        self.assertTrue(out[1].dtype == np.float32)
        # main-LSTM internal-states.
        self.assertTrue(out[2][0].shape == sample_shape[:1] +
                        (2, ))  # 2=2 LSTM units
        self.assertTrue(out[2][0].dtype == np.float32)
        self.assertTrue(out[2][1].shape == sample_shape[:1] +
                        (2, ))  # 2=2 LSTM units
        self.assertTrue(out[2][1].dtype == np.float32)

        test.terminate()
Esempio n. 21
0
class Policy(Component):
    """
    A Policy is a wrapper Component that contains a NeuralNetwork, an ActionAdapter and a Distribution Component.
    """
    def __init__(self,
                 network_spec,
                 action_space=None,
                 action_adapter_spec=None,
                 max_likelihood=True,
                 scope="policy",
                 **kwargs):
        """
        Args:
            network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build
                one.

            action_space (Space): The action Space within which this Component will create actions.

            action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default
                ActionAdapter object.

            max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling.
                Default: True.
        """
        super(Policy, self).__init__(scope=scope, **kwargs)

        self.neural_network = NeuralNetwork.from_spec(network_spec)
        if action_space is None:
            self.action_adapter = ActionAdapter.from_spec(action_adapter_spec)
            action_space = self.action_adapter.action_space
        else:
            self.action_adapter = ActionAdapter.from_spec(
                action_adapter_spec, action_space=action_space)
        self.action_space = action_space
        self.max_likelihood = max_likelihood

        # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding.
        # TODO: Replace entirely via sonnet-like BatchApply Component.
        is_impala = "IMPALANetwork" in type(self.neural_network).__name__

        # Add API-method to get baseline output (if we use an extra value function baseline node).
        if isinstance(self.action_adapter, BaselineActionAdapter):
            # TODO: IMPALA attempt to speed up final pass after LSTM.
            if is_impala:
                self.time_rank_folder = ReShape(fold_time_rank=True,
                                                scope="time-rank-fold")
                self.time_rank_unfolder_v = ReShape(unfold_time_rank=True,
                                                    time_major=True,
                                                    scope="time-rank-unfold-v")
                self.time_rank_unfolder_a_probs = ReShape(
                    unfold_time_rank=True,
                    time_major=True,
                    scope="time-rank-unfold-a-probs")
                self.time_rank_unfolder_logits = ReShape(
                    unfold_time_rank=True,
                    time_major=True,
                    scope="time-rank-unfold-logits")
                self.time_rank_unfolder_log_probs = ReShape(
                    unfold_time_rank=True,
                    time_major=True,
                    scope="time-rank-unfold-log-probs")
                self.add_components(self.time_rank_folder,
                                    self.time_rank_unfolder_v,
                                    self.time_rank_unfolder_a_probs,
                                    self.time_rank_unfolder_log_probs,
                                    self.time_rank_unfolder_logits)

            @rlgraph_api(component=self)
            def get_state_values_logits_probabilities_log_probs(
                    self, nn_input, internal_states=None):
                nn_output = self.neural_network.apply(nn_input,
                                                      internal_states)
                last_internal_states = nn_output.get("last_internal_states")
                nn_output = nn_output["output"]

                # TODO: IMPALA attempt to speed up final pass after LSTM.
                if is_impala:
                    nn_output = self.time_rank_folder.apply(nn_output)

                out = self.action_adapter.get_logits_probabilities_log_probs(
                    nn_output)

                # TODO: IMPALA attempt to speed up final pass after LSTM.
                if is_impala:
                    state_values = self.time_rank_unfolder_v.apply(
                        out["state_values"], nn_output)
                    logits = self.time_rank_unfolder_logits.apply(
                        out["logits"], nn_output)
                    probs = self.time_rank_unfolder_a_probs.apply(
                        out["probabilities"], nn_output)
                    log_probs = self.time_rank_unfolder_log_probs.apply(
                        out["log_probs"], nn_output)
                else:
                    state_values = out["state_values"]
                    logits = out["logits"]
                    probs = out["probabilities"]
                    log_probs = out["log_probs"]

                return dict(state_values=state_values,
                            logits=logits,
                            probabilities=probs,
                            log_probs=log_probs,
                            last_internal_states=last_internal_states)

        # Figure out our Distribution.
        if isinstance(action_space, IntBox):
            self.distribution = Categorical()
        # Continuous action space -> Normal distribution (each action needs mean and variance from network).
        elif isinstance(action_space, FloatBox):
            self.distribution = Normal()
        else:
            raise RLGraphError(
                "ERROR: `action_space` is of type {} and not allowed in {} Component!"
                .format(type(action_space).__name__, self.name))

        self.add_components(self.neural_network, self.action_adapter,
                            self.distribution)

        if is_impala:
            self.add_components(self.time_rank_folder,
                                self.time_rank_unfolder_v,
                                self.time_rank_unfolder_a_probs,
                                self.time_rank_unfolder_log_probs,
                                self.time_rank_unfolder_logits)

    # Define our interface.
    @rlgraph_api
    def get_nn_output(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: The raw output of the neural network (before it's cleaned-up and passed through the ActionAdapter).
        """
        out = self.neural_network.apply(nn_input, internal_states)
        return dict(output=out["output"],
                    last_internal_states=out.get("last_internal_states"))

    @rlgraph_api
    def get_action(self, nn_input, internal_states=None, max_likelihood=None):
        """
        Returns an action based on NN output, action adapter output and distribution sampling.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.
            max_likelihood (Optional[bool]): If not None, use this to determine whether actions should be drawn
                from the distribution in max-likelihood or stochastic fashion.

        Returns:
            any: The drawn action.
        """
        max_likelihood = self.max_likelihood if max_likelihood is None else max_likelihood

        nn_output = self.get_nn_output(nn_input, internal_states)

        # Skip our distribution, iff discrete action-space and max-likelihood acting (greedy).
        # In that case, one does not need to create a distribution in the graph each act (only to get the argmax
        # over the logits, which is the same as the argmax over the probabilities (or log-probabilities)).
        if max_likelihood is True and isinstance(self.action_space, IntBox):
            out = self.action_adapter.get_logits_probabilities_log_probs(
                nn_output["output"])
            action = self._graph_fn_get_max_likelihood_action_wo_distribution(
                out["logits"])
        else:
            out = self.action_adapter.get_logits_probabilities_log_probs(
                nn_output["output"])
            action = self.distribution.draw(out["probabilities"],
                                            max_likelihood)
        return dict(action=action,
                    last_internal_states=nn_output["last_internal_states"])

    @rlgraph_api
    def get_max_likelihood_action(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: See `get_action`, but with max_likelihood force set to True.
        """
        out = self.get_logits_probabilities_log_probs(nn_input,
                                                      internal_states)

        if isinstance(self.action_space, IntBox):
            action = self._graph_fn_get_max_likelihood_action_wo_distribution(
                out["logits"])
        else:
            action = self.distribution.sample_deterministic(
                out["probabilities"])

        return dict(action=action,
                    last_internal_states=out["last_internal_states"])

    @rlgraph_api
    def get_stochastic_action(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: See `get_action`, but with max_likelihood force set to False.
        """
        out = self.get_logits_probabilities_log_probs(nn_input,
                                                      internal_states)
        action = self.distribution.sample_stochastic(out["probabilities"])
        return dict(action=action,
                    last_internal_states=out["last_internal_states"])

    @rlgraph_api
    def get_action_layer_output(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: The raw output of the action layer of the ActionAdapter (including possibly the last internal states
                of a RNN-based NN).
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        action_layer_output = self.action_adapter.get_action_layer_output(
            nn_output["output"])
        # Add last internal states to return value.
        return dict(output=action_layer_output["output"],
                    last_internal_states=nn_output["last_internal_states"])

    @rlgraph_api
    def get_logits_probabilities_log_probs(self,
                                           nn_input,
                                           internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                logits: The (reshaped) logits from the ActionAdapter.
                probabilities: The probabilities gained from the softmaxed logits.
                log_probs: The log(probabilities) values.
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        aa_output = self.action_adapter.get_logits_probabilities_log_probs(
            nn_output["output"])
        return dict(logits=aa_output["logits"],
                    probabilities=aa_output["probabilities"],
                    log_probs=aa_output["log_probs"],
                    last_internal_states=nn_output["last_internal_states"])

    @rlgraph_api
    def get_entropy(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: See Distribution component.
        """
        out = self.get_logits_probabilities_log_probs(nn_input,
                                                      internal_states)
        entropy = self.distribution.entropy(out["probabilities"])
        return dict(entropy=entropy,
                    last_internal_states=out["last_internal_states"])

    @graph_fn
    def _graph_fn_get_max_likelihood_action_wo_distribution(self, logits):
        """
        Use this function only for discrete action spaces to circumvent using a full-blown
        backend-specific distribution object (e.g. tf.distribution.Multinomial).

        Args:
            logits (SingleDataOp): Logits over which to pick the argmax (greedy action).

        Returns:
            SingleDataOp: The argmax over the last rank of the input logits.
        """
        if get_backend() == "tf":
            return tf.argmax(logits, axis=-1, output_type=tf.int32)
        elif get_backend() == "pytorch":
            return torch.argmax(logits, dim=-1).int()