Beispiel #1
0
    def __init__(self,
                 network_spec,
                 value_weights_spec=None,
                 value_biases_spec=None,
                 value_activation=None,
                 value_fold_time_rank=False,
                 value_unfold_time_rank=False,
                 scope="shared-value-function-policy",
                 **kwargs):
        super(SharedValueFunctionPolicy, self).__init__(network_spec,
                                                        scope=scope,
                                                        **kwargs)

        # Create the extra value dense layer with 1 node.
        self.value_unfold_time_rank = value_unfold_time_rank
        self.value_network = NeuralNetwork(
            DenseLayer(
                units=1,
                activation=value_activation,
                weights_spec=value_weights_spec,
                biases_spec=value_biases_spec,
            ),
            fold_time_rank=value_fold_time_rank,
            unfold_time_rank=value_unfold_time_rank,
            scope="value-function-node")

        self.add_components(self.value_network)
    def __init__(
            self, action_space, world_option_model_network, encoder_network, num_features, num_mixtures, beta=0.2,
            post_phi_concat_network=None,
            reward_clipping=1.0,
            intrinsic_rewards_weight=0.1,
            concat_with_command_vector=False,
            optimizer=None, deterministic=False, scope="intrinsic-curiosity-world-option-model",
            **kwargs
    ):
        """
        Args:
            action_space (Space): The action Space to be fed into the model together with the latent feature vector
                for the states. Will be flattened automatically and then concatenated by this component.

            world_option_model_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to
                construct the world-option-model's neural network.

            encoder_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to
                construct the inverse dynamics model's encoder network leading from s to phi (feature vector).

            num_features (int): The size of the feature vectors phi.

            num_mixtures (int): The number of mixture Normals to use for the next-state distribution output.

            beta (float): The weight for the phi' loss (action loss is then 1.0 - beta).

            post_phi_concat_network

            reward_clipping (float): 0.0 for no clipping, some other value for +/- reward value clipping.
                Default: 1.0.

            concat_with_command_vector (bool): If True, this model needs an additional command vector (coming from the
                policy above) to concat it together with the latent state vector.

            optimizer (Optional[Optimizer]): The optimizer to use for supervised learning of the two networks
                (ICM and WOM).
        """
        self.num_features = num_features
        self.num_mixtures = num_mixtures
        self.deterministic = deterministic
        self.beta = beta
        assert 0.0 < self.beta < 1.0, "ERROR: `beta` must be between 0 and 1!"
        self.reward_clipping = reward_clipping
        self.intrinsic_rewards_weight = intrinsic_rewards_weight

        # Create the encoder network inside a SupervisedPredictor (so we get the adapter + distribution with it).
        self.state_encoder = SupervisedPredictor(
            network_spec=encoder_network, output_space=FloatBox(shape=(num_features,), add_batch_rank=True),
            scope="state-encoder"
        )

        # Create the container loss function for the two prediction tasks:
        # a) Action prediction and b) next-state prediction, each of them using a simple neg log likelihood loss
        # comparing the actual action and s' with their log-likelihood value vs the respective distributions.
        self.loss_functions = dict(
            # Action prediction loss (neg log likelihood of observed action vs the parameterized distribution).
            predicted_actions=NegativeLogLikelihoodLoss(
                distribution_spec=get_default_distribution_from_space(action_space),
                scope="action-loss"
            ),
            # s' prediction loss (neg log likelihood of observed s' vs the parameterized mixed normal distribution).
            predicted_phi_=NegativeLogLikelihoodLoss(distribution_spec=dict(type="mixture", _args=[
                "multi-variate-normal" for _ in range(num_mixtures)
            ]), scope="phi-loss")
        )

        # TODO: Support for command vector concatenation.
        #self.concat_with_command_vector = concat_with_command_vector

        # Define the Model's network's custom call method.
        def custom_call(self, inputs):
            phi = inputs["phi"]
            actions = inputs["actions"]
            phi_ = inputs["phi_"]
            actions_flat = self.get_sub_component_by_name("action-flattener").call(actions)
            concat_phis = self.get_sub_component_by_name("concat-phis").call(phi, phi_)
            # Predict the action that lead from s to s'.
            predicted_actions = self.get_sub_component_by_name("post-phi-concat-nn").call(concat_phis)

            # Concat phi with flattened actions.
            phi_and_actions = self.get_sub_component_by_name("concat-states-and-actions").call(
                phi, actions_flat
            )
            # Add stop-gradient to phi here before predicting phi'
            # (the phis should only be trained by the inverse dynamics model, not by the world option model).
            # NOT DONE IN ORIGINAL PAPER's CODE AND ALSO NOT IN MLAGENTS EQUIVALENT.
            # phi_and_actions = self.get_sub_component_by_name("stop-gradient").stop(phi_and_actions)
            # Predict phi' (through a mixture gaussian distribution).
            predicted_phi_ = self.get_sub_component_by_name("wom-nn").call(phi_and_actions)

            return dict(
                # Predictions (actions and next-state-features (mixture distribution)).
                predicted_actions=predicted_actions,
                predicted_phi_=predicted_phi_
                ## Also return the two feature vectors for s and s'.
                #phi=phi, phi_=phi_
            )

        # Create the SupervisedPredictor's neural network.
        predictor_network = NeuralNetwork(
            # The world option model network taking action-cat-phi and mapping them to the predicted phi'.
            NeuralNetwork.from_spec(world_option_model_network, scope="wom-nn"),
            # The concat component concatenating both latent state vectors (phi and phi').
            ConcatLayer(scope="concat-phis"),
            # The NN mapping from phi-cat-phi' to the action prediction.
            NeuralNetwork.from_spec(post_phi_concat_network, scope="post-phi-concat-nn"),
            # The ReShape component for flattening all actions in arbitrary action spaces.
            ReShape(flatten=True, flatten_categories=True, flatten_containers=True, scope="action-flattener"),
            # The concat component concatenating latent state feature vector and incoming (flattened) actions.
            ConcatLayer(scope="concat-states-and-actions"),
            # Set the `call` method.
            api_methods={("call", custom_call)}
        )

        if optimizer is None:
            optimizer = dict(type="adam", learning_rate=3e-4)

        super(IntrinsicCuriosityWorldOptionModel, self).__init__(
            predictor=dict(
                network_spec=predictor_network,
                output_space=Dict({
                    "predicted_actions": action_space,
                    "predicted_phi_": FloatBox(shape=(self.num_features,))
                }, add_batch_rank=action_space.has_batch_rank, add_time_rank=action_space.has_time_rank),
                distribution_adapter_spec=dict(
                    # for `predicted_actions`: use default adapter
                    # for predicted_phi': use normal-mixture adapter & distribution.
                    predicted_phi_={"type": "normal-mixture-adapter", "num_mixtures": num_mixtures}
                ),
                deterministic=deterministic
            ),
            loss_function=self.loss_functions["predicted_actions"],
            optimizer=optimizer, scope=scope, **kwargs
        )

        self.add_components(self.state_encoder, self.loss_functions["predicted_phi_"])