def __init__(self, network_spec, value_weights_spec=None, value_biases_spec=None, value_activation=None, value_fold_time_rank=False, value_unfold_time_rank=False, scope="shared-value-function-policy", **kwargs): super(SharedValueFunctionPolicy, self).__init__(network_spec, scope=scope, **kwargs) # Create the extra value dense layer with 1 node. self.value_unfold_time_rank = value_unfold_time_rank self.value_network = NeuralNetwork( DenseLayer( units=1, activation=value_activation, weights_spec=value_weights_spec, biases_spec=value_biases_spec, ), fold_time_rank=value_fold_time_rank, unfold_time_rank=value_unfold_time_rank, scope="value-function-node") self.add_components(self.value_network)
def test_time_rank_folding_for_large_dense_nn(self): vector_dim = 256 input_space = FloatBox(shape=(vector_dim, ), add_batch_rank=True, add_time_rank=True) base_config = config_from_path("configs/test_large_dense_nn.json") neural_net_wo_folding = NeuralNetwork.from_spec(base_config) test = ComponentTest(component=neural_net_wo_folding, input_spaces=dict(nn_input=input_space)) # Pull a large batch+time ranked sample. sample_shape = (256, 200) inputs = input_space.sample(sample_shape) start = time.monotonic() runs = 10 for _ in range(runs): print(".", flush=True, end="") test.test(("call", inputs), expected_outputs=None) runtime_wo_folding = time.monotonic() - start print( "\nTesting large dense NN w/o time-rank folding: {}x pass through with {}-data took " "{}s".format(runs, sample_shape, runtime_wo_folding)) neural_net_w_folding = NeuralNetwork.from_spec(base_config) # Folded space. input_space_folded = FloatBox(shape=(vector_dim, ), add_batch_rank=True) inputs = input_space.sample(sample_shape[0] * sample_shape[1]) test = ComponentTest(component=neural_net_w_folding, input_spaces=dict(nn_input=input_space_folded)) start = time.monotonic() for _ in range(runs): print(".", flush=True, end="") test.test(("call", inputs), expected_outputs=None) runtime_w_folding = time.monotonic() - start print( "\nTesting large dense NN w/ time-rank folding: {}x pass through with {}-data took " "{}s".format(runs, sample_shape, runtime_w_folding)) recursive_assert_almost_equal(runtime_w_folding, runtime_wo_folding, decimals=0)
def test_time_rank_folding_for_large_cnn_nn(self): width = 86 height = 86 time_rank = 20 input_space = FloatBox(shape=(width, height, 3), add_batch_rank=True, add_time_rank=True, time_major=True) base_config = config_from_path("configs/test_3x_cnn_nn.json") base_config.insert(0, {"type": "reshape", "fold_time_rank": True}) base_config.append({ "type": "reshape", "unfold_time_rank": time_rank, "time_major": True }) neural_net = NeuralNetwork.from_spec(base_config) test = ComponentTest(component=neural_net, input_spaces=dict(nn_input=input_space)) # Pull a large batch+time ranked sample. sample_shape = (time_rank, 256) inputs = input_space.sample(sample_shape) out = test.test(("call", inputs), expected_outputs=None)["output"] self.assertTrue(out.shape == (time_rank, 256, 7 * 7 * 64)) self.assertTrue(out.dtype == np.float32)
def __init__(self, input_network_specs, post_network_spec=None, **kwargs): """ Args: input_network_specs (Union[Dict[str,dict],Tuple[dict]]): A specification dict or tuple with values being the spec dicts for the single streams. The `call` method expects a dict input or a single tuple input (not as *args) in its first parameter. post_network_spec (Optional[]): The specification dict of the post-concat network or the post-concat network object itself. """ super(MultiInputStreamNeuralNetwork, self).__init__(scope="multi-input-stream-nn", **kwargs) # Create all streams' networks. if isinstance(input_network_specs, dict): self.input_stream_nns = {} for i, (flat_key, nn_spec) in enumerate( flatten_op(input_network_specs).items()): self.input_stream_nns[flat_key] = NeuralNetwork.from_spec( nn_spec, scope="input-stream-nn-{}".format(i)) # Create the concat layer to merge all streams. self.concat_layer = ConcatLayer(dict_keys=list( self.input_stream_nns.keys()), axis=-1) else: assert isinstance(input_network_specs, (list, tuple)),\ "ERROR: `input_network_specs` must be dict or tuple/list!" self.input_stream_nns = [] for i, nn_spec in enumerate(input_network_specs): self.input_stream_nns.append( NeuralNetwork.from_spec( nn_spec, scope="input-stream-nn-{}".format(i))) # Create the concat layer to merge all streams. self.concat_layer = ConcatLayer(axis=-1) # Create the post-network (after the concat). self.post_nn = NeuralNetwork.from_spec( post_network_spec, scope="post-concat-nn") # type: NeuralNetwork # Add all sub-Components. self.add_components( self.post_nn, self.concat_layer, *list(self.input_stream_nns.values() if isinstance( input_network_specs, dict) else self.input_stream_nns))
def __init__(self, action_space=None, final_shape=None, weights_spec=None, biases_spec=None, activation=None, pre_network_spec=None, scope="action-adapter", **kwargs): """ Args: action_space (Optional[Space]): The action Space within which this Component will create actions. NOTE: Exactly one of `action_space` of `final_shape` must be provided. final_shape (Optional[Tuple[int]): An optional final output shape (in case action_space is not provided). If None, will calculate the shape automatically from the given `action_space`. NOTE: Exactly one of `action_space` of `final_shape` must be provided. weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the weights of `self.action layer`. Default: None (use default initializer). biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0). activation (Optional[str]): The activation function to use for `self.action_layer`. Default: None (=linear). pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the last action layer. If None, only the action layer itself is applied. """ # Build the action layer for this adapter based on the given action-space. self.action_space = None if action_space is not None: self.action_space = action_space.with_batch_rank() assert not isinstance(self.action_space, ContainerSpace),\ "ERROR: ActionAdapter cannot handle ContainerSpaces!" units, self.final_shape = self.get_units_and_shape() action_layer = DenseLayer(units=units, activation=activation, weights_spec=weights_spec, biases_spec=biases_spec, scope="action-layer") # Do we have a pre-NN? self.network = NeuralNetwork.from_spec( pre_network_spec, scope="action-network") # type: NeuralNetwork self.network.add_layer(action_layer) # Add the reshape layer to match the action space's shape. self.network.add_layer(ReShape(new_shape=self.final_shape)) super(ActionAdapter, self).__init__(self.network, scope=scope, **kwargs)
def __init__(self, z_units, encoder_network_spec, decoder_network_spec, **kwargs): """ Args: z_units (int): Number of units of the latent (z) vectors that the encoder will produce. encoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct an encoder NeuralNetwork object from or a NeuralNetwork Component directly. decoder_network_spec (Union[dict,NeuralNetwork]): Specification dict to construct a decoder NeuralNetwork object from or a NeuralNetwork Component directly. """ super(VariationalAutoEncoder, self).__init__(scope="variational-auto-encoder", **kwargs) self.z_units = z_units # Create encoder and decoder networks. self.encoder_network = NeuralNetwork.from_spec(encoder_network_spec, scope="encoder-network") self.decoder_network = NeuralNetwork.from_spec(decoder_network_spec, scope="decoder-network") # Create the two Gaussian layers. self.mean_layer = DenseLayer(units=self.z_units, scope="mean-layer") self.stddev_layer = DenseLayer(units=self.z_units, scope="stddev-layer") # Create the Normal Distribution from which to sample. self.normal_distribution = Normal() # A concat layer to concat mean and stddev before passing it to the Normal distribution. # No longer needed: Pass Tuple (mean + stddev) into API-method instead of concat'd tensor. #self.concat_layer = ConcatLayer(axis=-1) # Add all sub-Components. self.add_components( self.encoder_network, self.decoder_network, self.mean_layer, self.stddev_layer, self.normal_distribution #, self.concat_layer )
def __init__(self, network_spec, action_space=None, action_adapter_spec=None, deterministic=True, scope="policy", distributions_spec=None, **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Union[dict,Space]): A specification dict to create the Space within which this Component will create actions or the action Space object directly. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. deterministic (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. distributions_spec (dict): Specifies bounded and discrete distribution types, and optionally additional configuration parameters such as temperature. batch_apply (bool): Whether to wrap both the NN and the ActionAdapter with a BatchApply Component in order to fold time rank into batch rank before a forward pass. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec( network_spec) # type: NeuralNetwork self.deterministic = deterministic self.action_adapters = {} self.distributions = {} self.distributions_spec = distributions_spec if distributions_spec is not None else {} self.bounded_distribution_type = self.distributions_spec.get( "bounded_distribution_type", "beta") self.discrete_distribution_type = self.distributions_spec.get( "discrete_distribution_type", "categorical") # For discrete approximations. self.gumbel_softmax_temperature = self.distributions_spec.get( "gumbel_softmax_temperature", 1.0) self._create_action_adapters_and_distributions( action_space=action_space, action_adapter_spec=action_adapter_spec) self.add_components(*[self.neural_network] + list(self.action_adapters.values()) + list(self.distributions.values())) self.flat_action_space = None
def __init__(self, network_spec, scope="value-function", **kwargs): """ Args: network_spec (list): Layer specification for baseline network. """ super(ValueFunction, self).__init__(scope=scope, **kwargs) # Attach VF output to hidden layers. value_layer = { "type": "dense", "units": 1, "activation": "linear", "scope": "value-function-output" } network_spec.append(value_layer) self.neural_network = NeuralNetwork.from_spec(network_spec) self.add_components(self.neural_network)
def __init__(self, network_spec, action_space=None, action_adapter_spec=None, deterministic=True, scope="policy", bounded_distribution_type="beta", discrete_distribution_type="categorical", **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Space): The action Space within which this Component will create actions. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. deterministic (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. bounded_distribution_type(str): The class of distributions to use for bounded action spaces. For options check the components.distributions package. Default: beta. discrete_distribution_type(str): The class of distributions to use for discrete action spaces. For options check the components.distributions package. Default: categorical. Agents requiring reparameterization may require a GumbelSoftmax distribution instead. batch_apply (bool): Whether to wrap both the NN and the ActionAdapter with a BatchApply Component in order to fold time rank into batch rank before a forward pass. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec(network_spec) # type: NeuralNetwork self.deterministic = deterministic self.action_adapters = dict() self.distributions = dict() self.bounded_distribution_type = bounded_distribution_type self.discrete_distribution_type = discrete_distribution_type self._create_action_adapters_and_distributions( action_space=action_space, action_adapter_spec=action_adapter_spec ) self.add_components( *[self.neural_network] + list(self.action_adapters.values()) + list(self.distributions.values()) )
class SharedValueFunctionPolicy(Policy): def __init__(self, network_spec, value_weights_spec=None, value_biases_spec=None, value_activation=None, value_fold_time_rank=False, value_unfold_time_rank=False, scope="shared-value-function-policy", **kwargs): super(SharedValueFunctionPolicy, self).__init__(network_spec, scope=scope, **kwargs) # Create the extra value dense layer with 1 node. self.value_unfold_time_rank = value_unfold_time_rank self.value_network = NeuralNetwork( DenseLayer( units=1, activation=value_activation, weights_spec=value_weights_spec, biases_spec=value_biases_spec, ), fold_time_rank=value_fold_time_rank, unfold_time_rank=value_unfold_time_rank, scope="value-function-node") self.add_components(self.value_network) @rlgraph_api def get_state_values(self, nn_input, internal_states=None): """ Returns the state value node's output. Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: Dict: state_values: The single (but batched) value function node output. """ nn_output = self.get_nn_output(nn_input, internal_states) if self.value_unfold_time_rank is True: state_values = self.value_network.apply(nn_output["output"], nn_input) else: state_values = self.value_network.apply(nn_output["output"]) return dict(state_values=state_values["output"], last_internal_states=nn_output.get("last_internal_states")) @rlgraph_api def get_state_values_logits_probabilities_log_probs( self, nn_input, internal_states=None): """ Similar to `get_values_logits_probabilities_log_probs`, but also returns in the return dict under key `state_value` the output of our state-value function node. Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: Dict: state_values: The single (but batched) value function node output. logits: The (reshaped) logits from the ActionAdapter. probabilities: The probabilities gained from the softmaxed logits. log_probs: The log(probabilities) values. last_internal_states: The last internal states (if network is RNN-based). """ nn_output = self.get_nn_output(nn_input, internal_states) logits, probabilities, log_probs = self._graph_fn_get_action_adapter_logits_probabilities_log_probs( nn_output["output"], nn_input) if self.value_unfold_time_rank is True: state_values = self.value_network.apply(nn_output["output"], nn_input) else: state_values = self.value_network.apply(nn_output["output"]) return dict(state_values=state_values["output"], logits=logits, probabilities=probabilities, log_probs=log_probs, last_internal_states=nn_output.get("last_internal_states"))
def build_value_function(self): # Attach VF output to hidden layers. self.network_spec.append(self.value_layer) self.neural_network = NeuralNetwork.from_spec(self.network_spec) self.add_components(self.neural_network)
def __init__(self, network_spec, action_space=None, action_adapter_spec=None, deterministic=True, scope="policy", **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Space): The action Space within which this Component will create actions. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. deterministic (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. batch_apply (bool): Whether to wrap both the NN and the ActionAdapter with a BatchApply Component in order to fold time rank into batch rank before a forward pass. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec( network_spec) # type: NeuralNetwork # Create the necessary action adapters for this Policy. One for each action space component. self.action_adapters = dict() if action_space is None: self.action_adapters[""] = ActionAdapter.from_spec( action_adapter_spec) self.action_space = self.action_adapters[""].action_space # Assert single component action space. assert len(self.action_space.flatten()) == 1,\ "ERROR: Action space must not be ContainerSpace if no `action_space` is given in Policy c'tor!" else: self.action_space = Space.from_spec(action_space) for i, (flat_key, action_component) in enumerate( self.action_space.flatten().items()): if action_adapter_spec is not None: aa_spec = action_adapter_spec.get(flat_key, action_adapter_spec) aa_spec["action_space"] = action_component else: aa_spec = dict(action_space=action_component) self.action_adapters[flat_key] = ActionAdapter.from_spec( aa_spec, scope="action-adapter-{}".format(i)) self.deterministic = deterministic # Figure out our Distributions. self.distributions = dict() for i, (flat_key, action_component) in enumerate( self.action_space.flatten().items()): if isinstance(action_component, IntBox): self.distributions[flat_key] = Categorical( scope="categorical-{}".format(i)) # Continuous action space -> Normal distribution (each action needs mean and variance from network). elif isinstance(action_component, FloatBox): self.distributions[flat_key] = Normal( scope="normal-{}".format(i)) else: raise RLGraphError( "ERROR: `action_component` is of type {} and not allowed in {} Component!" .format(type(action_space).__name__, self.name)) self.add_components(*[self.neural_network] + list(self.action_adapters.values()) + list(self.distributions.values()))
class SharedValueFunctionPolicy(Policy): def __init__(self, network_spec, value_weights_spec=None, value_biases_spec=None, value_activation=None, value_fold_time_rank=False, value_unfold_time_rank=False, scope="shared-value-function-policy", **kwargs): super(SharedValueFunctionPolicy, self).__init__(network_spec, scope=scope, **kwargs) # Create the extra value dense layer with 1 node. self.value_unfold_time_rank = value_unfold_time_rank self.value_network = NeuralNetwork( DenseLayer( units=1, activation=value_activation, weights_spec=value_weights_spec, biases_spec=value_biases_spec, ), fold_time_rank=value_fold_time_rank, unfold_time_rank=value_unfold_time_rank, scope="value-function-node") self.add_components(self.value_network) @rlgraph_api def get_state_values(self, nn_inputs): # , internal_states=None """ Returns the state value node's output. Args: nn_inputs (any): The input to our neural network. #internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: Dict: state_values: The single (but batched) value function node output. """ nn_outputs = self.get_nn_outputs(nn_inputs) #if self.value_unfold_time_rank is True: # state_values = self.value_network.call(nn_outputs, nn_inputs) #else: state_values = self.value_network.call(nn_outputs) return dict(state_values=state_values, nn_outputs=nn_outputs) @rlgraph_api def get_state_values_adapter_outputs_and_parameters( self, nn_inputs): #, internal_states=None """ Similar to `get_values_logits_probabilities_log_probs`, but also returns in the return dict under key `state_value` the output of our state-value function node. Args: nn_inputs (any): The input to our neural network. #internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: Dict: nn_outputs: The raw NN outputs. state_values: The single (but batched) value function node output. adapter_outputs: The (reshaped) logits from the ActionAdapter. parameters: The parameters for the distribution (gained from the softmaxed logits or interpreting logits as mean and stddev for a normal distribution). log_probs: The log(probabilities) values. """ nn_outputs = self.get_nn_outputs(nn_inputs) adapter_outputs, parameters, log_probs = self._graph_fn_get_adapter_outputs_and_parameters( nn_outputs) #if self.value_unfold_time_rank is True: # state_values = self.value_network.call(nn_outputs, nn_inputs) #else: state_values = self.value_network.call(nn_outputs) return dict(nn_outputs=nn_outputs, state_values=state_values, adapter_outputs=adapter_outputs, parameters=parameters, log_probs=log_probs) def get_state_values_logits_probabilities_log_probs( self, nn_input, internal_states=None): raise RLGraphObsoletedError( "API-method", "get_state_values_logits_probabilities_log_probs", "get_state_values_adapter_outputs_and_parameters") def get_state_values_logits_parameters_log_probs(self, nn_input, internal_states=None): raise RLGraphObsoletedError( "API-method", "get_state_values_logits_parameters_log_probs", "get_state_values_adapter_outputs_and_parameters")
def __init__(self, action_space, add_units=0, units=None, weights_spec=None, biases_spec=None, activation=None, pre_network_spec=None, scope="action-adapter", **kwargs): """ Args: action_space (Space): The action Space within which this Component will create actions. add_units (Optional[int]): An optional number of units to add to the auto-calculated number of action- layer nodes. Can be negative to subtract units from the auto-calculated value. NOTE: Only one of either `add_units` or `units` must be provided. units (Optional[int]): An optional number of units to use for the action-layer. If None, will calculate the number of units automatically from the given action_space. NOTE: Only one of either `add_units` or `units` must be provided. weights_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the weights of `self.action layer`. Default: None (use default initializer). biases_spec (Optional[any]): An optional RLGraph Initializer spec that will be used to initialize the biases of `self.action layer`. Default: None (use default initializer, which is usually 0.0). activation (Optional[str]): The activation function to use for `self.action_layer`. Default: None (=linear). pre_network_spec (Optional[dict,NeuralNetwork]): A spec dict for a neural network coming before the last action layer. If None, only the action layer itself is applied. """ # Build the action layer for this adapter based on the given action-space. self.action_space = action_space.with_batch_rank() assert not isinstance( self.action_space, ContainerSpace ), "ERROR: ActionAdapter cannot handle ContainerSpaces!" # Calculate the number of nodes in the action layer (DenseLayer object) depending on our action Space # or using a given fixed number (`units`). # Also generate the ReShape sub-Component and give it the new_shape. if isinstance(self.action_space, IntBox): if units is None: units = add_units + self.action_space.flat_dim_with_categories new_shape = self.action_space.get_shape(with_category_rank=True) else: if units is None: units = add_units + 2 * self.action_space.flat_dim # Those two dimensions are the mean and log sd # Manually add moments after batch/time ranks. new_shape = tuple([2] + list(self.action_space.shape)) assert units > 0, "ERROR: Number of nodes for action-layer calculated as {}! Must be larger 0.".format( units) action_layer = DenseLayer(units=units, activation=activation, weights_spec=weights_spec, biases_spec=biases_spec, scope="action-layer") # Do we have a pre-NN? self.network = NeuralNetwork.from_spec( pre_network_spec, scope="action-network") # type: NeuralNetwork self.network.add_layer(action_layer) # Add the reshape layer to match the action space's shape. self.network.add_layer(ReShape(new_shape=new_shape)) super(ActionAdapter, self).__init__(self.network, scope=scope, **kwargs)
def __init__(self, network_spec, action_space=None, action_adapter_spec=None, max_likelihood=True, scope="policy", **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Space): The action Space within which this Component will create actions. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec(network_spec) if action_space is None: self.action_adapter = ActionAdapter.from_spec(action_adapter_spec) action_space = self.action_adapter.action_space else: self.action_adapter = ActionAdapter.from_spec( action_adapter_spec, action_space=action_space) self.action_space = action_space self.max_likelihood = max_likelihood # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding. # TODO: Replace entirely via sonnet-like BatchApply Component. is_impala = "IMPALANetwork" in type(self.neural_network).__name__ # Add API-method to get baseline output (if we use an extra value function baseline node). if isinstance(self.action_adapter, BaselineActionAdapter): # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: self.time_rank_folder = ReShape(fold_time_rank=True, scope="time-rank-fold") self.time_rank_unfolder_v = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-v") self.time_rank_unfolder_a_probs = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-a-probs") self.time_rank_unfolder_logits = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-logits") self.time_rank_unfolder_log_probs = ReShape( unfold_time_rank=True, time_major=True, scope="time-rank-unfold-log-probs") self.add_components(self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits) @rlgraph_api(component=self) def get_state_values_logits_probabilities_log_probs( self, nn_input, internal_states=None): nn_output = self.neural_network.apply(nn_input, internal_states) last_internal_states = nn_output.get("last_internal_states") nn_output = nn_output["output"] # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: nn_output = self.time_rank_folder.apply(nn_output) out = self.action_adapter.get_logits_probabilities_log_probs( nn_output) # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: state_values = self.time_rank_unfolder_v.apply( out["state_values"], nn_output) logits = self.time_rank_unfolder_logits.apply( out["logits"], nn_output) probs = self.time_rank_unfolder_a_probs.apply( out["probabilities"], nn_output) log_probs = self.time_rank_unfolder_log_probs.apply( out["log_probs"], nn_output) else: state_values = out["state_values"] logits = out["logits"] probs = out["probabilities"] log_probs = out["log_probs"] return dict(state_values=state_values, logits=logits, probabilities=probs, log_probs=log_probs, last_internal_states=last_internal_states) # Figure out our Distribution. if isinstance(action_space, IntBox): self.distribution = Categorical() # Continuous action space -> Normal distribution (each action needs mean and variance from network). elif isinstance(action_space, FloatBox): self.distribution = Normal() else: raise RLGraphError( "ERROR: `action_space` is of type {} and not allowed in {} Component!" .format(type(action_space).__name__, self.name)) self.add_components(self.neural_network, self.action_adapter, self.distribution) if is_impala: self.add_components(self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits)
def __init__( self, action_space, world_option_model_network, encoder_network, num_features, num_mixtures, beta=0.2, post_phi_concat_network=None, reward_clipping=1.0, intrinsic_rewards_weight=0.1, concat_with_command_vector=False, optimizer=None, deterministic=False, scope="intrinsic-curiosity-world-option-model", **kwargs ): """ Args: action_space (Space): The action Space to be fed into the model together with the latent feature vector for the states. Will be flattened automatically and then concatenated by this component. world_option_model_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to construct the world-option-model's neural network. encoder_network (Union[NeuralNetwork,dict]): A specification dict (or NN object directly) to construct the inverse dynamics model's encoder network leading from s to phi (feature vector). num_features (int): The size of the feature vectors phi. num_mixtures (int): The number of mixture Normals to use for the next-state distribution output. beta (float): The weight for the phi' loss (action loss is then 1.0 - beta). post_phi_concat_network reward_clipping (float): 0.0 for no clipping, some other value for +/- reward value clipping. Default: 1.0. concat_with_command_vector (bool): If True, this model needs an additional command vector (coming from the policy above) to concat it together with the latent state vector. optimizer (Optional[Optimizer]): The optimizer to use for supervised learning of the two networks (ICM and WOM). """ self.num_features = num_features self.num_mixtures = num_mixtures self.deterministic = deterministic self.beta = beta assert 0.0 < self.beta < 1.0, "ERROR: `beta` must be between 0 and 1!" self.reward_clipping = reward_clipping self.intrinsic_rewards_weight = intrinsic_rewards_weight # Create the encoder network inside a SupervisedPredictor (so we get the adapter + distribution with it). self.state_encoder = SupervisedPredictor( network_spec=encoder_network, output_space=FloatBox(shape=(num_features,), add_batch_rank=True), scope="state-encoder" ) # Create the container loss function for the two prediction tasks: # a) Action prediction and b) next-state prediction, each of them using a simple neg log likelihood loss # comparing the actual action and s' with their log-likelihood value vs the respective distributions. self.loss_functions = dict( # Action prediction loss (neg log likelihood of observed action vs the parameterized distribution). predicted_actions=NegativeLogLikelihoodLoss( distribution_spec=get_default_distribution_from_space(action_space), scope="action-loss" ), # s' prediction loss (neg log likelihood of observed s' vs the parameterized mixed normal distribution). predicted_phi_=NegativeLogLikelihoodLoss(distribution_spec=dict(type="mixture", _args=[ "multi-variate-normal" for _ in range(num_mixtures) ]), scope="phi-loss") ) # TODO: Support for command vector concatenation. #self.concat_with_command_vector = concat_with_command_vector # Define the Model's network's custom call method. def custom_call(self, inputs): phi = inputs["phi"] actions = inputs["actions"] phi_ = inputs["phi_"] actions_flat = self.get_sub_component_by_name("action-flattener").call(actions) concat_phis = self.get_sub_component_by_name("concat-phis").call(phi, phi_) # Predict the action that lead from s to s'. predicted_actions = self.get_sub_component_by_name("post-phi-concat-nn").call(concat_phis) # Concat phi with flattened actions. phi_and_actions = self.get_sub_component_by_name("concat-states-and-actions").call( phi, actions_flat ) # Add stop-gradient to phi here before predicting phi' # (the phis should only be trained by the inverse dynamics model, not by the world option model). # NOT DONE IN ORIGINAL PAPER's CODE AND ALSO NOT IN MLAGENTS EQUIVALENT. # phi_and_actions = self.get_sub_component_by_name("stop-gradient").stop(phi_and_actions) # Predict phi' (through a mixture gaussian distribution). predicted_phi_ = self.get_sub_component_by_name("wom-nn").call(phi_and_actions) return dict( # Predictions (actions and next-state-features (mixture distribution)). predicted_actions=predicted_actions, predicted_phi_=predicted_phi_ ## Also return the two feature vectors for s and s'. #phi=phi, phi_=phi_ ) # Create the SupervisedPredictor's neural network. predictor_network = NeuralNetwork( # The world option model network taking action-cat-phi and mapping them to the predicted phi'. NeuralNetwork.from_spec(world_option_model_network, scope="wom-nn"), # The concat component concatenating both latent state vectors (phi and phi'). ConcatLayer(scope="concat-phis"), # The NN mapping from phi-cat-phi' to the action prediction. NeuralNetwork.from_spec(post_phi_concat_network, scope="post-phi-concat-nn"), # The ReShape component for flattening all actions in arbitrary action spaces. ReShape(flatten=True, flatten_categories=True, flatten_containers=True, scope="action-flattener"), # The concat component concatenating latent state feature vector and incoming (flattened) actions. ConcatLayer(scope="concat-states-and-actions"), # Set the `call` method. api_methods={("call", custom_call)} ) if optimizer is None: optimizer = dict(type="adam", learning_rate=3e-4) super(IntrinsicCuriosityWorldOptionModel, self).__init__( predictor=dict( network_spec=predictor_network, output_space=Dict({ "predicted_actions": action_space, "predicted_phi_": FloatBox(shape=(self.num_features,)) }, add_batch_rank=action_space.has_batch_rank, add_time_rank=action_space.has_time_rank), distribution_adapter_spec=dict( # for `predicted_actions`: use default adapter # for predicted_phi': use normal-mixture adapter & distribution. predicted_phi_={"type": "normal-mixture-adapter", "num_mixtures": num_mixtures} ), deterministic=deterministic ), loss_function=self.loss_functions["predicted_actions"], optimizer=optimizer, scope=scope, **kwargs ) self.add_components(self.state_encoder, self.loss_functions["predicted_phi_"])