Beispiel #1
0
def make_model_and_action_dist(policy, obs_space, action_space, config):
    # Get the output distribution class for predicting rewards and next-obs.
    policy.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist(
        obs_space, config, dist_type="deterministic", framework="torch")

    # Build one dynamics model if we are a Worker.
    # If we are the main MAML learner, build n (num_workers) dynamics Models
    # for being able to create checkpoints for the current state of training.
    device = (torch.device("cuda")
              if torch.cuda.is_available() else torch.device("cpu"))
    policy.dynamics_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs=num_outputs,
        model_config=config["dynamics_model"],
        framework="torch",
        name="dynamics_ensemble",
    ).to(device)

    action_dist, num_outputs = ModelCatalog.get_action_dist(action_space,
                                                            config,
                                                            framework="torch")
    # Create the pi-model and register it with the Policy.
    policy.pi = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        name="policy_model",
    )

    return policy.pi, action_dist
Beispiel #2
0
def build_model_and_distribution(policy, obs_space, action_space, config):

    if isinstance(action_space, Discrete):
        num_outputs = action_space.n
        dist = TorchCategorical
    else:
        num_outputs = np.prod(action_space.shape) * 2
        dist = TorchDiagGaussian

    policy.model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        framework="torch",
        model_interface=FullyConnectedNetwork,
        name="ac",
        model_config=config["model"],
    )
    policy.model_variables = policy.model.variables()

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        framework="torch",
        model_interface=FullyConnectedNetwork,
        name="ac_target",
        model_config=config["model"],
    )
    policy.target_model_variables = policy.target_model.variables()

    return policy.model, dist
Beispiel #3
0
def build_q_model_and_distribution(policy, obs_space, action_space, config):

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    if config["hiddens"]:
        # try to infer the last layer size, otherwise fall back to 256
        num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
        config["model"]["no_final_linear"] = True
    else:
        num_outputs = action_space.n

    # TODO(sven): Move option to add LayerNorm after each Dense
    #  generically into ModelCatalog.
    add_layer_norm = (
        isinstance(getattr(policy, "exploration", None), ParameterNoise)
        or config["exploration_config"]["type"] == "ParameterNoise")

    policy.q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        model_interface=DQNTorchModel,
        name=Q_SCOPE,
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=add_layer_norm,
        decompose_num=config["decompose_num"])

    policy.q_func_vars = policy.q_model.variables()

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        model_interface=DQNTorchModel,
        name=Q_TARGET_SCOPE,
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=add_layer_norm,
        decompose_num=config["decompose_num"])

    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model, TorchMultiObjCategorical
Beispiel #4
0
def build_ddpg_models(
    policy: Policy,
    observation_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> ModelV2:
    if policy.config["use_state_preprocessor"]:
        default_model = None  # catalog decides
        num_outputs = 256  # arbitrary
        config["model"]["no_final_linear"] = True
    else:
        default_model = TorchNoopModel if config[
            "framework"] == "torch" else NoopModel
        num_outputs = int(np.product(observation_space.shape))

    policy.model = ModelCatalog.get_model_v2(
        obs_space=observation_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework=config["framework"],
        model_interface=(DDPGTorchModel
                         if config["framework"] == "torch" else DDPGTFModel),
        default_model=default_model,
        name="ddpg_model",
        actor_hidden_activation=config["actor_hidden_activation"],
        actor_hiddens=config["actor_hiddens"],
        critic_hidden_activation=config["critic_hidden_activation"],
        critic_hiddens=config["critic_hiddens"],
        twin_q=config["twin_q"],
        add_layer_norm=(policy.config["exploration_config"].get("type") ==
                        "ParameterNoise"),
    )

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space=observation_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework=config["framework"],
        model_interface=(DDPGTorchModel
                         if config["framework"] == "torch" else DDPGTFModel),
        default_model=default_model,
        name="target_ddpg_model",
        actor_hidden_activation=config["actor_hidden_activation"],
        actor_hiddens=config["actor_hiddens"],
        critic_hidden_activation=config["critic_hidden_activation"],
        critic_hiddens=config["critic_hiddens"],
        twin_q=config["twin_q"],
        add_layer_norm=(policy.config["exploration_config"].get("type") ==
                        "ParameterNoise"),
    )

    return policy.model
Beispiel #5
0
def make_model_and_action_dist(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]:
    """Constructs the necessary ModelV2 and action dist class for the Policy.

    Args:
        policy (Policy): The TFPolicy that will use the models.
        obs_space (gym.spaces.Space): The observation space.
        action_space (gym.spaces.Space): The action space.
        config (TrainerConfigDict): The SAC trainer's config dict.

    Returns:
        ModelV2: The ModelV2 to be used by the Policy. Note: An additional
            target model will be created in this function and assigned to
            `policy.target_model`.
    """
    # Get the output distribution class for predicting rewards and next-obs.
    policy.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist(
        obs_space, config, dist_type="deterministic", framework="torch")

    # Build one dynamics model if we are a Worker.
    # If we are the main MAML learner, build n (num_workers) dynamics Models
    # for being able to create checkpoints for the current state of training.
    device = torch.device(
        "cuda") if torch.cuda.is_available() else torch.device("cpu")
    policy.dynamics_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs=num_outputs,
        model_config=config["dynamics_model"],
        framework="torch",
        name="dynamics_ensemble",
    ).to(device)

    action_dist, num_outputs = ModelCatalog.get_action_dist(action_space,
                                                            config,
                                                            framework="torch")
    # Create the pi-model and register it with the Policy.
    policy.pi = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        name="policy_model",
    )

    return policy.pi, action_dist
    def test_conv2d_default_stacks(self):
        """Tests, whether conv2d defaults are available for img obs spaces.
        """
        action_space = gym.spaces.Discrete(2)

        shapes = [
            (480, 640, 3),
            (240, 320, 3),
            (96, 96, 3),
            (84, 84, 3),
            (42, 42, 3),
            (10, 10, 3),
        ]
        for shape in shapes:
            print(f"shape={shape}")
            obs_space = gym.spaces.Box(-1.0, 1.0, shape=shape)
            for fw in framework_iterator():
                model = ModelCatalog.get_model_v2(obs_space,
                                                  action_space,
                                                  2,
                                                  MODEL_DEFAULTS.copy(),
                                                  framework=fw)
                self.assertTrue(isinstance(model,
                                           (VisionNetwork, TorchVision)))
                if fw == "torch":
                    output, _ = model(
                        {"obs": torch.from_numpy(obs_space.sample()[None])})
                else:
                    output, _ = model({"obs": obs_space.sample()[None]})
                # B x [action logits]
                self.assertTrue(output.shape == (1, 2))
                print("ok")
Beispiel #7
0
    def build_q_model(self, obs_space, action_space, num_outputs, q_model_config, name):
        """Builds one of the (twin) Q-nets used by this SAC.

        Override this method in a sub-class of SACTFModel to implement your
        own Q-nets. Alternatively, simply set `custom_model` within the
        top level SAC `q_model_config` config key to make this default implementation
        of `build_q_model` use your custom Q-nets.

        Returns:
            TFModelV2: The TFModelV2 Q-net sub-model.
        """
        self.concat_obs_and_actions = False
        if self.discrete:
            input_space = obs_space
        else:
            orig_space = getattr(obs_space, "original_space", obs_space)
            if isinstance(orig_space, Box) and len(orig_space.shape) == 1:
                input_space = Box(
                    float("-inf"),
                    float("inf"),
                    shape=(orig_space.shape[0] + action_space.shape[0],),
                )
                self.concat_obs_and_actions = True
            else:
                input_space = gym.spaces.Tuple([orig_space, action_space])

        model = ModelCatalog.get_model_v2(
            input_space,
            action_space,
            num_outputs,
            q_model_config,
            framework="tf",
            name=name,
        )
        return model
    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        # TODO: (sven) Support Dicts as well.
        assert isinstance(obs_space.original_space, (Tuple)), \
            "`obs_space.original_space` must be Tuple!"

        super().__init__(obs_space, action_space, num_outputs, model_config,
                         name)

        # Build the CNN(s) given obs_space's image components.
        self.cnns = {}
        concat_size = 0
        for i, component in enumerate(obs_space.original_space):
            # Image space.
            if len(component.shape) == 3:
                config = {
                    "conv_filters":
                    model_config.get("conv_filters",
                                     get_filter_config(component.shape)),
                    "conv_activation":
                    model_config.get("conv_activation"),
                }
                cnn = ModelCatalog.get_model_v2(component,
                                                action_space,
                                                num_outputs=None,
                                                model_config=config,
                                                framework="tf",
                                                name="cnn_{}".format(i))
                concat_size += cnn.num_outputs
                self.cnns[i] = cnn
            # Discrete inputs -> One-hot encode.
            elif isinstance(component, Discrete):
                concat_size += component.n
            # TODO: (sven) Multidiscrete (see e.g. our auto-LSTM wrappers).
            # Everything else (1D Box).
            else:
                assert len(component.shape) == 1, \
                    "Only input Box 1D or 3D spaces allowed!"
                concat_size += component.shape[-1]

        self.logits_and_value_model = None
        self._value_out = None
        if num_outputs:
            # Action-distribution head.
            concat_layer = tf.keras.layers.Input((concat_size, ))
            logits_layer = tf.keras.layers.Dense(
                num_outputs,
                activation=tf.keras.activations.linear,
                name="logits")(concat_layer)

            # Create the value branch model.
            value_layer = tf.keras.layers.Dense(
                1,
                name="value_out",
                activation=None,
                kernel_initializer=normc_initializer(0.01))(concat_layer)
            self.logits_and_value_model = tf.keras.models.Model(
                concat_layer, [logits_layer, value_layer])
        else:
            self.num_outputs = concat_size
Beispiel #9
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            if get_default_config:
                config = dict(get_default_config(), **config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config

            if action_sampler_fn:
                if not make_model:
                    raise ValueError(
                        "make_model is required if action_sampler_fn is given")
                self.dist_class = None
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework="tf",
                )

            self.model({
                SampleBatch.CUR_OBS: tf.convert_to_tensor(
                    np.array([observation_space.sample()])),
                SampleBatch.PREV_ACTIONS: tf.convert_to_tensor(
                    [_flatten_action(action_space.sample())]),
                SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]),
            }, [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ], tf.convert_to_tensor([1]))

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_with_dummy_batch()
            self._loss_initialized = True

            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
                self._optimizer = tf.train.AdamOptimizer(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
Beispiel #10
0
        def __init__(self, obs_space, action_space, config):
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            if before_init:
                before_init(self, obs_space, action_space, config)

            if make_model_and_action_dist:
                self.model, self.dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], torch=True)
                self.model = ModelCatalog.get_model_v2(obs_space,
                                                       action_space,
                                                       logit_dim,
                                                       self.config["model"],
                                                       framework="torch")

            TorchPolicy.__init__(self, obs_space, action_space, config,
                                 self.model, loss_fn, self.dist_class)

            if after_init:
                after_init(self, obs_space, action_space, config)
Beispiel #11
0
        def __init__(self, obs_space, action_space, config):
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            if before_init:
                before_init(self, obs_space, action_space, config)

            if make_model_and_action_dist:
                self.model, self.dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
                # Make sure, we passed in a correct Model factory.
                assert isinstance(self.model, TorchModelV2), \
                    "ERROR: TorchPolicy::make_model_and_action_dist must " \
                    "return a TorchModelV2 object!"
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
                self.model = ModelCatalog.get_model_v2(obs_space,
                                                       action_space,
                                                       logit_dim,
                                                       self.config["model"],
                                                       framework="torch")

            TorchPolicy.__init__(self, obs_space, action_space, config,
                                 self.model, loss_fn, self.dist_class)

            if after_init:
                after_init(self, obs_space, action_space, config)
Beispiel #12
0
    def __init__(self, obs_space, action_space, config):
        model = ModelCatalog.get_model_v2(obs_space, action_space,
                                          action_space.n, config["model"],
                                          "torch")
        _, env_creator = Trainer._get_env_id_and_creator(config["env"], config)
        if config["ranked_rewards"]["enable"]:
            # if r2 is enabled, tne env is wrapped to include a rewards buffer
            # used to normalize rewards
            env_cls = get_r2_env_wrapper(env_creator, config["ranked_rewards"])

            # the wrapped env is used only in the mcts, not in the
            # rollout workers
            def _env_creator():
                return env_cls(config["env_config"])

        else:

            def _env_creator():
                return env_creator(config["env_config"])

        def mcts_creator():
            return MCTS(model, config["mcts_config"])

        super().__init__(
            obs_space,
            action_space,
            config,
            model,
            alpha_zero_loss,
            TorchCategorical,
            mcts_creator,
            _env_creator,
        )
Beispiel #13
0
    def _init_model_and_dist_class(self):
        if is_overridden(self.make_model) and is_overridden(
                self.make_model_and_action_dist):
            raise ValueError(
                "Only one of make_model or make_model_and_action_dist "
                "can be overridden.")

        if is_overridden(self.make_model):
            model = self.make_model()
            dist_class, _ = ModelCatalog.get_action_dist(
                self.action_space,
                self.config["model"],
                framework=self.framework)
        elif is_overridden(self.make_model_and_action_dist):
            model, dist_class = self.make_model_and_action_dist()
        else:
            dist_class, logit_dim = ModelCatalog.get_action_dist(
                self.action_space,
                self.config["model"],
                framework=self.framework)
            model = ModelCatalog.get_model_v2(
                obs_space=self.observation_space,
                action_space=self.action_space,
                num_outputs=logit_dim,
                model_config=self.config["model"],
                framework=self.framework,
            )
        return model, dist_class
Beispiel #14
0
def make_appo_model(
    policy: Policy,
    obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: TrainerConfigDict,
) -> ModelV2:
    """Builds model and target model for APPO.

    Args:
        policy (Policy): The Policy, which will use the model for optimization.
        obs_space (gym.spaces.Space): The policy's observation space.
        action_space (gym.spaces.Space): The policy's action space.
        config (TrainerConfigDict):

    Returns:
        ModelV2: The Model for the Policy to use.
            Note: The target model will not be returned, just assigned to
            `policy.target_model`.
    """
    # Get the num_outputs for the following model construction calls.
    _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"])

    # Construct the (main) model.
    policy.model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        logit_dim,
        config["model"],
        name=POLICY_SCOPE,
        framework="torch" if config["framework"] == "torch" else "tf",
    )
    policy.model_variables = policy.model.variables()

    # Construct the target model.
    policy.target_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        logit_dim,
        config["model"],
        name=TARGET_POLICY_SCOPE,
        framework="torch" if config["framework"] == "torch" else "tf",
    )
    policy.target_model_variables = policy.target_model.variables()

    # Return only the model (not the target model).
    return policy.model
Beispiel #15
0
        def __init__(self, obs_space, action_space, config):
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            if validate_spaces:
                validate_spaces(self, obs_space, action_space, self.config)

            if before_init:
                before_init(self, obs_space, action_space, self.config)

            # Model is customized (use default action dist class).
            if make_model:
                assert make_model_and_action_dist is None, \
                    "Either `make_model` or `make_model_and_action_dist`" \
                    " must be None!"
                self.model = make_model(self, obs_space, action_space, config)
                dist_class, _ = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
            # Model and action dist class are customized.
            elif make_model_and_action_dist:
                self.model, dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
            # Use default model and default action dist.
            else:
                dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
                self.model = ModelCatalog.get_model_v2(
                    obs_space=obs_space,
                    action_space=action_space,
                    num_outputs=logit_dim,
                    model_config=self.config["model"],
                    framework="torch")

            # Make sure, we passed in a correct Model factory.
            assert isinstance(self.model, TorchModelV2), \
                "ERROR: Generated Model must be a TorchModelV2 object!"

            policy_class.__init__(
                self,
                observation_space=obs_space,
                action_space=action_space,
                config=config,
                model=self.model,
                loss=loss_fn,
                action_distribution_class=dist_class,
                action_sampler_fn=action_sampler_fn,
                action_distribution_fn=action_distribution_fn,
                max_seq_len=config["model"]["max_seq_len"],
                get_batch_divisibility_req=get_batch_divisibility_req,
            )

            if callable(training_view_requirements_fn):
                self.training_view_requirements.update(
                    training_view_requirements_fn(self))

            if after_init:
                after_init(self, obs_space, action_space, config)
Beispiel #16
0
    def make_model(self) -> ModelV2:
        # copying ddpg build model to here to be explicit
        model_config = self.config["model"]
        model_config.update(
            dict(
                actor_hidden_activation=self.config["actor_hidden_activation"],
                actor_hiddens=self.config["actor_hiddens"],
                critic_hidden_activation=self.
                config["critic_hidden_activation"],
                critic_hiddens=self.config["critic_hiddens"],
                twin_q=self.config["twin_q"],
            ))
        num_outputs = int(np.product(self.observation_space.shape))

        # TODO: why do we even have to go through this get_model_v2 function?
        self.model = ModelCatalog.get_model_v2(
            obs_space=self.observation_space,
            action_space=self.action_space,
            num_outputs=num_outputs,
            model_config=model_config,
            framework=self.config["framework"],
            # use this model for interface (get_q, get_q_twin, .etc)
            model_interface=CRRModel,
            default_model=TorchNoopModel,
            name="model",
        )

        # TODO: this is a bad python pattern to assign attributes that do not exist in
        #  the constructor
        self.target_model = ModelCatalog.get_model_v2(
            obs_space=self.observation_space,
            action_space=self.action_space,
            num_outputs=num_outputs,
            model_config=model_config,
            framework=self.config["framework"],
            # use this model for interface (get_q, get_q_twin, .etc)
            model_interface=CRRModel,
            default_model=TorchNoopModel,
            name="target_model",
        )

        return self.model
Beispiel #17
0
def build_dreamer_model(policy, obs_space, action_space, config):

    policy.model = ModelCatalog.get_model_v2(obs_space,
                                             action_space,
                                             1,
                                             config["dreamer_model"],
                                             name="DreamerModel",
                                             framework="torch")

    policy.model_variables = policy.model.variables()

    return policy.model, None
    def make_model(self):

        model = ModelCatalog.get_model_v2(
            self.observation_space,
            self.action_space,
            1,
            self.config["dreamer_model"],
            name="DreamerModel",
            framework="torch",
        )

        self.model_variables = model.variables()

        return model
Beispiel #19
0
        def __init__(self, obs_space, action_space, config):
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            if before_init:
                before_init(self, obs_space, action_space, config)

            # Model is customized (use default action dist class).
            if make_model:
                assert make_model_and_action_dist is None
                self.model = make_model(self, obs_space, action_space, config)
                dist_class, _ = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
            # Model and action dist class are customized.
            elif make_model_and_action_dist:
                self.model, dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
            # Use default model and default action dist.
            else:
                dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
                self.model = ModelCatalog.get_model_v2(
                    obs_space=obs_space,
                    action_space=action_space,
                    num_outputs=logit_dim,
                    model_config=self.config["model"],
                    framework="torch",
                    **self.config["model"].get("custom_model_config", {}))

            # Make sure, we passed in a correct Model factory.
            assert isinstance(self.model, TorchModelV2), \
                "ERROR: Generated Model must be a TorchModelV2 object!"

            TorchPolicy.__init__(
                self,
                observation_space=obs_space,
                action_space=action_space,
                config=config,
                model=self.model,
                loss=loss_fn,
                action_distribution_class=dist_class,
                action_sampler_fn=action_sampler_fn,
                action_distribution_fn=action_distribution_fn,
                max_seq_len=config["model"]["max_seq_len"],
                get_batch_divisibility_req=get_batch_divisibility_req,
            )

            if after_init:
                after_init(self, obs_space, action_space, config)
Beispiel #20
0
def make_appo_model(policy) -> ModelV2:
    """Builds model and target model for APPO.

    Returns:
        ModelV2: The Model for the Policy to use.
            Note: The target model will not be returned, just assigned to
            `policy.target_model`.
    """
    # Get the num_outputs for the following model construction calls.
    _, logit_dim = ModelCatalog.get_action_dist(
        policy.action_space, policy.config["model"]
    )

    # Construct the (main) model.
    policy.model = ModelCatalog.get_model_v2(
        policy.observation_space,
        policy.action_space,
        logit_dim,
        policy.config["model"],
        name=POLICY_SCOPE,
        framework=policy.framework,
    )
    policy.model_variables = policy.model.variables()

    # Construct the target model.
    policy.target_model = ModelCatalog.get_model_v2(
        policy.observation_space,
        policy.action_space,
        logit_dim,
        policy.config["model"],
        name=TARGET_POLICY_SCOPE,
        framework=policy.framework,
    )
    policy.target_model_variables = policy.target_model.variables()

    # Return only the model (not the target model).
    return policy.model
Beispiel #21
0
def make_model_and_dist(policy, obs_space, action_space, config):
    # Get the output distribution class for predicting rewards and next-obs.
    policy.distr_cls_next_obs, num_outputs = ModelCatalog.get_action_dist(
        obs_space, config, dist_type="deterministic", framework="torch")
    if config["predict_reward"]:
        # TODO: (sven) implement reward prediction.
        _ = ModelCatalog.get_action_dist(gym.spaces.Box(
            float("-inf"), float("inf"), ()),
                                         config,
                                         dist_type="")

    # Build one dynamics model if we are a Worker.
    # If we are the main MAML learner, build n (num_workers) dynamics Models
    # for being able to create checkpoints for the current state of training.
    policy.dynamics_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs=num_outputs,
        model_config=config["dynamics_model"],
        framework="torch",
        name="dynamics_model",
        model_interface=DYNATorchModel,
    )

    action_dist, num_outputs = ModelCatalog.get_action_dist(
        action_space, config, dist_type="deterministic", framework="torch")
    # Create the pi-model and register it with the Policy.
    policy.pi = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        name="policy_model",
    )

    return policy.pi, action_dist
Beispiel #22
0
def make_q_models(policy):
    if not isinstance(policy.action_space, gym.spaces.Discrete):
        raise UnsupportedSpaceException(
            f"Action space {policy.action_space} is not supported for DQN.")

    model = ModelCatalog.get_model_v2(
        obs_space=policy.observation_space,
        action_space=policy.action_space,
        num_outputs=policy.action_space.n,
        model_config=policy.config["model"],
        framework=policy.config["framework"],
        name=Q_SCOPE,
    )

    target_model = ModelCatalog.get_model_v2(
        obs_space=policy.observation_space,
        action_space=policy.action_space,
        num_outputs=policy.action_space.n,
        model_config=policy.config["model"],
        framework=policy.config["framework"],
        name=Q_TARGET_SCOPE,
    )

    return model, target_model
Beispiel #23
0
    def make_model(self) -> ModelV2:
        """Build underlying model for this Policy.

        Returns:
            The Model for the Policy to use.
        """
        # Default ModelV2 model.
        _, logit_dim = ModelCatalog.get_action_dist(self.action_space,
                                                    self.config["model"])
        return ModelCatalog.get_model_v2(
            self.observation_space,
            self.action_space,
            logit_dim,
            self.config["model"],
            framework=self.framework,
        )
Beispiel #24
0
 def __init__(self, obs_space, act_space, config):
     super(LearnableSignalerPolicy, self).__init__(obs_space, act_space,
                                                   config)
     self.framework = "torch"
     self.exploration = self._create_exploration()
     self.n_signals = act_space.spaces[0].n if isinstance(
         act_space, Tuple) else act_space.n
     self.device = (torch.device("cuda")
                    if torch.cuda.is_available() else torch.device("cpu"))
     self.model = ModelCatalog.get_model_v2(
         MultiDiscrete([2]),
         Discrete(self.n_signals),
         self.n_signals,
         config['sig_model'],
         framework="torch",
         name="SignalerNet",
         default_model=FullyConnectedNetwork)
Beispiel #25
0
    def build_policy_model(self, obs_space, num_outputs, policy_model_config,
                           name):
        """Builds the policy model used by this SAC.

        Override this method in a sub-class of SACTFModel to implement your
        own policy net. Alternatively, simply set `custom_model` within the
        top level SAC `policy_model` config key to make this default
        implementation of `build_policy_model` use your custom policy network.

        Returns:
            TorchModelV2: The TorchModelV2 policy sub-model.
        """
        model = ModelCatalog.get_model_v2(obs_space,
                                          self.action_space,
                                          num_outputs,
                                          policy_model_config,
                                          framework="torch",
                                          name=name)
        return model
Beispiel #26
0
def build_model_and_distribution(policy, obs_space, action_space, config):

    if isinstance(action_space, Discrete):
        num_outputs = action_space.n
        dist = TorchCategorical
    else:
        num_outputs = np.prod(action_space.shape) * 2
        dist = TorchDiagGaussian

    model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                      action_space=action_space,
                                      num_outputs=num_outputs,
                                      framework="torch",
                                      model_interface=FCN_MultiV_MultiObj,
                                      name="ac",
                                      model_config=config["model"],
                                      num_decompose=decompose)

    return model, dist
Beispiel #27
0
        def __init__(self, obs_space, action_space, config):
            if get_default_config:
                config = dict(get_default_config(), **config)
            self.config = config

            if before_init:
                before_init(self, obs_space, action_space, config)

            if make_model_and_action_dist:
                self.model, dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
                # Make sure, we passed in a correct Model factory.
                assert isinstance(self.model, TorchModelV2), \
                    "ERROR: TorchPolicy::make_model_and_action_dist must " \
                    "return a TorchModelV2 object!"
            else:
                dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"], framework="torch")
                self.model = ModelCatalog.get_model_v2(
                    obs_space=obs_space,
                    action_space=action_space,
                    num_outputs=logit_dim,
                    model_config=self.config["model"],
                    framework="torch",
                    **self.config["model"].get("custom_options", {}))

            TorchPolicy.__init__(
                self,
                obs_space,
                action_space,
                config,
                model=self.model,
                loss=loss_fn,
                action_distribution_class=dist_class,
                action_sampler_fn=action_sampler_fn,
                action_distribution_fn=action_distribution_fn,
                max_seq_len=config["model"]["max_seq_len"],
                get_batch_divisibility_req=get_batch_divisibility_req,
            )

            if after_init:
                after_init(self, obs_space, action_space, config)
Beispiel #28
0
    def __init__(self, obs_space, action_space, config):
        # update policy attr for loss calculation
        self.minibatch_size = config['sgd_minibatch_size']
        self.ppo_epochs = config['ppo_epochs']
        self.dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"], framework='torch')
        self.model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                               action_space=action_space,
                                               num_outputs=logit_dim,
                                               model_config=config["model"],
                                               framework='torch')

        super().__init__(obs_space,
                         action_space,
                         config,
                         model=self.model,
                         loss=ppo_surrogate_loss,
                         action_distribution_class=self.dist_class)

        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.config["lr"])
Beispiel #29
0
    def __init__(self, obs_space, action_space, config):
        # update policy attr for loss calculation
        print('DROPPOPolicy init...')
        # self.framework = config['framework'] = 'torch'
        # self.kl_coeff = config['kl_coeff']
        # self.kl_target = config['kl_target']
        # self.entropy_coeff = config['entropy_coeff']
        # self.cur_lr = config['lr']
        # # setup ._value() for gae computation
        # self.setup_value(config)
        self.minibatch_size = config['sgd_minibatch_size']
        self.ppo_epochs = config['ppo_epochs']
        self.dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, config["model"], framework='torch')
        self.model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                               action_space=action_space,
                                               num_outputs=logit_dim,
                                               model_config=config["model"],
                                               framework='torch')
        super().__init__(obs_space,
                         action_space,
                         config,
                         model=self.model,
                         loss=ppo_surrogate_loss,
                         action_distribution_class=self.dist_class)

        # Merge Model's view requirements into Policy's.
        # self.view_requirements.update(self.model.view_requirements)
        # init mixins
        # setup_mixins(self, obs_space, action_space, config)
        # Perform test runs through postprocessing- and loss functions.
        # self._initialize_loss_from_dummy_batch(
        #     auto_remove_unneeded_view_reqs=True,
        #     stats_fn=kl_and_loss_stats,
        # )
        self.view_requirements.update({'rewards': ViewRequirement()})
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.config["lr"])
Beispiel #30
0
    def __init__(self,
                 action_space: Space,
                 *,
                 framework: str,
                 model: ModelV2,
                 feature_dim: int = 288,
                 feature_net_config: Optional[ModelConfigDict] = None,
                 inverse_net_hiddens: Tuple[int] = (256, ),
                 inverse_net_activation: str = "relu",
                 forward_net_hiddens: Tuple[int] = (256, ),
                 forward_net_activation: str = "relu",
                 beta: float = 0.2,
                 eta: float = 1.0,
                 lr: float = 1e-3,
                 sub_exploration: Optional[FromConfigSpec] = None,
                 **kwargs):
        """Initializes a Curiosity object.

        Uses as defaults the hyperparameters described in [1].

        Args:
             feature_dim (int): The dimensionality of the feature (phi)
                vectors.
             feature_net_config (Optional[ModelConfigDict]): Optional model
                configuration for the feature network, producing feature
                vectors (phi) from observations. This can be used to configure
                fcnet- or conv_net setups to properly process any observation
                space.
             inverse_net_hiddens (Tuple[int]): Tuple of the layer sizes of the
                inverse (action predicting) NN head (on top of the feature
                outputs for phi and phi').
             inverse_net_activation (str): Activation specifier for the inverse
                net.
             forward_net_hiddens (Tuple[int]): Tuple of the layer sizes of the
                forward (phi' predicting) NN head.
             forward_net_activation (str): Activation specifier for the forward
                net.
             beta (float): Weight for the forward loss (over the inverse loss,
                which gets weight=1.0-beta) in the common loss term.
             eta (float): Weight for intrinsic rewards before being added to
                extrinsic ones.
             lr (float): The learning rate for the curiosity-specific
                optimizer, optimizing feature-, inverse-, and forward nets.
             sub_exploration (Optional[FromConfigSpec]): The config dict for
                the underlying Exploration to use (e.g. epsilon-greedy for
                DQN). If None, uses the FromSpecDict provided in the Policy's
                default config.
        """
        if not isinstance(action_space, (Discrete, MultiDiscrete)):
            raise ValueError(
                "Only (Multi)Discrete action spaces supported for Curiosity "
                "so far!")

        super().__init__(
            action_space, model=model, framework=framework, **kwargs)

        if self.policy_config["num_workers"] != 0:
            raise ValueError(
                "Curiosity exploration currently does not support parallelism."
                " `num_workers` must be 0!")

        self.feature_dim = feature_dim
        if feature_net_config is None:
            feature_net_config = self.policy_config["model"].copy()
        self.feature_net_config = feature_net_config
        self.inverse_net_hiddens = inverse_net_hiddens
        self.inverse_net_activation = inverse_net_activation
        self.forward_net_hiddens = forward_net_hiddens
        self.forward_net_activation = forward_net_activation

        self.action_dim = self.action_space.n if isinstance(
            self.action_space, Discrete) else np.sum(self.action_space.nvec)

        self.beta = beta
        self.eta = eta
        self.lr = lr
        # TODO: (sven) if sub_exploration is None, use Trainer's default
        #  Exploration config.
        if sub_exploration is None:
            raise NotImplementedError
        self.sub_exploration = sub_exploration

        # Creates modules/layers inside the actual ModelV2.
        self._curiosity_feature_net = ModelCatalog.get_model_v2(
            self.model.obs_space,
            self.action_space,
            self.feature_dim,
            model_config=self.feature_net_config,
            framework=self.framework,
            name="feature_net",
        )

        self._curiosity_inverse_fcnet = self._create_fc_net(
            [2 * self.feature_dim] + list(self.inverse_net_hiddens) +
            [self.action_dim],
            self.inverse_net_activation,
            name="inverse_net")

        self._curiosity_forward_fcnet = self._create_fc_net(
            [self.feature_dim + self.action_dim] + list(
                self.forward_net_hiddens) + [self.feature_dim],
            self.forward_net_activation,
            name="forward_net")

        # This is only used to select the correct action
        self.exploration_submodule = from_config(
            cls=Exploration,
            config=self.sub_exploration,
            action_space=self.action_space,
            framework=self.framework,
            policy_config=self.policy_config,
            model=self.model,
            num_workers=self.num_workers,
            worker_index=self.worker_index,
        )