Beispiel #1
0
def build_sac_model(policy, obs_space, action_space, config):
    if config["model"].get("custom_model"):
        logger.warning(
            "Setting use_state_preprocessor=True since a custom model "
            "was specified.")
        config["use_state_preprocessor"] = True
    if not isinstance(action_space, (Box, Discrete)):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for SAC.".format(action_space))
    if isinstance(action_space, Box) and len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space has multiple dimensions "
            "{}. ".format(action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")

    # # infer num_outpus as action space dim (not embedding size!!)
    # _, num_outputs = ModelCatalog.get_action_dist(
    #     action_space, config["model"], framework="torch")
    num_outputs = action_space.n

    # Force-ignore any additionally provided hidden layer sizes.
    # Everything should be configured using SAC's "Q_model" and "policy_model"
    # settings.
    policy.model = BaselineSACTorchModel(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        name="sac_model",
        actor_hidden_activation=config["policy_model"]["fcnet_activation"],
        actor_hiddens=config["policy_model"]["fcnet_hiddens"],
        critic_hidden_activation=config["Q_model"]["fcnet_activation"],
        critic_hiddens=config["Q_model"]["fcnet_hiddens"],
        twin_q=config["twin_q"],
        initial_alpha=config["initial_alpha"],
        target_entropy=config["target_entropy"],
        # customs 
        embed_dim =config["embed_dim"],
        encoder_type=config["encoder_type"]) 


    policy.target_model = BaselineSACTorchModel(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        name="target_sac_model",
        actor_hidden_activation=config["policy_model"]["fcnet_activation"],
        actor_hiddens=config["policy_model"]["fcnet_hiddens"],
        critic_hidden_activation=config["Q_model"]["fcnet_activation"],
        critic_hiddens=config["Q_model"]["fcnet_hiddens"],
        twin_q=config["twin_q"],
        initial_alpha=config["initial_alpha"],
        target_entropy=config["target_entropy"],
        # customs 
        embed_dim =config["embed_dim"],
        encoder_type=config["encoder_type"])

    return policy.model
Beispiel #2
0
def validate_spaces(policy: Policy, observation_space: gym.spaces.Space,
                    action_space: gym.spaces.Space,
                    config: TrainerConfigDict) -> None:
    """Validates the observation- and action spaces used for the Policy.

    Args:
        policy (Policy): The policy, whose spaces are being validated.
        observation_space (gym.spaces.Space): The observation space to
            validate.
        action_space (gym.spaces.Space): The action space to validate.
        config (TrainerConfigDict): The Policy's config dict.

    Raises:
        UnsupportedSpaceException: If one of the spaces is not supported.
    """
    # Only support single Box or single Discrete spaces.
    if not isinstance(action_space, (Box, Discrete, Simplex)):
        raise UnsupportedSpaceException(
            "Action space ({}) of {} is not supported for "
            "SAC. Must be [Box|Discrete|Simplex].".format(
                action_space, policy))
    # If Box, make sure it's a 1D vector space.
    elif isinstance(action_space,
                    (Box, Simplex)) and len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space ({}) of {} has multiple dimensions "
            "{}. ".format(action_space, policy, action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")
Beispiel #3
0
def build_ddpg_model(policy, obs_space, action_space, config):
    if config["model"]["custom_model"]:
        logger.warning(
            "Setting use_state_preprocessor=True since a custom model "
            "was specified.")
        config["use_state_preprocessor"] = True
    if not isinstance(action_space, Box):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DDPG.".format(action_space))
    if len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space has multiple dimensions "
            "{}. ".format(action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")

    if config["use_state_preprocessor"]:
        default_model = None  # catalog decides
        num_outputs = 256  # arbitrary
        config["model"]["no_final_linear"] = True
    else:
        default_model = NoopModel
        num_outputs = int(np.product(obs_space.shape))

    policy.model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs,
        config["model"],
        framework="tf",
        model_interface=DDPGModel,
        default_model=default_model,
        name="ddpg_model",
        actor_hidden_activation=config["actor_hidden_activation"],
        actor_hiddens=config["actor_hiddens"],
        critic_hidden_activation=config["critic_hidden_activation"],
        critic_hiddens=config["critic_hiddens"],
        parameter_noise=config["parameter_noise"],
        twin_q=config["twin_q"])

    policy.target_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs,
        config["model"],
        framework="tf",
        model_interface=DDPGModel,
        default_model=default_model,
        name="target_ddpg_model",
        actor_hidden_activation=config["actor_hidden_activation"],
        actor_hiddens=config["actor_hiddens"],
        critic_hidden_activation=config["critic_hidden_activation"],
        critic_hiddens=config["critic_hiddens"],
        parameter_noise=config["parameter_noise"],
        twin_q=config["twin_q"])

    return policy.model
Beispiel #4
0
def validate_spaces(pid, observation_space, action_space, config):
    if not isinstance(action_space, (Box, Discrete)):
        raise UnsupportedSpaceException(
            "Action space ({}) of {} is not supported for "
            "SAC.".format(action_space, pid))
    if isinstance(action_space, Box) and len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space ({}) of {} has multiple dimensions "
            "{}. ".format(action_space, pid, action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")
Beispiel #5
0
def validate_spaces(pid: PolicyID, observation_space: gym.spaces.Space,
                    action_space: gym.spaces.Space,
                    config: TrainerConfigDict) -> None:
    if not isinstance(action_space, Box):
        raise UnsupportedSpaceException(
            "Action space ({}) of {} is not supported for "
            "DDPG.".format(action_space, pid))
    elif len(action_space.shape) > 1:
        raise UnsupportedSpaceException(
            "Action space ({}) of {} has multiple dimensions "
            "{}. ".format(action_space, pid, action_space.shape) +
            "Consider reshaping this into a single dimension, "
            "using a Tuple action space, or the multi-agent API.")
Beispiel #6
0
def build_q_networks(policy, input_dict, observation_space, action_space,
                     config):

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    # Action Q network
    with tf.variable_scope(Q_SCOPE) as scope:
        q_values, q_logits, q_dist, _ = _build_q_network(
            policy, input_dict[SampleBatch.CUR_OBS], observation_space,
            action_space)
        policy.q_values = q_values
        policy.q_func_vars = _scope_vars(scope.name)

    # Noise vars for Q network except for layer normalization vars
    if config["parameter_noise"]:
        _build_parameter_noise(
            policy,
            [var for var in policy.q_func_vars if "LayerNorm" not in var.name])
        policy.action_probs = tf.nn.softmax(policy.q_values)

    # Action outputs
    qvp = QValuePolicy(q_values, input_dict[SampleBatch.CUR_OBS],
                       action_space.n, policy.stochastic, policy.eps,
                       policy.config["soft_q"], policy.config["softmax_temp"])
    policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob

    return policy.output_actions, policy.action_prob
Beispiel #7
0
        def _make_box_from_dict(space):
            """
            Convert a spaces.Dict to a spaces.Box

            """
            sp = list(space.spaces.values())
            lows = []
            highs = []

            for s in sp:
                if isinstance(s, gym.spaces.Discrete):
                    highs.append(s.n)
                    lows.append(0)

                elif isinstance(s, gym.spaces.MultiBinary):
                    sh = reduce(lambda x, y: x * y, s.shape)
                    highs += [1] * sh
                    lows += [0] * sh

                elif isinstance(s, gym.spaces.Box):
                    highs += s.high.flatten().tolist()
                    lows += s.low.flatten().tolist()

                else:
                    raise UnsupportedSpaceException(
                        "Space {} is not supported.".format(space))

            highs = np.asarray(highs)
            lows = np.asarray(lows)
            return gym.spaces.Box(high=highs, low=lows)
Beispiel #8
0
def validate_spaces(
    policy: Policy,
    observation_space: gym.spaces.Space,
    action_space: gym.spaces.Space,
    config: AlgorithmConfigDict,
) -> None:
    """Validates the observation- and action spaces used for the Policy.

    Args:
        policy: The policy, whose spaces are being validated.
        observation_space: The observation space to validate.
        action_space: The action space to validate.
        config: The Policy's config dict.

    Raises:
        UnsupportedSpaceException: If one of the spaces is not supported.
    """
    # Only support single Box or single Discrete spaces.
    if not isinstance(action_space, gym.spaces.Discrete):
        msg = (
            f"Action space ({action_space}) of {policy} is not supported for "
            f"Bandit algorithms. Must be `Discrete`."
        )
        # Hint at using the MultiDiscrete to Discrete wrapper for Bandits.
        if isinstance(action_space, gym.spaces.MultiDiscrete):
            msg += (
                " Try to wrap your environment with the "
                "`ray.rllib.env.wrappers.recsim::"
                "MultiDiscreteToDiscreteActionWrapper` class: `tune.register_env("
                "[some str], lambda ctx: MultiDiscreteToDiscreteActionWrapper("
                "[your gym env])); config = {'env': [some str]}`"
            )
        raise UnsupportedSpaceException(msg)
Beispiel #9
0
def build_q_models(policy, obs_space, action_space, config):

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    if config["hiddens"]:
        num_outputs = 256
        config["model"]["no_final_linear"] = True
    else:
        num_outputs = action_space.n

    policy.q_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs,
        config["model"],
        framework="tf",
        name=Q_SCOPE,
        model_interface=SimpleQModel,
        q_hiddens=config["hiddens"])

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs,
        config["model"],
        framework="tf",
        name=Q_TARGET_SCOPE,
        model_interface=SimpleQModel,
        q_hiddens=config["hiddens"])

    return policy.q_model
def build_q_models(policy, obs_space, action_space, config):

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                               action_space=action_space,
                                               num_outputs=action_space.n,
                                               model_config=config["model"],
                                               framework=config["framework"],
                                               name=Q_SCOPE)

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.n,
        model_config=config["model"],
        framework=config["framework"],
        name=Q_TARGET_SCOPE)

    policy.q_func_vars = policy.q_model.variables()
    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model
def build_q_models(policy, obs_space, action_space, config):
    policy.log_stats = config["log_stats"]
    if policy.log_stats:
        policy.stats_dict = {}
        policy.stats_fn = config["stats_fn"]

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))
    policy.device = (torch.device("cuda")
                     if torch.cuda.is_available() else torch.device("cpu"))
    default_model = RNNModel if config[
        "recurrent_dqn"] else FullyConnectedNetwork
    policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                               action_space=action_space,
                                               num_outputs=action_space.n,
                                               model_config=config["model"],
                                               framework=config["framework"],
                                               default_model=default_model,
                                               name=Q_SCOPE).to(policy.device)

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.n,
        model_config=config["model"],
        framework=config["framework"],
        default_model=default_model,
        name=Q_TARGET_SCOPE).to(policy.device)

    policy.q_func_vars = policy.q_model.variables()
    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model
Beispiel #12
0
    def setup_loss(self, action_space):
        if isinstance(action_space, gym.spaces.Box):
            ac_size = action_space.shape[0]
            self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
        elif isinstance(action_space, gym.spaces.Discrete):
            self.ac = tf.placeholder(tf.int64, [None], name="ac")
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for A3C.".format(
                    action_space))
        self.adv = tf.placeholder(tf.float32, [None], name="adv")
        self.r = tf.placeholder(tf.float32, [None], name="r")

        log_prob = self.action_dist.logp(self.ac)

        # The "policy gradients" loss: its derivative is precisely the policy
        # gradient. Notice that self.ac is a placeholder that is provided
        # externally. adv will contain the advantages, as calculated in
        # compute_advantages.
        self.pi_loss = -tf.reduce_sum(log_prob * self.adv)

        delta = self.vf - self.r
        self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
        self.entropy = tf.reduce_sum(self.action_dist.entropy())
        self.loss = (self.pi_loss +
                     self.vf_loss * self.config["vf_loss_coeff"] +
                     self.entropy * self.config["entropy_coeff"])
def build_q_model_and_distribution_comp(policy, obs_space, action_space,
                                        config):
    # Keys of the observation space that must be used at train and test time
    policy.train_obs_keys = config["train_obs_keys"]
    policy.test_obs_keys = config["test_obs_keys"]

    # Check whether policy observation space is inside a Tuple space
    policy.requires_tupling = False
    if isinstance(action_space, Tuple) and len(action_space.spaces) == 1:
        policy.action_space = action_space.spaces[0]
        action_space = action_space.spaces[0]
        policy.requires_tupling = True
    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    # Get real observation space
    if isinstance(obs_space, Box):
        assert hasattr(obs_space,
                       "original_space"), "Invalid observation space"
        obs_space = obs_space.original_space
        if isinstance(obs_space, Tuple):
            obs_space = obs_space.spaces[0]
    assert isinstance(obs_space, Dict), "Invalid observation space"
    policy.has_action_mask = "action_mask" in obs_space.spaces
    assert all([k in obs_space.spaces for k in policy.train_obs_keys
                ]), "Invalid train keys specification"
    assert all([k in obs_space.spaces for k in policy.test_obs_keys
                ]), "Invalid test keys specification"

    # Get observation space used for training
    if config["train_obs_space"] is None:
        train_obs_space = obs_space
    else:
        train_obs_space = config["train_obs_space"]
        if isinstance(train_obs_space, Box):
            assert hasattr(train_obs_space,
                           "original_space"), "Invalid observation space"
            train_obs_space = train_obs_space.original_space
            if isinstance(train_obs_space, Tuple):
                train_obs_space = train_obs_space.spaces[0]

    # Obs spaces used for training and testing
    sp = Dict({k: obs_space.spaces[k] for k in policy.test_obs_keys})
    policy.real_test_obs_space = flatten_space(sp)
    policy.real_test_obs_space.original_space = sp

    sp = Dict({k: train_obs_space.spaces[k] for k in policy.train_obs_keys})
    policy.real_train_obs_space = flatten_space(sp)
    policy.real_train_obs_space.original_space = sp
    policy.n_actions = action_space.n

    model_space = Dict({
        k: obs_space.spaces[k]
        for k in policy.test_obs_keys if k != "action_mask" and k != "signal"
    })
    return build_q_models(policy, flatten_space(model_space), action_space, config), \
           TorchCategorical
Beispiel #14
0
def build_q_model(policy: Policy, obs_space: gym.Space,
                  action_space: gym.Space,
                  config: TrainerConfigDict) -> ModelV2:

    if not isinstance(action_space, gym.spaces.Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    if config["hiddens"]:
        # try to infer the last layer size, otherwise fall back to 256
        num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
        config["model"]["no_final_linear"] = True
    else:
        num_outputs = action_space.n

    policy.q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="tf",
        model_interface=DistributionalQTFModel,
        name=Q_SCOPE,
        num_atoms=config["num_atoms"],
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        v_min=config["v_min"],
        v_max=config["v_max"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=isinstance(
            getattr(policy, "exploration", None), ParameterNoise)
        or config["exploration_config"]["type"] == "ParameterNoise")

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="tf",
        model_interface=DistributionalQTFModel,
        name=Q_TARGET_SCOPE,
        num_atoms=config["num_atoms"],
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        v_min=config["v_min"],
        v_max=config["v_max"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=isinstance(
            getattr(policy, "exploration", None), ParameterNoise)
        or config["exploration_config"]["type"] == "ParameterNoise")

    return policy.q_model
Beispiel #15
0
def build_q_model_and_distribution(policy, obs_space, action_space, config):

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    if config["hiddens"]:
        # try to infer the last layer size, otherwise fall back to 256
        num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
        config["model"]["no_final_linear"] = True
    else:
        num_outputs = action_space.n

    # TODO(sven): Move option to add LayerNorm after each Dense
    #  generically into ModelCatalog.
    add_layer_norm = (
        isinstance(getattr(policy, "exploration", None), ParameterNoise)
        or config["exploration_config"]["type"] == "ParameterNoise")

    policy.q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        model_interface=DQNTorchModel,
        name=Q_SCOPE,
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=add_layer_norm,
        decompose_num=config["decompose_num"])

    policy.q_func_vars = policy.q_model.variables()

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=num_outputs,
        model_config=config["model"],
        framework="torch",
        model_interface=DQNTorchModel,
        name=Q_TARGET_SCOPE,
        dueling=config["dueling"],
        q_hiddens=config["hiddens"],
        use_noisy=config["noisy"],
        sigma0=config["sigma0"],
        # TODO(sven): Move option to add LayerNorm after each Dense
        #  generically into ModelCatalog.
        add_layer_norm=add_layer_norm,
        decompose_num=config["decompose_num"])

    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model, TorchMultiObjCategorical
Beispiel #16
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config)
        self.config = config
        self.sess = tf.get_default_session()

        # Setup the policy
        self.observations = tf.placeholder(
            tf.float32, [None] + list(observation_space.shape))
        dist_class, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])
        self.model = ModelCatalog.get_model(self.observations, logit_dim,
                                            self.config["model"])
        action_dist = dist_class(self.model.outputs)
        self.vf = tf.reshape(
            linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
            [-1])
        self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          tf.get_variable_scope().name)

        # Setup the policy loss
        if isinstance(action_space, gym.spaces.Box):
            ac_size = action_space.shape[0]
            actions = tf.placeholder(tf.float32, [None, ac_size], name="ac")
        elif isinstance(action_space, gym.spaces.Discrete):
            actions = tf.placeholder(tf.int64, [None], name="ac")
        else:
            raise UnsupportedSpaceException(
                "Action space {} is not supported for A3C.".format(
                    action_space))
        advantages = tf.placeholder(tf.float32, [None], name="advantages")
        v_target = tf.placeholder(tf.float32, [None], name="v_target")
        self.loss = A3CLoss(action_dist, actions, advantages, v_target,
                            self.vf, self.config["vf_loss_coeff"],
                            self.config["entropy_coeff"])

        # Initialize TFPolicyGraph
        loss_in = [
            ("obs", self.observations),
            ("actions", actions),
            ("advantages", advantages),
            ("value_targets", v_target),
        ]
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=self.observations,
            action_sampler=action_dist.sample(),
            loss=self.loss.total_loss,
            loss_inputs=loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            seq_lens=self.model.seq_lens,
            max_seq_len=self.config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
Beispiel #17
0
 def _make_continuous_space(space):
     if isinstance(space, Box):
         return space
     elif isinstance(space, Discrete):
         return Box(low=np.zeros((space.n, )),
                    high=np.ones((space.n, )))
     else:
         raise UnsupportedSpaceException(
             "Space {} is not supported.".format(space))
Beispiel #18
0
    def __init__(self, env: gym.Env):
        super().__init__(env)

        if not isinstance(env.action_space, MultiDiscrete):
            raise UnsupportedSpaceException(
                f"Action space {env.action_space} "
                f"is not supported by {self.__class__.__name__}")
        self.action_space_dimensions = env.action_space.nvec
        self.action_space = Discrete(np.prod(self.action_space_dimensions))
Beispiel #19
0
    def __init__(self, observation_space, action_space, config):
        # Validate spaces.
        # Only support single Box or single Discrete spaces.
        if not isinstance(action_space, (Box, Discrete)):
            raise UnsupportedSpaceException(
                "Action space ({}) of {} is not supported for "
                "MB-MPO. Must be [Box|Discrete].".format(action_space, self))
        # If Box, make sure it's a 1D vector space.
        elif isinstance(action_space, Box) and len(action_space.shape) > 1:
            raise UnsupportedSpaceException(
                "Action space ({}) of {} has multiple dimensions "
                "{}. ".format(action_space, self, action_space.shape) +
                "Consider reshaping this into a single dimension Box space "
                "or using the multi-agent API.")

        config = dict(ray.rllib.algorithms.mbmpo.mbmpo.DEFAULT_CONFIG,
                      **config)
        super().__init__(observation_space, action_space, config)
Beispiel #20
0
    def get_action_dist(action_space, config, dist_type=None, torch=False):
        """Returns action distribution class and size for the given action space.

        Args:
            action_space (Space): Action space of the target gym env.
            config (dict): Optional model config.
            dist_type (str): Optional identifier of the action distribution.
            torch (bool):  Optional whether to return PyTorch distribution.

        Returns:
            dist_class (ActionDistribution): Python class of the distribution.
            dist_dim (int): The size of the input vector to the distribution.
        """

        config = config or MODEL_DEFAULTS
        if isinstance(action_space, gym.spaces.Box):
            if len(action_space.shape) > 1:
                raise UnsupportedSpaceException(
                    "Action space has multiple dimensions "
                    "{}. ".format(action_space.shape) +
                    "Consider reshaping this into a single dimension, "
                    "using a Tuple action space, or the multi-agent API.")
            if dist_type is None:
                dist = TorchDiagGaussian if torch else DiagGaussian
                return dist, action_space.shape[0] * 2
            elif dist_type == "deterministic":
                return Deterministic, action_space.shape[0]
        elif isinstance(action_space, gym.spaces.Discrete):
            dist = TorchCategorical if torch else Categorical
            return dist, action_space.n
        elif isinstance(action_space, gym.spaces.Tuple):
            child_dist = []
            input_lens = []
            for action in action_space.spaces:
                dist, action_size = ModelCatalog.get_action_dist(
                    action, config)
                child_dist.append(dist)
                input_lens.append(action_size)
            if torch:
                raise NotImplementedError
            return partial(MultiActionDistribution,
                           child_distributions=child_dist,
                           action_space=action_space,
                           input_lens=input_lens), sum(input_lens)
        elif isinstance(action_space, Simplex):
            if torch:
                raise NotImplementedError
            return Dirichlet, action_space.shape[0]
        elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete):
            if torch:
                raise NotImplementedError
            return partial(MultiCategorical, input_lens=action_space.nvec), \
                int(sum(action_space.nvec))

        raise NotImplementedError("Unsupported args: {} {}".format(
            action_space, dist_type))
Beispiel #21
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        # when env.action_space is of Box type, e.g., Pendulum-v0
        # action_space.low is [-2.0], high is [2.0]
        # take action by calling, e.g., env.step([3.5])
        if not isinstance(env.action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                config["noise_scale"] * 0.4 **
                (1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(config["exploration_fraction"] *
                                       config["schedule_max_timesteps"]),
                initial_p=config["noise_scale"] * 1.0,
                final_p=config["noise_scale"] *
                config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        # hard instead of soft
        self.ddpg_graph.update_target(self.sess, 1.0)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the policy and Q-value networks and
        # their corresponding target networks
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Beispiel #22
0
def build_q_model(policy, obs_space, action_space, config):

    if not isinstance(action_space, Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    if config["hiddens"]:
        # try to infer the last layer size, otherwise fall back to 256
        num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
        config["model"]["no_final_linear"] = True
    else:
        num_outputs = action_space.n

    policy.q_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs,
        config["model"],
        framework="tf",
        model_interface=DistributionalQModel,
        name=Q_SCOPE,
        num_atoms=config["num_atoms"],
        q_hiddens=config["hiddens"],
        dueling=config["dueling"],
        use_noisy=config["noisy"],
        v_min=config["v_min"],
        v_max=config["v_max"],
        sigma0=config["sigma0"],
        parameter_noise=config["parameter_noise"])

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space,
        action_space,
        num_outputs,
        config["model"],
        framework="tf",
        model_interface=DistributionalQModel,
        name=Q_TARGET_SCOPE,
        num_atoms=config["num_atoms"],
        q_hiddens=config["hiddens"],
        dueling=config["dueling"],
        use_noisy=config["noisy"],
        v_min=config["v_min"],
        v_max=config["v_max"],
        sigma0=config["sigma0"],
        parameter_noise=config["parameter_noise"])

    return policy.q_model
Beispiel #23
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Box):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.ddpg_graph.copy_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0
        nb_actions = env.action_space.shape[-1]
        stddev = config["exploration_noise"]
        self.exploration_noise = OUNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        self.action_range = (-1., 1.)

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.ddpg_graph.critic_loss, self.ddpg_graph.action_loss),
            self.sess)
        self.max_action = env.action_space.high
        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        # Technically not needed when not remote
        self.obs_filter = get_filter(config["observation_filter"],
                                     env.observation_space.shape)
        self.rew_filter = get_filter(config["reward_filter"], ())
        self.filters = {
            "obs_filter": self.obs_filter,
            "rew_filter": self.rew_filter
        }

        self.obs = self.env.reset()
Beispiel #24
0
def _build_q_models(policy: Policy, obs_space: gym.spaces.Space,
                    action_space: gym.spaces.Space,
                    config: TrainerConfigDict) -> ModelV2:
    """Build q_model and target_q_model for Simple Q learning

    Note that this function works for both Tensorflow and PyTorch.

    Args:
        policy (Policy): The Policy, which will use the model for optimization.
        obs_space (gym.spaces.Space): The policy's observation space.
        action_space (gym.spaces.Space): The policy's action space.
        config (TrainerConfigDict):

    Returns:
        ModelV2: The Model for the Policy to use.
            Note: The target q model will not be returned, just assigned to
            `policy.target_q_model`.
    """
    if not isinstance(action_space, gym.spaces.Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                               action_space=action_space,
                                               num_outputs=action_space.n,
                                               model_config=config["model"],
                                               framework=config["framework"],
                                               name=Q_SCOPE)
    if torch.cuda.is_available():
        policy.q_model = policy.q_model.to("cuda")

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.n,
        model_config=config["model"],
        framework=config["framework"],
        name=Q_TARGET_SCOPE)
    if torch.cuda.is_available():
        policy.target_q_model = policy.target_q_model.to("cuda")

    policy.q_func_vars = policy.q_model.variables()
    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model
Beispiel #25
0
def build_avg_model_and_distribution(
    policy: Policy, obs_space: gym.spaces.Space,
    action_space: gym.spaces.Space, config: TrainerConfigDict
) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]:
    if not isinstance(action_space, gym.spaces.Discrete):
        raise UnsupportedSpaceException(
            f"Action space {action_space} is not supported for NFSP.")

    policy.avg_model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                                 action_space=action_space,
                                                 num_outputs=action_space.n,
                                                 model_config=config["model"],
                                                 framework=config["framework"],
                                                 name=AVG_POL_SCOPE)

    policy.avg_func_vars = policy.avg_model.variables()

    return policy.avg_model, TorchCategorical
Beispiel #26
0
    def __init__(self, registry, env_creator, config, logdir, worker_index):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"], config["random_starts"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Use either a different `eps` per worker, or a linear schedule.
        if config["per_worker_exploration"]:
            assert config["num_workers"] > 1, "This requires multiple workers"
            self.exploration = ConstantSchedule(
                0.4 ** (
                    1 + worker_index / float(config["num_workers"] - 1) * 7))
        else:
            self.exploration = LinearSchedule(
                schedule_timesteps=int(
                    config["exploration_fraction"] *
                    config["schedule_max_timesteps"]),
                initial_p=1.0,
                final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Beispiel #27
0
def _make_array_from_obs(obs, size, spaces):
    """
    Transform original obs dict to one dimensional np.array
    :param obs: dict, original observation dictionary
    :param size: total size of the wrapped env
    :return: np.array, flatten out array of observations
    """
    # get size of space
    # initialize zeros array with correct shape
    array = np.zeros(size)
    # get space dict
    offset = 0
    # for every observation
    for k in spaces.keys():

        # get gym space related to observation
        sp = spaces[k]
        v = obs[k]

        # if MultiBinary, get shape and add values to array
        if isinstance(sp, gym.spaces.MultiBinary):
            size = reduce(lambda x, y: x * y, sp.shape)
            array[offset:offset + size] = v

        # if Discrete then we need to use the OHV rappresentation, and set n to be one
        elif isinstance(sp, gym.spaces.Discrete):
            size = sp.n
            array[offset + v] = 1

        # if Box, then get size and assign flatten value
        elif isinstance(sp, gym.spaces.Box):
            size = reduce(lambda x, y: x * y, sp.shape)
            array[offset:offset + size] = v.flatten()

        # else raise exception
        else:
            raise UnsupportedSpaceException(f"space {type(sp)} is not supported for ParametricWrapper")

        # update offset
        offset += size

    return np.asarray(array)
Beispiel #28
0
def _make_box_from_obs(space):
    """
    Convert a spaces.Dict to a spaces.Box given highs/lows vectors initialization.

    :param space: gym.spaces.Dict
    :return: gym.spaces.Box
    """
    sp = list(space.spaces.values())
    lows = []
    highs = []

    # for every space
    for s in sp:

        # if discrete then the observation will be transformed to a OneHotVector rapresentation to deal with
        # discrete values, so add n 0/1 as lows/highs
        if isinstance(s, gym.spaces.Discrete):
            highs += [1] * s.n
            lows += [0] * s.n

        # if multibinary then do the same as before but get shape with reduce
        elif isinstance(s, gym.spaces.MultiBinary):
            sh = reduce(lambda x, y: x * y, s.shape)
            highs += [1] * sh
            lows += [0] * sh

        # if box then just flatten highs and flows
        elif isinstance(s, gym.spaces.Box):
            highs += s.high.flatten().tolist()
            lows += s.low.flatten().tolist()

        # else raise exception
        else:
            raise UnsupportedSpaceException(
                "Space {} is not supported.".format(space))

    # convert to array
    highs = np.asarray(highs)
    lows = np.asarray(lows)
    # return box as high/low initialization
    return gym.spaces.Box(high=highs, low=lows)
Beispiel #29
0
    def __init__(self, registry, env_creator, config, logdir):
        env = env_creator(config["env_config"])
        env = wrap_dqn(registry, env, config["model"])
        self.env = env
        self.config = config

        if not isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DQN.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.dqn_graph = models.DQNGraph(registry, env, config, logdir)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(config["exploration_fraction"] *
                                   config["schedule_max_timesteps"]),
            initial_p=1.0,
            final_p=config["exploration_final_eps"])

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.dqn_graph.update_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)

        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()
Beispiel #30
0
    def __init__(self, registry, env_creator, config, worker_index):
        env = ModelCatalog.get_preprocessor_as_wrapper(
            registry, env_creator(config["env_config"]), config["model"])
        self.env = env
        self.config = config

        if isinstance(env.action_space, Discrete):
            raise UnsupportedSpaceException(
                "Action space {} is not supported for DDPG.".format(
                    env.action_space))

        tf_config = tf.ConfigProto(**config["tf_session_args"])
        self.sess = tf.Session(config=tf_config)
        self.ddpg_graph = models.DDPGGraph(registry, env, config)

        # Initialize the parameters and copy them to the target network.
        self.sess.run(tf.global_variables_initializer())
        self.ddpg_graph.copy_target(self.sess)
        self.global_timestep = 0
        self.local_timestep = 0
        nb_actions = env.action_space.shape[-1]
        stddev = config["exploration_noise"]
        self.exploration_noise = OUNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        self.action_range = (-1., 1.)

        # Note that this encompasses both the Q and target network
        self.variables = ray.experimental.TensorFlowVariables(
            tf.group(self.ddpg_graph.td_error, self.ddpg_graph.action_lost),
            self.sess)
        self.max_action = env.action_space.high
        self.episode_rewards = [0.0]
        self.episode_lengths = [0.0]
        self.saved_mean_reward = None

        self.obs = self.env.reset()