Example #1
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model(np.zeros((10, 3), dtype=np.float32), 5)
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model(
                np.zeros((10, 84, 84, 3), dtype=np.float32), 5)
            self.assertEqual(type(p2), VisionNetwork)
Example #2
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model(
                get_registry(), np.zeros((10, 3), dtype=np.float32), 5)
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model(
                get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5)
            self.assertEqual(type(p2), VisionNetwork)
Example #3
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model(
                get_registry(), np.zeros((10, 3), dtype=np.float32), 5)
            assert type(p1) == FullyConnectedNetwork

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model(
                get_registry(), np.zeros((10, 80, 80, 3), dtype=np.float32), 5)
            assert type(p2) == VisionNetwork
Example #4
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model({
                "obs": tf.zeros((10, 3), dtype=tf.float32)
            }, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {})
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model({
                "obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32)
            }, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {})
            self.assertEqual(type(p2), VisionNetwork)
Example #5
0
    def testDefaultModels(self):
        ray.init()

        with tf.variable_scope("test1"):
            p1 = ModelCatalog.get_model(
                {"obs": tf.zeros((10, 3), dtype=tf.float32)},
                Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5, {})
            self.assertEqual(type(p1), FullyConnectedNetwork)

        with tf.variable_scope("test2"):
            p2 = ModelCatalog.get_model(
                {"obs": tf.zeros((10, 84, 84, 3), dtype=tf.float32)},
                Box(0, 1, shape=(84, 84, 3), dtype=np.float32), Discrete(5), 5,
                {})
            self.assertEqual(type(p2), VisionNetwork)
Example #6
0
def _build_q_network(registry, inputs, num_actions, config):
    dueling = config["dueling"]
    hiddens = config["hiddens"]
    frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
    frontend_out = frontend.last_layer

    with tf.variable_scope("action_value"):
        action_out = frontend_out
        for hidden in hiddens:
            action_out = layers.fully_connected(
                action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
        action_scores = layers.fully_connected(
            action_out, num_outputs=num_actions, activation_fn=None)

    if dueling:
        with tf.variable_scope("state_value"):
            state_out = frontend_out
            for hidden in hiddens:
                state_out = layers.fully_connected(
                    state_out, num_outputs=hidden, activation_fn=tf.nn.relu)
            state_score = layers.fully_connected(
                state_out, num_outputs=1, activation_fn=None)
        action_scores_mean = tf.reduce_mean(action_scores, 1)
        action_scores_centered = action_scores - tf.expand_dims(
            action_scores_mean, 1)
        return state_score + action_scores_centered
    else:
        return action_scores
Example #7
0
 def _build_value_network(self, obs, obs_space):
     value_model = ModelCatalog.get_model(
         {
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, 1, self.config["model"])
     return value_model.outputs
Example #8
0
    def __init__(self, registry, sess, action_space, preprocessor,
                 observation_filter):
        self.sess = sess
        self.action_space = action_space
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(
            observation_filter, self.preprocessor.shape)
        self.inputs = tf.placeholder(
            tf.float32, [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, dist_type="deterministic")
        model = ModelCatalog.get_model(registry, self.inputs, dist_dim,
                                       options={"fcnet_hiddens": [32, 32]})
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum([np.prod(variable.shape.as_list())
                               for _, variable
                               in self.variables.variables.items()])
        self.sess.run(tf.global_variables_initializer())
Example #9
0
    def _build_critic_network(
        self,
        obs_n,
        act_n,
        obs_space_n,
        act_space_n,
        use_state_preprocessor,
        hiddens,
        activation=None,
        scope=None,
    ):
        """ Build critic network

        Args:
            obs_n: list, the observation placeholder list contains at least one.
            act_n: list, the action placeholder list contains at least one.
            obs_space_n: list, the observation space list contains at least one.
            act_space_n: list, the action space list contains at least one.
            use_state_preprocessor: bool, if true, there are `n` preprocessor models for each observation placeholder
                otherwise, no.
            hiddens: list, a list of unit definition.
            activation: tf.nn, default is None, to initialize the activation function.
            scope: str, name the variable scope

        Returns:
            out: tf.Tensor, logits out.
            feature: tf.Tensor, intputs of logits output.
            model_n: list, preprocessor models for observation inputs.
            variables: list, return global variables of this critic network.
        """

        with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model_n = [
                    ModelCatalog.get_model(
                        {
                            "obs": obs,
                            "is_training": self._get_is_training_placeholder(),
                        },
                        obs_space,
                        act_space,
                        1,
                        self.config["model"],
                    )
                    for obs, obs_space, act_space in zip(
                        obs_n, obs_space_n, act_space_n
                    )
                ]
                out_n = [model.last_layer for model in model_n]
                out = tf.concat(out_n + act_n, axis=1)
            else:
                model_n = [None] * len(obs_n)
                out = tf.concat(obs_n + act_n, axis=1)

            for hidden in hiddens:
                out = tf1.layers.dense(out, units=hidden, activation=activation)
            feature = out
            out = tf1.layers.dense(feature, units=1, activation=None)

        return out, feature, model_n, tf1.global_variables(scope.name)
def _build_q_network(inputs, num_actions, config):
    dueling = config["dueling"]
    hiddens = config["hiddens"]
    frontend = ModelCatalog.get_model(inputs, 1, config["model"])
    frontend_out = frontend.last_layer

    with tf.variable_scope("action_value"):
        action_out = frontend_out
        for hidden in hiddens:
            action_out = layers.fully_connected(action_out,
                                                num_outputs=hidden,
                                                activation_fn=tf.nn.relu)
        action_scores = layers.fully_connected(action_out,
                                               num_outputs=num_actions,
                                               activation_fn=None)

    if dueling:
        with tf.variable_scope("state_value"):
            state_out = frontend_out
            for hidden in hiddens:
                state_out = layers.fully_connected(state_out,
                                                   num_outputs=hidden,
                                                   activation_fn=tf.nn.relu)
            state_score = layers.fully_connected(state_out,
                                                 num_outputs=1,
                                                 activation_fn=None)
        action_scores_mean = tf.reduce_mean(action_scores, 1)
        action_scores_centered = action_scores - tf.expand_dims(
            action_scores_mean, 1)
        return state_score + action_scores_centered
    else:
        return action_scores
Example #11
0
    def _build_critic_network(self,
                              obs_n,
                              act_n,
                              obs_space_n,
                              act_space_n,
                              use_state_preprocessor,
                              hiddens,
                              activation=None,
                              scope=None):
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model_n = [
                    ModelCatalog.get_model(
                        {
                            "obs": obs,
                            "is_training": self._get_is_training_placeholder(),
                        }, obs_space, act_space, 1, self.config["model"])
                    for obs, obs_space, act_space in zip(
                        obs_n, obs_space_n, act_space_n)
                ]
                out_n = [model.last_layer for model in model_n]
                out = tf.concat(out_n + act_n, axis=1)
            else:
                model_n = [None] * len(obs_n)
                out = tf.concat(obs_n + act_n, axis=1)

            for hidden in hiddens:
                out = tf.layers.dense(out, units=hidden, activation=activation)
            feature = out
            out = tf.layers.dense(feature, units=1, activation=None)

        return out, feature, model_n, tf.global_variables(scope.name)
Example #12
0
    def __init__(self, obs_space, action_space, config):
        self.action_space = action_space
        self.action_noise_std = config["action_noise_std"]
        self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space)
        self.observation_filter = get_filter(config["observation_filter"],
                                             self.preprocessor.shape)
        self.single_threaded = config.get("single_threaded", False)
        self.sess = make_session(single_threaded=self.single_threaded)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, config["model"], dist_type="deterministic")
        model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs},
                                       obs_space, action_space, dist_dim,
                                       config["model"])
        dist = dist_class(model.outputs, model)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Example #13
0
 def _build_q_network(self, obs, obs_space, actions):
     return QNetwork(
         ModelCatalog.get_model({
             "obs": obs
         }, obs_space, 1, self.config["model"]), actions,
         self.config["critic_hiddens"],
         self.config["critic_hidden_activation"]).value
Example #14
0
    def __init__(self, observation_space, action_space, observations,
                 advantages, actions, prev_logits, logit_dim, kl_coeff,
                 distribution_class, config, sess):
        assert (isinstance(action_space, gym.spaces.Discrete)
                or isinstance(action_space, gym.spaces.Box))
        self.prev_dist = distribution_class(prev_logits)

        # Saved so that we can compute actions given different observations
        self.observations = observations

        self.curr_logits = ModelCatalog.get_model(observations, logit_dim,
                                                  config["model"]).outputs
        self.curr_dist = distribution_class(self.curr_logits)
        self.sampler = self.curr_dist.sample()

        # Make loss functions.
        self.ratio = tf.exp(
            self.curr_dist.logp(actions) - self.prev_dist.logp(actions))
        self.kl = self.prev_dist.kl(self.curr_dist)
        self.mean_kl = tf.reduce_mean(self.kl)
        self.entropy = self.curr_dist.entropy()
        self.mean_entropy = tf.reduce_mean(self.entropy)
        self.surr1 = self.ratio * advantages
        self.surr2 = tf.clip_by_value(self.ratio, 1 - config["clip_param"],
                                      1 + config["clip_param"]) * advantages
        self.surr = tf.minimum(self.surr1, self.surr2)
        self.loss = tf.reduce_mean(-self.surr + kl_coeff * self.kl -
                                   config["entropy_coeff"] * self.entropy)
        self.sess = sess
Example #15
0
    def __init__(self, sess, action_space, preprocessor, observation_filter,
                 action_noise_std):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor

        if observation_filter == "MeanStdFilter":
            self.observation_filter = MeanStdFilter(self.preprocessor.shape,
                                                    clip=None)
        elif observation_filter == "NoFilter":
            self.observation_filter = NoFilter()
        else:
            raise Exception("Unknown observation_filter: " +
                            str("observation_filter"))

        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            self.action_space, dist_type="deterministic")
        model = ModelCatalog.get_model(self.inputs, dist_dim)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum([
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items()
        ])
        self.sess.run(tf.global_variables_initializer())
Example #16
0
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Example #17
0
    def __init__(self,
                 sess,
                 action_space,
                 obs_space,
                 preprocessor,
                 observation_filter,
                 model_config,
                 action_noise_std=0.0):
        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, model_config, dist_type="deterministic")

        model = ModelCatalog.get_model({
            "obs": self.inputs
        }, obs_space, action_space, dist_dim, model_config)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.tf_utils.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Example #18
0
 def _build_q_network(self, obs):
     qnet = QNetwork(
         ModelCatalog.get_model(obs, 1, self.config["model"]),
         self.num_actions, self.config["dueling"], self.config["hiddens"],
         self.config["noisy"], self.config["num_atoms"],
         self.config["v_min"], self.config["v_max"], self.config["sigma0"])
     return qnet.value, qnet.logits, qnet.dist
Example #19
0
    def _build_policy_network(self, obs, obs_space, action_space):
        if self.config["use_state_preprocessor"]:
            model = ModelCatalog.get_model(
                {
                    "obs": obs,
                    "is_training": self._get_is_training_placeholder(),
                }, obs_space, action_space, 1, self.config["model"])
            action_out = model.last_layer
        else:
            model = None
            action_out = obs

        activation = getattr(tf.nn, self.config["actor_hidden_activation"])
        for hidden in self.config["actor_hiddens"]:
            action_out = tf.layers.dense(action_out,
                                         units=hidden,
                                         activation=activation)
            if self.config["parameter_noise"]:
                action_out = tf.keras.layers.LayerNormalization()(action_out)
        action_out = tf.layers.dense(action_out,
                                     units=action_space.shape[0],
                                     activation=None)

        # Use sigmoid to scale to [0,1], but also double magnitude of input to
        # emulate behaviour of tanh activation used in DDPG and TD3 papers.
        sigmoid_out = tf.nn.sigmoid(2 * action_out)
        # Rescale to actual env policy scale
        # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
        # get same dims)
        action_range = (action_space.high - action_space.low)[None]
        low_action = action_space.low[None]
        actions = action_range * sigmoid_out + low_action

        return actions, model
Example #20
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model({"obs": tf.constant([1, 2, 3])},
                                 Box(0, 1, shape=(3, ), dtype=np.float32),
                                 Discrete(5), 5, {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
Example #21
0
    def _initialize(self, ob_space, ac_space, preprocessor, ac_noise_std):
        self.ac_space = ac_space
        self.ac_noise_std = ac_noise_std
        self.preprocessor_shape = preprocessor.transform_shape(ob_space.shape)

        with tf.variable_scope(type(self).__name__) as scope:
            # Observation normalization.
            ob_mean = tf.get_variable(
                'ob_mean', self.preprocessor_shape, tf.float32,
                tf.constant_initializer(np.nan), trainable=False)
            ob_std = tf.get_variable(
                'ob_std', self.preprocessor_shape, tf.float32,
                tf.constant_initializer(np.nan), trainable=False)
            in_mean = tf.placeholder(tf.float32, self.preprocessor_shape)
            in_std = tf.placeholder(tf.float32, self.preprocessor_shape)
            self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[
                tf.assign(ob_mean, in_mean),
                tf.assign(ob_std, in_std),
            ])

            inputs = tf.placeholder(
                tf.float32, [None] + list(self.preprocessor_shape))

            # TODO(ekl): we should do clipping in a standard RLlib preprocessor
            clipped_inputs = tf.clip_by_value(
                (inputs - ob_mean) / ob_std, -5.0, 5.0)

            # Policy network.
            dist_class, dist_dim = ModelCatalog.get_action_dist(
                self.ac_space, dist_type='deterministic')
            model = ModelCatalog.get_model(clipped_inputs, dist_dim)
            dist = dist_class(model.outputs)
            self._act = U.function([inputs], dist.sample())
        return scope
Example #22
0
    def _build_actor_network(self,
                             obs,
                             obs_space,
                             act_space,
                             use_state_preprocessor,
                             hiddens,
                             activation=None,
                             scope=None):
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model = ModelCatalog.get_model(
                    {
                        "obs": obs,
                        "is_training": self._get_is_training_placeholder(),
                    }, obs_space, act_space, 1, self.config["model"])
                out = model.last_layer
            else:
                model = None
                out = obs

            for hidden in hiddens:
                out = tf.layers.dense(out, units=hidden, activation=activation)
            feature = tf.layers.dense(out,
                                      units=act_space.shape[0],
                                      activation=None)
            sampler = tfp.distributions.RelaxedOneHotCategorical(
                temperature=1.0, logits=feature).sample()

        return sampler, feature, model, tf.global_variables(scope.name)
Example #23
0
 def _build_p_network(self, obs, obs_space):
     return PNetwork(
         ModelCatalog.get_model({
             "obs": obs
         }, obs_space, 1, self.config["model"]), self.dim_actions,
         self.config["actor_hiddens"],
         self.config["actor_hidden_activation"]).action_scores
Example #24
0
 def _build_p_network(self, obs, obs_space):
     return PNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, 1, self.config["model"]), self.dim_actions,
         self.config["actor_hiddens"],
         self.config["actor_hidden_activation"]).action_scores
Example #25
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model({
         "obs": tf.constant([1, 2, 3])
     }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5,
                                 {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
Example #26
0
 def _build_q_network(self, obs, obs_space, action_space, actions):
     q_net = QNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, action_space, 1, self.config["model"]), actions,
         self.config["critic_hiddens"],
         self.config["critic_hidden_activation"])
     return q_net.value, q_net.model
Example #27
0
 def _build_q_network(self, obs, obs_space, actions):
     q_net = QNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, 1, self.config["model"]), actions,
         self.config["critic_hiddens"],
         self.config["critic_hidden_activation"])
     return q_net.value, q_net.model
Example #28
0
 def _build_p_network(self, obs, obs_space):
     policy_net = PNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, obs_space, 1, self.config["model"]), self.dim_actions,
         self.config["actor_hiddens"],
         self.config["actor_hidden_activation"])
     return policy_net.action_scores, policy_net.model
Example #29
0
 def _build_q_network(self, obs, space):
     qnet = QNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, space, self.num_actions, self.config["model"]),
         self.num_actions, self.config["dueling"], self.config["hiddens"],
         self.config["noisy"], self.config["num_atoms"],
         self.config["v_min"], self.config["v_max"], self.config["sigma0"])
     return qnet.value, qnet.logits, qnet.dist, qnet.model
Example #30
0
 def _build_q_network(self, obs, space):
     qnet = QNetwork(
         ModelCatalog.get_model({
             "obs": obs,
             "is_training": self._get_is_training_placeholder(),
         }, space, self.num_actions, self.config["model"]),
         self.num_actions, self.config["dueling"], self.config["hiddens"],
         self.config["noisy"], self.config["num_atoms"],
         self.config["v_min"], self.config["v_max"], self.config["sigma0"])
     return qnet.value, qnet.logits, qnet.dist, qnet.model
Example #31
0
 def _build_p_network(self, obs, obs_space):
     policy_net = PNetwork(
         ModelCatalog.get_model(
             {
                 "obs": obs,
                 "is_training": self._get_is_training_placeholder(),
             }, obs_space, 1, self.config["model"]), self.dim_actions,
         self.config["actor_hiddens"],
         self.config["actor_hidden_activation"],
         self.config["parameter_noise"])
     return policy_net.action_scores, policy_net.model
Example #32
0
def _build_q_network(policy, obs, obs_space, action_space):
    config = policy.config
    qnet = QNetwork(
        ModelCatalog.get_model(
            {
                "obs": obs,
                "is_training": policy._get_is_training_placeholder(),
            }, obs_space, action_space, action_space.n, config["model"]),
        action_space.n, config["dueling"], config["hiddens"], config["noisy"],
        config["num_atoms"], config["v_min"], config["v_max"],
        config["sigma0"], config["parameter_noise"])
    return qnet.value, qnet.logits, qnet.dist, qnet.model
Example #33
0
def _build_q_network(inputs, action_inputs, config):
    frontend = ModelCatalog.get_model(inputs, 1, config["model"])

    hiddens = config["critic_hiddens"]

    q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
    for hidden in hiddens:
        q_out = layers.fully_connected(q_out,
                                       num_outputs=hidden,
                                       activation_fn=tf.nn.relu)
    q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)

    return q_scores
Example #34
0
    def _build_q_network(self, obs, obs_space, action_space, actions):
        if self.config["use_state_preprocessor"]:
            q_model = ModelCatalog.get_model({
                "obs": obs,
                "is_training": self._get_is_training_placeholder(),
            }, obs_space, action_space, 1, self.config["model"])
            q_out = tf.concat([q_model.last_layer, actions], axis=1)
        else:
            q_model = None
            q_out = tf.concat([obs, actions], axis=1)

        activation = getattr(tf.nn, self.config["critic_hidden_activation"])
        for hidden in self.config["critic_hiddens"]:
            q_out = tf.layers.dense(q_out, units=hidden, activation=activation)
        q_values = tf.layers.dense(q_out, units=1, activation=None)

        return q_values, q_model
Example #35
0
def _build_p_network(inputs, dim_actions, config):
    """
    map an observation (i.e., state) to an action where
    each entry takes value from (0, 1) due to the sigmoid function
    """
    frontend = ModelCatalog.get_model(inputs, 1, config["model"])

    hiddens = config["actor_hiddens"]
    action_out = frontend.last_layer
    for hidden in hiddens:
        action_out = layers.fully_connected(action_out,
                                            num_outputs=hidden,
                                            activation_fn=tf.nn.relu)
    # Use sigmoid layer to bound values within (0, 1)
    # shape of action_scores is [batch_size, dim_actions]
    action_scores = layers.fully_connected(action_out,
                                           num_outputs=dim_actions,
                                           activation_fn=tf.nn.sigmoid)

    return action_scores
Example #36
0
    def _build_actor_network(self,
                             obs,
                             obs_space,
                             act_space,
                             use_state_preprocessor,
                             hiddens,
                             activation=None,
                             scope=None):
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model = ModelCatalog.get_model({
                    "obs": obs,
                    "is_training": self._get_is_training_placeholder(),
                }, obs_space, act_space, 1, self.config["model"])
                out = model.last_layer
            else:
                model = None
                out = obs

            for hidden in hiddens:
                out = tf.layers.dense(out, units=hidden, activation=activation)

            feature = tf.layers.dense(
                out, units=act_space.shape[0], activation=None)

            """

            sampler = tfp.distributions.RelaxedOneHotCategorical(
                temperature=1.0, logits=feature).sample()
            """
            # Use sigmoid to scale to [0,1], but also double magnitude of input to
            # emulate behaviour of tanh activation used in DDPG and TD3 papers.
            sigmoid_out = tf.nn.sigmoid(2 * feature)
            # Rescale to actual env policy scale
            # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
            # get same dims)
            action_range = (act_space.high - act_space.low)[None]
            low_action = act_space.low[None]
            actions = action_range * sigmoid_out + low_action

        return actions, feature, model, tf.global_variables(scope.name)
Example #37
0
    def __init__(self,
                 sess,
                 action_space,
                 preprocessor,
                 observation_filter,
                 action_noise_std,
                 options={}):

        if len(preprocessor.shape) > 1:
            raise UnsupportedSpaceException(
                "Observation space {} is not supported with ARS.".format(
                    preprocessor.shape))

        self.sess = sess
        self.action_space = action_space
        self.action_noise_std = action_noise_std
        self.preprocessor = preprocessor
        self.observation_filter = get_filter(observation_filter,
                                             self.preprocessor.shape)
        self.inputs = tf.placeholder(tf.float32,
                                     [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
            action_space, dist_type="deterministic")

        model = ModelCatalog.get_model(self.inputs, dist_dim, options=options)
        dist = dist_class(model.outputs)
        self.sampler = dist.sample()

        self.variables = ray.experimental.TensorFlowVariables(
            model.outputs, self.sess)

        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
        self.sess.run(tf.global_variables_initializer())
Example #38
0
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
        self.config = config

        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Action inputs
        self.obs_t = tf.placeholder(tf.float32,
                                    shape=(None, ) + observation_space.shape)
        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards_ph = tf.placeholder(tf.float32, [None],
                                         name="prev_reward")

        with tf.variable_scope(POLICY_SCOPE) as scope:
            self.model = ModelCatalog.get_model(
                {
                    "obs": self.obs_t,
                    "prev_actions": prev_actions_ph,
                    "prev_rewards": prev_rewards_ph,
                    "is_training": self._get_is_training_placeholder(),
                }, observation_space, action_space, logit_dim,
                self.config["model"])
            logits = self.model.outputs
            self.p_func_vars = scope_vars(scope.name)

        # Action outputs
        action_dist = dist_cls(logits)
        self.output_actions = action_dist.sample()

        # Training inputs
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")

        # v network evaluation
        with tf.variable_scope(VALUE_SCOPE) as scope:
            state_values = self.model.value_function()
            self.v_func_vars = scope_vars(scope.name)
        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
                                              logits, self.act_t, action_space)

        # which kind of objective to optimize
        objective = (self.p_loss.loss +
                     self.config["vf_coeff"] * self.v_loss.loss)
        self.explained_variance = tf.reduce_mean(
            explained_variance(self.cum_rew_t, state_values))

        # initialize TFPolicy
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            (SampleBatch.CUR_OBS, self.obs_t),
            (SampleBatch.ACTIONS, self.act_t),
            (Postprocessing.ADVANTAGES, self.cum_rew_t),
        ]
        TFPolicy.__init__(self,
                          observation_space,
                          action_space,
                          self.sess,
                          obs_input=self.obs_t,
                          action_sampler=self.output_actions,
                          action_prob=action_dist.sampled_action_prob(),
                          loss=objective,
                          model=self.model,
                          loss_inputs=self.loss_inputs,
                          state_inputs=self.model.state_in,
                          state_outputs=self.model.state_out,
                          prev_action_input=prev_actions_ph,
                          prev_reward_input=prev_rewards_ph)
        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "total_loss": objective,
            "vf_explained_var": self.explained_variance,
            "policy_loss": self.p_loss.loss,
            "vf_loss": self.v_loss.loss
        }
    def __init__(self, observation_space, action_space, config):
        config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
        self.config = config

        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        # Action inputs
        self.obs_t = tf.placeholder(
            tf.float32, shape=(None, ) + observation_space.shape)
        prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
        prev_rewards_ph = tf.placeholder(
            tf.float32, [None], name="prev_reward")

        with tf.variable_scope(P_SCOPE) as scope:
            self.model = ModelCatalog.get_model({
                "obs": self.obs_t,
                "prev_actions": prev_actions_ph,
                "prev_rewards": prev_rewards_ph,
                "is_training": self._get_is_training_placeholder(),
            }, observation_space, action_space, logit_dim,
                                                self.config["model"])
            logits = self.model.outputs
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
        action_dist = dist_cls(logits)
        self.output_actions = action_dist.sample()

        # Training inputs
        self.act_t = tf.placeholder(tf.int32, [None], name="action")
        self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")

        # v network evaluation
        with tf.variable_scope(V_SCOPE) as scope:
            state_values = self.model.value_function()
            self.v_func_vars = _scope_vars(scope.name)
        self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
        self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
                                              logits, self.act_t, action_space)

        # which kind of objective to optimize
        objective = (
            self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss)
        self.explained_variance = tf.reduce_mean(
            explained_variance(self.cum_rew_t, state_values))

        # initialize TFPolicyGraph
        self.sess = tf.get_default_session()
        self.loss_inputs = [
            ("obs", self.obs_t),
            ("actions", self.act_t),
            ("advantages", self.cum_rew_t),
        ]
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=self.obs_t,
            action_sampler=self.output_actions,
            action_prob=action_dist.sampled_action_prob(),
            loss=objective,
            model=self.model,
            loss_inputs=self.loss_inputs,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions_ph,
            prev_reward_input=prev_rewards_ph)
        self.sess.run(tf.global_variables_initializer())

        self.stats_fetches = {
            "total_loss": objective,
            "vf_explained_var": self.explained_variance,
            "policy_loss": self.p_loss.loss,
            "vf_loss": self.v_loss.loss
        }
Example #40
0
File: loss.py Project: adgirish/ray
    def __init__(
            self, observation_space, action_space,
            observations, value_targets, advantages, actions,
            prev_logits, prev_vf_preds, logit_dim,
            kl_coeff, distribution_class, config, sess, registry):
        self.prev_dist = distribution_class(prev_logits)

        # Saved so that we can compute actions given different observations
        self.observations = observations

        self.curr_logits = ModelCatalog.get_model(
            registry, observations, logit_dim, config["model"]).outputs
        self.curr_dist = distribution_class(self.curr_logits)
        self.sampler = self.curr_dist.sample()

        if config["use_gae"]:
            vf_config = config["model"].copy()
            # Do not split the last layer of the value function into
            # mean parameters and standard deviation parameters and
            # do not make the standard deviations free variables.
            vf_config["free_log_std"] = False
            with tf.variable_scope("value_function"):
                self.value_function = ModelCatalog.get_model(
                    registry, observations, 1, vf_config).outputs
            self.value_function = tf.reshape(self.value_function, [-1])

        # Make loss functions.
        self.ratio = tf.exp(self.curr_dist.logp(actions) -
                            self.prev_dist.logp(actions))
        self.kl = self.prev_dist.kl(self.curr_dist)
        self.mean_kl = tf.reduce_mean(self.kl)
        self.entropy = self.curr_dist.entropy()
        self.mean_entropy = tf.reduce_mean(self.entropy)
        self.surr1 = self.ratio * advantages
        self.surr2 = tf.clip_by_value(self.ratio, 1 - config["clip_param"],
                                      1 + config["clip_param"]) * advantages
        self.surr = tf.minimum(self.surr1, self.surr2)
        self.mean_policy_loss = tf.reduce_mean(-self.surr)

        if config["use_gae"]:
            # We use a huber loss here to be more robust against outliers,
            # which seem to occur when the rollouts get longer (the variance
            # scales superlinearly with the length of the rollout)
            self.vf_loss1 = tf.square(self.value_function - value_targets)
            vf_clipped = prev_vf_preds + tf.clip_by_value(
                self.value_function - prev_vf_preds,
                -config["clip_param"], config["clip_param"])
            self.vf_loss2 = tf.square(vf_clipped - value_targets)
            self.vf_loss = tf.minimum(self.vf_loss1, self.vf_loss2)
            self.mean_vf_loss = tf.reduce_mean(self.vf_loss)
            self.loss = tf.reduce_mean(
                -self.surr + kl_coeff * self.kl +
                config["vf_loss_coeff"] * self.vf_loss -
                config["entropy_coeff"] * self.entropy)
        else:
            self.mean_vf_loss = tf.constant(0.0)
            self.loss = tf.reduce_mean(
                -self.surr +
                kl_coeff * self.kl -
                config["entropy_coeff"] * self.entropy)

        self.sess = sess

        if config["use_gae"]:
            self.policy_results = [
                self.sampler, self.curr_logits, self.value_function]
        else:
            self.policy_results = [
                self.sampler, self.curr_logits, tf.constant("NA")]
Example #41
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model(
         get_registry(), 1, 5, {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))