Ejemplo n.º 1
0
    def _initialize(self, ob_space, ac_space, ac_bins, ac_noise_std,
                    nonlin_type, hidden_dims, connection_type):
        self.ac_space = ac_space
        self.ac_bins = ac_bins
        self.ac_noise_std = ac_noise_std
        self.hidden_dims = hidden_dims
        self.connection_type = connection_type

        assert len(ob_space.shape) == len(self.ac_space.shape) == 1
        assert (np.all(np.isfinite(self.ac_space.low))
                and np.all(np.isfinite(self.ac_space.high))), ("Action bounds "
                                                               "required")

        self.nonlin = {
            'tanh': tf.tanh,
            'relu': tf.nn.relu,
            'lrelu': U.lrelu,
            'elu': tf.nn.elu
        }[nonlin_type]

        with tf.variable_scope(type(self).__name__) as scope:
            # Observation normalization.
            ob_mean = tf.get_variable('ob_mean',
                                      ob_space.shape,
                                      tf.float32,
                                      tf.constant_initializer(np.nan),
                                      trainable=False)
            ob_std = tf.get_variable('ob_std',
                                     ob_space.shape,
                                     tf.float32,
                                     tf.constant_initializer(np.nan),
                                     trainable=False)
            in_mean = tf.placeholder(tf.float32, ob_space.shape)
            in_std = tf.placeholder(tf.float32, ob_space.shape)
            self._set_ob_mean_std = U.function([in_mean, in_std], [],
                                               updates=[
                                                   tf.assign(ob_mean, in_mean),
                                                   tf.assign(ob_std, in_std),
                                               ])

            # Policy network.
            o = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
            a = self._make_net(
                tf.clip_by_value((o - ob_mean) / ob_std, -5.0, 5.0))
            self._act = U.function([o], a)
        return scope
Ejemplo n.º 2
0
    def _initialize(self, ob_space, ac_space, preprocessor, ac_noise_std):
        self.ac_space = ac_space
        self.ac_noise_std = ac_noise_std
        self.preprocessor_shape = preprocessor.transform_shape(ob_space.shape)

        with tf.variable_scope(type(self).__name__) as scope:
            # Observation normalization.
            ob_mean = tf.get_variable('ob_mean',
                                      self.preprocessor_shape,
                                      tf.float32,
                                      tf.constant_initializer(np.nan),
                                      trainable=False)
            ob_std = tf.get_variable('ob_std',
                                     self.preprocessor_shape,
                                     tf.float32,
                                     tf.constant_initializer(np.nan),
                                     trainable=False)
            in_mean = tf.placeholder(tf.float32, self.preprocessor_shape)
            in_std = tf.placeholder(tf.float32, self.preprocessor_shape)
            self._set_ob_mean_std = U.function([in_mean, in_std], [],
                                               updates=[
                                                   tf.assign(ob_mean, in_mean),
                                                   tf.assign(ob_std, in_std),
                                               ])

            inputs = tf.placeholder(tf.float32,
                                    [None] + list(self.preprocessor_shape))

            # TODO(ekl): we should do clipping in a standard RLlib preprocessor
            clipped_inputs = tf.clip_by_value((inputs - ob_mean) / ob_std,
                                              -5.0, 5.0)

            # Policy network.
            dist_class, dist_dim = ModelCatalog.get_action_dist(
                self.ac_space, dist_type='deterministic')
            model = ModelCatalog.get_model(clipped_inputs, dist_dim)
            dist = dist_class(model.outputs)
            self._act = U.function([inputs], dist.sample())
        return scope
Ejemplo n.º 3
0
    def __init__(self, *args, **kwargs):
        self.args, self.kwargs = args, kwargs
        self.scope = self._initialize(*args, **kwargs)
        self.all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               self.scope.name)

        self.trainable_variables = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, self.scope.name)
        self.num_params = sum(
            int(np.prod(v.get_shape().as_list()))
            for v in self.trainable_variables)
        self._setfromflat = U.SetFromFlat(self.trainable_variables)
        self._getflat = U.GetFlat(self.trainable_variables)

        logger.info('Trainable variables ({} parameters)'.format(
            self.num_params))
        for v in self.trainable_variables:
            shp = v.get_shape().as_list()
            logger.info('- {} shape:{} size:{}'.format(v.name, shp,
                                                       np.prod(shp)))
        logger.info('All variables')
        for v in self.all_variables:
            shp = v.get_shape().as_list()
            logger.info('- {} shape:{} size:{}'.format(v.name, shp,
                                                       np.prod(shp)))

        placeholders = [
            tf.placeholder(v.value().dtype,
                           v.get_shape().as_list()) for v in self.all_variables
        ]
        self.set_all_vars = U.function(
            inputs=placeholders,
            outputs=[],
            updates=[
                tf.group(*[
                    v.assign(p)
                    for v, p in zip(self.all_variables, placeholders)
                ])
            ])