Ejemplo n.º 1
0
    def _initialize(self, ob_space, ac_space, preprocessor, ac_noise_std):
        self.ac_space = ac_space
        self.ac_noise_std = ac_noise_std
        self.preprocessor_shape = preprocessor.transform_shape(ob_space.shape)

        with tf.variable_scope(type(self).__name__) as scope:
            # Observation normalization.
            ob_mean = tf.get_variable(
                'ob_mean', self.preprocessor_shape, tf.float32,
                tf.constant_initializer(np.nan), trainable=False)
            ob_std = tf.get_variable(
                'ob_std', self.preprocessor_shape, tf.float32,
                tf.constant_initializer(np.nan), trainable=False)
            in_mean = tf.placeholder(tf.float32, self.preprocessor_shape)
            in_std = tf.placeholder(tf.float32, self.preprocessor_shape)
            self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[
                tf.assign(ob_mean, in_mean),
                tf.assign(ob_std, in_std),
            ])

            inputs = tf.placeholder(
                tf.float32, [None] + list(self.preprocessor_shape))

            # TODO(ekl): we should do clipping in a standard RLlib preprocessor
            clipped_inputs = tf.clip_by_value(
                (inputs - ob_mean) / ob_std, -5.0, 5.0)

            # Policy network.
            dist_class, dist_dim = ModelCatalog.get_action_dist(
                self.ac_space, dist_type='deterministic')
            model = ModelCatalog.get_model(clipped_inputs, dist_dim)
            dist = dist_class(model.outputs)
            self._act = U.function([inputs], dist.sample())
        return scope
Ejemplo n.º 2
0
    def __init__(self, *args, **kwargs):
        self.args, self.kwargs = args, kwargs
        self.scope = self._initialize(*args, **kwargs)
        self.all_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                               self.scope.name)

        self.trainable_variables = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, self.scope.name)
        self.num_params = sum(int(np.prod(v.get_shape().as_list()))
                              for v in self.trainable_variables)
        self._setfromflat = U.SetFromFlat(self.trainable_variables)
        self._getflat = U.GetFlat(self.trainable_variables)

        logger.info('Trainable variables ({} parameters)'
                    .format(self.num_params))
        for v in self.trainable_variables:
            shp = v.get_shape().as_list()
            logger.info('- {} shape:{} size:{}'.format(v.name, shp,
                                                       np.prod(shp)))
        logger.info('All variables')
        for v in self.all_variables:
            shp = v.get_shape().as_list()
            logger.info('- {} shape:{} size:{}'.format(v.name, shp,
                                                       np.prod(shp)))

        placeholders = [tf.placeholder(v.value().dtype,
                                       v.get_shape().as_list())
                        for v in self.all_variables]
        self.set_all_vars = U.function(
            inputs=placeholders,
            outputs=[],
            updates=[tf.group(*[v.assign(p) for v, p
                     in zip(self.all_variables, placeholders)])]
        )