def _build(self, state_input, step_input, hidden_input, name=None):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                (outputs, step_outputs, step_hidden, hidden_init_var) = gru(
                    name='mean_std_network',
                    gru_cell=self._mean_std_gru_cell,
                    all_input_var=state_input,
                    step_input_var=step_input,
                    step_hidden_var=hidden_input,
                    hidden_state_init=self._hidden_state_init,
                    hidden_state_init_trainable=self.
                    _hidden_state_init_trainable,
                    output_nonlinearity_layer=self.
                    _mean_std_output_nonlinearity_layer)
                with tf.variable_scope('mean_network'):
                    mean_var = outputs[..., :action_dim]
                    step_mean_var = step_outputs[..., :action_dim]
                with tf.variable_scope('log_std_network'):
                    log_std_var = outputs[..., action_dim:]
                    step_log_std_var = step_outputs[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                (mean_var, step_mean_var, step_hidden, hidden_init_var) = gru(
                    name='mean_network',
                    gru_cell=self._mean_gru_cell,
                    all_input_var=state_input,
                    step_input_var=step_input,
                    step_hidden_var=hidden_input,
                    hidden_state_init=self._hidden_state_init,
                    hidden_state_init_trainable=self.
                    _hidden_state_init_trainable,
                    output_nonlinearity_layer=self.
                    _mean_output_nonlinearity_layer)
                log_std_var = parameter(state_input,
                                        length=action_dim,
                                        initializer=tf.constant_initializer(
                                            self._init_std_param),
                                        trainable=self._learn_std,
                                        name='log_std_param')
                step_log_std_var = parameter(
                    step_input,
                    length=action_dim,
                    initializer=tf.constant_initializer(self._init_std_param),
                    trainable=self._learn_std,
                    name='step_log_std_param')

        dist = DiagonalGaussian(self._output_dim)
        rnd = tf.random.normal(shape=step_mean_var.get_shape().as_list()[1:])
        action_var = rnd * tf.exp(step_log_std_var) + step_mean_var

        return (action_var, mean_var, step_mean_var, log_std_var,
                step_log_std_var, step_hidden, hidden_init_var, dist)
Example #2
0
    def test_param(self):
        param = parameter(input_var=self.input_vars,
                          length=3,
                          initializer=tf.constant_initializer(
                              self.initial_params))
        self.sess.run(tf.global_variables_initializer())
        p = self.sess.run(param, feed_dict=self.feed_dict)

        assert p.shape == (5, 3)
        assert np.all(p == self.initial_params)
Example #3
0
    def test_param(self):
        input_vars = tf.placeholder(shape=[None, 2, 3, 4], dtype=tf.float32)
        initial_params = np.array([48, 21, 33])

        params = parameter(input_var=input_vars,
                           length=3,
                           initializer=tf.constant_initializer(initial_params))

        data = np.zeros(shape=[5, 2, 3, 4])
        feed_dict = {
            input_vars: data,
        }

        self.sess.run(tf.global_variables_initializer())
        p = self.sess.run(params, feed_dict=feed_dict)

        assert p.shape[:-1] == data.shape[:-1]
        assert np.all(p[0, 0, 0, :] == initial_params)
Example #4
0
    def _build(self, state_input):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable
                b = tf.constant_initializer(b)
                mean_std_network = mlp(
                    state_input,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    output_b_init=b,
                    name='mean_std_network')
                with tf.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.variable_scope('std_network'):
                    std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_network = mlp(
                    state_input,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    name='mean_network')

                # std network
                if self._adaptive_std:
                    b = tf.constant_initializer(self._init_std_param)
                    std_network = mlp(
                        state_input,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_b_init=b,
                        name='std_network')
                else:
                    p = tf.constant_initializer(self._init_std_param)
                    std_network = parameter(state_input,
                                            length=action_dim,
                                            initializer=p,
                                            trainable=self._learn_std,
                                            name='std_network')

        mean_var = mean_network
        std_param_var = std_network

        with tf.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                std_param_var = std_param_var
            elif self._std_parameterization == 'softplus':
                std_param_var = tf.log(1. + tf.exp(std_param_var))
            else:
                raise NotImplementedError

        with tf.variable_scope('std_limits'):
            if self._min_std_param:
                std_var = tf.maximum(std_param_var, self._min_std_param)
            if self._max_std_param:
                std_var = tf.minimum(std_param_var, self._max_std_param)

        dist = DiagonalGaussian(action_dim)

        rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:],
                               seed=deterministic.get_seed())
        action_var = rnd * tf.exp(std_var) + mean_var

        return action_var, mean_var, std_var, std_param_var, dist
    def _build(self, state_input):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable

                mean_std_network = mlp(
                    state_input,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    output_nonlinearity=self._output_nonlinearity,
                    output_w_init=self._output_w_init,
                    output_b_init=tf.constant_initializer(b),
                    name='mean_std_network',
                    layer_normalization=self._layer_normalization)
                with tf.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.variable_scope('log_std_network'):
                    log_std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_network = mlp(
                    state_input,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    output_nonlinearity=self._output_nonlinearity,
                    output_w_init=self._output_w_init,
                    output_b_init=self._output_b_init,
                    name='mean_network',
                    layer_normalization=self._layer_normalization)

                # std network
                if self._adaptive_std:
                    log_std_network = mlp(
                        state_input,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        hidden_w_init=self._std_hidden_w_init,
                        hidden_b_init=self._std_hidden_b_init,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_w_init=self._std_output_w_init,
                        output_b_init=tf.constant_initializer(
                            self._init_std_param),
                        name='log_std_network',
                        layer_normalization=self._layer_normalization)
                else:
                    log_std_network = parameter(
                        state_input,
                        length=action_dim,
                        initializer=tf.constant_initializer(
                            self._init_std_param),
                        trainable=self._learn_std,
                        name='log_std_network')

        mean_var = mean_network
        std_param = log_std_network

        with tf.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                log_std_var = std_param
            else:  # we know it must be softplus here
                log_std_var = tf.log(1. + tf.exp(std_param))

        with tf.variable_scope('std_limits'):
            if self._min_std_param is not None:
                log_std_var = tf.maximum(log_std_var, self._min_std_param)
            if self._max_std_param is not None:
                log_std_var = tf.minimum(log_std_var, self._max_std_param)

        dist = DiagonalGaussian(self._output_dim)

        rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:],
                               seed=deterministic.get_seed())
        action_var = rnd * tf.exp(log_std_var) + mean_var

        return action_var, mean_var, log_std_var, std_param, dist
Example #6
0
    def _build(self, state_input):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable
                b = tf.constant_initializer(b)
                mean_std_network = mlp(
                    state_input,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    output_b_init=b,
                    name='mean_std_network')
                with tf.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.variable_scope('std_network'):
                    std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_network = mlp(
                    state_input,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    name='mean_network')

                # std network
                if self._adaptive_std:
                    b = tf.constant_initializer(self._init_std_param)
                    std_network = mlp(
                        state_input,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_b_init=b,
                        name='std_network')
                else:
                    p = tf.constant_initializer(self._init_std_param)
                    std_network = parameter(state_input,
                                            length=action_dim,
                                            initializer=p,
                                            trainable=self._learn_std,
                                            name='std_network')

        mean_var = mean_network
        log_std_var = std_network

        with tf.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                pass
            elif self._std_parameterization == 'softplus':
                softplus_std_var = tf.log(1. + tf.exp(log_std_var))
                log_std_var = tf.log(softplus_std_var)
            else:
                raise NotImplementedError

        with tf.variable_scope('std_limits'):
            if self._min_std_param:
                log_std_var = tf.maximum(log_std_var, self._min_std_param)
            if self._max_std_param:
                log_std_var = tf.minimum(log_std_var, self._max_std_param)

        distribution = tfp.distributions.MultivariateNormalDiag(
            mean_var, tf.exp(log_std_var))

        action_var = distribution.sample(seed=ext.get_seed())

        return action_var, log_std_var, distribution