Beispiel #1
0
 def __init__(self,
              output_dim,
              hidden_dim=32,
              name=None,
              hidden_nonlinearity=tf.nn.tanh,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer(),
              recurrent_nonlinearity=tf.nn.sigmoid,
              recurrent_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_nonlinearity=None,
              output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_b_init=tf.zeros_initializer(),
              hidden_state_init=tf.zeros_initializer(),
              hidden_state_init_trainable=False,
              cell_state_init=tf.zeros_initializer(),
              cell_state_init_trainable=False,
              forget_bias=True,
              learn_std=True,
              init_std=1.0,
              std_share_network=False,
              layer_normalization=False):
     super().__init__(name)
     self._output_dim = output_dim
     self._hidden_dim = hidden_dim
     self._hidden_nonlinearity = hidden_nonlinearity
     self._hidden_w_init = hidden_w_init
     self._hidden_b_init = hidden_b_init
     self._recurrent_nonlinearity = recurrent_nonlinearity
     self._recurrent_w_init = recurrent_w_init
     self._output_nonlinearity = output_nonlinearity
     self._output_w_init = output_w_init
     self._output_b_init = output_b_init
     self._hidden_state_init = hidden_state_init
     self._hidden_state_init_trainable = hidden_state_init_trainable
     self._cell_state_init = cell_state_init
     self._cell_state_init_trainable = cell_state_init_trainable
     self._forget_bias = forget_bias
     self._layer_normalization = layer_normalization
     self._learn_std = learn_std
     self._std_share_network = std_share_network
     # pylint: disable=assignment-from-no-return
     self._init_std_param = np.log(init_std)
     self._initialize()
Beispiel #2
0
    def _initialize(self):
        """Build policy to support sampling.

        After build, get_action_*() methods will be available.

        """
        obs_input = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, None, self.obs_dim))
        encoder_input = tf.compat.v1.placeholder(
            tf.float32, shape=(None, None, self._encoder.input_dim))
        latent_input = tf.compat.v1.placeholder(
            tf.float32, shape=(None, None, self._encoder.output_dim))

        with tf.compat.v1.variable_scope(self._encoder.name):
            encoder_dist = self._encoder.build(encoder_input,
                                               name='encoder').dist

        with tf.compat.v1.variable_scope('concat_obs_latent'):
            obs_latent_input = tf.concat([obs_input, latent_input], -1)

        dist, mean_var, log_std_var = super().build(
            obs_latent_input,
            # Must named 'default' to
            # compensate tf default worker
            name='default').outputs

        embed_state_input = tf.concat([
            obs_input,
            encoder_dist.sample(seed=deterministic.get_tf_seed_stream())
        ], -1)
        dist_given_task, mean_g_t, log_std_g_t = super().build(
            embed_state_input, name='given_task').outputs

        self._f_dist_obs_latent = tf.compat.v1.get_default_session(
        ).make_callable([
            dist.sample(seed=deterministic.get_tf_seed_stream()), mean_var,
            log_std_var
        ],
                        feed_list=[obs_input, latent_input])

        self._f_dist_obs_task = tf.compat.v1.get_default_session(
        ).make_callable([
            dist_given_task.sample(seed=deterministic.get_tf_seed_stream()),
            mean_g_t, log_std_g_t
        ],
                        feed_list=[obs_input, encoder_input])
    def __init__(self,
                 env_spec,
                 name='CategoricalMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=tf.nn.softmax,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 layer_normalization=False):
        if not isinstance(env_spec.action_space, akro.Discrete):
            raise ValueError('CategoricalMLPPolicy only works'
                             'with akro.Discrete action space.')

        self._env_spec = env_spec
        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.n

        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization

        self._f_prob = None

        super().__init__(output_dim=self._action_dim,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         layer_normalization=layer_normalization,
                         name=name)

        self._initialize()
    def __init__(self,
                 env_spec,
                 name='ContinuousMLPQFunction',
                 hidden_sizes=(32, 32),
                 action_merge_layer=-2,
                 hidden_nonlinearity=tf.nn.relu,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 layer_normalization=False):

        self._env_spec = env_spec
        self._hidden_sizes = hidden_sizes
        self._action_merge_layer = action_merge_layer
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization

        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim

        super().__init__(name=name,
                         output_dim=1,
                         hidden_sizes=hidden_sizes,
                         concat_layer=self._action_merge_layer,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         layer_normalization=layer_normalization)
        self._network = None

        self._initialize()
    def __init__(self,
                 env_spec,
                 num_seq_inputs=1,
                 name='ContinuousMLPBaseline',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 optimizer=None,
                 optimizer_args=None,
                 normalize_inputs=True):
        self._env_spec = env_spec
        self._normalize_inputs = normalize_inputs
        self._name = name

        if optimizer_args is None:
            optimizer_args = dict()
        if optimizer is None:
            self._optimizer = make_optimizer(LbfgsOptimizer, **optimizer_args)
        else:
            self._optimizer = make_optimizer(optimizer, **optimizer_args)

        super().__init__(input_shape=(env_spec.observation_space.flat_dim *
                                      num_seq_inputs, ),
                         output_dim=1,
                         name=name,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init)

        self._x_mean = None
        self._x_std = None
        self._y_hat = None
        self._initialize()
Beispiel #6
0
def cnn(input_var,
        filters,
        strides,
        name,
        padding,
        hidden_nonlinearity=tf.nn.relu,
        hidden_w_init=tf.initializers.glorot_uniform(
            seed=deterministic.get_tf_seed_stream()),
        hidden_b_init=tf.zeros_initializer()):
    """Convolutional neural network (CNN).

    Note:
        Based on 'NHWC' data format: [batch, height, width, channel].

    Args:
        input_var (tf.Tensor): Input tf.Tensor to the CNN.
        filters (Tuple[Tuple[int, Tuple[int, int]], ...]): Number and dimension
            of filters. For example, ((3, (3, 5)), (32, (3, 3))) means there
            are two convolutional layers. The filter for the first layer have 3
            channels and its shape is (3 x 5), while the filter for the second
            layer have 32 channels and its shape is (3 x 3).
        strides (tuple[int]): The stride of the sliding window. For example,
            (1, 2) means there are two convolutional layers. The stride of the
            filter for first layer is 1 and that of the second layer is 2.
        name (str): Network name, also the variable scope.
        padding (str): The type of padding algorithm to use,
            either 'SAME' or 'VALID'.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.

    Return:
        tf.Tensor: The output tf.Tensor of the CNN.

    """
    with tf.compat.v1.variable_scope(name):
        h = input_var
        for index, (filter_iter, stride) in enumerate(zip(filters, strides)):
            _stride = [1, stride, stride, 1]
            h = _conv(h, 'h{}'.format(index), filter_iter[1], filter_iter[0],
                      _stride, hidden_w_init, hidden_b_init, padding)
            if hidden_nonlinearity is not None:
                h = hidden_nonlinearity(h)

        # flatten
        dim = tf.reduce_prod(h.get_shape()[1:].as_list())
        return tf.reshape(h, [-1, dim])
Beispiel #7
0
 def _initialize_policy(self):
     """Initialize policy."""
     state_input = tf.compat.v1.placeholder(tf.float32,
                                            shape=(None, None,
                                                   self._obs_dim))
     dist, mean, log_std = self.build(state_input).outputs
     self._f_dist = tf.compat.v1.get_default_session().make_callable(
         [
             dist.sample(seed=deterministic.get_tf_seed_stream()), mean,
             log_std
         ],
         feed_list=[state_input])
 def __init__(self,
              output_dim,
              hidden_dim,
              name=None,
              hidden_nonlinearity=tf.nn.tanh,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer(),
              recurrent_nonlinearity=tf.nn.sigmoid,
              recurrent_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_nonlinearity=tf.nn.softmax,
              output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_b_init=tf.zeros_initializer(),
              hidden_state_init=tf.zeros_initializer(),
              hidden_state_init_trainable=False,
              cell_state_init=tf.zeros_initializer(),
              cell_state_init_trainable=False,
              forget_bias=True,
              layer_normalization=False):
     super().__init__(
         output_dim=output_dim,
         hidden_dim=hidden_dim,
         name=name,
         hidden_nonlinearity=hidden_nonlinearity,
         hidden_w_init=hidden_w_init,
         hidden_b_init=hidden_b_init,
         recurrent_nonlinearity=recurrent_nonlinearity,
         recurrent_w_init=recurrent_w_init,
         output_nonlinearity=output_nonlinearity,
         output_w_init=output_w_init,
         output_b_init=output_b_init,
         hidden_state_init=hidden_state_init,
         hidden_state_init_trainable=hidden_state_init_trainable,
         cell_state_init=cell_state_init,
         cell_state_init_trainable=cell_state_init_trainable,
         forget_bias=forget_bias,
         layer_normalization=layer_normalization)
Beispiel #9
0
 def __init__(self,
              output_dim,
              name='MLPModel',
              hidden_sizes=(32, 32),
              hidden_nonlinearity=tf.nn.relu,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer(),
              output_nonlinearity=None,
              output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_b_init=tf.zeros_initializer(),
              layer_normalization=False):
     super().__init__(name)
     self._output_dim = output_dim
     self._hidden_sizes = hidden_sizes
     self._hidden_nonlinearity = hidden_nonlinearity
     self._hidden_w_init = hidden_w_init
     self._hidden_b_init = hidden_b_init
     self._output_nonlinearity = output_nonlinearity
     self._output_w_init = output_w_init
     self._output_b_init = output_b_init
     self._layer_normalization = layer_normalization
Beispiel #10
0
 def __init__(self,
              input_dim,
              output_dim,
              filters,
              strides,
              padding,
              name=None,
              is_image=True,
              hidden_sizes=(32, 32),
              hidden_nonlinearity=tf.nn.relu,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer(),
              output_nonlinearity=tf.nn.softmax,
              output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_b_init=tf.zeros_initializer(),
              layer_normalization=False):
     super().__init__(name)
     self._is_image = is_image
     self._cnn_model = CNNModel(input_dim=input_dim,
                                filters=filters,
                                strides=strides,
                                padding=padding,
                                hidden_nonlinearity=hidden_nonlinearity,
                                name='CNNModel')
     self._mlp_model = CategoricalMLPModel(
         output_dim=output_dim,
         hidden_sizes=hidden_sizes,
         hidden_nonlinearity=hidden_nonlinearity,
         hidden_w_init=hidden_w_init,
         hidden_b_init=hidden_b_init,
         output_nonlinearity=output_nonlinearity,
         output_w_init=output_w_init,
         output_b_init=output_b_init,
         layer_normalization=layer_normalization,
         name='MLPModel')
    def __init__(self,
                 env_spec,
                 name='ContinuousMLPPolicy',
                 hidden_sizes=(64, 64),
                 hidden_nonlinearity=tf.nn.relu,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=tf.nn.tanh,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 layer_normalization=False):
        self._env_spec = env_spec
        action_dim = env_spec.action_space.flat_dim
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization
        self._obs_dim = env_spec.observation_space.flat_dim

        super().__init__(output_dim=action_dim,
                         name=name,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         layer_normalization=layer_normalization)

        self._initialize()
Beispiel #12
0
 def _initialize(self):
     """Initialize encoder."""
     embedding_input = tf.compat.v1.placeholder(tf.float32,
                                                shape=(None, None,
                                                       self._input_dim),
                                                name='default_encoder')
     with tf.compat.v1.variable_scope(self._name) as vs:
         self._variable_scope = vs
         self._network = self.model.build(embedding_input)
         self._f_dist = tf.compat.v1.get_default_session().make_callable(
             [
                 self._network.dist.sample(
                     seed=deterministic.get_tf_seed_stream()),
                 self._network.mean, self._network.log_std
             ],
             feed_list=[embedding_input])
Beispiel #13
0
 def __init__(self,
              filters,
              strides,
              padding,
              name=None,
              hidden_nonlinearity=tf.nn.relu,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer()):
     super().__init__(name)
     self._filters = filters
     self._strides = strides
     self._padding = padding
     self._hidden_nonlinearity = hidden_nonlinearity
     self._hidden_w_init = hidden_w_init
     self._hidden_b_init = hidden_b_init
Beispiel #14
0
 def _initialize(self):
     """Initialize policy."""
     state_input = tf.compat.v1.placeholder(tf.float32,
                                            shape=(None, None) +
                                            self._obs_dim)
     if isinstance(self.env_spec.observation_space, akro.Image):
         augmented_state_input = tf.cast(state_input, tf.float32)
         augmented_state_input /= 255.0
     else:
         augmented_state_input = state_input
     dist = self.build(augmented_state_input).outputs
     self._f_prob = tf.compat.v1.get_default_session().make_callable(
         [
             tf.argmax(dist.sample(seed=deterministic.get_tf_seed_stream()),
                       -1), dist.probs
         ],
         feed_list=[state_input])
Beispiel #15
0
 def __init__(self,
              input_dim,
              filters,
              strides,
              name=None,
              padding='SAME',
              pool_strides=(2, 2),
              pool_shapes=(2, 2),
              hidden_nonlinearity=tf.nn.relu,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer()):
     super().__init__(name)
     self._input_dim = input_dim
     self._filters = filters
     self._strides = strides
     self._padding = padding
     self._pool_strides = pool_strides
     self._pool_shapes = pool_shapes
     self._hidden_nonlinearity = hidden_nonlinearity
     self._hidden_w_init = hidden_w_init
     self._hidden_b_init = hidden_b_init
Beispiel #16
0
    def _build(self, state_input, name=None):
        """Build model given input placeholder(s).

        Args:
            state_input (tf.Tensor): Place holder for state input.
            name (str): Inner model name, also the variable scope of the
                inner model, if exist. One example is
                garage.tf.models.Sequential.

        Return:
            tf.Tensor: Sampled action.
            tf.Tensor: Mean.
            tf.Tensor: Parameterized log_std.
            tf.Tensor: log_std.
            tfp.distributions.MultivariateNormalDiag: Distribution.

        """
        del name
        action_dim = self._output_dim

        with tf.compat.v1.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an CNN
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable

                mean_std_conv = cnn(
                    input_var=state_input,
                    filters=self._filters,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    strides=self._strides,
                    padding=self._padding,
                    name='mean_std_cnn')
                mean_std_network = mlp(
                    mean_std_conv,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    output_nonlinearity=self._output_nonlinearity,
                    output_w_init=self._output_w_init,
                    output_b_init=tf.constant_initializer(b),
                    name='mean_std_network',
                    layer_normalization=self._layer_normalization)
                with tf.compat.v1.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.compat.v1.variable_scope('log_std_network'):
                    log_std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_conv = cnn(input_var=state_input,
                                filters=self._filters,
                                hidden_nonlinearity=self._hidden_nonlinearity,
                                hidden_w_init=self._hidden_w_init,
                                hidden_b_init=self._hidden_b_init,
                                strides=self._strides,
                                padding=self._padding,
                                name='mean_cnn')

                mean_network = mlp(
                    mean_conv,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    output_nonlinearity=self._output_nonlinearity,
                    output_w_init=self._output_w_init,
                    output_b_init=self._output_b_init,
                    name='mean_network',
                    layer_normalization=self._layer_normalization)

                # std network
                if self._adaptive_std:
                    log_std_conv = cnn(
                        input_var=state_input,
                        filters=self._std_filters,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        hidden_w_init=self._std_hidden_w_init,
                        hidden_b_init=self._std_hidden_b_init,
                        strides=self._std_strides,
                        padding=self._std_padding,
                        name='log_std_cnn')

                    log_std_network = mlp(
                        log_std_conv,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        hidden_w_init=self._std_hidden_w_init,
                        hidden_b_init=self._std_hidden_b_init,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_w_init=self._std_output_w_init,
                        output_b_init=tf.constant_initializer(
                            self._init_std_param),
                        name='log_std_network',
                        layer_normalization=self._layer_normalization)
                else:
                    log_std_network = parameter(
                        input_var=state_input,
                        length=action_dim,
                        initializer=tf.constant_initializer(
                            self._init_std_param),
                        trainable=self._learn_std,
                        name='log_std_network')

        mean_var = mean_network
        std_param = log_std_network

        with tf.compat.v1.variable_scope('std_limits'):
            if self._min_std_param is not None:
                std_param = tf.maximum(std_param, self._min_std_param)
            if self._max_std_param is not None:
                std_param = tf.minimum(std_param, self._max_std_param)

        with tf.compat.v1.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                log_std_var = std_param
            else:  # we know it must be softplus here
                log_std_var = tf.math.log(tf.math.log(1. + tf.exp(std_param)))

        dist = tfp.distributions.MultivariateNormalDiag(
            loc=mean_var, scale_diag=tf.exp(log_std_var))
        rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:],
                               seed=deterministic.get_tf_seed_stream())
        action_var = rnd * tf.exp(log_std_var) + mean_var

        return action_var, mean_var, log_std_var, std_param, dist
    def __init__(self,
                 env_spec,
                 name='CategoricalLSTMPolicy',
                 hidden_dim=32,
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 recurrent_nonlinearity=tf.nn.sigmoid,
                 recurrent_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_nonlinearity=tf.nn.softmax,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 hidden_state_init=tf.zeros_initializer(),
                 hidden_state_init_trainable=False,
                 cell_state_init=tf.zeros_initializer(),
                 cell_state_init_trainable=False,
                 state_include_action=True,
                 forget_bias=True,
                 layer_normalization=False):
        if not isinstance(env_spec.action_space, akro.Discrete):
            raise ValueError('CategoricalLSTMPolicy only works'
                             'with akro.Discrete action space.')

        self._env_spec = env_spec
        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.n

        self._hidden_dim = hidden_dim
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._recurrent_nonlinearity = recurrent_nonlinearity
        self._recurrent_w_init = recurrent_w_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._hidden_state_init = hidden_state_init
        self._hidden_state_init_trainable = hidden_state_init_trainable
        self._cell_state_init = cell_state_init
        self._cell_state_init_trainable = cell_state_init_trainable
        self._forget_bias = forget_bias
        self._layer_normalization = layer_normalization
        self._state_include_action = state_include_action

        if state_include_action:
            self._input_dim = self._obs_dim + self._action_dim
        else:
            self._input_dim = self._obs_dim

        self._f_step_prob = None

        super().__init__(
            output_dim=self._action_dim,
            hidden_dim=self._hidden_dim,
            name=name,
            forget_bias=forget_bias,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            recurrent_nonlinearity=recurrent_nonlinearity,
            recurrent_w_init=recurrent_w_init,
            hidden_state_init=hidden_state_init,
            hidden_state_init_trainable=hidden_state_init_trainable,
            cell_state_init=cell_state_init,
            cell_state_init_trainable=cell_state_init_trainable,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            layer_normalization=layer_normalization)

        self._prev_actions = None
        self._prev_hiddens = None
        self._prev_cells = None
        self._init_hidden = None
        self._init_cell = None

        self._initialize_policy()
Beispiel #18
0
    def __init__(self,
                 env_spec,
                 hidden_dim=32,
                 name='GaussianGRUPolicy',
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 recurrent_nonlinearity=tf.nn.sigmoid,
                 recurrent_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 hidden_state_init=tf.zeros_initializer(),
                 hidden_state_init_trainable=False,
                 learn_std=True,
                 std_share_network=False,
                 init_std=1.0,
                 layer_normalization=False,
                 state_include_action=True):
        if not isinstance(env_spec.action_space, akro.Box):
            raise ValueError('GaussianGRUPolicy only works with '
                             'akro.Box action space, but not {}'.format(
                                 env_spec.action_space))

        self._env_spec = env_spec
        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim

        self._hidden_dim = hidden_dim
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._recurrent_nonlinearity = recurrent_nonlinearity
        self._recurrent_w_init = recurrent_w_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._hidden_state_init = hidden_state_init
        self._hidden_state_init_trainable = hidden_state_init_trainable
        self._learn_std = learn_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._layer_normalization = layer_normalization
        self._state_include_action = state_include_action

        if state_include_action:
            self._input_dim = self._obs_dim + self._action_dim
        else:
            self._input_dim = self._obs_dim

        self._f_step_mean_std = None

        super().__init__(
            output_dim=self._action_dim,
            hidden_dim=hidden_dim,
            name=name,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            recurrent_nonlinearity=recurrent_nonlinearity,
            recurrent_w_init=recurrent_w_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            hidden_state_init=hidden_state_init,
            hidden_state_init_trainable=hidden_state_init_trainable,
            layer_normalization=layer_normalization,
            learn_std=learn_std,
            std_share_network=std_share_network,
            init_std=init_std)

        self._prev_actions = None
        self._prev_hiddens = None
        self._init_hidden = None

        self._initialize_policy()
Beispiel #19
0
    def _build_entropy_terms(self, i):
        """Build policy entropy tensor.

        Args:
            i (namedtuple): Collection of variables to compute policy loss.

        Returns:
            tf.Tensor: Policy entropy.

        """
        pol_dist = self._policy_network.dist
        infer_dist = self._infer_network.dist
        enc_dist = self._encoder_network.dist
        with tf.name_scope('entropy_terms'):
            # 1. Encoder distribution total entropy
            with tf.name_scope('encoder_entropy'):
                encoder_dist, _, _ = self.policy.encoder.build(
                    i.task_var, name='encoder_entropy').outputs
                encoder_all_task_entropies = -encoder_dist.log_prob(
                    i.latent_var)

                if self._use_softplus_entropy:
                    encoder_entropy = tf.nn.softplus(
                        encoder_all_task_entropies)

                encoder_entropy = tf.reduce_mean(encoder_entropy,
                                                 name='encoder_entropy')
                encoder_entropy = tf.stop_gradient(encoder_entropy)

            # 2. Infernece distribution cross-entropy (log-likelihood)
            with tf.name_scope('inference_ce'):
                # Build inference with trajectory windows

                traj_ll = infer_dist.log_prob(
                    enc_dist.sample(seed=deterministic.get_tf_seed_stream()),
                    name='traj_ll')

                inference_ce_raw = -traj_ll
                inference_ce = tf.clip_by_value(inference_ce_raw, -3, 3)

                if self._use_softplus_entropy:
                    inference_ce = tf.nn.softplus(inference_ce)

                if self._stop_ce_gradient:
                    inference_ce = tf.stop_gradient(inference_ce)

            # 3. Policy path entropies
            with tf.name_scope('policy_entropy'):
                policy_entropy = -pol_dist.log_prob(i.action_var,
                                                    name='policy_log_likeli')

                # This prevents entropy from becoming negative
                # for small policy std
                if self._use_softplus_entropy:
                    policy_entropy = tf.nn.softplus(policy_entropy)

                policy_entropy = tf.stop_gradient(policy_entropy)

        # Diagnostic functions
        self._f_task_entropies = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            encoder_all_task_entropies)
        self._f_encoder_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), encoder_entropy)
        self._f_inference_ce = compile_function(
            flatten_inputs(self._policy_opt_inputs),
            tf.reduce_mean(inference_ce * i.valid_var))
        self._f_policy_entropy = compile_function(
            flatten_inputs(self._policy_opt_inputs), policy_entropy)

        return encoder_entropy, inference_ce, policy_entropy
    def __init__(self,
                 env_spec,
                 filters,
                 strides,
                 hidden_sizes=(256, ),
                 action_merge_layer=-2,
                 name=None,
                 padding='SAME',
                 max_pooling=False,
                 pool_strides=(2, 2),
                 pool_shapes=(2, 2),
                 cnn_hidden_nonlinearity=tf.nn.relu,
                 hidden_nonlinearity=tf.nn.relu,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 layer_normalization=False):

        if (not isinstance(env_spec.observation_space, akro.Box)
                or not len(env_spec.observation_space.shape) in (2, 3)):
            raise ValueError(
                '{} can only process 2D, 3D akro.Image or'
                ' akro.Box observations, but received an env_spec with '
                'observation_space of type {} and shape {}'.format(
                    type(self).__name__,
                    type(env_spec.observation_space).__name__,
                    env_spec.observation_space.shape))

        self._env_spec = env_spec
        self._filters = filters
        self._strides = strides
        self._hidden_sizes = hidden_sizes
        self._action_merge_layer = action_merge_layer
        self._padding = padding
        self._max_pooling = max_pooling
        self._pool_strides = pool_strides
        self._pool_shapes = pool_shapes
        self._cnn_hidden_nonlinearity = cnn_hidden_nonlinearity
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization

        self._obs_dim = self._env_spec.observation_space.shape
        self._action_dim = self._env_spec.action_space.shape

        super().__init__(name=name,
                         filters=self._filters,
                         strides=self._strides,
                         hidden_sizes=self._hidden_sizes,
                         action_merge_layer=self._action_merge_layer,
                         padding=self._padding,
                         max_pooling=self._max_pooling,
                         pool_strides=self._pool_strides,
                         pool_shapes=self._pool_shapes,
                         cnn_hidden_nonlinearity=self._cnn_hidden_nonlinearity,
                         hidden_nonlinearity=self._hidden_nonlinearity,
                         hidden_w_init=self._hidden_w_init,
                         hidden_b_init=self._hidden_b_init,
                         output_nonlinearity=self._output_nonlinearity,
                         output_w_init=self._output_w_init,
                         output_b_init=self._output_b_init,
                         layer_normalization=self._layer_normalization)

        self._initialize()
Beispiel #21
0
    def __init__(self,
                 env_spec,
                 name='GaussianMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        if not isinstance(env_spec.action_space, akro.Box):
            raise ValueError('GaussianMLPPolicy only works with '
                             'akro.Box action space, but not {}'.format(
                                 env_spec.action_space))

        self._env_spec = env_spec
        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim

        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._learn_std = learn_std
        self._adaptive_std = adaptive_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._min_std = min_std
        self._max_std = max_std
        self._std_hidden_sizes = std_hidden_sizes
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_parameterization = std_parameterization
        self._layer_normalization = layer_normalization

        self._f_dist = None

        super().__init__(output_dim=self._action_dim,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         learn_std=learn_std,
                         adaptive_std=adaptive_std,
                         std_share_network=std_share_network,
                         init_std=init_std,
                         min_std=min_std,
                         max_std=max_std,
                         std_hidden_sizes=std_hidden_sizes,
                         std_hidden_nonlinearity=std_hidden_nonlinearity,
                         std_output_nonlinearity=std_output_nonlinearity,
                         std_parameterization=std_parameterization,
                         layer_normalization=layer_normalization,
                         name=name)

        self._initialize_policy()
Beispiel #22
0
    def __init__(self,
                 embedding_spec,
                 name='GaussianMLPEncoder',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        super().__init__(name)
        self._embedding_spec = embedding_spec
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._learn_std = learn_std
        self._adaptive_std = adaptive_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._min_std = min_std
        self._max_std = max_std
        self._std_hidden_sizes = std_hidden_sizes
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_parameterization = std_parameterization
        self._layer_normalization = layer_normalization

        self._latent_dim = embedding_spec.output_space.flat_dim
        self._input_dim = embedding_spec.input_space.flat_dim
        self._network = None
        self._f_dist = None

        self.model = GaussianMLPModel(
            output_dim=self._latent_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            hidden_w_init=hidden_w_init,
            hidden_b_init=hidden_b_init,
            output_nonlinearity=output_nonlinearity,
            output_w_init=output_w_init,
            output_b_init=output_b_init,
            learn_std=learn_std,
            adaptive_std=adaptive_std,
            std_share_network=std_share_network,
            init_std=init_std,
            min_std=min_std,
            max_std=max_std,
            std_hidden_sizes=std_hidden_sizes,
            std_hidden_nonlinearity=std_hidden_nonlinearity,
            std_output_nonlinearity=std_output_nonlinearity,
            std_parameterization=std_parameterization,
            layer_normalization=layer_normalization,
            name='GaussianMLPModel')

        self._initialize()
Beispiel #23
0
def mlp(input_var,
        output_dim,
        hidden_sizes,
        name,
        input_var2=None,
        concat_layer=-2,
        hidden_nonlinearity=tf.nn.relu,
        hidden_w_init=tf.initializers.glorot_uniform(
            seed=deterministic.get_tf_seed_stream()),
        hidden_b_init=tf.zeros_initializer(),
        output_nonlinearity=None,
        output_w_init=tf.initializers.glorot_uniform(
            seed=deterministic.get_tf_seed_stream()),
        output_b_init=tf.zeros_initializer(),
        layer_normalization=False):
    """Multi-layer perceptron (MLP).

    It maps real-valued inputs to real-valued outputs.

    Args:
        input_var (tf.Tensor): Input tf.Tensor to the MLP.
        output_dim (int): Dimension of the network output.
        hidden_sizes (list[int]): Output dimension of dense layer(s).
            For example, (32, 32) means this MLP consists of two
            hidden layers, each with 32 hidden units.
        name (str): Network name, also the variable scope.
        input_var2 (tf.Tensor): Second input tf.Tensor to the MLP if input
            needs to be concatenated with a layer in the model.
        concat_layer (int): The index of layers at which to concatenate
            input_var2 with the network. If input_var2 is not supplied, this
            arguments is ignored. The indexing works like standard python list
            indexing. Index of 0 refers to the input layer (input_var) while
            an index of -1 points to the last hidden layer. Default parameter
            points to second layer from the end. If the model has only one
            layer, input_var2 is concatenated with that layer.
        hidden_nonlinearity (callable): Activation function for intermediate
            dense layer(s). It should return a tf.Tensor. Set it to
            None to maintain a linear activation.
        hidden_w_init (callable): Initializer function for the weight
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        hidden_b_init (callable): Initializer function for the bias
            of intermediate dense layer(s). The function should return a
            tf.Tensor.
        output_nonlinearity (callable): Activation function for output dense
            layer. It should return a tf.Tensor. Set it to None to
            maintain a linear activation.
        output_w_init (callable): Initializer function for the weight
            of output dense layer(s). The function should return a
            tf.Tensor.
        output_b_init (callable): Initializer function for the bias
            of output dense layer(s). The function should return a
            tf.Tensor.
        layer_normalization (bool): Bool for using layer normalization or not.

    Return:
        tf.Tensor: The output tf.Tensor of the MLP.

    """
    n_layers = len(hidden_sizes) + 1
    _merge_inputs = False

    if input_var2 is not None:
        _merge_inputs = True
        if n_layers > 1:
            _concat_layer = (concat_layer % n_layers + n_layers) % n_layers
        else:
            _concat_layer = 0

    with tf.compat.v1.variable_scope(name):
        l_hid = input_var
        for idx, hidden_size in enumerate(hidden_sizes):
            if _merge_inputs and idx == _concat_layer:
                l_hid = tf.keras.layers.concatenate([l_hid, input_var2])

            l_hid = tf.compat.v1.layers.dense(inputs=l_hid,
                                              units=hidden_size,
                                              activation=hidden_nonlinearity,
                                              kernel_initializer=hidden_w_init,
                                              bias_initializer=hidden_b_init,
                                              name='hidden_{}'.format(idx))
            if layer_normalization:
                l_hid = tf.keras.layers.LayerNormalization()(l_hid)

        if _merge_inputs and _concat_layer == len(hidden_sizes):
            l_hid = tf.keras.layers.concatenate([l_hid, input_var2])

        l_out = tf.compat.v1.layers.dense(inputs=l_hid,
                                          units=output_dim,
                                          activation=output_nonlinearity,
                                          kernel_initializer=output_w_init,
                                          bias_initializer=output_b_init,
                                          name='output')
    return l_out
    def __init__(self,
                 env_spec,
                 filters,
                 strides,
                 hidden_sizes=(256, ),
                 name=None,
                 padding='SAME',
                 max_pooling=False,
                 pool_strides=(2, 2),
                 pool_shapes=(2, 2),
                 cnn_hidden_nonlinearity=tf.nn.relu,
                 hidden_nonlinearity=tf.nn.relu,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 dueling=False,
                 layer_normalization=False):
        if not isinstance(env_spec.observation_space, akro.Box) or \
                not len(env_spec.observation_space.shape) in (2, 3):
            raise ValueError(
                '{} can only process 2D, 3D akro.Image or'
                ' akro.Box observations, but received an env_spec with '
                'observation_space of type {} and shape {}'.format(
                    type(self).__name__,
                    type(env_spec.observation_space).__name__,
                    env_spec.observation_space.shape))

        self._env_spec = env_spec
        self._action_dim = env_spec.action_space.n
        self._filters = filters
        self._strides = strides
        self._hidden_sizes = hidden_sizes
        self._padding = padding
        self._max_pooling = max_pooling
        self._pool_strides = pool_strides
        self._pool_shapes = pool_shapes
        self._cnn_hidden_nonlinearity = cnn_hidden_nonlinearity
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization
        self._dueling = dueling

        self.obs_dim = self._env_spec.observation_space.shape
        action_dim = self._env_spec.action_space.flat_dim

        if not max_pooling:
            cnn_model = CNNModel(input_dim=self.obs_dim,
                                 filters=filters,
                                 strides=strides,
                                 padding=padding,
                                 hidden_nonlinearity=cnn_hidden_nonlinearity)
        else:
            cnn_model = CNNModelWithMaxPooling(
                input_dim=self.obs_dim,
                filters=filters,
                strides=strides,
                padding=padding,
                pool_strides=pool_strides,
                pool_shapes=pool_shapes,
                hidden_nonlinearity=cnn_hidden_nonlinearity)
        if not dueling:
            output_model = MLPModel(output_dim=action_dim,
                                    hidden_sizes=hidden_sizes,
                                    hidden_nonlinearity=hidden_nonlinearity,
                                    hidden_w_init=hidden_w_init,
                                    hidden_b_init=hidden_b_init,
                                    output_nonlinearity=output_nonlinearity,
                                    output_w_init=output_w_init,
                                    output_b_init=output_b_init,
                                    layer_normalization=layer_normalization)
        else:
            output_model = MLPDuelingModel(
                output_dim=action_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                hidden_w_init=hidden_w_init,
                hidden_b_init=hidden_b_init,
                output_nonlinearity=output_nonlinearity,
                output_w_init=output_w_init,
                output_b_init=output_b_init,
                layer_normalization=layer_normalization)

        super().__init__(cnn_model, output_model, name=name)
        self._network = None

        self._initialize()
Beispiel #25
0
    def __init__(self,
                 env_spec,
                 num_seq_inputs=1,
                 name='GaussianMLPBaseline',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 optimizer=None,
                 optimizer_args=None,
                 use_trust_region=True,
                 max_kl_step=0.01,
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_hidden_sizes=(32, 32),
                 std_nonlinearity=None,
                 layer_normalization=False,
                 normalize_inputs=True,
                 normalize_outputs=True,
                 subsample_factor=1.0):
        self._env_spec = env_spec
        self._num_seq_inputs = num_seq_inputs
        self._use_trust_region = use_trust_region
        self._max_kl_step = max_kl_step
        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._subsample_factor = subsample_factor

        if optimizer_args is None:
            optimizer_args = dict()
        if optimizer is None:
            if use_trust_region:
                self._optimizer = make_optimizer(PenaltyLBFGSOptimizer,
                                                 **optimizer_args)
            else:
                self._optimizer = make_optimizer(LBFGSOptimizer,
                                                 **optimizer_args)
        else:
            self._optimizer = make_optimizer(optimizer, **optimizer_args)

        super().__init__(name=name,
                         input_shape=(env_spec.observation_space.flat_dim *
                                      num_seq_inputs, ),
                         output_dim=1,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         learn_std=learn_std,
                         adaptive_std=adaptive_std,
                         std_share_network=std_share_network,
                         init_std=init_std,
                         min_std=None,
                         max_std=None,
                         std_hidden_sizes=std_hidden_sizes,
                         std_hidden_nonlinearity=std_nonlinearity,
                         std_output_nonlinearity=None,
                         std_parameterization='exp',
                         layer_normalization=layer_normalization)
        # model for old distribution, used when trusted region is on
        self._old_model = self.clone_model(name=name + '_old_model')
        self._x_mean = None
        self._x_std = None
        self._y_mean = None
        self._y_std = None
        self._old_network = None

        self._initialize()
Beispiel #26
0
 def __init__(self,
              output_dim,
              name=None,
              hidden_sizes=(32, 32),
              hidden_nonlinearity=tf.nn.tanh,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer(),
              output_nonlinearity=None,
              output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_b_init=tf.zeros_initializer(),
              learn_std=True,
              adaptive_std=False,
              std_share_network=False,
              init_std=1.0,
              min_std=1e-6,
              max_std=None,
              std_hidden_sizes=(32, 32),
              std_hidden_nonlinearity=tf.nn.tanh,
              std_hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              std_hidden_b_init=tf.zeros_initializer(),
              std_output_nonlinearity=None,
              std_output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              std_parameterization='exp',
              layer_normalization=False):
     # Network parameters
     super().__init__(name)
     self._hidden_sizes = hidden_sizes
     self._output_dim = output_dim
     self._learn_std = learn_std
     self._adaptive_std = adaptive_std
     self._std_share_network = std_share_network
     self._std_hidden_sizes = std_hidden_sizes
     self._init_std = init_std
     self._min_std = min_std
     self._max_std = max_std
     self._std_hidden_nonlinearity = std_hidden_nonlinearity
     self._std_hidden_w_init = std_hidden_w_init
     self._std_hidden_b_init = std_hidden_b_init
     self._std_output_nonlinearity = std_output_nonlinearity
     self._std_output_w_init = std_output_w_init
     self._std_parameterization = std_parameterization
     self._hidden_nonlinearity = hidden_nonlinearity
     self._hidden_w_init = hidden_w_init
     self._hidden_b_init = hidden_b_init
     self._output_nonlinearity = output_nonlinearity
     self._output_w_init = output_w_init
     self._output_b_init = output_b_init
     self._layer_normalization = layer_normalization
     # Tranform std arguments to parameterized space
     self._init_std_param = None
     self._min_std_param = None
     self._max_std_param = None
     # pylint: disable=assignment-from-no-return
     if self._std_parameterization == 'exp':
         self._init_std_param = np.log(init_std)
         if min_std is not None:
             self._min_std_param = np.log(min_std)
         if max_std is not None:
             self._max_std_param = np.log(max_std)
     elif self._std_parameterization == 'softplus':
         self._init_std_param = np.log(np.exp(init_std) - 1)
         if min_std is not None:
             self._min_std_param = np.log(np.exp(min_std) - 1)
         if max_std is not None:
             self._max_std_param = np.log(np.exp(max_std) - 1)
     else:
         raise ValueError("std parameterization should be or 'exp' or "
                          "'softplus' but got {}".format(
                              self._std_parameterization))
Beispiel #27
0
    def __init__(self,
                 output_dim,
                 filters,
                 strides,
                 padding,
                 hidden_sizes,
                 name=None,
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_filters=(),
                 std_strides=(),
                 std_padding='SAME',
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 std_hidden_b_init=tf.zeros_initializer(),
                 std_output_nonlinearity=None,
                 std_output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 std_parameterization='exp',
                 layer_normalization=False):
        # Network parameters
        super().__init__(name)
        self._output_dim = output_dim
        self._filters = filters
        self._strides = strides
        self._padding = padding
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._learn_std = learn_std
        self._adaptive_std = adaptive_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._min_std = min_std
        self._max_std = max_std
        self._std_filters = std_filters
        self._std_strides = std_strides
        self._std_padding = std_padding
        self._std_hidden_sizes = std_hidden_sizes
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_hidden_w_init = std_hidden_w_init
        self._std_hidden_b_init = std_hidden_b_init
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_output_w_init = std_output_w_init
        self._std_parameterization = std_parameterization
        self._layer_normalization = layer_normalization

        # Tranform std arguments to parameterized space
        self._init_std_param = None
        self._min_std_param = None
        self._max_std_param = None
        if self._std_parameterization == 'exp':
            self._init_std_param = np.log(init_std)
            if min_std is not None:
                self._min_std_param = np.log(min_std)
            if max_std is not None:
                self._max_std_param = np.log(max_std)
        elif self._std_parameterization == 'softplus':
            self._init_std_param = np.log(np.exp(init_std) - 1)
            if min_std is not None:
                self._min_std_param = np.log(np.exp(min_std) - 1)
            if max_std is not None:
                self._max_std_param = np.log(np.exp(max_std) - 1)
        else:
            raise NotImplementedError
    def __init__(self,
                 env_spec,
                 encoder,
                 name='GaussianMLPTaskEmbeddingPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 learn_std=True,
                 adaptive_std=False,
                 std_share_network=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=tf.nn.tanh,
                 std_output_nonlinearity=None,
                 std_parameterization='exp',
                 layer_normalization=False):
        assert isinstance(env_spec.action_space, akro.Box)
        assert not isinstance(env_spec.observation_space, akro.Dict)
        self._env_spec = env_spec
        self._name = name
        self._encoder = encoder
        self._augmented_observation_space = akro.concat(
            self._env_spec.observation_space, self.task_space)
        self._hidden_sizes = hidden_sizes
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._learn_std = learn_std
        self._adaptive_std = adaptive_std
        self._std_share_network = std_share_network
        self._init_std = init_std
        self._min_std = min_std
        self._max_std = max_std
        self._std_hidden_sizes = std_hidden_sizes
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_parameterization = std_parameterization
        self._layer_normalization = layer_normalization

        self.obs_dim = env_spec.observation_space.flat_dim
        self.action_dim = env_spec.action_space.flat_dim

        super().__init__(output_dim=self.action_dim,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         learn_std=learn_std,
                         adaptive_std=adaptive_std,
                         std_share_network=std_share_network,
                         init_std=init_std,
                         min_std=min_std,
                         max_std=max_std,
                         std_hidden_sizes=std_hidden_sizes,
                         std_hidden_nonlinearity=std_hidden_nonlinearity,
                         std_output_nonlinearity=std_output_nonlinearity,
                         std_parameterization=std_parameterization,
                         layer_normalization=layer_normalization,
                         name=name)

        self._initialize()
Beispiel #29
0
    def __init__(self,
                 env_spec,
                 filters,
                 strides,
                 padding,
                 hidden_sizes,
                 hidden_nonlinearity=tf.nn.tanh,
                 hidden_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 hidden_b_init=tf.zeros_initializer(),
                 output_nonlinearity=None,
                 output_w_init=tf.initializers.glorot_uniform(
                     seed=deterministic.get_tf_seed_stream()),
                 output_b_init=tf.zeros_initializer(),
                 name='GaussianCNNBaseline',
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_filters=(),
                 std_strides=(),
                 std_padding='SAME',
                 std_hidden_sizes=(),
                 std_hidden_nonlinearity=None,
                 std_output_nonlinearity=None,
                 layer_normalization=False,
                 normalize_inputs=True,
                 normalize_outputs=True,
                 subsample_factor=1.,
                 optimizer=None,
                 optimizer_args=None,
                 use_trust_region=True,
                 max_kl_step=0.01):

        if not isinstance(env_spec.observation_space, akro.Box) or \
                not len(env_spec.observation_space.shape) in (2, 3):
            raise ValueError(
                '{} can only process 2D, 3D akro.Image or'
                ' akro.Box observations, but received an env_spec with '
                'observation_space of type {} and shape {}'.format(
                    type(self).__name__,
                    type(env_spec.observation_space).__name__,
                    env_spec.observation_space.shape))

        self._env_spec = env_spec
        self._use_trust_region = use_trust_region
        self._subsample_factor = subsample_factor
        self._max_kl_step = max_kl_step
        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs

        if optimizer_args is None:
            optimizer_args = dict()
        if optimizer is None:
            if use_trust_region:
                self._optimizer = make_optimizer(PenaltyLbfgsOptimizer,
                                                 **optimizer_args)
            else:
                self._optimizer = make_optimizer(LbfgsOptimizer,
                                                 **optimizer_args)
        else:
            self._optimizer = make_optimizer(optimizer, **optimizer_args)

        super().__init__(input_shape=env_spec.observation_space.shape,
                         output_dim=1,
                         filters=filters,
                         strides=strides,
                         padding=padding,
                         hidden_sizes=hidden_sizes,
                         hidden_nonlinearity=hidden_nonlinearity,
                         hidden_w_init=hidden_w_init,
                         hidden_b_init=hidden_b_init,
                         output_nonlinearity=output_nonlinearity,
                         output_w_init=output_w_init,
                         output_b_init=output_b_init,
                         learn_std=learn_std,
                         adaptive_std=adaptive_std,
                         std_share_network=std_share_network,
                         init_std=init_std,
                         min_std=None,
                         max_std=None,
                         std_filters=std_filters,
                         std_strides=std_strides,
                         std_padding=std_padding,
                         std_hidden_sizes=std_hidden_sizes,
                         std_hidden_nonlinearity=std_hidden_nonlinearity,
                         std_output_nonlinearity=std_output_nonlinearity,
                         std_parameterization='exp',
                         layer_normalization=layer_normalization,
                         name=name)
        # model for old distribution, used when trusted region is on
        self._old_model = self.clone_model(name=name + '_old_model')
        self._old_network = None

        self._x_mean = None
        self._x_std = None
        self._y_mean = None
        self._y_std = None

        self._initialize()
 def __init__(self,
              input_shape,
              output_dim,
              filters,
              strides,
              padding,
              hidden_sizes,
              name='GaussianCNNRegressorModel',
              hidden_nonlinearity=tf.nn.tanh,
              hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              hidden_b_init=tf.zeros_initializer(),
              output_nonlinearity=None,
              output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              output_b_init=tf.zeros_initializer(),
              learn_std=True,
              adaptive_std=False,
              std_share_network=False,
              init_std=1.0,
              min_std=1e-6,
              max_std=None,
              std_filters=(),
              std_strides=(),
              std_padding='SAME',
              std_hidden_sizes=(32, 32),
              std_hidden_nonlinearity=tf.nn.tanh,
              std_hidden_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              std_hidden_b_init=tf.zeros_initializer(),
              std_output_nonlinearity=None,
              std_output_w_init=tf.initializers.glorot_uniform(
                  seed=deterministic.get_tf_seed_stream()),
              std_parameterization='exp',
              layer_normalization=False):
     super().__init__(output_dim=output_dim,
                      filters=filters,
                      strides=strides,
                      padding=padding,
                      hidden_sizes=hidden_sizes,
                      hidden_nonlinearity=hidden_nonlinearity,
                      hidden_w_init=hidden_w_init,
                      hidden_b_init=hidden_b_init,
                      output_nonlinearity=output_nonlinearity,
                      output_w_init=output_w_init,
                      output_b_init=output_b_init,
                      learn_std=learn_std,
                      adaptive_std=adaptive_std,
                      std_share_network=std_share_network,
                      init_std=init_std,
                      min_std=min_std,
                      max_std=max_std,
                      std_filters=std_filters,
                      std_strides=std_strides,
                      std_padding=std_padding,
                      std_hidden_sizes=std_hidden_sizes,
                      std_hidden_nonlinearity=std_hidden_nonlinearity,
                      std_hidden_w_init=std_hidden_w_init,
                      std_hidden_b_init=std_hidden_b_init,
                      std_output_nonlinearity=std_output_nonlinearity,
                      std_output_w_init=std_output_w_init,
                      std_parameterization=std_parameterization,
                      layer_normalization=layer_normalization,
                      name=name)
     self._input_shape = input_shape