Beispiel #1
0
 def dist_info_sym(self, obs_var, state_info_vars, name=None):
     with tf.name_scope(name, 'dist_info_sym', [obs_var, state_info_vars]):
         n_batches = tf.shape(obs_var)[0]
         n_steps = tf.shape(obs_var)[1]
         obs_var = tf.reshape(obs_var, tf.stack([n_batches, n_steps, -1]))
         if self.state_include_action:
             prev_action_var = state_info_vars['prev_action']
             all_input_var = tf.concat(
                 axis=2, values=[obs_var, prev_action_var])
         else:
             all_input_var = obs_var
         if self.feature_network is None:
             with tf.name_scope(
                     self._mean_network_name, values=[all_input_var]):
                 means = L.get_output(self.mean_network.output_layer,
                                      {self.l_input: all_input_var})
             with tf.name_scope(
                     self._std_network_name, values=[all_input_var]):
                 log_stds = L.get_output(self.l_log_std,
                                         {self.l_input: all_input_var})
         else:
             flat_input_var = tf.reshape(all_input_var,
                                         (-1, self.input_dim))
             with tf.name_scope(
                     self._mean_network_name,
                     values=[all_input_var, flat_input_var]):
                 means = L.get_output(
                     self.mean_network.output_layer, {
                         self.l_input: all_input_var,
                         self.feature_network.input_layer: flat_input_var
                     })
             with tf.name_scope(
                     self._mean_network_name,
                     values=[all_input_var, flat_input_var]):
                 log_stds = L.get_output(
                     self.l_log_std, {
                         self.l_input: all_input_var,
                         self.feature_network.input_layer: flat_input_var
                     })
         return dict(mean=means, log_std=log_stds)
Beispiel #2
0
    def __init__(
        self,
        env_spec,
        name='CategoricalMLPPolicy',
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        prob_network=None,
    ):
        """
        CategoricalMLPPolicy.

        A policy that uses a MLP to estimate a categorical distribution.

        Args:
            env_spec (garage.envs.env_spec.EnvSpec): Environment specification.
            hidden_sizes (list[int]): Output dimension of dense layer(s).
                For example, (32, 32) means the MLP of this policy consists
                of two hidden layers, each with 32 hidden units.
            hidden_nonlinearity: Activation function for
                intermediate dense layer(s).
            prob_network (tf.Tensor): manually specified network for this
                policy. If None, a MLP with the network parameters will be
                created. If not None, other network params are ignored.
        """
        assert isinstance(env_spec.action_space, akro.Discrete)

        Serializable.quick_init(self, locals())

        self.name = name
        self._prob_network_name = 'prob_network'
        with tf.variable_scope(name, 'CategoricalMLPPolicy'):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim, ),
                    output_dim=env_spec.action_space.n,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name=self._prob_network_name,
                )

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            with tf.name_scope(self._prob_network_name):
                prob_network_outputs = L.get_output(prob_network.output_layer)
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var], prob_network_outputs)

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalMLPPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [prob_network.output_layer])
Beispiel #3
0
    def log_likelihood_sym(self, x_var, y_var, name=None):
        with tf.name_scope(name, 'log_likelihood_sym', [x_var, y_var]):
            normalized_xs_var = (x_var - self._x_mean_var) / self._x_std_var

            with tf.name_scope(self._mean_network_name,
                               values=[normalized_xs_var]):
                normalized_means_var = L.get_output(
                    self._l_mean,
                    {self._mean_network.input_layer: normalized_xs_var})
            with tf.name_scope(self._std_network_name,
                               values=[normalized_xs_var]):
                normalized_log_stds_var = L.get_output(
                    self._l_log_std,
                    {self._mean_network.input_layer: normalized_xs_var})

            means_var = (normalized_means_var * self._y_std_var +
                         self._y_mean_var)
            log_stds_var = normalized_log_stds_var + tf.math.log(
                self._y_std_var)

            return self._dist.log_likelihood_sym(
                y_var, dict(mean=means_var, log_std=log_stds_var))
Beispiel #4
0
    def _build_net(self, reuse=None, custom_getter=None, trainable=None):
        """
        Set up q network based on class attributes.

        This function uses layers defined in garage.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            custom_getter: A customized getter object used to get variables.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(self.name,
                               reuse=reuse,
                               custom_getter=custom_getter):
            l_in = layers.InputLayer(shape=(None, self._obs_dim), name="obs")

            l_hidden = l_in
            for idx, hidden_size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                l_hidden = layers.DenseLayer(
                    l_hidden,
                    hidden_size,
                    nonlinearity=self._hidden_nonlinearity,
                    trainable=trainable,
                    name="hidden_%d" % idx)

            l_output = layers.DenseLayer(
                l_hidden,
                self._action_dim,
                nonlinearity=self._output_nonlinearity,
                trainable=trainable,
                name="output")

            with tf.name_scope(self._policy_network_name):
                action = layers.get_output(l_output)
                scaled_action = tf.multiply(action,
                                            self._action_bound,
                                            name="scaled_action")

        self._f_prob_online = tensor_utils.compile_function(
            inputs=[l_in.input_var], outputs=scaled_action)
        self._output_layer = l_output
        self._obs_layer = l_in

        LayersPowered.__init__(self, [l_output])
Beispiel #5
0
    def __init__(
            self,
            env_spec,
            name="CategoricalMLPPolicy",
            hidden_sizes=(32, 32),
            hidden_nonlinearity=tf.nn.tanh,
            prob_network=None,
    ):
        """
        :param env_spec: A spec for the mdp.
        :param hidden_sizes: list of sizes for the fully connected
        hidden layers
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param prob_network: manually specified network for this policy, other
         network params
        are ignored
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        Serializable.quick_init(self, locals())

        self.name = name
        self._prob_network_name = "prob_network"
        with tf.variable_scope(name, "CategoricalMLPPolicy"):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim, ),
                    output_dim=env_spec.action_space.n,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=tf.nn.softmax,
                    name=self._prob_network_name,
                )

            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            with tf.name_scope(self._prob_network_name):
                prob_network_outputs = L.get_output(prob_network.output_layer)
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var], prob_network_outputs)

            self._dist = Categorical(env_spec.action_space.n)

            super(CategoricalMLPPolicy, self).__init__(env_spec)
            LayersPowered.__init__(self, [prob_network.output_layer])
Beispiel #6
0
    def build_net(self, trainable=True, name=None):
        """
        Set up q network based on class attributes.

        This function uses layers defined in garage.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(name):
            l_in = layers.InputLayer(shape=(None, self._obs_dim), name='obs')

            l_hidden = l_in
            for idx, hidden_size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                l_hidden = layers.DenseLayer(
                    l_hidden,
                    hidden_size,
                    nonlinearity=self._hidden_nonlinearity,
                    trainable=trainable,
                    name='hidden_%d' % idx)

            l_output = layers.DenseLayer(
                l_hidden,
                self._action_dim,
                nonlinearity=self._output_nonlinearity,
                trainable=trainable,
                name='output')

            with tf.name_scope(self._policy_network_name):
                action = layers.get_output(l_output)
                scaled_action = tf.multiply(action,
                                            self._action_bound,
                                            name='scaled_action')

        f_prob_online = tensor_utils.compile_function(inputs=[l_in.input_var],
                                                      outputs=scaled_action)
        output_layer = l_output
        obs_layer = l_in

        return f_prob_online, output_layer, obs_layer
Beispiel #7
0
    def __init__(self,
                 env_spec,
                 name='DeterministicMLPPolicy',
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.relu,
                 output_nonlinearity=tf.nn.tanh,
                 prob_network=None,
                 bn=False):
        assert isinstance(env_spec.action_space, akro.Box)

        Serializable.quick_init(self, locals())

        self._prob_network_name = 'prob_network'
        with tf.compat.v1.variable_scope(name, 'DeterministicMLPPolicy'):
            if prob_network is None:
                prob_network = MLP(
                    input_shape=(env_spec.observation_space.flat_dim, ),
                    output_dim=env_spec.action_space.flat_dim,
                    hidden_sizes=hidden_sizes,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    # batch_normalization=True,
                    name='mlp_prob_network',
                )

            with tf.name_scope(self._prob_network_name):
                prob_network_output = L.get_output(
                    prob_network.output_layer, deterministic=True)
            self._l_prob = prob_network.output_layer
            self._l_obs = prob_network.input_layer
            self._f_prob = tensor_utils.compile_function(
                [prob_network.input_layer.input_var], prob_network_output)

        self.prob_network = prob_network
        self.name = name

        # Note the deterministic=True argument. It makes sure that when getting
        # actions from single observations, we do not update params in the
        # batch normalization layers.
        # TODO: this doesn't currently work properly in the tf version so we
        # leave out batch_norm
        super(DeterministicMLPPolicy, self).__init__(env_spec)
        LayersPowered.__init__(self, [prob_network.output_layer])
    def __init__(
        self,
        env_spec,
        name="GaussianGRUPolicy",
        hidden_dim=32,
        feature_network=None,
        state_include_action=True,
        hidden_nonlinearity=tf.tanh,
        gru_layer_cls=L.GRULayer,
        learn_std=True,
        init_std=1.0,
        output_nonlinearity=None,
        std_share_network=False,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Box)

        self._mean_network_name = "mean_network"
        self._std_network_name = "std_network"

        with tf.variable_scope(name, "GaussianGRUPolicy"):
            Serializable.quick_init(self, locals())
            super(GaussianGRUPolicy, self).__init__(env_spec)

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            if state_include_action:
                input_dim = obs_dim + action_dim
            else:
                input_dim = obs_dim

            l_input = L.InputLayer(shape=(None, None, input_dim), name="input")

            if feature_network is None:
                feature_dim = input_dim
                l_flat_feature = None
                l_feature = l_input
            else:
                feature_dim = feature_network.output_layer.output_shape[-1]
                l_flat_feature = feature_network.output_layer
                l_feature = L.OpLayer(
                    l_flat_feature,
                    extras=[l_input],
                    name="reshape_feature",
                    op=lambda flat_feature, input: tf.reshape(
                        flat_feature,
                        tf.stack([
                            tf.shape(input)[0],
                            tf.shape(input)[1], feature_dim
                        ])),
                    shape_op=lambda _, input_shape:
                    (input_shape[0], input_shape[1], feature_dim))

            if std_share_network:
                mean_network = GRUNetwork(
                    input_shape=(feature_dim, ),
                    input_layer=l_feature,
                    output_dim=2 * action_dim,
                    hidden_dim=hidden_dim,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    gru_layer_cls=gru_layer_cls,
                    name="gru_mean_network")

                l_mean = L.SliceLayer(mean_network.output_layer,
                                      slice(action_dim),
                                      name="mean_slice")

                l_step_mean = L.SliceLayer(mean_network.step_output_layer,
                                           slice(action_dim),
                                           name="step_mean_slice")

                l_log_std = L.SliceLayer(mean_network.output_layer,
                                         slice(action_dim, 2 * action_dim),
                                         name="log_std_slice")

                l_step_log_std = L.SliceLayer(mean_network.step_output_layer,
                                              slice(action_dim,
                                                    2 * action_dim),
                                              name="step_log_std_slice")
            else:
                mean_network = GRUNetwork(
                    input_shape=(feature_dim, ),
                    input_layer=l_feature,
                    output_dim=action_dim,
                    hidden_dim=hidden_dim,
                    hidden_nonlinearity=hidden_nonlinearity,
                    output_nonlinearity=output_nonlinearity,
                    gru_layer_cls=gru_layer_cls,
                    name="gru_mean_network")

                l_mean = mean_network.output_layer

                l_step_mean = mean_network.step_output_layer

                l_log_std = L.ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=tf.constant_initializer(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

                l_step_log_std = L.ParamLayer(
                    mean_network.step_input_layer,
                    num_units=action_dim,
                    param=l_log_std.param,
                    name="step_output_log_std",
                    trainable=learn_std,
                )

            self.mean_network = mean_network
            self.feature_network = feature_network
            self.l_input = l_input
            self.state_include_action = state_include_action

            flat_input_var = tf.placeholder(dtype=tf.float32,
                                            shape=(None, input_dim),
                                            name="flat_input")
            if feature_network is None:
                feature_var = flat_input_var
            else:
                feature_var = L.get_output(
                    l_flat_feature,
                    {feature_network.input_layer: flat_input_var})

            with tf.name_scope(self._mean_network_name):
                out_step_mean, out_step_hidden_mean = L.get_output(
                    [l_step_mean, mean_network.step_hidden_layer],
                    {mean_network.step_input_layer: feature_var})
                out_step_mean = tf.identity(out_step_mean, "step_mean")
                out_step_hidden_mean = tf.identity(out_step_hidden_mean,
                                                   "step_hidden_mean")

            with tf.name_scope(self._std_network_name):
                out_step_log_std = L.get_output(
                    l_step_log_std,
                    {mean_network.step_input_layer: feature_var})
                out_step_log_std = tf.identity(out_step_log_std,
                                               "step_log_std")

            self.f_step_mean_std = tensor_utils.compile_function([
                flat_input_var,
                mean_network.step_prev_state_layer.input_var,
            ], [out_step_mean, out_step_log_std, out_step_hidden_mean])

            self.l_mean = l_mean
            self.l_log_std = l_log_std

            self.input_dim = input_dim
            self.action_dim = action_dim
            self.hidden_dim = hidden_dim

            self.prev_actions = None
            self.prev_hiddens = None
            self.dist = RecurrentDiagonalGaussian(action_dim)
            self.name = name

            out_layers = [l_mean, l_log_std, l_step_log_std]
            if feature_network is not None:
                out_layers.append(feature_network.output_layer)

            LayersPowered.__init__(self, out_layers)
    def __init__(
        self,
        input_shape,
        output_dim,
        name='CategoricalMLPRegressor',
        prob_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        optimizer=None,
        tr_optimizer=None,
        use_trust_region=True,
        max_kl_step=0.01,
        normalize_inputs=True,
        no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean
        network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
        mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param max_kl_step: KL divergence constraint for each iteration
        """
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        with tf.compat.v1.variable_scope(name, 'CategoricalMLPRegressor'):
            if optimizer is None:
                optimizer = LbfgsOptimizer()
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            self._prob_network_name = 'prob_network'
            if prob_network is None:
                prob_network = MLP(input_shape=input_shape,
                                   output_dim=output_dim,
                                   hidden_sizes=hidden_sizes,
                                   hidden_nonlinearity=hidden_nonlinearity,
                                   output_nonlinearity=tf.nn.softmax,
                                   name=self._prob_network_name)

            l_prob = prob_network.output_layer

            LayersPowered.__init__(self, [l_prob])

            xs_var = prob_network.input_layer.input_var
            ys_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                              shape=[None, output_dim],
                                              name='ys')
            old_prob_var = tf.compat.v1.placeholder(dtype=tf.float32,
                                                    shape=[None, output_dim],
                                                    name='old_prob')

            x_mean_var = tf.compat.v1.get_variable(
                name='x_mean',
                shape=(1, ) + input_shape,
                initializer=tf.constant_initializer(0., dtype=tf.float32))
            x_std_var = tf.compat.v1.get_variable(
                name='x_std',
                shape=(1, ) + input_shape,
                initializer=tf.constant_initializer(1., dtype=tf.float32))

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            with tf.name_scope(self._prob_network_name,
                               values=[normalized_xs_var]):
                prob_var = L.get_output(
                    l_prob, {prob_network.input_layer: normalized_xs_var})

            old_info_vars = dict(prob=old_prob_var)
            info_vars = dict(prob=prob_var)

            dist = self._dist = Categorical(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = tf.one_hot(tf.argmax(prob_var, axis=1),
                                   depth=output_dim)

            self.prob_network = prob_network
            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_prob = tensor_utils.compile_function([xs_var], prob_var)
            self.l_prob = l_prob

            self.optimizer.update_opt(loss=loss,
                                      target=self,
                                      network_outputs=[prob_var],
                                      inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss,
                                         target=self,
                                         network_outputs=[prob_var],
                                         inputs=[xs_var, ys_var, old_prob_var],
                                         leq_constraint=(mean_kl, max_kl_step))

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
Beispiel #10
0
 def dist_info_sym(self, obs_var, state_info_vars=None, name=None):
     with tf.name_scope(name, "dist_info_sym", [obs_var, state_info_vars]):
         with tf.name_scope(self._prob_network_name, values=[obs_var]):
             prob = L.get_output(
                 self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)})
         return dict(prob=prob)
Beispiel #11
0
    def __init__(
        self,
        input_shape,
        output_dim,
        name="DeterministicMLPRegressor",
        network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.tanh,
        output_nonlinearity=None,
        optimizer=None,
        optimizer_args=None,
        normalize_inputs=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the
        mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
        mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        """
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name, "DeterministicMLPRegressor"):
            if optimizer_args is None:
                optimizer_args = dict()

            if optimizer is None:
                optimizer = LbfgsOptimizer(**optimizer_args)
            else:
                optimizer = optimizer(**optimizer_args)

            self.output_dim = output_dim
            self.optimizer = optimizer

            self._network_name = "network"
            if network is None:
                network = MLP(input_shape=input_shape,
                              output_dim=output_dim,
                              hidden_sizes=hidden_sizes,
                              hidden_nonlinearity=hidden_nonlinearity,
                              output_nonlinearity=output_nonlinearity,
                              name=self._network_name)

            l_out = network.output_layer

            LayersPowered.__init__(self, [l_out])

            xs_var = network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    shape=[None, output_dim],
                                    name="ys")

            x_mean_var = tf.get_variable(name="x_mean",
                                         shape=(1, ) + input_shape,
                                         initializer=tf.constant_initializer(
                                             0., dtype=tf.float32))
            x_std_var = tf.get_variable(name="x_std",
                                        shape=(1, ) + input_shape,
                                        initializer=tf.constant_initializer(
                                            1., dtype=tf.float32))

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            with tf.name_scope(self._network_name, values=[normalized_xs_var]):
                fit_ys_var = L.get_output(
                    l_out, {network.input_layer: normalized_xs_var})

            loss = -tf.reduce_mean(tf.square(fit_ys_var - ys_var))

            self.f_predict = tensor_utils.compile_function([xs_var],
                                                           fit_ys_var)

            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[fit_ys_var],
            )

            optimizer_args["inputs"] = [xs_var, ys_var]

            self.optimizer.update_opt(**optimizer_args)

            self.name = name
            self.l_out = l_out

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
Beispiel #12
0
 def predict_sym(self, xs, name=None):
     with tf.name_scope(name, "predict_sym", values=[xs]):
         return L.get_output(self.l_out, xs)
Beispiel #13
0
    def build_net(self, trainable=True, name=None):
        """
        Set up policy network based on class attributes.

        This function uses layers defined in garage.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            trainable: A bool indicates whether variables are trainable.
        """
        input_shape = self._env_spec.observation_space.shape
        assert len(input_shape) in [2, 3]
        if len(input_shape) == 2:
            input_shape = (1, ) + input_shape

        with tf.variable_scope(name):
            l_in = layers.InputLayer(shape=(None, self._obs_dim), name="obs")
            l_hid = layers.reshape(
                l_in, ([0], ) + input_shape, name="reshape_input")

            if self._batch_norm:
                l_hid = layers.batch_norm(l_hid)

            for idx, conv_filter, filter_size, stride, pad in zip(
                    range(len(self._conv_filters)),
                    self._conv_filters,
                    self._conv_filter_sizes,
                    self._conv_strides,
                    self._conv_pads,
            ):
                l_hid = layers.Conv2DLayer(
                    l_hid,
                    num_filters=conv_filter,
                    filter_size=filter_size,
                    stride=(stride, stride),
                    pad=pad,
                    nonlinearity=self._hidden_nonlinearity,
                    name="conv_hidden_%d" % idx,
                    weight_normalization=self._weight_normalization,
                    trainable=trainable,
                )
                if self._pooling:
                    l_hid = layers.Pool2DLayer(l_hid, pool_size=self._pool_size)
                if self._batch_norm:
                    l_hid = layers.batch_norm(l_hid)

            l_hid = layers.flatten(l_hid, name="conv_flatten")
            for idx, hidden_size in enumerate(self._hidden_sizes):
                l_hid = layers.DenseLayer(
                    l_hid,
                    num_units=hidden_size,
                    nonlinearity=self._hidden_nonlinearity,
                    name="hidden_%d" % idx,
                    weight_normalization=self._weight_normalization,
                    trainable=trainable,
                )
                if self._batch_norm:
                    l_hid = layers.batch_norm(l_hid)
            l_output = layers.DenseLayer(
                l_hid,
                num_units=self._action_dim,
                nonlinearity=self._output_nonlinearity,
                name="output",
                weight_normalization=self._weight_normalization,
                trainable=trainable,
            )

            with tf.name_scope(self._policy_network_name):
                action = layers.get_output(l_output)
                # scaled_action = tf.multiply(
                #     action, self._action_bound, name="scaled_action")

        f_prob_online = tensor_utils.compile_function(
            inputs=[l_in.input_var], outputs=action)
        output_layer = l_output
        obs_layer = l_in

        return f_prob_online, output_layer, obs_layer
    def _build_net(self, reuse=None, custom_getter=None, trainable=None):
        """
        Set up q network based on class attributes. This function uses layers
        defined in rllab.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            custom_getter: A customized getter object used to get variables.
            trainable: A bool indicates whether variables are trainable.
        """
        with tf.variable_scope(
                self.name, reuse=reuse, custom_getter=custom_getter):
            l_obs = L.InputLayer(
                shape=(None, flat_dim(self._env_spec.observation_space)),
                name="obs")
            l_action = L.InputLayer(
                shape=(None, flat_dim(self._env_spec.action_space)),
                name="actions")

            n_layers = len(self._hidden_sizes) + 1

            if n_layers > 1:
                action_merge_layer = \
                    (self._action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            l_hidden = l_obs

            for idx, size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hidden = batch_norm(l_hidden)

                if idx == action_merge_layer:
                    l_hidden = L.ConcatLayer([l_hidden, l_action])

                l_hidden = L.DenseLayer(
                    l_hidden,
                    num_units=size,
                    nonlinearity=self._hidden_nonlinearity,
                    trainable=trainable,
                    name="hidden_%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hidden = L.ConcatLayer([l_hidden, l_action])

            l_output = L.DenseLayer(
                l_hidden,
                num_units=1,
                nonlinearity=self._output_nonlinearity,
                trainable=trainable,
                name="output")

            output_var = L.get_output(l_output)

        self._f_qval = tensor_utils.compile_function(
            [l_obs.input_var, l_action.input_var], output_var)
        self._output_layer = l_output
        self._obs_layer = l_obs
        self._action_layer = l_action

        LayersPowered.__init__(self, [l_output])
Beispiel #15
0
    def __init__(self,
                 env_spec,
                 name='CategoricalLSTMPolicy',
                 hidden_dim=32,
                 hidden_nonlinearity=tf.nn.tanh,
                 recurrent_nonlinearity=tf.nn.sigmoid,
                 recurrent_w_x_init=L.XavierUniformInitializer(),
                 recurrent_w_h_init=L.OrthogonalInitializer(),
                 output_nonlinearity=tf.nn.softmax,
                 output_w_init=L.XavierUniformInitializer(),
                 feature_network=None,
                 prob_network=None,
                 state_include_action=True,
                 forget_bias=1.0,
                 use_peepholes=False,
                 lstm_layer_cls=L.LSTMLayer):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        self._prob_network_name = 'prob_network'
        with tf.variable_scope(name, 'CategoricalLSTMPolicy'):
            Serializable.quick_init(self, locals())
            super(CategoricalLSTMPolicy, self).__init__(env_spec)

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            if state_include_action:
                input_dim = obs_dim + action_dim
            else:
                input_dim = obs_dim

            l_input = L.InputLayer(shape=(None, None, input_dim), name='input')

            if feature_network is None:
                feature_dim = input_dim
                l_flat_feature = None
                l_feature = l_input
            else:
                feature_dim = feature_network.output_layer.output_shape[-1]
                l_flat_feature = feature_network.output_layer
                l_feature = L.OpLayer(
                    l_flat_feature,
                    extras=[l_input],
                    name='reshape_feature',
                    op=lambda flat_feature, input: tf.reshape(
                        flat_feature,
                        tf.stack([
                            tf.shape(input)[0],
                            tf.shape(input)[1], feature_dim
                        ])),
                    shape_op=lambda _, input_shape:
                    (input_shape[0], input_shape[1], feature_dim))

            if prob_network is None:
                prob_network = LSTMNetwork(
                    input_shape=(feature_dim, ),
                    input_layer=l_feature,
                    output_dim=env_spec.action_space.n,
                    hidden_dim=hidden_dim,
                    hidden_nonlinearity=hidden_nonlinearity,
                    recurrent_nonlinearity=recurrent_nonlinearity,
                    recurrent_w_x_init=recurrent_w_x_init,
                    recurrent_w_h_init=recurrent_w_h_init,
                    output_nonlinearity=output_nonlinearity,
                    output_w_init=output_w_init,
                    forget_bias=forget_bias,
                    use_peepholes=use_peepholes,
                    lstm_layer_cls=lstm_layer_cls,
                    name=self._prob_network_name)

            self.prob_network = prob_network
            self.feature_network = feature_network
            self.l_input = l_input
            self.state_include_action = state_include_action

            flat_input_var = tf.placeholder(dtype=tf.float32,
                                            shape=(None, input_dim),
                                            name='flat_input')
            if feature_network is None:
                feature_var = flat_input_var
            else:
                with tf.name_scope('feature_network', values=[flat_input_var]):
                    feature_var = L.get_output(
                        l_flat_feature,
                        {feature_network.input_layer: flat_input_var})

            with tf.name_scope(self._prob_network_name, values=[feature_var]):
                out_prob_step, out_prob_hidden, out_step_cell = L.get_output(
                    [
                        prob_network.step_output_layer,
                        prob_network.step_hidden_layer,
                        prob_network.step_cell_layer
                    ], {prob_network.step_input_layer: feature_var})

            self.f_step_prob = tensor_utils.compile_function([
                flat_input_var,
                prob_network.step_prev_state_layer.input_var,
            ], [out_prob_step, out_prob_hidden, out_step_cell])

            self.input_dim = input_dim
            self.action_dim = action_dim
            self.hidden_dim = hidden_dim
            self.name = name

            self.prev_actions = None
            self.prev_hiddens = None
            self.prev_cells = None
            self.dist = RecurrentCategorical(env_spec.action_space.n)

            out_layers = [prob_network.output_layer]
            if feature_network is not None:
                out_layers.append(feature_network.output_layer)

            LayersPowered.__init__(self, out_layers)
Beispiel #16
0
 def get_action_sym(self, obs_var, name=None):
     with tf.name_scope(name, 'get_action_sym', values=[obs_var]):
         with tf.name_scope(self._prob_network_name, values=[obs_var]):
             return L.get_output(self.prob_network.output_layer, obs_var)
Beispiel #17
0
    def __init__(
        self,
        env_spec,
        name='GaussianLSTMPolicy',
        hidden_dim=32,
        hidden_nonlinearity=tf.tanh,
        recurrent_nonlinearity=tf.nn.sigmoid,
        recurrent_w_x_init=L.XavierUniformInitializer(),
        recurrent_w_h_init=L.OrthogonalInitializer(),
        output_nonlinearity=None,
        output_w_init=L.XavierUniformInitializer(),
        feature_network=None,
        state_include_action=True,
        learn_std=True,
        init_std=1.0,
        lstm_layer_cls=L.LSTMLayer,
        use_peepholes=False,
        std_share_network=False,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, akro.Box)

        self._mean_network_name = 'mean_network'
        self._std_network_name = 'std_network'
        with tf.variable_scope(name, 'GaussianLSTMPolicy'):
            Serializable.quick_init(self, locals())
            super(GaussianLSTMPolicy, self).__init__(env_spec)

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            if state_include_action:
                input_dim = obs_dim + action_dim
            else:
                input_dim = obs_dim

            l_input = L.InputLayer(shape=(None, None, input_dim), name='input')

            if feature_network is None:
                feature_dim = input_dim
                l_flat_feature = None
                l_feature = l_input
            else:
                feature_dim = feature_network.output_layer.output_shape[-1]
                l_flat_feature = feature_network.output_layer
                l_feature = L.OpLayer(
                    l_flat_feature,
                    extras=[l_input],
                    name='reshape_feature',
                    op=lambda flat_feature, input: tf.reshape(
                        flat_feature,
                        tf.stack([
                            tf.shape(input)[0],
                            tf.shape(input)[1], feature_dim
                        ])),
                    shape_op=lambda _, input_shape:
                    (input_shape[0], input_shape[1], feature_dim))

            if std_share_network:
                mean_network = LSTMNetwork(
                    input_shape=(feature_dim, ),
                    input_layer=l_feature,
                    output_dim=2 * action_dim,
                    hidden_dim=hidden_dim,
                    hidden_nonlinearity=hidden_nonlinearity,
                    recurrent_nonlinearity=recurrent_nonlinearity,
                    recurrent_w_x_init=recurrent_w_x_init,
                    recurrent_w_h_init=recurrent_w_h_init,
                    output_nonlinearity=output_nonlinearity,
                    output_w_init=output_w_init,
                    lstm_layer_cls=lstm_layer_cls,
                    name='lstm_mean_network',
                    use_peepholes=use_peepholes,
                )

                l_mean = L.SliceLayer(
                    mean_network.output_layer,
                    slice(action_dim),
                    name='mean_slice',
                )

                l_step_mean = L.SliceLayer(
                    mean_network.step_output_layer,
                    slice(action_dim),
                    name='step_mean_slice',
                )

                l_log_std = L.SliceLayer(
                    mean_network.output_layer,
                    slice(action_dim, 2 * action_dim),
                    name='log_std_slice',
                )

                l_step_log_std = L.SliceLayer(
                    mean_network.step_output_layer,
                    slice(action_dim, 2 * action_dim),
                    name='step_log_std_slice',
                )
            else:
                mean_network = LSTMNetwork(
                    input_shape=(feature_dim, ),
                    input_layer=l_feature,
                    output_dim=action_dim,
                    hidden_dim=hidden_dim,
                    hidden_nonlinearity=hidden_nonlinearity,
                    recurrent_nonlinearity=recurrent_nonlinearity,
                    recurrent_w_x_init=recurrent_w_x_init,
                    recurrent_w_h_init=recurrent_w_h_init,
                    output_nonlinearity=output_nonlinearity,
                    output_w_init=output_w_init,
                    lstm_layer_cls=lstm_layer_cls,
                    name='lstm_mean_network',
                    use_peepholes=use_peepholes,
                )

                l_mean = mean_network.output_layer

                l_step_mean = mean_network.step_output_layer

                l_log_std = L.ParamLayer(
                    mean_network.input_layer,
                    num_units=action_dim,
                    param=tf.constant_initializer(np.log(init_std)),
                    name='output_log_std',
                    trainable=learn_std,
                )

                l_step_log_std = L.ParamLayer(
                    mean_network.step_input_layer,
                    num_units=action_dim,
                    param=l_log_std.param,
                    name='step_output_log_std',
                    trainable=learn_std,
                )

            self.mean_network = mean_network
            self.feature_network = feature_network
            self.l_input = l_input
            self.state_include_action = state_include_action
            self.name = name

            flat_input_var = tf.placeholder(dtype=tf.float32,
                                            shape=(None, input_dim),
                                            name='flat_input')
            if feature_network is None:
                feature_var = flat_input_var
            else:
                feature_var = L.get_output(
                    l_flat_feature,
                    {feature_network.input_layer: flat_input_var})

            with tf.name_scope(self._mean_network_name, values=[feature_var]):
                (out_step_mean, out_step_hidden, out_mean_cell) = L.get_output(
                    [
                        l_step_mean, mean_network.step_hidden_layer,
                        mean_network.step_cell_layer
                    ], {mean_network.step_input_layer: feature_var})
                out_step_mean = tf.identity(out_step_mean, 'step_mean')
                out_step_hidden = tf.identity(out_step_hidden, 'step_hidden')
                out_mean_cell = tf.identity(out_mean_cell, 'mean_cell')

            with tf.name_scope(self._std_network_name, values=[feature_var]):
                out_step_log_std = L.get_output(
                    l_step_log_std,
                    {mean_network.step_input_layer: feature_var})
                out_step_log_std = tf.identity(out_step_log_std,
                                               'step_log_std')

            self.f_step_mean_std = tensor_utils.compile_function([
                flat_input_var,
                mean_network.step_prev_state_layer.input_var,
            ], [
                out_step_mean, out_step_log_std, out_step_hidden, out_mean_cell
            ])

            self.l_mean = l_mean
            self.l_log_std = l_log_std

            self.input_dim = input_dim
            self.action_dim = action_dim
            self.hidden_dim = hidden_dim

            self.prev_actions = None
            self.prev_hiddens = None
            self.prev_cells = None
            self.dist = RecurrentDiagonalGaussian(action_dim)

            out_layers = [l_mean, l_log_std]
            if feature_network is not None:
                out_layers.append(feature_network.output_layer)

            LayersPowered.__init__(self, out_layers)
    def build_net(self, trainable=True, name=None):
        """
        Set up q network based on class attributes. This function uses layers
        defined in garage.tf.

        Args:
            reuse: A bool indicates whether reuse variables in the same scope.
            trainable: A bool indicates whether variables are trainable.
        """
        input_shape = self._env_spec.observation_space.shape
        assert len(input_shape) in [2, 3]
        if len(input_shape) == 2:
            input_shape = (1, ) + input_shape

        with tf.variable_scope(name):
            l_in = layers.InputLayer(shape=(None, self._obs_dim), name="obs")
            l_hid = layers.reshape(l_in, ([0], ) + input_shape,
                                   name="reshape_input")

            if self._batch_norm:
                l_hid = layers.batch_norm(l_hid)

            for idx, conv_filter, filter_size, stride, pad in zip(
                    range(len(self._conv_filters)),
                    self._conv_filters,
                    self._conv_filter_sizes,
                    self._conv_strides,
                    self._conv_pads,
            ):
                l_hid = layers.Conv2DLayer(
                    l_hid,
                    num_filters=conv_filter,
                    filter_size=filter_size,
                    stride=(stride, stride),
                    pad=pad,
                    nonlinearity=self._hidden_nonlinearity,
                    name="conv_hidden_%d" % idx,
                    weight_normalization=self._weight_normalization,
                    trainable=trainable,
                )
                if self._pooling:
                    l_hid = layers.Pool2DLayer(l_hid,
                                               pool_size=self._pool_size)
                if self._batch_norm:
                    l_hid = layers.batch_norm(l_hid)

            l_hid = layers.flatten(l_hid, name="conv_flatten")
            l_action = layers.InputLayer(shape=(None, self._action_dim),
                                         name="actions")

            n_layers = len(self._hidden_sizes) + 1
            if n_layers > 1:
                action_merge_layer = \
                    (self._action_merge_layer % n_layers + n_layers) % n_layers
            else:
                action_merge_layer = 1

            for idx, size in enumerate(self._hidden_sizes):
                if self._batch_norm:
                    l_hid = batch_norm(l_hid)

                if idx == action_merge_layer:
                    l_hid = layers.ConcatLayer([l_hid, l_action])

                l_hid = layers.DenseLayer(
                    l_hid,
                    num_units=size,
                    nonlinearity=self._hidden_nonlinearity,
                    trainable=trainable,
                    name="hidden_%d" % (idx + 1))

            if action_merge_layer == n_layers:
                l_hid = layers.ConcatLayer([l_hid, l_action])

            l_output = layers.DenseLayer(
                l_hid,
                num_units=1,
                nonlinearity=self._output_nonlinearity,
                trainable=trainable,
                name="output")

            output_var = layers.get_output(l_output)

        f_qval = tensor_utils.compile_function(
            [l_in.input_var, l_action.input_var], output_var)
        output_layer = l_output
        obs_layer = l_in
        action_layer = l_action

        return f_qval, output_layer, obs_layer, action_layer
    def __init__(self,
                 input_shape,
                 output_dim,
                 name="GaussianMLPRegressor",
                 mean_network=None,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=tf.nn.tanh,
                 optimizer=None,
                 optimizer_args=None,
                 use_trust_region=True,
                 max_kl_step=0.01,
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_hidden_sizes=(32, 32),
                 std_nonlinearity=None,
                 normalize_inputs=True,
                 normalize_outputs=True,
                 subsample_factor=1.0):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean
         network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
         mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param max_kl_step: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only
         effective if adaptive_std is False. If adaptive_std is True, this
         parameter is ignored, and the weights for the std network are always
         earned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the
         std network. Only used if `std_share_network` is False. It defaults to
         the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std
         network. Only used if `std_share_network` is False. It defaults to the
         same non-linearity as the mean.
        """
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())
        self._mean_network_name = "mean_network"
        self._std_network_name = "std_network"

        with tf.variable_scope(name):
            if optimizer_args is None:
                optimizer_args = dict()

            if optimizer is None:
                if use_trust_region:
                    optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
                else:
                    optimizer = LbfgsOptimizer(**optimizer_args)
            else:
                optimizer = optimizer(**optimizer_args)

            self._optimizer = optimizer
            self._subsample_factor = subsample_factor

            if mean_network is None:
                if std_share_network:
                    mean_network = MLP(
                        name="mean_network",
                        input_shape=input_shape,
                        output_dim=2 * output_dim,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=None,
                    )
                    l_mean = L.SliceLayer(
                        mean_network.output_layer,
                        slice(output_dim),
                        name="mean_slice",
                    )
                else:
                    mean_network = MLP(
                        name="mean_network",
                        input_shape=input_shape,
                        output_dim=output_dim,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=None,
                    )
                    l_mean = mean_network.output_layer

            if adaptive_std:
                l_log_std = MLP(
                    name="log_std_network",
                    input_shape=input_shape,
                    input_var=mean_network.input_layer.input_var,
                    output_dim=output_dim,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_nonlinearity,
                    output_nonlinearity=None,
                ).output_layer
            elif std_share_network:
                l_log_std = L.SliceLayer(
                    mean_network.output_layer,
                    slice(output_dim, 2 * output_dim),
                    name="log_std_slice",
                )
            else:
                l_log_std = L.ParamLayer(
                    mean_network.input_layer,
                    num_units=output_dim,
                    param=tf.constant_initializer(np.log(init_std)),
                    name="output_log_std",
                    trainable=learn_std,
                )

            LayersPowered.__init__(self, [l_mean, l_log_std])

            xs_var = mean_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    name="ys",
                                    shape=(None, output_dim))
            old_means_var = tf.placeholder(dtype=tf.float32,
                                           name="ys",
                                           shape=(None, output_dim))
            old_log_stds_var = tf.placeholder(dtype=tf.float32,
                                              name="old_log_stds",
                                              shape=(None, output_dim))

            x_mean_var = tf.Variable(
                np.zeros((1, ) + input_shape, dtype=np.float32),
                name="x_mean",
            )
            x_std_var = tf.Variable(
                np.ones((1, ) + input_shape, dtype=np.float32),
                name="x_std",
            )
            y_mean_var = tf.Variable(
                np.zeros((1, output_dim), dtype=np.float32),
                name="y_mean",
            )
            y_std_var = tf.Variable(
                np.ones((1, output_dim), dtype=np.float32),
                name="y_std",
            )

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var
            normalized_ys_var = (ys_var - y_mean_var) / y_std_var

            with tf.name_scope(self._mean_network_name,
                               values=[normalized_xs_var]):
                normalized_means_var = L.get_output(
                    l_mean, {mean_network.input_layer: normalized_xs_var})
            with tf.name_scope(self._std_network_name,
                               values=[normalized_xs_var]):
                normalized_log_stds_var = L.get_output(
                    l_log_std, {mean_network.input_layer: normalized_xs_var})

            means_var = normalized_means_var * y_std_var + y_mean_var
            log_stds_var = normalized_log_stds_var + tf.log(y_std_var)

            normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
            normalized_old_log_stds_var = old_log_stds_var - tf.log(y_std_var)

            dist = self._dist = DiagonalGaussian(output_dim)

            normalized_dist_info_vars = dict(mean=normalized_means_var,
                                             log_std=normalized_log_stds_var)

            mean_kl = tf.reduce_mean(
                dist.kl_sym(
                    dict(mean=normalized_old_means_var,
                         log_std=normalized_old_log_stds_var),
                    normalized_dist_info_vars,
                ))

            loss = -tf.reduce_mean(
                dist.log_likelihood_sym(normalized_ys_var,
                                        normalized_dist_info_vars))

            self._f_predict = tensor_utils.compile_function([xs_var],
                                                            means_var)
            self._f_pdists = tensor_utils.compile_function(
                [xs_var], [means_var, log_stds_var])
            self._l_mean = l_mean
            self._l_log_std = l_log_std

            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[
                    normalized_means_var, normalized_log_stds_var
                ],
            )

            if use_trust_region:
                optimizer_args["leq_constraint"] = (mean_kl, max_kl_step)
                optimizer_args["inputs"] = [
                    xs_var, ys_var, old_means_var, old_log_stds_var
                ]
            else:
                optimizer_args["inputs"] = [xs_var, ys_var]

            self._optimizer.update_opt(**optimizer_args)

            self._use_trust_region = use_trust_region
            self._name = name

            self._normalize_inputs = normalize_inputs
            self._normalize_outputs = normalize_outputs
            self._mean_network = mean_network
            self._x_mean_var = x_mean_var
            self._x_std_var = x_std_var
            self._y_mean_var = y_mean_var
            self._y_std_var = y_std_var

            # Optionally create assign operations for normalization
            if self._normalize_inputs:
                self._x_mean_var_ph = tf.placeholder(
                    shape=(1, ) + input_shape,
                    dtype=tf.float32,
                )
                self._x_std_var_ph = tf.placeholder(
                    shape=(1, ) + input_shape,
                    dtype=tf.float32,
                )
                self._assign_x_mean = tf.assign(self._x_mean_var,
                                                self._x_mean_var_ph)
                self._assign_x_std = tf.assign(self._x_std_var,
                                               self._x_std_var_ph)
            if self._normalize_outputs:
                self._y_mean_var_ph = tf.placeholder(
                    shape=(1, output_dim),
                    dtype=tf.float32,
                )
                self._y_std_var_ph = tf.placeholder(
                    shape=(1, output_dim),
                    dtype=tf.float32,
                )
                self._assign_y_mean = tf.assign(self._y_mean_var,
                                                self._y_mean_var_ph)
                self._assign_y_std = tf.assign(self._y_std_var,
                                               self._y_std_var_ph)
Beispiel #20
0
    def __init__(
        self,
        env_spec,
        name="CategoricalGRUPolicy",
        hidden_dim=32,
        feature_network=None,
        state_include_action=True,
        hidden_nonlinearity=tf.tanh,
        gru_layer_cls=L.GRULayer,
    ):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)

        self._prob_network_name = "prob_network"
        with tf.variable_scope(name, "CategoricalGRUPolicy"):
            Serializable.quick_init(self, locals())
            super(CategoricalGRUPolicy, self).__init__(env_spec)

            obs_dim = env_spec.observation_space.flat_dim
            action_dim = env_spec.action_space.flat_dim

            if state_include_action:
                input_dim = obs_dim + action_dim
            else:
                input_dim = obs_dim

            l_input = L.InputLayer(shape=(None, None, input_dim), name="input")

            if feature_network is None:
                feature_dim = input_dim
                l_flat_feature = None
                l_feature = l_input
            else:
                feature_dim = feature_network.output_layer.output_shape[-1]
                l_flat_feature = feature_network.output_layer
                l_feature = L.OpLayer(
                    l_flat_feature,
                    extras=[l_input],
                    name="reshape_feature",
                    op=lambda flat_feature, input: tf.reshape(
                        flat_feature,
                        tf.stack([
                            tf.shape(input)[0],
                            tf.shape(input)[1], feature_dim
                        ])),
                    shape_op=lambda _, input_shape:
                    (input_shape[0], input_shape[1], feature_dim))

            prob_network = GRUNetwork(input_shape=(feature_dim, ),
                                      input_layer=l_feature,
                                      output_dim=env_spec.action_space.n,
                                      hidden_dim=hidden_dim,
                                      hidden_nonlinearity=hidden_nonlinearity,
                                      output_nonlinearity=tf.nn.softmax,
                                      gru_layer_cls=gru_layer_cls,
                                      name=self._prob_network_name)

            self.prob_network = prob_network
            self.feature_network = feature_network
            self.l_input = l_input
            self.state_include_action = state_include_action

            flat_input_var = tf.placeholder(dtype=tf.float32,
                                            shape=(None, input_dim),
                                            name="flat_input")
            if feature_network is None:
                feature_var = flat_input_var
            else:
                with tf.name_scope("feature_network", values=[flat_input_var]):
                    feature_var = L.get_output(
                        l_flat_feature,
                        {feature_network.input_layer: flat_input_var})

            with tf.name_scope(self._prob_network_name, values=[feature_var]):
                out_prob_step, out_prob_hidden = L.get_output(
                    [
                        prob_network.step_output_layer,
                        prob_network.step_hidden_layer
                    ], {prob_network.step_input_layer: feature_var})
                out_prob_step = tf.identity(out_prob_step, "prob_step_output")
                out_prob_hidden = tf.identity(out_prob_hidden,
                                              "prob_step_hidden")

            self.f_step_prob = tensor_utils.compile_function(
                [flat_input_var, prob_network.step_prev_state_layer.input_var],
                [out_prob_step, out_prob_hidden])

            self.input_dim = input_dim
            self.action_dim = action_dim
            self.hidden_dim = hidden_dim
            self.name = name

            self.prev_actions = None
            self.prev_hiddens = None
            self.dist = RecurrentCategorical(env_spec.action_space.n)

            out_layers = [prob_network.output_layer]
            if feature_network is not None:
                out_layers.append(feature_network.output_layer)

            LayersPowered.__init__(self, out_layers)
    def __init__(
        self,
        input_shape,
        output_dim,
        name="BernoulliMLPRegressor",
        hidden_sizes=(32, 32),
        hidden_nonlinearity=tf.nn.relu,
        optimizer=None,
        tr_optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        no_initial_trust_region=True,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean
        network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the
        mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        with tf.variable_scope(name):

            if optimizer is None:
                optimizer = LbfgsOptimizer(name="optimizer")
            if tr_optimizer is None:
                tr_optimizer = ConjugateGradientOptimizer()

            self.output_dim = output_dim
            self.optimizer = optimizer
            self.tr_optimizer = tr_optimizer

            p_network = MLP(input_shape=input_shape,
                            output_dim=output_dim,
                            hidden_sizes=hidden_sizes,
                            hidden_nonlinearity=hidden_nonlinearity,
                            output_nonlinearity=tf.nn.sigmoid,
                            name="p_network")

            l_p = p_network.output_layer

            LayersPowered.__init__(self, [l_p])

            xs_var = p_network.input_layer.input_var
            ys_var = tf.placeholder(dtype=tf.float32,
                                    shape=(None, output_dim),
                                    name="ys")
            old_p_var = tf.placeholder(dtype=tf.float32,
                                       shape=(None, output_dim),
                                       name="old_p")

            x_mean_var = tf.get_variable(name="x_mean",
                                         initializer=tf.zeros_initializer(),
                                         shape=(1, ) + input_shape)
            x_std_var = tf.get_variable(name="x_std",
                                        initializer=tf.ones_initializer(),
                                        shape=(1, ) + input_shape)

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var

            p_var = L.get_output(l_p,
                                 {p_network.input_layer: normalized_xs_var})

            old_info_vars = dict(p=old_p_var)
            info_vars = dict(p=p_var)

            dist = self._dist = Bernoulli(output_dim)

            mean_kl = tf.reduce_mean(dist.kl_sym(old_info_vars, info_vars))

            loss = -tf.reduce_mean(dist.log_likelihood_sym(ys_var, info_vars))

            predicted = p_var >= 0.5

            self.f_predict = tensor_utils.compile_function([xs_var], predicted)
            self.f_p = tensor_utils.compile_function([xs_var], p_var)
            self.l_p = l_p

            self.optimizer.update_opt(loss=loss,
                                      target=self,
                                      network_outputs=[p_var],
                                      inputs=[xs_var, ys_var])
            self.tr_optimizer.update_opt(loss=loss,
                                         target=self,
                                         network_outputs=[p_var],
                                         inputs=[xs_var, ys_var, old_p_var],
                                         leq_constraint=(mean_kl, step_size))

            self.use_trust_region = use_trust_region
            self.name = name

            self.normalize_inputs = normalize_inputs
            self.x_mean_var = x_mean_var
            self.x_std_var = x_std_var
            self.first_optimized = not no_initial_trust_region
    def __init__(self,
                 input_shape,
                 output_dim,
                 conv_filters,
                 conv_filter_sizes,
                 conv_strides,
                 conv_pads,
                 hidden_sizes,
                 hidden_nonlinearity=tf.nn.tanh,
                 output_nonlinearity=None,
                 name='GaussianConvRegressor',
                 mean_network=None,
                 learn_std=True,
                 init_std=1.0,
                 adaptive_std=False,
                 std_share_network=False,
                 std_conv_filters=[],
                 std_conv_filter_sizes=[],
                 std_conv_strides=[],
                 std_conv_pads=[],
                 std_hidden_sizes=[],
                 std_hidden_nonlinearity=None,
                 std_output_nonlinearity=None,
                 normalize_inputs=True,
                 normalize_outputs=True,
                 subsample_factor=1.,
                 optimizer=None,
                 optimizer_args=dict(),
                 use_trust_region=True,
                 max_kl_step=0.01):
        Parameterized.__init__(self)
        Serializable.quick_init(self, locals())
        self._mean_network_name = 'mean_network'
        self._std_network_name = 'std_network'

        with tf.compat.v1.variable_scope(name):
            if optimizer is None:
                if use_trust_region:
                    optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
                else:
                    optimizer = LbfgsOptimizer(**optimizer_args)
            else:
                optimizer = optimizer(**optimizer_args)

            self._optimizer = optimizer
            self._subsample_factor = subsample_factor

            if mean_network is None:
                if std_share_network:
                    b = np.concatenate(
                        [
                            np.zeros(output_dim),
                            np.full(output_dim, np.log(init_std))
                        ],
                        axis=0)  # yapf: disable
                    b = tf.constant_initializer(b)
                    mean_network = ConvNetwork(
                        name=self._mean_network_name,
                        input_shape=input_shape,
                        output_dim=2 * output_dim,
                        conv_filters=conv_filters,
                        conv_filter_sizes=conv_filter_sizes,
                        conv_strides=conv_strides,
                        conv_pads=conv_pads,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity,
                        output_b_init=b)
                    l_mean = layers.SliceLayer(
                        mean_network.output_layer,
                        slice(output_dim),
                        name='mean_slice',
                    )
                else:
                    mean_network = ConvNetwork(
                        name=self._mean_network_name,
                        input_shape=input_shape,
                        output_dim=output_dim,
                        conv_filters=conv_filters,
                        conv_filter_sizes=conv_filter_sizes,
                        conv_strides=conv_strides,
                        conv_pads=conv_pads,
                        hidden_sizes=hidden_sizes,
                        hidden_nonlinearity=hidden_nonlinearity,
                        output_nonlinearity=output_nonlinearity)
                    l_mean = mean_network.output_layer

            if adaptive_std:
                l_log_std = ConvNetwork(
                    name=self._std_network_name,
                    input_shape=input_shape,
                    output_dim=output_dim,
                    conv_filters=std_conv_filters,
                    conv_filter_sizes=std_conv_filter_sizes,
                    conv_strides=std_conv_strides,
                    conv_pads=std_conv_pads,
                    hidden_sizes=std_hidden_sizes,
                    hidden_nonlinearity=std_hidden_nonlinearity,
                    output_nonlinearity=std_output_nonlinearity,
                    output_b_init=tf.constant_initializer(np.log(init_std)),
                ).output_layer
            elif std_share_network:
                l_log_std = layers.SliceLayer(
                    mean_network.output_layer,
                    slice(output_dim, 2 * output_dim),
                    name='log_std_slice',
                )
            else:
                l_log_std = layers.ParamLayer(
                    mean_network.input_layer,
                    num_units=output_dim,
                    param=tf.constant_initializer(np.log(init_std)),
                    trainable=learn_std,
                    name=self._std_network_name,
                )

            LayersPowered.__init__(self, [l_mean, l_log_std])

            xs_var = mean_network.input_layer.input_var
            ys_var = tf.compat.v1.placeholder(
                dtype=tf.float32, name='ys', shape=(None, output_dim))
            old_means_var = tf.compat.v1.placeholder(
                dtype=tf.float32, name='ys', shape=(None, output_dim))
            old_log_stds_var = tf.compat.v1.placeholder(
                dtype=tf.float32,
                name='old_log_stds',
                shape=(None, output_dim))

            x_mean_var = tf.Variable(
                np.zeros((1, np.prod(input_shape)), dtype=np.float32),
                name='x_mean',
            )
            x_std_var = tf.Variable(
                np.ones((1, np.prod(input_shape)), dtype=np.float32),
                name='x_std',
            )
            y_mean_var = tf.Variable(
                np.zeros((1, output_dim), dtype=np.float32),
                name='y_mean',
            )
            y_std_var = tf.Variable(
                np.ones((1, output_dim), dtype=np.float32),
                name='y_std',
            )

            normalized_xs_var = (xs_var - x_mean_var) / x_std_var
            normalized_ys_var = (ys_var - y_mean_var) / y_std_var

            with tf.name_scope(
                    self._mean_network_name, values=[normalized_xs_var]):
                normalized_means_var = layers.get_output(
                    l_mean, {mean_network.input_layer: normalized_xs_var})
            with tf.name_scope(
                    self._std_network_name, values=[normalized_xs_var]):
                normalized_log_stds_var = layers.get_output(
                    l_log_std, {mean_network.input_layer: normalized_xs_var})

            means_var = normalized_means_var * y_std_var + y_mean_var
            log_stds_var = normalized_log_stds_var + tf.math.log(y_std_var)

            normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
            normalized_old_log_stds_var = (
                old_log_stds_var - tf.math.log(y_std_var))

            dist = self._dist = DiagonalGaussian(output_dim)

            normalized_dist_info_vars = dict(
                mean=normalized_means_var, log_std=normalized_log_stds_var)

            mean_kl = tf.reduce_mean(
                dist.kl_sym(
                    dict(
                        mean=normalized_old_means_var,
                        log_std=normalized_old_log_stds_var),
                    normalized_dist_info_vars,
                ))

            loss = -tf.reduce_mean(
                dist.log_likelihood_sym(normalized_ys_var,
                                        normalized_dist_info_vars))

            self._f_predict = tensor_utils.compile_function([xs_var],
                                                            means_var)
            self._f_pdists = tensor_utils.compile_function(
                [xs_var], [means_var, log_stds_var])
            self._l_mean = l_mean
            self._l_log_std = l_log_std

            optimizer_args = dict(
                loss=loss,
                target=self,
                network_outputs=[
                    normalized_means_var, normalized_log_stds_var
                ],
            )

            if use_trust_region:
                optimizer_args['leq_constraint'] = (mean_kl, max_kl_step)
                optimizer_args['inputs'] = [
                    xs_var, ys_var, old_means_var, old_log_stds_var
                ]
            else:
                optimizer_args['inputs'] = [xs_var, ys_var]

            self._optimizer.update_opt(**optimizer_args)

            self._use_trust_region = use_trust_region
            self._name = name

            self._normalize_inputs = normalize_inputs
            self._normalize_outputs = normalize_outputs
            self._mean_network = mean_network
            self._x_mean_var = x_mean_var
            self._x_std_var = x_std_var
            self._y_mean_var = y_mean_var
            self._y_std_var = y_std_var