Example #1
0
 def proba_distribution_from_latent(self,
                                    pi_latent_vector,
                                    name_id=None,
                                    init_scale=1.0,
                                    init_bias=0.0):
     mean = linear(pi_latent_vector,
                   'pi',
                   self.size,
                   name_id=name_id,
                   init_scale=init_scale,
                   init_bias=init_bias,
                   use_bias=True)
     # logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
     logstd = linear(pi_latent_vector,
                     'pi/logstd',
                     self.size,
                     name_id=name_id,
                     init_scale=init_scale,
                     use_bias=False)
     # OpenAI Variation to cap the standard deviation
     # log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
     # Original Implementation
     # CAP the standard deviation of the actor
     LOG_STD_MAX = 0.5
     LOG_STD_MIN = -10
     logstd = tf.clip_by_value(logstd, LOG_STD_MIN, LOG_STD_MAX)
     pdparam = tf.concat([mean, logstd], axis=1)
     return self.proba_distribution_from_flat(pdparam)
Example #2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 net_arch=None,
                 act_funs=tf.tanh,
                 cnn_extractor=nature_cnn,
                 feature_extraction="cnn",
                 layer_norm=False,
                 **kwargs):
        super(FeedForwardPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse=reuse,
                             scale=(feature_extraction == "cnn"))

        self._kwargs_check(feature_extraction, kwargs)

        if net_arch is None:
            layers = [64, 64]
            net_arch = [dict(vf=layers, pi=layers)]

        with tf.variable_scope("model", reuse=reuse):
            if feature_extraction == "cnn":
                pi_latent = vf_latent = cnn_extractor(self.processed_obs,
                                                      **kwargs)
            else:
                pi_latent, vf_latent = mlp_extractor(
                    tf.layers.flatten(self.processed_obs), net_arch, act_funs)

            self._value_fn = linear(vf_latent, 'vf', 1)

            mean = tf.nn.sigmoid(linear(pi_latent, 'pi', ac_space.shape[0]))
            logstd = tf.get_variable(
                name='pi/logstd',
                shape=[1, ac_space.shape[0]],
                initializer=tf.random_normal_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            self._policy = mean
            self._proba_distribution = self.pdtype.proba_distribution_from_flat(
                pdparam)

        self._setup_init()
 def proba_distribution_from_latent(self,
                                    pi_latent_vector,
                                    vf_latent_vector,
                                    init_scale=1.0,
                                    init_bias=0.0):
     pdparam = linear(pi_latent_vector,
                      'pi',
                      self.size,
                      init_scale=init_scale,
                      init_bias=init_bias)
     q_values = linear(vf_latent_vector,
                       'q',
                       self.size,
                       init_scale=init_scale,
                       init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), pdparam, q_values
Example #4
0
def nature_cnn(scaled_images, **kwargs):
    """
    CNN from Nature paper.

    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu
    layer_1 = activ(
        conv(scaled_images,
             'c1',
             n_filters=32,
             filter_size=8,
             stride=4,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_2 = activ(
        conv(layer_1,
             'c2',
             n_filters=64,
             filter_size=4,
             stride=2,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = activ(
        conv(layer_2,
             'c3',
             n_filters=64,
             filter_size=3,
             stride=1,
             init_scale=np.sqrt(2),
             **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))
 def proba_distribution_from_latent(self,
                                    pi_latent_vector,
                                    vf_latent_vector,
                                    init_scale=1.0,
                                    init_bias=0.0):
     mean = linear(pi_latent_vector,
                   'pi',
                   self.size,
                   init_scale=init_scale,
                   init_bias=init_bias)
     logstd = tf.get_variable(name='pi/logstd',
                              shape=[1, self.size],
                              initializer=tf.zeros_initializer())
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     q_values = linear(vf_latent_vector,
                       'q',
                       self.size,
                       init_scale=init_scale,
                       init_bias=init_bias)
     return self.proba_distribution_from_flat(pdparam), mean, q_values
Example #6
0
 def proba_distribution_from_latent(self,
                                    pi_latent_vector,
                                    name_id=None,
                                    init_scale=1.0,
                                    init_bias=0.0):
     pdparam = linear(pi_latent_vector,
                      'pi',
                      self.size,
                      name_id=name_id,
                      init_scale=init_scale,
                      init_bias=init_bias,
                      use_bias=True)
     return self.proba_distribution_from_flat(pdparam)
Example #7
0
def mlp_extractor(flat_observations, net_arch, act_funs):
    """
    Constructs an MLP that receives observations as an input and outputs a latent representation for the policy and
    a value network. The ``net_arch`` parameter allows to specify the amount and size of the hidden layers and how many
    of them are shared between the policy network and the value network. It is assumed to be a list with the following
    structure:

    1. An arbitrary length (zero allowed) number of integers each specifying the number of units in a shared layer.
       If the number of ints is zero, there will be no shared layers.
    2. An optional dict, to specify the following non-shared layers for the value network and the policy network.
       It is formatted like ``dict(vf=[<value layer sizes>], pi=[<policy layer sizes>])``.
       If it is missing any of the keys (pi or vf), no non-shared layers (empty list) is assumed.

    For example to construct a network with one shared layer of size 55 followed by two non-shared layers for the value
    network of size 255 and a single non-shared layer of size 128 for the policy network, the following layers_spec
    would be used: ``[55, dict(vf=[255, 255], pi=[128])]``. A simple shared network topology with two layers of size 128
    would be specified as [128, 128].

    The ``act_funs`` parameter has a similar form to the ``net_arch`` parameter. Except ``act_funs`` parameter can be a
    single tf function, which means all layers share a common activation function.

    :param flat_observations: (tf.Tensor) The observations to base policy and value function on.
    :param net_arch: ([int or dict]) The specification of the policy and value networks.
        See above for details on its formatting.
    :param act_funs: (tf function or list) The activation function (list) to use for the networks.
    :return: (tf.Tensor, tf.Tensor) latent_policy, latent_value of the specified network.
        If all layers are shared, then ``latent_policy == latent_value``
    """
    latent = flat_observations
    policy_only_layers = [
    ]  # Layer sizes of the network that only belongs to the policy network
    value_only_layers = [
    ]  # Layer sizes of the network that only belongs to the value network
    policy_act_funs = []
    value_act_funs = []

    if isinstance(act_funs, list):
        if isinstance(net_arch[0], int):
            assert len(act_funs) == len(
                net_arch
            ), "Error: the number of act_funs have to equal to the number of layers"
        else:
            assert isinstance(act_funs[0], dict), "Error: the form of parameter ``act_funs`` should be similar to the " \
                                                  "parameter ``net_arch``"
            assert len(act_funs[0]['pi']) == len(net_arch[0]['pi']), "Error: the number of act_funs have to equal to " \
                                                                     "the number of pi layers."
            assert len(act_funs[0]['vf']) == len(net_arch[0]['vf']), "Error: the number of act_funs have to equal to " \
                                                                     "the number of vf layers."

    # Iterate through the shared layers and build the shared parts of the network
    for idx, layer in enumerate(net_arch):
        if isinstance(layer, int):  # Check that this is a shared layer
            layer_size = layer
            if isinstance(act_funs, list):
                latent = act_funs[idx](linear(latent,
                                              "shared_fc{}".format(idx),
                                              layer_size,
                                              init_scale=np.sqrt(2)))
            else:
                latent = act_funs(
                    linear(latent,
                           "shared_fc{}".format(idx),
                           layer_size,
                           init_scale=np.sqrt(2)))
        else:
            assert isinstance(
                layer, dict
            ), "Error: the net_arch list can only contain ints and dicts"
            if 'pi' in layer:
                assert isinstance(
                    layer['pi'], list
                ), "Error: net_arch[-1]['pi'] must contain a list of integers."
                policy_only_layers = layer['pi']
                policy_act_funs = act_funs[0]['pi']

            if 'vf' in layer:
                assert isinstance(
                    layer['vf'], list
                ), "Error: net_arch[-1]['vf'] must contain a list of integers."
                value_only_layers = layer['vf']
                value_act_funs = act_funs[0]['vf']
            break  # From here on the network splits up in policy and value network

    # Build the non-shared part of the network
    latent_policy = latent
    latent_value = latent
    for idx, (pi_layer_size, vf_layer_size) in enumerate(
            zip_longest(policy_only_layers, value_only_layers)):
        if pi_layer_size is not None:
            assert isinstance(
                pi_layer_size,
                int), "Error: net_arch[-1]['pi'] must only contain integers."
            latent_policy = policy_act_funs[idx](linear(latent_policy,
                                                        "pi_fc{}".format(idx),
                                                        pi_layer_size,
                                                        init_scale=np.sqrt(2)))
        if vf_layer_size is not None:
            assert isinstance(
                vf_layer_size,
                int), "Error: net_arch[-1]['vf'] must only contain integers."
            latent_value = value_act_funs[idx](linear(latent_value,
                                                      "vf_fc{}".format(idx),
                                                      vf_layer_size,
                                                      init_scale=np.sqrt(2)))

    return latent_policy, latent_value