Exemple #1
0
    def add_baseline_op(self, scope="baseline"):
        """
    Build the baseline network within the scope.

    In this function we will build the baseline network.
    Use build_mlp with the same parameters as the policy network to
    get the baseline estimate. You also have to setup a target
    placeholder and an update operation so the baseline can be trained.

    Args:
        scope: the scope of the baseline network

    TODO: Set the following fields
        self.baseline
            HINT: use build_mlp, the network is the same as policy network
            check self.config for n_layers and layer_size
            HINT: tf.squeeze might be helpful
        self.baseline_target_placeholder --> Not required anymore
        self.update_baseline_op
            HINT: first construct a loss using tf.losses.mean_squared_error.
            HINT: use AdamOptimizer with self.lr

    """
        ######################################################
        #########   YOUR CODE HERE - 4-8 lines.   ############

        self.baseline = build_mlp(self.observation_placeholder, 1, scope,
                                  self.config.n_layers, self.config.layer_size,
                                  self.config.activation)
        loss = tf.losses.mean_squared_error(self.baseline_target_placeholder,
                                            tf.squeeze(self.baseline))
        optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.update_baseline_op = optimizer.minimize(loss)
 def build(self):
     value = build_mlp(self.observation, 1, self.config.n_layers,
                       self.config.layer_size, self.config.activation)
     # value = tf.squeeze(value)
     self.baseline = keras.Model(inputs=self.observation, outputs=value)
     self.loss = keras.losses.MeanSquaredError()
     self.optimizer = keras.optimizers.Adam(learning_rate=self.lr)
     self.baseline.compile(loss=self.loss, optimizer=self.optimizer)
    def build(self) :
        self.observation = keras.Input(
            dtype = tf.float32,
            shape = (self.observation_dim,),
        )
        self.action = build_mlp(self.observation, self.action_dim, self.config.n_layers,
                        self.config.layer_size, self.config.activation)
        self.action_logit = keras.Model(inputs = self.observation, outputs = self.action)

        if self.discrete :
            sampled_action = tf.squeeze(tf.random.categorical(self.action,1))
        else :
            self.normal_layer = Normal_action_sample()
            sampled_action = self.normal_layer(self.action)

        self.sample_action = keras.Model(inputs = self.observation, outputs = sampled_action, name='sample_action')
        self.sample_action.summary()
        if self.config.use_baseline :
            self.baseline_network = BaselineNetwork(self.config, self.observation)
            self.baseline_network.build()
Exemple #4
0
    def build_policy_network_op(self, scope="policy_network"):
        """
        Build the policy network, construct the tensorflow operation to sample
        actions from the policy network outputs, and compute the log probabilities
        of the actions taken (for computing the loss later). These operations are
        stored in self.sampled_action and self.logprob. Must handle both settings
        of self.discrete.

        Args:
                scope: the scope of the neural network

        TODO:
        Discrete case:
            action_logits: the logits for each action
                HINT: use build_mlp, check self.config for layer_size and
                n_layers
            self.sampled_action: sample from these logits
                HINT: use tf.multinomial + tf.squeeze
            self.logprob: compute the log probabilities of the taken actions
                HINT: 1. tf.nn.sparse_softmax_cross_entropy_with_logits computes
                         the *negative* log probabilities of labels, given logits.
                      2. taken actions are different than sampled actions!

        Continuous case:
            To build a policy in a continuous action space domain, we will have the
            model output the means of each action dimension, and then sample from
            a multivariate normal distribution with these means and trainable standard
            deviation.

            That is, the action a_t ~ N( mu(o_t), sigma)
            where mu(o_t) is the network that outputs the means for each action
            dimension, and sigma is a trainable variable for the standard deviations.
            N here is a multivariate gaussian distribution with the given parameters.

            action_means: the predicted means for each action dimension.
                HINT: use build_mlp, check self.config for layer_size and
                n_layers
            log_std: a trainable variable for the log standard deviations.
                HINT: think about why we use log std as the trainable variable instead of std
                HINT: use tf.get_variable
                HINT: The shape of this should match the shape of action dimension
            self.sampled_action: sample from the gaussian distribution as described above
                HINT: use tf.random_normal
                HINT: use re-parametrization to obtain N(mu, sigma) from N(0, 1)
            self.lobprob: the log probabilities of the taken actions
                HINT: use tf.contrib.distributions.MultivariateNormalDiag

        """
        #######################################################
        #########   YOUR CODE HERE - 8-12 lines.   ############
        self.scope = scope
        if self.discrete:
            action_logits = build_mlp(self.observation_placeholder,
                                      self.action_dim, self.scope,
                                      self.config.n_layers,
                                      self.config.layer_size,
                                      output_activation=self.config.activation)
            #self.sampled_action = tf.multinomial(action_logits, 1)
            self.sampled_action = tf.squeeze(tf.multinomial(action_logits, 1),1)
            
            self.logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits(
                #labels=self.action_placeholder,
                labels=self.action_placeholder,
                logits=action_logits)
        else:
            action_means = build_mlp(self.observation_placeholder,
                                     self.action_dim, self.scope,
                                     self.config.n_layers,
                                     self.config.layer_size)
            log_std = tf.get_variable("log_std")
            self.sampled_action = action_means + tf.multiply(
                log_std.exp(), tf.random_normal(self.batch_size))
            self.lobprob = tf.log(
                tf.contrib.distributions.MultivariateNormalDiag())