def add_baseline_op(self, scope="baseline"): """ Build the baseline network within the scope. In this function we will build the baseline network. Use build_mlp with the same parameters as the policy network to get the baseline estimate. You also have to setup a target placeholder and an update operation so the baseline can be trained. Args: scope: the scope of the baseline network TODO: Set the following fields self.baseline HINT: use build_mlp, the network is the same as policy network check self.config for n_layers and layer_size HINT: tf.squeeze might be helpful self.baseline_target_placeholder --> Not required anymore self.update_baseline_op HINT: first construct a loss using tf.losses.mean_squared_error. HINT: use AdamOptimizer with self.lr """ ###################################################### ######### YOUR CODE HERE - 4-8 lines. ############ self.baseline = build_mlp(self.observation_placeholder, 1, scope, self.config.n_layers, self.config.layer_size, self.config.activation) loss = tf.losses.mean_squared_error(self.baseline_target_placeholder, tf.squeeze(self.baseline)) optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) self.update_baseline_op = optimizer.minimize(loss)
def build(self): value = build_mlp(self.observation, 1, self.config.n_layers, self.config.layer_size, self.config.activation) # value = tf.squeeze(value) self.baseline = keras.Model(inputs=self.observation, outputs=value) self.loss = keras.losses.MeanSquaredError() self.optimizer = keras.optimizers.Adam(learning_rate=self.lr) self.baseline.compile(loss=self.loss, optimizer=self.optimizer)
def build(self) : self.observation = keras.Input( dtype = tf.float32, shape = (self.observation_dim,), ) self.action = build_mlp(self.observation, self.action_dim, self.config.n_layers, self.config.layer_size, self.config.activation) self.action_logit = keras.Model(inputs = self.observation, outputs = self.action) if self.discrete : sampled_action = tf.squeeze(tf.random.categorical(self.action,1)) else : self.normal_layer = Normal_action_sample() sampled_action = self.normal_layer(self.action) self.sample_action = keras.Model(inputs = self.observation, outputs = sampled_action, name='sample_action') self.sample_action.summary() if self.config.use_baseline : self.baseline_network = BaselineNetwork(self.config, self.observation) self.baseline_network.build()
def build_policy_network_op(self, scope="policy_network"): """ Build the policy network, construct the tensorflow operation to sample actions from the policy network outputs, and compute the log probabilities of the actions taken (for computing the loss later). These operations are stored in self.sampled_action and self.logprob. Must handle both settings of self.discrete. Args: scope: the scope of the neural network TODO: Discrete case: action_logits: the logits for each action HINT: use build_mlp, check self.config for layer_size and n_layers self.sampled_action: sample from these logits HINT: use tf.multinomial + tf.squeeze self.logprob: compute the log probabilities of the taken actions HINT: 1. tf.nn.sparse_softmax_cross_entropy_with_logits computes the *negative* log probabilities of labels, given logits. 2. taken actions are different than sampled actions! Continuous case: To build a policy in a continuous action space domain, we will have the model output the means of each action dimension, and then sample from a multivariate normal distribution with these means and trainable standard deviation. That is, the action a_t ~ N( mu(o_t), sigma) where mu(o_t) is the network that outputs the means for each action dimension, and sigma is a trainable variable for the standard deviations. N here is a multivariate gaussian distribution with the given parameters. action_means: the predicted means for each action dimension. HINT: use build_mlp, check self.config for layer_size and n_layers log_std: a trainable variable for the log standard deviations. HINT: think about why we use log std as the trainable variable instead of std HINT: use tf.get_variable HINT: The shape of this should match the shape of action dimension self.sampled_action: sample from the gaussian distribution as described above HINT: use tf.random_normal HINT: use re-parametrization to obtain N(mu, sigma) from N(0, 1) self.lobprob: the log probabilities of the taken actions HINT: use tf.contrib.distributions.MultivariateNormalDiag """ ####################################################### ######### YOUR CODE HERE - 8-12 lines. ############ self.scope = scope if self.discrete: action_logits = build_mlp(self.observation_placeholder, self.action_dim, self.scope, self.config.n_layers, self.config.layer_size, output_activation=self.config.activation) #self.sampled_action = tf.multinomial(action_logits, 1) self.sampled_action = tf.squeeze(tf.multinomial(action_logits, 1),1) self.logprob = -tf.nn.sparse_softmax_cross_entropy_with_logits( #labels=self.action_placeholder, labels=self.action_placeholder, logits=action_logits) else: action_means = build_mlp(self.observation_placeholder, self.action_dim, self.scope, self.config.n_layers, self.config.layer_size) log_std = tf.get_variable("log_std") self.sampled_action = action_means + tf.multiply( log_std.exp(), tf.random_normal(self.batch_size)) self.lobprob = tf.log( tf.contrib.distributions.MultivariateNormalDiag())