def _build_model(self): ''' @brief: the network is defined here ''' self._iteration = tf.Variable(0, trainable=False, name='step') # two initializer weight_init = normc_initializer(self._npr) bias_init = tf.constant_initializer(0) # if input not provided, make one if self._input is None: self._input = tf.placeholder(tf.float32, [None, self._input_size], name='ob_input') with tf.variable_scope(self._name_scope): self._layer = self._input self._layer_input_size = self._input_size for i_layer in range(len(self._network_shape)): self._layer = \ fully_connected(self._layer, self._layer_input_size, self._network_shape[i_layer], weight_init, bias_init, "policy_" + str(i_layer), trainable=self._trainable) self._layer = tf.nn.tanh(self._layer) self._layer_input_size = self._network_shape[i_layer] # the output layer if not self._is_baseline: weight_init = normc_initializer(self._npr, 0.01) self._action_mu_output = fully_connected(self._layer, self._layer_input_size, self._output_size, weight_init, bias_init, "policy_output", trainable=self._trainable) if self._define_std: # size: [1, num_action] self._action_dist_logstd = tf.Variable( (0 * self._npr.randn( 1, self._output_size)).astype(np.float32), name="policy_logstd", trainable=self._trainable ) # size: [batch, num_action] self._action_dist_logstd_param = tf.tile( self._action_dist_logstd, tf.stack((tf.shape(self._action_mu_output)[0], 1)) ) # get the variable list self._set_var_list()
def _policy_nn(self): """ Neural net for policy approximation function Policy parameterized by Gaussian means and variances. NN outputs mean action based on observation. Trainable variables hold log-variances for each action dimension (i.e. variances not determined by NN). """ if self.policy_size == 'small': print("using small structure") hid1_size = self.obs_dim hid3_size = self.act_dim hid2_size = int(np.sqrt(hid1_size * hid3_size)) elif self.policy_size == 'large': print('Using large structure ') hid1_size = self.obs_dim * self.hid1_mult hid3_size = self.act_dim * 10 hid2_size = int(np.sqrt(hid1_size * hid3_size)) else: raise NotImplementedError # heuristic to set learning rate based on NN size (tuned on 'Hopper-v1') self.lr = 9e-4 / np.sqrt(hid2_size) # 9e-4 empirically determined weight_init = tf.random_uniform_initializer(-0.05, 0.05) bias_init = tf.constant_initializer(0) # 3 hidden layers with tanh activations with tf.variable_scope(self.scope): h1 = fully_connected(self.obs_ph, self.obs_dim, hid1_size, weight_init, bias_init, "policy_h1") h1 = tf.nn.tanh(h1) h2 = fully_connected(h1, hid1_size, hid2_size, weight_init, bias_init, "policy_h2") h2 = tf.nn.tanh(h2) h3 = fully_connected(h2, hid2_size, hid3_size, weight_init, bias_init, "policy_h3") self.means = fully_connected(h3, hid3_size, self.act_dim, weight_init, bias_init, "policy_mean") # logvar_speed is used to 'fool' gradient descent into making faster updates # to log-variances. heuristic sets logvar_speed based on network size. logvar_speed = (10 * hid3_size) // 48 log_vars = tf.get_variable("policy_logvars", (logvar_speed, self.act_dim), tf.float32, tf.constant_initializer(0.0)) self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar print( 'Policy Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}, logvar_speed: {}' .format(hid1_size, hid2_size, hid3_size, self.lr, logvar_speed))
def _build_policy_nn(self): """ Build ops and computation graph for policy Network Policy parameterized by Gaussian means and variances. NN outputs mean action based on observation. Each sampler process has its own local policy network, the weights are assigned from the main process via Queue """ self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs') if self.policy_size == 'small': print("using small structure") hid1_size = self.obs_dim hid3_size = self.act_dim hid2_size = int(np.sqrt(hid1_size * hid3_size)) elif self.policy_size == 'large': print('Using large structure ') hid1_size = self.obs_dim * self.hid1_mult hid3_size = self.act_dim * 10 hid2_size = int(np.sqrt(hid1_size * hid3_size)) else: raise NotImplementedError weight_init = tf.random_uniform_initializer(-0.05, 0.05) bias_init = tf.constant_initializer(0) # 3 hidden layers with tanh activations with tf.variable_scope(self.scope): h1 = fully_connected(self.obs_ph, self.obs_dim, hid1_size, weight_init, bias_init, "policy_h1") h1 = tf.nn.tanh(h1) h2 = fully_connected(h1, hid1_size, hid2_size, weight_init, bias_init, "policy_h2") h2 = tf.nn.tanh(h2) h3 = fully_connected(h2, hid2_size, hid3_size, weight_init, bias_init, "policy_h3") self.means = fully_connected(h3, hid3_size, self.act_dim, weight_init, bias_init, "policy_mean") # logvar_speed is used to 'fool' gradient descent into making faster updates # to log-variances. heuristic sets logvar_speed based on network size. logvar_speed = (10 * hid3_size) // 48 log_vars = tf.get_variable("policy_logvars", (logvar_speed, self.act_dim), tf.float32, tf.constant_initializer(0.0)) self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar print('Policy Params in scope {} -- h1: {}, h2: {}, ' + \ 'h3: {}, logvar_speed: {}'.format( self.scope, hid1_size, hid2_size, hid3_size, logvar_speed))