Esempio n. 1
0
    def _build_model(self):
        '''
            @brief: the network is defined here
        '''
        self._iteration = tf.Variable(0, trainable=False, name='step')

        # two initializer
        weight_init = normc_initializer(self._npr)
        bias_init = tf.constant_initializer(0)

        # if input not provided, make one
        if self._input is None:
            self._input = tf.placeholder(tf.float32, [None, self._input_size],
                                         name='ob_input')

        with tf.variable_scope(self._name_scope):
            self._layer = self._input
            self._layer_input_size = self._input_size
            for i_layer in range(len(self._network_shape)):
                self._layer = \
                    fully_connected(self._layer,
                                    self._layer_input_size,
                                    self._network_shape[i_layer],
                                    weight_init,
                                    bias_init, "policy_" + str(i_layer),
                                    trainable=self._trainable)
                self._layer = tf.nn.tanh(self._layer)
                self._layer_input_size = self._network_shape[i_layer]

            # the output layer
            if not self._is_baseline:
                weight_init = normc_initializer(self._npr, 0.01)
            self._action_mu_output = fully_connected(self._layer,
                                                     self._layer_input_size,
                                                     self._output_size,
                                                     weight_init,
                                                     bias_init,
                                                     "policy_output",
                                                     trainable=self._trainable)

            if self._define_std:
                # size: [1, num_action]
                self._action_dist_logstd = tf.Variable(
                    (0 * self._npr.randn(
                        1, self._output_size)).astype(np.float32),
                    name="policy_logstd",
                    trainable=self._trainable
                )

                # size: [batch, num_action]
                self._action_dist_logstd_param = tf.tile(
                    self._action_dist_logstd,
                    tf.stack((tf.shape(self._action_mu_output)[0], 1))
                )

        # get the variable list
        self._set_var_list()
Esempio n. 2
0
    def _policy_nn(self):
        """ Neural net for policy approximation function
            Policy parameterized by Gaussian means and variances. NN outputs mean
            action based on observation. Trainable variables hold log-variances
            for each action dimension (i.e. variances not determined by NN).
        """
        if self.policy_size == 'small':
            print("using small structure")
            hid1_size = self.obs_dim
            hid3_size = self.act_dim
            hid2_size = int(np.sqrt(hid1_size * hid3_size))

        elif self.policy_size == 'large':
            print('Using large structure ')
            hid1_size = self.obs_dim * self.hid1_mult
            hid3_size = self.act_dim * 10
            hid2_size = int(np.sqrt(hid1_size * hid3_size))
        else:
            raise NotImplementedError

        # heuristic to set learning rate based on NN size (tuned on 'Hopper-v1')
        self.lr = 9e-4 / np.sqrt(hid2_size)  # 9e-4 empirically determined
        weight_init = tf.random_uniform_initializer(-0.05, 0.05)
        bias_init = tf.constant_initializer(0)
        # 3 hidden layers with tanh activations
        with tf.variable_scope(self.scope):
            h1 = fully_connected(self.obs_ph, self.obs_dim, hid1_size,
                                 weight_init, bias_init, "policy_h1")
            h1 = tf.nn.tanh(h1)
            h2 = fully_connected(h1, hid1_size, hid2_size, weight_init,
                                 bias_init, "policy_h2")
            h2 = tf.nn.tanh(h2)
            h3 = fully_connected(h2, hid2_size, hid3_size, weight_init,
                                 bias_init, "policy_h3")
            self.means = fully_connected(h3, hid3_size, self.act_dim,
                                         weight_init, bias_init, "policy_mean")
            # logvar_speed is used to 'fool' gradient descent into making faster updates
            # to log-variances. heuristic sets logvar_speed based on network size.
            logvar_speed = (10 * hid3_size) // 48
            log_vars = tf.get_variable("policy_logvars",
                                       (logvar_speed, self.act_dim),
                                       tf.float32,
                                       tf.constant_initializer(0.0))
            self.log_vars = tf.reduce_sum(log_vars,
                                          axis=0) + self.policy_logvar
        print(
            'Policy Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}, logvar_speed: {}'
            .format(hid1_size, hid2_size, hid3_size, self.lr, logvar_speed))
Esempio n. 3
0
    def _build_policy_nn(self):
        """
        Build ops and computation graph for policy Network
        Policy parameterized by Gaussian means and variances.
        NN outputs mean action based on observation.
        Each sampler process has its own local policy network,
        the weights are assigned from the main process via Queue
        """
        self.obs_ph = tf.placeholder(tf.float32, (None, self.obs_dim), 'obs')
        if self.policy_size == 'small':
            print("using small structure")
            hid1_size = self.obs_dim
            hid3_size = self.act_dim
            hid2_size = int(np.sqrt(hid1_size * hid3_size))

        elif self.policy_size == 'large':
            print('Using large structure ')
            hid1_size = self.obs_dim * self.hid1_mult
            hid3_size = self.act_dim  * 10
            hid2_size = int(np.sqrt(hid1_size * hid3_size))
        else:
            raise NotImplementedError
        weight_init = tf.random_uniform_initializer(-0.05, 0.05)
        bias_init = tf.constant_initializer(0)
        # 3 hidden layers with tanh activations
        with tf.variable_scope(self.scope):
            h1 = fully_connected(self.obs_ph, self.obs_dim, hid1_size,
                weight_init, bias_init, "policy_h1")
            h1 = tf.nn.tanh(h1)
            h2 = fully_connected(h1, hid1_size, hid2_size, weight_init,
                bias_init, "policy_h2")
            h2 = tf.nn.tanh(h2)
            h3 = fully_connected(h2, hid2_size, hid3_size, weight_init,
                bias_init, "policy_h3")
            self.means = fully_connected(h3, hid3_size, self.act_dim, weight_init,
                bias_init, "policy_mean")
            # logvar_speed is used to 'fool' gradient descent into making faster updates
            # to log-variances. heuristic sets logvar_speed based on network size.
            logvar_speed = (10 * hid3_size) // 48
            log_vars = tf.get_variable("policy_logvars", (logvar_speed, self.act_dim),
                tf.float32, tf.constant_initializer(0.0))
            self.log_vars = tf.reduce_sum(log_vars, axis=0) + self.policy_logvar
        print('Policy Params in scope {} -- h1: {}, h2: {}, ' + \
              'h3: {}, logvar_speed: {}'.format(
              self.scope, hid1_size, hid2_size, hid3_size, logvar_speed))