Ejemplo n.º 1
0
    else:
        mean = np.empty(shape=ob.shape, dtype=np.float32)
        std = np.empty(shape=(), dtype=np.float32)
    MPI.COMM_WORLD.Bcast(mean, root=0)
    MPI.COMM_WORLD.Bcast(std, root=0)
    return mean, std


def layernorm(x):
    m, v = tf.nn.moments(x, -1, keep_dims=True)
    return (x - m) / (tf.sqrt(v) + 1e-8)


getsess = tf.get_default_session

fc = partial(tf.layers.dense, kernel_initializer=normc_initializer(1.))
activ = tf.nn.relu


def flatten_two_dims(x):
    return tf.reshape(x, [-1] + x.get_shape().as_list()[2:])


def unflatten_first_dim(x, sh):
    return tf.reshape(x, [sh[0], sh[1]] + x.get_shape().as_list()[1:])


def add_pos_bias(x):
    with tf.variable_scope(name_or_scope=None, default_name="pos_bias"):
        b = tf.get_variable(name="pos_bias", shape=[1] + x.get_shape().as_list()[1:], dtype=tf.float32,
                            initializer=tf.zeros_initializer())
Ejemplo n.º 2
0
    def __init__(self, ob_dim, ac_dim):
        """
        Create a gaussian MLP policy

        :param ob_dim: (int) Observation dimention
        :param ac_dim: (int) action dimention
        """
        # Here we'll construct a bunch of expressions, which will be used in two places:
        # (1) When sampling actions
        # (2) When computing loss functions, for the policy update
        # Variables specific to (1) have the word "sampled" in them,
        # whereas variables specific to (2) have the word "old" in them
        ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2],
                               name="ob")  # batch of observations
        oldac_na = tf.placeholder(
            tf.float32, shape=[None, ac_dim],
            name="ac")  # batch of actions previous actions
        # batch of actions previous action distributions
        oldac_dist = tf.placeholder(tf.float32,
                                    shape=[None, ac_dim * 2],
                                    name="oldac_dist")
        adv_n = tf.placeholder(tf.float32, shape=[None],
                               name="adv")  # advantage function estimate
        wd_dict = {}
        layer_1 = tf.nn.tanh(
            dense(ob_no,
                  64,
                  "h1",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0.0,
                  weight_loss_dict=wd_dict))
        layer_2 = tf.nn.tanh(
            dense(layer_1,
                  64,
                  "h2",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0.0,
                  weight_loss_dict=wd_dict))
        mean_na = dense(layer_2,
                        ac_dim,
                        "mean",
                        weight_init=tf_util.normc_initializer(0.1),
                        bias_init=0.0,
                        weight_loss_dict=wd_dict)  # Mean control output
        self.wd_dict = wd_dict
        # Variance on outputs
        self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim],
                                                     tf.float32,
                                                     tf.zeros_initializer())
        logstd_1a = tf.expand_dims(logstd_1a, 0)
        std_1a = tf.exp(logstd_1a)
        std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
        ac_dist = tf.concat([
            tf.reshape(mean_na, [-1, ac_dim]),
            tf.reshape(std_na, [-1, ac_dim])
        ], 1)
        # This is the sampled action we'll perform.
        sampled_ac_na = tf.random_normal(tf.shape(
            ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim]
        logprobsampled_n = -tf.reduce_sum(tf.log(
            ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
                2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
                    tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) /
                    (tf.square(ac_dist[:, ac_dim:])),
                    axis=1)  # Logprob of sampled action
        logprob_n = -tf.reduce_sum(
            tf.log(ac_dist[:, ac_dim:]), axis=1
        ) - 0.5 * tf.log(2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
            tf.square(ac_dist[:, :ac_dim] - oldac_na) /
            (tf.square(ac_dist[:, ac_dim:])),
            axis=1
        )  # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
        kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
        # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n))
        # Approximation of KL divergence between old policy used to generate actions,
        # and new policy used to compute logprob_n
        surr = -tf.reduce_mean(
            adv_n * logprob_n
        )  # Loss function that we'll differentiate to get the policy gradient
        surr_sampled = -tf.reduce_mean(logprob_n)  # Sampled loss of the policy
        # Generate a new action and its logprob
        self._act = tf_util.function(
            [ob_no], [sampled_ac_na, ac_dist, logprobsampled_n])
        # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl)
        #  Compute (approximate) KL divergence between old policy and new policy
        self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss)
        # Input and output variables needed for computing loss
        self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled)
        tf_util.initialize()  # Initialize uninitialized TF variables
    def __init__(self, ob_dim, ac_dim, verbose=1):
        """
        Create an MLP policy for a value function

        :param ob_dim: (int) Observation dimention
        :param ac_dim: (int) action dimention
        :param verbose: (int) verbosity level
        """
        obs_ph = tf.placeholder(tf.float32,
                                shape=[None, ob_dim * 2 + ac_dim * 2 + 2
                                       ])  # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        layer_1 = tf.nn.elu(
            dense(obs_ph,
                  64,
                  "h1",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        layer_2 = tf.nn.elu(
            dense(layer_1,
                  64,
                  "h2",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        vpred_n = dense(layer_2,
                        1,
                        "hfinal",
                        weight_init=tf_util.normc_initializer(1.0),
                        bias_init=0,
                        weight_loss_dict=wd_dict)[:, 0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = tf.reduce_mean(
            tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))

        self._predict = tf_util.function([obs_ph], vpred_n)

        optim = kfac.KfacOptimizer(learning_rate=0.001,
                                   cold_lr=0.001 * (1 - 0.9),
                                   momentum=0.9,
                                   clip_kl=0.3,
                                   epsilon=0.1,
                                   stats_decay=0.95,
                                   async=1,
                                   kfac_update=2,
                                   cold_iter=50,
                                   weight_decay_dict=wd_dict,
                                   max_grad_norm=None,
                                   verbose=verbose)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss,
                                                  loss_sampled,
                                                  var_list=vf_var_list)
        self.do_update = tf_util.function([obs_ph, vtarg_n], update_op)  # pylint: disable=E1101
        tf_util.initialize()  # Initialize uninitialized TF variables
Ejemplo n.º 4
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):

        # observation_space 와 action_space
        # ob_space = env.observation_space
        # ac_space = env.action_space
        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
        # return ::
        # obs :: placeholder
        # pdtype :: action_space의 distribution정보를 담은 객체

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # DeepMind의 observation normalization
        obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std,
                               -5.0, 5.0)

        # ===========================[Value function prediction Model]====================================================
        # dense()는 tf.nn.bias_add(tf.matmul(input_tensor, weight), bias) 를 리턴해준다.
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=tf_util.normc_initializer(1.0))[:, 0]
        # dense(last, 1) 의 경우 return shape 가 [None, 1]이므로, [None]을 얻기 위해서 [:, 0]를 해준다.

        # ===========================[Policy function MOdel]==============================================================
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))

        # 선택한 gym 환경의 action space가 Box 형태인 경우 다음과 같이
        # action distribution의 mean 과 std 를 concate 한 값을
        # action output으로 사용한다.
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         tf_util.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            tf_util.normc_initializer(0.01))

        # 이 모델에서의 action output은 다음 proba_distribution 에 담기도록 합니다.
        self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
        self.state_in = []
        self.state_out = []

        # ppo1/mlp_policy class를 상송했으므로
        # 해당 클래스에서 정의되어 있는 act 함수를 재정의
        # _act를 정의해주면
        # act()를 통해서 action / value 를 얻을수 있다.
        self.stochastic_ph = tf.placeholder(dtype=tf.bool,
                                            shape=(),
                                            name="stochastic")
        action = tf_util.switch(self.stochastic_ph,
                                self.proba_distribution.sample(),
                                self.proba_distribution.mode())
        self.action = action
        self._act = tf_util.function([self.stochastic_ph, obs],
                                     [action, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):

        assert isinstance(ob_space, gym.spaces.Box)

        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)

        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # normalization
        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                kernel_initializer=U.normc_initializer(1.0),
                                name="vffc%i" % (i + 1)))

        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        # input to policy
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                tf.layers.dense(last_out,
                                hid_size,
                                name="polfc%i" % (i + 1),
                                kernel_initializer=U.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd[option[0]]], axis=1)
        else:
            pdparam = tf.layers.dense(
                last_out,
                pdtype.param_shape()[0],
                name="polfinal",
                kernel_initializer=U.normc_initializer(0.01))

        # select option
        self.op_pi = tf.nn.softmax(
            tf.layers.dense(tf.stop_gradient(last_out),
                            num_options,
                            name="OPfc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
Ejemplo n.º 6
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):

        obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std,
                               -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "vffc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))
        self.vpred = dense(last_out,
                           1,
                           "vffinal",
                           weight_init=tf_util.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                dense(last_out,
                      hid_size,
                      "polfc%i" % (i + 1),
                      weight_init=tf_util.normc_initializer(1.0)))

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense(last_out,
                         pdtype.param_shape()[0] // 2, "polfinal",
                         tf_util.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = dense(last_out,
                            pdtype.param_shape()[0], "polfinal",
                            tf_util.normc_initializer(0.01))

        self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
        self.state_in = []
        self.state_out = []

        # change for BC
        self.stochastic_ph = tf.placeholder(dtype=tf.bool,
                                            shape=(),
                                            name="stochastic")
        action = tf_util.switch(self.stochastic_ph,
                                self.proba_distribution.sample(),
                                self.proba_distribution.mode())
        self.action = action
        self._act = tf_util.function([self.stochastic_ph, obs],
                                     [action, self.vpred])