Ejemplo n.º 1
0
    def get_loss(self):
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
        return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)
Ejemplo n.º 2
0
    def predict_next(self, reuse):
        if isinstance(self.ac_space, gym.spaces.Discrete):
            ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2)
        else:
            ac = self.ac
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope, reuse=reuse):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
        return x
    def get_loss(self):
        nl = tf.nn.leaky_relu
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)
        ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)

        def add_ac(x):
            if x.get_shape().ndims == 2:
                return tf.concat([x, ac], axis=-1)
            elif x.get_shape().ndims == 4:
                sh = tf.shape(x)
                return tf.concat([
                    x, ac_four_dim + tf.zeros([
                        sh[0], sh[1], sh[2],
                        ac_four_dim.get_shape()[3].value
                    ], tf.float32)
                ],
                                 axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac)
            x = unflatten_first_dim(x, sh)
        self.prediction_pixels = x * self.ob_std + self.ob_mean
        return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2,
                              [2, 3, 4])
Ejemplo n.º 4
0
    def predict_next(self, reuse):
        nl = tf.nn.leaky_relu
        if isinstance(self.ac_space, gym.spaces.Discrete):
            ac = tf.one_hot(self.ac, get_action_n(self.ac_space), axis=2)
        else:
            ac = self.ac
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)
        ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)

        def add_ac(x):
            if x.get_shape().ndims == 2:
                return tf.concat([x, ac], axis=-1)
            elif x.get_shape().ndims == 4:
                sh = tf.shape(x)
                return tf.concat([
                    x, ac_four_dim + tf.zeros([
                        sh[0], sh[1], sh[2],
                        ac_four_dim.get_shape()[3].value
                    ], tf.float32)
                ],
                                 axis=-1)

        with tf.variable_scope(self.scope, reuse=reuse):
            x = flatten_two_dims(self.features)
            x = unet(x, nl=nl, feat_dim=self.feat_dim, cond=add_ac)
            x = unflatten_first_dim(x, sh)
        self.prediction_pixels = x * self.ob_std + self.ob_mean
        # return tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, [2, 3, 4])
        return x
    def get_loss(self, ac):
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu,
                                reuse=tf.AUTO_REUSE)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu,
                                      reuse=tf.AUTO_REUSE)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None,
                                      reuse=tf.AUTO_REUSE)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x),
                                n_out_features,
                                activation=None,
                                reuse=tf.AUTO_REUSE)
            x = unflatten_first_dim(x, sh)
            return x
    def get_loss(self):
        nl = tf.nn.leaky_relu
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)
        ac_four_dim = tf.expand_dims(tf.expand_dims(ac, 1), 1)

        def add_ac(x):
            if x.get_shape().ndims == 2:
                return tf.concat([x, ac], axis=-1)
            elif x.get_shape().ndims == 4:
                sh = tf.shape(x)
                return tf.concat(
                    [
                        x,
                        ac_four_dim + tf.zeros(
                            [
                                sh[0], sh[1], sh[2],
                                ac_four_dim.get_shape()[3].value
                            ],
                            tf.float32,
                        ),
                    ],
                    axis=-1,
                )

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            mu, log_sigma_squared = unet(x,
                                         nl=nl,
                                         feat_dim=self.feat_dim,
                                         cond=add_ac)
            mu = unflatten_first_dim(mu, sh)
            log_sigma_squared = unflatten_first_dim(log_sigma_squared, sh)
        prediction_pixels = mu * self.ob_std + self.ob_mean
        if self.ama == "true":
            mse = tf.square(mu - 2 * tf.stop_gradient(self.out_features))
            dynamics_reward = tf.reduce_mean((mse - tf.exp(log_sigma_squared)),
                                             axis=[2, 3, 4])
            if self.clip_ama == "true":
                dynamics_reward = tf.clip_by_value(dynamics_reward, 0, 1e6)
            loss = tf.reduce_mean(
                (tf.exp(-log_sigma_squared) *
                 (mse) + self.uncertainty_penalty * log_sigma_squared),
                axis=[2, 3, 4],
            )
        elif self.ama == "false":
            mse = tf.square(mu - tf.stop_gradient(self.out_features))
            dynamics_reward = tf.reduce_mean(mse, axis=[2, 3, 4])
            loss = dynamics_reward
        else:
            raise ValueError("Please specify whether to use AMA or not")
        return (
            loss,
            dynamics_reward,
            prediction_pixels,
            log_sigma_squared,
        )
Ejemplo n.º 7
0
 def prior_regularization(self, prior, sigma_mu=1e4, sigma_sigma=1e-4):
     """
         对 prior network 输出的分布进行约束. 在没有该约束的情况下, 模型一般也不会发散.
         该正则项对原损失函数的影响很小, 几乎不影响学习的过程, 推荐使用. 对应于论文 4.3.2 内容
     """
     mu = flatten_two_dims(prior.mean())  # (None, 128)
     sigma = flatten_two_dims(prior.stddev())  # (None, 128)
     mu_regularise = -tf.reduce_sum(mu**2, axis=-1) / (2 * (sigma_mu**2))
     sigma_regularise = tf.reduce_sum(tf.math.log(sigma) - sigma,
                                      axis=-1) * sigma_sigma
     reg = mu_regularise + sigma_regularise  # shape=(None,)
     return tf.reshape(reg, (self.sh[0], self.sh[1]))  # shape=(None,None)
Ejemplo n.º 8
0
 def decoder(self, z):  # z 是VAE后验分布的均值, shape=(None,None,512)
     nl = tf.nn.leaky_relu
     z_has_timesteps = (z.get_shape().ndims == 3)
     if z_has_timesteps:
         sh = tf.shape(z)
         z = flatten_two_dims(z)  # (None,512)
     with tf.variable_scope(self.scope + "decoder"):
         # 反卷积网络. de-convolution. spherical_obs=True, 输出 z.shape=(None,84,84,4)
         z = small_deconvnet(z,
                             nl=nl,
                             ch=4 if self.spherical_obs else 8,
                             positional_bias=True)
         if z_has_timesteps:
             z = unflatten_first_dim(z, sh)
         if self.spherical_obs:  # 球形损失, scale 在所有维度都是同一个常数, 简化运算
             scale = tf.get_variable(name="scale",
                                     shape=(),
                                     dtype=tf.float32,
                                     initializer=tf.ones_initializer())
             scale = tf.maximum(scale, -4.)
             scale = tf.nn.softplus(scale)
             scale = scale * tf.ones_like(z)
         else:
             z, scale = tf.split(z, 2, -1)  # 输出 split, 分别作为 mu 和 scale.
             scale = tf.nn.softplus(scale)
         # scale = tf.Print(scale, [scale])
         return tf.distributions.Normal(loc=z, scale=scale)
    def get_last_features(self, x, reuse):
        x_has_timesteps = (x.get_shape().ndims == 5)
        if x_has_timesteps:
            sh = tf.shape(x)
            x = flatten_two_dims(x)

        #with tf.variable_scope(self.scope + "_features", reuse=reuse):
        with tf.variable_scope(self.scope+"_features", reuse=reuse):
            x = (tf.to_float(x) - self.ob_mean) / self.ob_std
            x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize)

            if x_has_timesteps:
                x = unflatten_first_dim(x, sh)
            x = tf.reshape(x, [-1, sh[1], self.feat_dim])
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
            init_1 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_1, self.last_h_in_1)
            if self.lstm2_size:
                init_2 = tf.contrib.rnn.LSTMStateTuple(self.last_c_in_2, self.last_h_in_2)
            if self.aux_input:
                prev_rews = tf.expand_dims(self.ph_last_rew, -1)
                x = tf.concat([x, prev_rews], -1)
            x, c_out_1, h_out_1 = lstm(self.lstm1_size)(x, initial_state=init_1)
            if self.lstm2_size:
                if self.aux_input:
                    prev_acs = tf.one_hot(self.ph_last_ac, depth=self.num_actions)
                    x = tf.concat([x, tf.cast(prev_acs, tf.float32)], -1)
                    x = tf.concat([x, self.ph_last_vel], -1)

                x, c_out_2, h_out_2  = lstm(self.lstm2_size)(x, initial_state=init_2)
        return x
Ejemplo n.º 10
0
    def __init__(self, ob_space, ac_space, hidsize,
                  feat_dim, layernormalize, nl, scope="policy"):
        if layernormalize:
            print("Warning: policy is operating on top of layer-normed features. It might slow down the training.")
        self.layernormalize = layernormalize
        self.nl = nl
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape, name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x, name='value_function_output', units=1, activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
 def decoder(self, z):
     nl = tf.nn.leaky_relu
     z_has_timesteps = (z.get_shape().ndims == 3)
     if z_has_timesteps:
         sh = tf.shape(z)
         z = flatten_two_dims(z)
     with tf.variable_scope(self.scope + "decoder"):
         z = small_deconvnet(z,
                             nl=nl,
                             ch=4 if self.spherical_obs else 8,
                             positional_bias=True)
         if z_has_timesteps:
             z = unflatten_first_dim(z, sh)
         if self.spherical_obs:
             scale = tf.get_variable(name="scale",
                                     shape=(),
                                     dtype=tf.float32,
                                     initializer=tf.ones_initializer())
             scale = tf.maximum(scale, -4.)
             scale = tf.nn.softplus(scale)
             scale = scale * tf.ones_like(z)
         else:
             z, scale = tf.split(z, 2, -1)
             scale = tf.nn.softplus(scale)
         # scale = tf.Print(scale, [scale])
         return tf.distributions.Normal(loc=z, scale=scale)
Ejemplo n.º 12
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy",
                 nlstm=256):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                # h, self.dropout_assign_ops = choose_cnn(processed_x)
                # xs = batch_to_seq(h, nenv, nsteps)
                # ms = batch_to_seq(M, nenv, nsteps)
                # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
                # h5 = seq_to_batch(h5)
                # vf = fc(h5, 'v', 1)[:,0]
                # self.pd, self.pi = self.pdtype.pdfromlatent(h5)
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
 def get_loss_t2(self):
     ac = tf.one_hot(self.auxiliary_task.policy.a_samp_alt,
                     self.ac_space.n,
                     axis=2)
     self.next_pred = self.get_loss(ac)
     self.next_pred_flat = flatten_two_dims(self.next_pred)
     return tf.reduce_mean(
         (self.next_pred - tf.stop_gradient(self.out_features))**2, -1)
Ejemplo n.º 14
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        self.n_env = n_env
        self.n_steps = n_steps
        self.n_batch = n_env * n_steps
        self.n_lstm = n_lstm
        self.reuse = reuse
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            # self.ac_pdtype = make_pdtype(ac_space)
            self.ac_pdtype = make_proba_dist_type(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(self.n_env, self.n_steps) +
                                        ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder(
                [self.n_env, self.n_steps], name='ac')
            self.masks_ph = tf.placeholder(tf.float32,
                                           [self.n_env, self.n_steps],
                                           name="masks_ph")  # mask (done t-1)
            self.flat_masks_ph = tf.reshape(self.masks_ph,
                                            [self.n_env * self.n_steps])
            self.states_ph = tf.placeholder(tf.float32,
                                            [self.n_env, n_lstm * 2],
                                            name="states_ph")  # states
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            self.pdparamsize = self.ac_pdtype.param_shape()[0]

            self.sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=self.reuse)
            self.features = unflatten_first_dim(self.flat_features, self.sh)
 def get_loss(self):
     with tf.variable_scope(self.scope):
         x = tf.concat([self.features, self.next_features], 2)
         sh = tf.shape(x)
         x = flatten_two_dims(x)
         x = fc(x, units=self.policy.hidsize, activation=activ)
         x = fc(x, units=self.ac_space.n, activation=None)
         param = unflatten_first_dim(x, sh)
         idfpd = self.policy.ac_pdtype.pdfromflat(param)
         return idfpd.neglogp(self.ac)
Ejemplo n.º 16
0
 def get_loss(self, reuse=False):
     with tf.variable_scope(self.scope, reuse=reuse):
         x = tf.concat([self.features, self.next_features], 2)
         sh = tf.shape(x)
         x = flatten_two_dims(x)
         x = fc(x, units=self.policy.hidsize, activation=activ)
         # x = fc(x, units=self.ac_space.n, activation=None)
         x = fc(x, units=get_action_n(self.ac_space), activation=None)
         param = unflatten_first_dim(x, sh)
         # idfpd = self.policy.ac_pdtype.pdfromflat(param)
         idfpd = self.policy.ac_pdtype.proba_distribution_from_flat(param)
         return idfpd.neglogp(self.ac)
Ejemplo n.º 17
0
 def get_features(self, x, reuse):
     nl = tf.nn.leaky_relu
     x_has_timesteps = (x.get_shape().ndims == 5)
     if x_has_timesteps:
         sh = tf.shape(x)
         x = flatten_two_dims(x)
     with tf.variable_scope(self.scope + "_features", reuse=reuse):
         x = (tf.to_float(x) - self.ob_mean) / self.ob_std
         x = small_convnet(x, nl=nl, feat_dim=self.feat_dim, last_nl=nl, layernormalize=False)
     if x_has_timesteps:
         x = unflatten_first_dim(x, sh)
     return x
Ejemplo n.º 18
0
    def get_features(self, x, reuse):
        x_has_timesteps = (x.get_shape().ndims == 5)
        if x_has_timesteps:
            sh = tf.shape(x)
            x = flatten_two_dims(x)

        with tf.variable_scope(self.scope + "_features", reuse=reuse):
            x = tf.to_float(x)
            x = small_convnet(x, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize)

        if x_has_timesteps:
            x = unflatten_first_dim(x, sh)
        return x
Ejemplo n.º 19
0
    def get_loss(self):
        ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
        sh = tf.shape(ac)
        ac = flatten_two_dims(ac)

        def add_ac(x):
            return tf.concat([x, ac], axis=-1)

        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(add_ac(x),
                                self.hidsize,
                                activation=tf.nn.leaky_relu)

            def residual(x):
                res = tf.layers.dense(add_ac(x),
                                      self.hidsize,
                                      activation=tf.nn.leaky_relu)
                res = tf.layers.dense(add_ac(res),
                                      self.hidsize,
                                      activation=None)
                return x + res

            for _ in range(4):
                x = residual(x)
            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(add_ac(x), n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
            #####################################################
            #ps = (tf.reduce_mean(tf.stop_gradient(self.out_features), -1))
            #print("reward: ", tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, -1).shape)
            #####################################################
            # tf.reduce_mean((x - tf.stop_gradient(self.out_features)) ** 2, -1), buf_ac
        return tf.reduce_mean(
            (x - tf.stop_gradient(self.out_features))**2, -1
        ), tf.stop_gradient(
            self.features
        )  # 84 x 84 x 128 x128 int x: state prediction - non update next obs RMS -> reward : 128(pararellel thread) x 128(rollouts length)
Ejemplo n.º 20
0
 def get_features(self, x, reuse):
     if (x.get_shape().ndims == 5):
         shape = tf.shape(x)
         x = flatten_two_dims(x)
     with tf.variable_scope(self.scope + '_features', reuse=reuse):
         x = (tf.cast(x, tf.float32) - self.ob_mean) / self.ob_std
         x = small_convnet(x,
                           nl=self.nl,
                           feat_dim=self.feat_dim,
                           last_nl=None,
                           layernormalize=self.layernormalize)
     if (x.get_shape().ndims == 5):
         x = unflatten_first_dim(x, shape)
     return x
Ejemplo n.º 21
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope='policy'):
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        self.hidsize = hidsize
        self.feat_dim = feat_dim
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.placeholder_observation = tf.placeholder(dtype=tf.int32,
                                                          shape=(None, None) +
                                                          ob_space.shape,
                                                          name='observation')
            self.placeholder_action = self.ac_pdtype.sample_placeholder(
                [None, None], name='action')
            self.pd = self.vpred = None
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            shape = tf.shape(self.placeholder_observation)
            x = flatten_two_dims(self.placeholder_observation)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, shape)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                value_pred = fc(x,
                                name='value_func_output',
                                units=1,
                                activation=None)
            pdparam = unflatten_first_dim(pdparam, shape)
            self.vpred = unflatten_first_dim(value_pred, shape)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
Ejemplo n.º 22
0
 def get_loss(self):
     # 构造逆环境模型, 流程 输入 [feature(obs), feature(obs_next)] -> 输出动作参数
     # 计算不同动作的高斯或者softmax分布 -> 计算 log_prob 作为 inverse dynamics 的损失.
     with tf.variable_scope(self.scope):
         # features.shape=(None,None,512), next_features.shape=(None,None,512),
         x = tf.concat([self.features, self.next_features],
                       2)  # x.shape=(None,None,1024)
         sh = tf.shape(x)
         x = flatten_two_dims(x)  # (None, 1024) 融合了 feature 和 next_feature
         x = fc(x, units=self.policy.hidsize,
                activation=activ)  # (None,512)
         x = fc(x, units=self.ac_space.n,
                activation=None)  # (None,4)    输出动作logits
         param = unflatten_first_dim(x, sh)  # (None,None,4)  恢复维度
         idfpd = self.policy.ac_pdtype.pdfromflat(param)  # 根据输出 logits 建立分布
         # 如果是连续动作空间,这里代表高斯-log损失; 如果是离散动作空间, 这里代表 softmax 损失
         return idfpd.neglogp(self.ac)  # shape等于前2个维度 (None,None)
    def set_dynamics(self, dynamics):
        self.dynamics = dynamics
        with tf.variable_scope(self.scope):
            shaped = tf.shape(self.ph_ob)
            flat = flatten_two_dims(self.ph_ob)
            features = self.dynamics.auxiliary_task.get_features(flat, reuse=tf.AUTO_REUSE)
            pdparam = self.get_pdparam(features, False)
            pdparam = unflatten_first_dim(pdparam, shaped)
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)

            '''Alternate ac for forward dynamics'''
            pdparam_alt = self.get_pdparam(self.extracted_features, True)
            pdparam_alt = unflatten_first_dim(pdparam_alt, shaped)
            self.a_samp_alt = self.ac_pdtype.pdfromflat(pdparam_alt).sample()
    def __init__(self, ob_space, ac_space, hidsize,
                 ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"):
        if layernormalize:
            print("Warning: policy is operating on top of layer-normed features. It might slow down the training.")
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std

        ''' Defining variables that'll be initialized with dynamics '''
        self.dynamics = None
        self.a_samp = None
        self.entropy = None
        self.nlp_samp = None
        self.a_samp_alt = None

        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape, name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            self.pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            self.extracted_features = tf.placeholder(dtype=tf.float32,
                                                     shape=self.flat_features.shape)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                vpred = fc(x, name='value_function_output', units=1, activation=None)
                y = fc(vpred,  units=hidsize, activation=activ)
                y = fc(y, units=hidsize, activation=activ)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
Ejemplo n.º 25
0
    def get_loss(self):
        sh = tf.shape(self.features)
        with tf.variable_scope(self.scope):
            x = flatten_two_dims(self.features)
            x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu)
            x = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu)

            # def residual(x):
            #     res = tf.layers.dense(x, self.hidsize, activation=tf.nn.relu)
            #     res = tf.layers.dense(x, self.hidsize, activation=None)
            #     return x + res

            # for _ in range(4):
            #     x = residual(x)

            n_out_features = self.out_features.get_shape()[-1].value
            x = tf.layers.dense(x, n_out_features, activation=None)
            x = unflatten_first_dim(x, sh)
        return tf.reduce_mean((x - tf.stop_gradient(self.out_features))**2, -1)
 def get_loss(self):
     with tf.variable_scope(self.scope):
         f_mean, f_logvar = tf.split(self.features, 2, -1)
         next_f_mean, next_f_logvar = tf.split(self.next_features, 2, -1)
         f_scale = tf.nn.softplus(f_logvar)
         f_distribution = tf.distributions.Normal(loc=f_mean, scale=f_scale)
         next_f_scale = tf.nn.softplus(next_f_logvar)
         next_f_distribution = tf.distributions.Normal(loc=next_f_mean, scale=next_f_scale)
         sh = tf.shape(f_mean)
         prior = tf.distributions.Normal(loc=tf.zeros(sh), scale=tf.ones(sh))
         kl_loss = tf.distributions.kl_divergence(f_distribution, prior)
         f_sample = f_distribution.sample()
         ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
         x = tf.concat([f_sample, ac], 2)
         sh = tf.shape(x)
         x = flatten_two_dims(x)
         x = fc(x, units=self.policy.hidsize, activation=activ)
         x = fc(x, units=2 * self.feat_dim, activation=None)
         x = unflatten_first_dim(x, sh)
         mean, logvar = tf.split(x, 2, -1)
         scale = tf.nn.softplus(logvar)
         post_distribution = tf.distributions.Normal(loc=mean, scale=scale)
         return self.beta * kl_loss + tf.distributions.kl_divergence(next_f_distribution, post_distribution)
Ejemplo n.º 27
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.bool_actionclip = True  #TODO Need to make this flexible
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        #self.ac_range = ac_range
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(
                ac_space
            )  #RS: Should give a continuous action space, given  a continuous action env
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x,
                             name='pd',
                             units=pdparamsize,
                             activation=tf.nn.tanh)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.a_samp = self.clip_action(
                self.a_samp) if self.bool_actionclip else self.a_samp
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
            self.pd_logstd = pd.logstd
            self.pd_std = pd.std
            self.pd_mean = pd.mean
Ejemplo n.º 28
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        """ ob_space: (84,84,4);        ac_space: 4;
            ob_mean.shape=(84,84,4);    ob_std=1.7是标量;            hidsize: 512;
            feat_dim: 512;              layernormalize: False;      nl: tf.nn.leaky_relu.
        """
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(
                ac_space)  # 离散动作空间为soft-max分布, 连续状态空间为高斯分布
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')  # 初始化
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[
                0]  # breakout中等于4. 维度, 在soft-max情况下等于动作空间的维度

            sh = tf.shape(self.ph_ob)  # ph_ob.shape = (None,None,84,84,4)
            x = flatten_two_dims(self.ph_ob)  # x.shape = (None,84,84,4) 将前2维合并
            self.flat_features = self.get_features(
                x, reuse=False)  # shape=(None,512)
            self.features = unflatten_first_dim(self.flat_features,
                                                sh)  # shape=(None,None,512)

            # 定义策略网络和值函数网络. 其输入时已经提取过特征的 feature, 而不是原始的输入.
            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize,
                       activation=activ)  # activ=tf.nn.relu
                x = fc(x, units=hidsize, activation=activ)  # 分成 策略和值函数
                pdparam = fc(x, name='pd', units=pdparamsize,
                             activation=None)  # 动作logits, shape=(None,4)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)  # 值函数, 线性单元, shape=(None,1)
            pdparam = unflatten_first_dim(pdparam, sh)  # shape=(None,None,4)
            self.vpred = unflatten_first_dim(
                vpred, sh)[:, :, 0]  # 值函数, 由于最后一维为1, 因此不要. shape=(None,None)
            self.pd = pd = self.ac_pdtype.pdfromflat(
                pdparam)  # 策略输出softmax分布. 有mean,neglogp,kl,entropy,sample等函数
            self.a_samp = pd.sample()  # 采样动作,int型 (None,None), 每个位置是标量
            self.entropy = pd.entropy()  # 熵. (None,None)
            self.nlp_samp = pd.neglogp(
                self.a_samp)  # -log pi(a|s)  (None,None)
 def get_loss_t1(self):
     ac = tf.one_hot(self.ac, self.ac_space.n, axis=2)
     self.first_pred = self.get_loss(ac)
     self.first_pred_flat = flatten_two_dims(self.first_pred)
     return tf.reduce_mean(
         (self.first_pred - tf.stop_gradient(self.out_features))**2, -1)
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        # hidsize: all hidsize in fcn
        # feat_dim: feature dimension
        # nl: non-linear
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl  # non-linear
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            # the ac_pdtype does not contain any information about ob space
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            # pdparamsize: the number of pdparams
            pdparamsize = self.ac_pdtype.param_shape()[0]

            # sh: [None, None, h, w, c]
            sh = tf.shape(self.ph_ob)
            # ob: [None, None, h, w, c]
            # x: [None, h, w, c]
            x = flatten_two_dims(self.ph_ob)
            # flat_features returns the feature with shape [None, feat_dim]
            self.flat_features = self.get_features(x, reuse=False)
            # features: [None, None, feat_dim]
            self.features = unflatten_first_dim(self.flat_features, sh)

            # two head NN; pdparam is the params for pdtype
            # vpred outputs the estimated value
            with tf.variable_scope(scope, reuse=False):
                # activ = tf.nn.relu
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)