Ejemplo n.º 1
0
    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None):
        ob_space = env.observation_space

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=nbatch)

        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                        nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(
                        encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(env=env,
                                 observations=X,
                                 latent=policy_latent,
                                 vf_latent=vf_latent,
                                 sess=sess,
                                 estimate_q=estimate_q,
                                 **extra_tensors)
        return policy
Ejemplo n.º 2
0
def observation_input(ob_space, batch_size=None, name='Ob'):
    from gym.spaces import Discrete, Box, MultiDiscrete
    from baselines.common.input import encode_observation
    assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
        'Baselines only deal with Discrete and Box observation spaces'
    dtype = ob_space.dtype
    if dtype == np.int8:
        dtype = np.uint8
    shape = (ob_space.shape[0], ob_space.shape[1], ob_space.shape[2])
    placeholder = tf.placeholder(shape=(batch_size,) + shape, dtype=dtype, name=name)
    return placeholder, encode_observation(ob_space, placeholder)
Ejemplo n.º 3
0
    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None):
        ob_space = env.observation_space

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)

        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)


        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(
            env=env,
            observations=X,
            latent=policy_latent,
            vf_latent=vf_latent,
            sess=sess,
            estimate_q=estimate_q,
            **extra_tensors
        )
        return policy
Ejemplo n.º 4
0
    def __init__(self, env, nbatch, nsteps, nenvs, network, **policy_kwargs):
        self.nbatch = nbatch
        self.nsteps = nsteps
        self.nenvs = nenvs
        self.ob_space = env.observation_space
        self.OUT = tf.placeholder(tf.float32, [nenvs])
        self.X = observation_placeholder(self.ob_space, batch_size=nbatch)
        encoded_x = encode_observation(self.ob_space, self.X)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            self.net = get_network_builder(network)(**policy_kwargs)
            self.h1 = self.net(encoded_x)
        self.h2 = fc(self.h1, 'vf', 1)
        self.out = self.h2[:, 0]
Ejemplo n.º 5
0
    def policy_fn(scope_name="pi",
                  nbatch=None,
                  nsteps=None,
                  sess=sess,
                  observ_placeholder=None):

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=nbatch)

        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent, recurrent_tensors = policy_network(encoded_x)

            if recurrent_tensors is not None:
                # recurrent architecture, need a few more steps
                nenv = nbatch // nsteps
                assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                    nbatch, nsteps)
                policy_latent, recurrent_tensors = policy_network(
                    encoded_x, nenv)
                extra_tensors.update(recurrent_tensors)

        policy = Policy(observations=X,
                        action_space=ac_space,
                        latent=policy_latent,
                        sess=sess,
                        train=train,
                        beta=beta,
                        l2=l2,
                        lr=lr,
                        init_scale=init_scale,
                        init_bias=init_bias,
                        trainable_variance=trainable_variance,
                        trainable_bias=trainable_bias,
                        init_logstd=init_logstd,
                        scope_name=scope_name,
                        clip=clip,
                        class_weights=class_weights,
                        **extra_tensors)
        return policy
Ejemplo n.º 6
0
    def pi_vf_fn(X, extra_tensors, nbatch, nsteps, recurrent_subname=None):
        """Shared network to extract latent feature for ob, ob_next"""
        ob_space = env.observation_space
        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            cnn_fm, policy_latent, recurrent_tensors = policy_network(
                encoded_x)

            if recurrent_tensors is not None:
                # recurrent architecture, need a few more steps
                nenv = nbatch // nsteps
                assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                    nbatch, nsteps)
                cnn_fm, policy_latent, recurrent_tensors = policy_network(
                    encoded_x, nenv)

                if recurrent_subname is not None:
                    new_recurrent_tensors = {}
                    for k, v in recurrent_tensors.items():
                        new_recurrent_tensors[recurrent_subname + '_' + k] = v
                    extra_tensors.update(new_recurrent_tensors)
                else:
                    extra_tensors.update(recurrent_tensors)

        with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
            _v_net = value_network

            if _v_net is None or _v_net == 'shared':
                vf_latent = policy_latent
            elif _v_net == 'gap':
                vf_latent = global_average_pooling(cnn_fm, **policy_kwargs)
            else:
                raise NotImplementedError

            vf = fc(vf_latent, 'vf_fc', 1)[:, 0]

        return cnn_fm, policy_latent, vf
Ejemplo n.º 7
0
    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None):
        ob_space = env.observation_space

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=nbatch)

        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent, policy_latent_mean, info_loss = policy_network(
                encoded_x)
            if isinstance(policy_latent, tuple):
                raise NotImplementedError()

        policy = PolicyWithValue(
            env=env,
            observations=X,
            arch=arch,
            latent=policy_latent,
            latent_mean=policy_latent_mean,
            info_loss=info_loss,
            # vf_latent=vf_latent,
            sess=sess,
            estimate_q=estimate_q,
            **extra_tensors)
        return policy
Ejemplo n.º 8
0
    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None,
                  encoded_x=None):
        ob_space = env.observation_space
        extra_tensors = {}

        if observ_placeholder is None:
            X = observation_placeholder(ob_space, batch_size=nbatch)
            if normalize_observations and X.dtype == tf.float32:
                new_encoded_x, rms = _normalize_clip_observation(X)
                extra_tensors['rms'] = rms
            else:
                new_encoded_x = X

            new_encoded_x = encode_observation(ob_space, new_encoded_x)
            new_encoded_x = get_network_builder("cnn")(
                **policy_kwargs)(new_encoded_x)
        else:
            X = observ_placeholder
            new_encoded_x = encoded_x

        with tf.variable_scope('pi' + str(head), reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(new_encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                        nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(
                        new_encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf' + str(head), reuse=tf.AUTO_REUSE):
                vf_latent, _ = _v_net(new_encoded_x)

        policy = PolicyWithValue(
            env=env,
            observations=X,
            latent=policy_latent,
            head=head,
            vf_latent=vf_latent,  #this is the same as policy_latent...
            sess=sess,
            estimate_q=estimate_q,
            **extra_tensors)

        #print(policy.vf)

        return policy, X, new_encoded_x
Ejemplo n.º 9
0
    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None,
                  randomization=True):
        ob_space = env.observation_space

        extra_tensors = {}

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=None)

        encoded_x = encode_observation(ob_space, X)

        # Randomization
        if randomization:
            encoded_x = tf.layers.conv2d(
                encoded_x / 255.,
                3,
                3,
                padding='same',
                kernel_initializer=tf.initializers.glorot_normal(),
                trainable=False,
                name='randcnn') * 255.
            randcnn_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                              scope="ppo2_model/randcnn")
            extra_tensors['randcnn_param'] = randcnn_param

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            extra_tensors['latent_fts'] = policy_latent
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(
                        nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(
                        encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(env=env,
                                 observations=X,
                                 latent=policy_latent,
                                 vf_latent=vf_latent,
                                 sess=sess,
                                 estimate_q=estimate_q,
                                 **extra_tensors)
        return policy
Ejemplo n.º 10
0
    def dynamics_fn(nbatch=None,
                    nsteps=None,
                    sess=None,
                    observ_placeholder=None,
                    index=None):
        ob_space = env.observation_space
        # ac_space = env.action_space
        # print("shape", (64,) + (ob_space.shape[0] + ac_space.shape[0], ))
        # Assume we have the same type for state and action space (Continuous - Continuous, Discrete - Discrete)
        # assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
        #     'Can only deal with Discrete and Box observation spaces for now'
        #
        # dtype = ob_space.dtype
        # if dtype == np.int8:
        #     dtype = np.uint8

        #X = tf.placeholder(shape=(nbatch,) + (ob_space.shape[0] + ac_space.shape[0], ), dtype=dtype, name='dyn_input')

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(
            ob_space, batch_size=nbatch)

        extra_tensors = {}

        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X

        encoded_x = encode_observation(
            ob_space, encoded_x
        )  #  Encode input in the way that is appropriate to the observation space(float)

        with tf.variable_scope('dyn%s' % index, reuse=tf.AUTO_REUSE):
            dynamics_latent = dynamics_network(encoded_x)
            if isinstance(dynamics_latent, tuple):
                dynamics_latent, recurrent_tensors = dynamics_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent dynamics: batch size {} smaller than nsteps {}'.format(
                        nbatch, nsteps)
                    dynamics_latent, recurrent_tensors = dynamics_network(
                        encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)

        #             print('dynamics%s'%character, train_dynamics_model[i])

        ### original
        # with tf.variable_scope('dyn', reuse=tf.AUTO_REUSE):
        #     dynamics_latent = dynamics_network(encoded_x)
        #     if isinstance(dynamics_latent, tuple):
        #         dynamics_latent, recurrent_tensors = dynamics_latent
        #
        #         if recurrent_tensors is not None:
        #             # recurrent architecture, need a few more steps
        #             nenv = nbatch // nsteps
        #             assert nenv > 0, 'Bad input for recurrent dynamics: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
        #             dynamics_latent, recurrent_tensors = dynamics_network(encoded_x, nenv)
        #             extra_tensors.update(recurrent_tensors)

        ### original delete  tf.variable_scope (first line)
        # dynamics_latent = dynamics_network(encoded_x)
        # if isinstance(dynamics_latent, tuple):
        #     dynamics_latent, recurrent_tensors = dynamics_latent
        #
        #     if recurrent_tensors is not None:
        #         # recurrent architecture, need a few more steps
        #         nenv = nbatch // nsteps
        #         assert nenv > 0, 'Bad input for recurrent dynamics: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
        #         dynamics_latent, recurrent_tensors = dynamics_network(encoded_x, nenv)
        #         extra_tensors.update(recurrent_tensors)

        # _v_net = value_network
        #
        # if _v_net is None or _v_net == 'shared':
        #     vf_latent = dynamics_latent
        # else:
        #     if _v_net == 'copy':
        #         _v_net = dynamics_network
        #     else:
        #         assert callable(_v_net)
        #
        #     with tf.variable_scope('dyn_vf', reuse=tf.AUTO_REUSE):
        #         vf_latent = _v_net(encoded_x)

        dynamics = DynamicsWithValue(
            env=env,
            observations=X,
            latent=dynamics_latent,
            sess=sess,
            index=index,  ### added
            **extra_tensors)
        return dynamics
Ejemplo n.º 11
0
    def policy_fn(nbatch=None,
                  nsteps=None,
                  sess=None,
                  observ_placeholder=None):
        print("Observation space being passed to policies.py {}".format(
            env.observation_space))
        ob_space = env.observation_space
        print("Observation space is in policies.py {}".format(ob_space))

        #placeholder list by Sai
        ph_list = []

        if isinstace(ob_space, dict):  #will accept dictionaries for now
            for eachKey in ob_space:
                ph_list.append(
                    observation_placeholder(ob_space[eachKey],
                                            batch_size=nbatch))

        print("The shape of the placeholder list (ph_list) in policies.py {}".
              format(ph_list.shape))

        extra_tensors = {}

        if normalize_observations and ph_list[0].dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            extra_tensors['rms'] = rms
        else:
            encoded_x = ph_list

        for eachKey in ob_space:
            count = 0
            encoded_x[count] = (encode_observation(ob_space[eachKey],
                                                   ph_list[count]))
            count += 1

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(
                encoded_x)  # encoded_x is the list of encoded ph
            # if isinstance(policy_latent, tuple):
            #     policy_latent, recurrent_tensors = policy_latent

            #     if recurrent_tensors is not None:
            #         # recurrent architecture, need a few more steps
            #         nenv = nbatch // nsteps
            #         assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
            #         policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
            #         extra_tensors.update(recurrent_tensors)

        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(env=env,
                                 observations=ph_list,
                                 latent=policy_latent,
                                 vf_latent=vf_latent,
                                 sess=sess,
                                 estimate_q=estimate_q,
                                 **extra_tensors)
        return policy
Ejemplo n.º 12
0
    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None, mix_mode='nomix'):
        ob_space = env.observation_space

        extra_tensors = {}

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=None)

        if mix_mode in ['mixreg', 'mixobs']:
            COEFF = tf.placeholder(tf.float32, [None])
            INDICES = tf.placeholder(tf.int32, [None])
            OTHER_INDICES = tf.placeholder(tf.int32, [None])
            coeff = tf.reshape(COEFF, (-1, 1, 1, 1))
            encoded_x = tf.cast(X, tf.float32)
            encoded_x = coeff * tf.gather(encoded_x, INDICES, axis=0) + (1 - coeff) * tf.gather(encoded_x, OTHER_INDICES, axis=0)
            encoded_x = tf.cast(encoded_x, tf.uint8)
            extra_tensors['coeff'] = COEFF
            extra_tensors['indices'] = INDICES
            extra_tensors['other_indices'] = OTHER_INDICES
        elif mix_mode == 'nomix':
            encoded_x = X
        else:
            raise ValueError(f"Unknown mixing mode: {mix_mode} !")

        encoded_x = encode_observation(ob_space, encoded_x)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
                    extra_tensors.update(recurrent_tensors)


        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(
            env=env,
            observations=X,
            latent=policy_latent,
            vf_latent=vf_latent,
            sess=sess,
            estimate_q=estimate_q,
            **extra_tensors
        )
        return policy
Ejemplo n.º 13
0
    def policy_fn(nbatch=None, nsteps=None, sess=None, observ_placeholder=None):
        ob_space = env.observation_space

        X = observ_placeholder if observ_placeholder is not None else observation_placeholder(ob_space, batch_size=nbatch)
        #这里对输入的X进行镜像,X为batch_size*84*84*4,axis=1为上下镜像,axis=2为左右镜像
        X_mirror = tf.reverse(X,axis=[1])
        extra_tensors = {}
        if normalize_observations and X.dtype == tf.float32:
            encoded_x, rms = _normalize_clip_observation(X)
            #下方所有加mirror的都是添加的,和正常的状态过同样的计算流程
            encoded_x_mirror, _ = _normalize_clip_observation(X_mirror)
            extra_tensors['rms'] = rms
        else:
            encoded_x = X
            encoded_x_mirror = X_mirror

        encoded_x = encode_observation(ob_space, encoded_x)
        encoded_x_mirror = encode_observation(ob_space,encoded_x_mirror)

        with tf.variable_scope('pi', reuse=tf.AUTO_REUSE):
            policy_latent = policy_network(encoded_x)
            #过同样的策略网络
            policy_latent_mirror = policy_network(encoded_x_mirror)
            if isinstance(policy_latent, tuple):
                policy_latent, recurrent_tensors = policy_latent
                policy_latent_mirror, recurrent_tensors_mirror = policy_latent_mirror

                if recurrent_tensors is not None:
                    # recurrent architecture, need a few more steps
                    nenv = nbatch // nsteps
                    assert nenv > 0, 'Bad input for recurrent policy: batch size {} smaller than nsteps {}'.format(nbatch, nsteps)
                    policy_latent, recurrent_tensors = policy_network(encoded_x, nenv)
                    policy_latent_mirror,recurrent_tensors_mirror = policy_network(encoded_x_mirror,nenv)
                    extra_tensors.update(recurrent_tensors)


        _v_net = value_network

        if _v_net is None or _v_net == 'shared':
            vf_latent = policy_latent
        else:
            if _v_net == 'copy':
                _v_net = policy_network
            else:
                assert callable(_v_net)

            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                # TODO recurrent architectures are not supported with value_network=copy yet
                vf_latent = _v_net(encoded_x)

        policy = PolicyWithValue(
            env=env,
            observations=X,
            latent=policy_latent,
            #mirrorlatent为镜像后过策略神经但是没有经过全连接层的输出,后面镜像的价值网络和策略网络共享这个,所以没有定义一个vf_latent_mirror
            mirrorlatent = policy_latent_mirror,
            vf_latent=vf_latent,
            sess=sess,
            estimate_q=estimate_q,
            **extra_tensors
        )
        return policy