def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  # pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi = fc(h, 'pi', nact, init_scale=0.01)
            vf = fc(h, 'v', 1)[:, 0]

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
def _matching_fc(tensor, name, size, init_scale, init_bias):
    if tensor.shape[-1] == size:
        return tensor
    else:
        return fc(tensor,
                  name,
                  size,
                  init_scale=init_scale,
                  init_bias=init_bias)
def nature_cnn(unscaled_images):
    """
    CNN from Nature paper.
    """
    # cast(x, dtype, name=None) 将x的数据格式转化成dtype.
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(
        conv(scaled_images, 'c1', nf=32, rf=8, stride=4,
             init_scale=np.sqrt(2)))
    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        # nh, nw, nc = ob_space.shape  # (nh, nw, nc) = (height, width, channels)
        ob_shape = (nbatch, ob_space.shape[0])
        # nact = ac_space.n
        # X = tf.placeholder(tf.uint8, ob_shape)  # obs
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='phOb')
        M = tf.placeholder(tf.float32, [nbatch],
                           name='phMaskDone')  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2],
                           name='phCellState')  # states and output: (c, h)
        with tf.variable_scope("model", reuse=reuse):
            # h = nature_cnn(X)
            # h = tf.add(X, 0, name='h')  # need more network to power enough
            h = mlp(X)
            xs = batch_to_seq(
                h, nenv,
                nsteps)  # A List contain tensors all with shape [nenv, -1]
            ms = batch_to_seq(
                M, nenv,
                nsteps)  # A List contain tensors all with shape [nenv, 1]
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            # pi = fc(h5, 'fc_pi', actdim)
            pi0 = tf.nn.tanh(h5[:, :1]) * 3
            pi1 = tf.nn.sigmoid(h5[:, 1:2]) * 10
            pi = tf.concat([pi0, pi1], axis=1, name='pi')

            vf = fc(h5, 'v', 1)
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
        # self.pdtype = make_pdtype(ac_space)
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        action = tf.add(
            a0, 0, name='action')  # use this tensor as action when inference
        newState = tf.add(snew, 0, name='newCellState')
        print('sel.pd.shape', self.pd.shape, a0.shape)
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 reuse=False,
                 training=True):  # pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape  # 增加nbatch行
        actdim = ac_space.shape[0]
        # X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        X = tf.placeholder(tf.float32, [None, ob_space.shape[0]], name='Ob')
        with tf.variable_scope("model", reuse=reuse):
            # activ = tf.tanh
            bn = tf.layers.batch_normalization
            activ = lkrelu
            # h1 = activ(bn(fc(X, 'pi_fc1', nh=512, init_scale=np.sqrt(2)), training=training))
            # h2 = activ(bn(fc(h1, 'pi_fc2', nh=512, init_scale=np.sqrt(2)), training=training))
            # h3 = activ(bn(fc(h2, 'pi_fc3', nh=256, init_scale=np.sqrt(2)), training=training))
            h1 = activ(fc(X, 'pi_fc1', nh=100, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=100, init_scale=np.sqrt(2)))
            # pi0 = tf.nn.tanh(fc(h2, 'pi0', 1, init_scale=0.01))*3  # (-3, 3)
            # pi1 = tf.nn.sigmoid(fc(h2, 'pi1', 1, init_scale=0.01))*10  # (0, 10)
            # pi = tf.concat([pi0, pi1], axis=1, name='pi')
            pi = tf.nn.tanh(fc(h2, 'pi', nh=actdim)) * 10
            # h1 = activ(bn(fc(X, 'vf_fc1', nh=512, init_scale=np.sqrt(2)), training=training))
            # h2 = activ(bn(fc(h1, 'vf_fc2', nh=512, init_scale=np.sqrt(2)), training=training))
            # h3 = activ(bn(fc(h2, 'vf_fc3', nh=256, init_scale=np.sqrt(2)), training=training))
            h1 = activ(fc(X, 'vf_fc1', nh=100, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=100, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)[:, 0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
            # logstd = tf.layers.dense(inputs=h2, activation=None, units=actdim, name='logstd')

        pdparam = tf.concat([pi, pi * 0.0 + logstd],
                            axis=1)  # pi * 0.0 + logstd的作用是使得qi有相同的形状
        self.pdtype = make_pdtype(
            ac_space)  # Probability distribution function  pd
        '''返回DiagGaussianPd的类'''
        self.pd = self.pdtype.pdfromflat(pdparam)
        a0 = self.pd.sample()
        self.action = tf.identity(
            a0, name='action')  # use this tensor as action when inference
        # if I need action clipping?
        # a1 = tf.clip_by_value(a0[:, 0:1], -3, 3)
        # a2 = tf.clip_by_value(a0[:, 1:2], 0, 10)
        # a0 = tf.concat([a1, a2], axis=1)
        neglogp0 = self.pd.neglogp(a0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            # {X: ob}给placeholder赋值
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
def mlp(unscaled_vector):
    h = tf.nn.relu(fc(unscaled_vector, 'fc1', nh=100, init_scale=np.sqrt(1.0)))
    h2 = tf.nn.tanh(fc(h, 'fc2', nh=100, init_scale=np.sqrt(1.0)))
    h3 = fc(h, 'fc2', nh=64, init_scale=np.sqrt(1.0))
    return h3