コード例 #1
0
ファイル: policies.py プロジェクト: Divyankpandey/baselines
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
コード例 #2
0
ファイル: policies.py プロジェクト: Divyankpandey/baselines
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
コード例 #3
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
コード例 #4
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #5
0
ファイル: policies.py プロジェクト: wangyuhuix/TRGPPO
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
コード例 #6
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
コード例 #7
0
ファイル: policies.py プロジェクト: IcarusTan/baselines
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #8
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X: ob, S: state, M: mask})

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #9
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)[:, 0]
            lp = fc(h, 'lp', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, vf, lp, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(vf, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.lp = lp
        self.step = step
        self.value = value
コード例 #10
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        ob_g, ob_l = tf.split(X, 2, axis=1)
        ob_g = tf.squeeze(ob_g, axis=1) - 128.0
        ob_l = tf.squeeze(ob_l, axis=1) - 128.0

        # Conv layer
        net_g = vggm1234(ob_g)
        net_l = vggm1234(ob_l)
        feat = tf.concat([net_g, net_l], 1)

        # LSTM
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(feat, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        # FC
        h = slim.fully_connected(h, 4, scope='fc', activation_fn=tf.nn.tanh)

        return (feat, h), {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
コード例 #11
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)
        for i in range(len(hiddens) - 1):
            h = utils.fc(h,
                         'mlp_fc{}'.format(i),
                         nh=hiddens[i],
                         init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)

        nlstm = hiddens[-1]

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = utils.batch_to_seq(h, nenv, nsteps)
        ms = utils.batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = utils.seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
コード例 #12
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet,
                 reuse=False,
                 flownet=None, train_from_scratch=False,
                 recurrent=None,
                 large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False):
        ob_shape_vec = (nbatch,) + ob_space["vector"].shape
        nh, nw, nc = ob_space["image"].shape
        ob_shape_im = (nbatch, nh, nw, nc)

        actdim = ac_space.shape[0]
        X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec')  # obs
        X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im')

        if add_flownet:
            # adding previous image placeholder:
            X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p')  # obs t-1
        else:
            X_p = None

        if recurrent:
            nenv = nbatch // nsteps
            M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
            S = tf.placeholder(tf.float32, [nenv, nlstm*2])  # states

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h_im = mujoco_cnn(
                X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec,
                X_p, flownet,
                train_from_scratch,
                large_cnn, diff_frames)

            if add_predicted_flow_to_vec:
                flow_vec = get_flow_vec(
                    X_im, 'pi', nbatch, add_flownet,
                    X_p, flownet,
                    train_from_scratch,
                    large_cnn, diff_frames)
                h_vec = tf.concat([X_vec, flow_vec], axis=-1)
                h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            else:
                h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h1 = tf.concat([h_im, h_vec], 1)

            if recurrent:
                xs = batch_to_seq(h1, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)
                if recurrent == 'lstm':
                    h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
                else:
                    assert recurrent == 'lnlstm'
                    h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
                h2 = seq_to_batch(h5)
            else:
                h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)

            vf = fc(h2, 'vf', 1)
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        a0_r = self.pd.mode()
        neglogp0 = self.pd.neglogp(a0)
        if not recurrent:
            self.initial_state = None
        else:
            self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)


        self.placeholder_dict = {
            "image": X_im,
            "vector": X_vec
        }
        if add_flownet:
            self.placeholder_dict["last_image"] = X_p

        if not recurrent:
            def step(ob, *_args, remove_noise=False, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                if not remove_noise:
                    a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict)
                return a, v, self.initial_state, neglogp

            def value(ob, *_args, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                return sess.run(v0, feed_dict=feed_dict)
        else:
            def step(ob, state, mask, remove_noise=False):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                if not remove_noise:
                    a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict)
                return a, v, s, neglogp

            def value(ob, state, mask):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                return sess.run(v0, feed_dict=feed_dict)

        self.X_im = X_im
        self.X_vec = X_vec
        self.X_p = X_p
        self.pi = pi
        if not recurrent:
            self.vf = v0
        else:
            self.vf = vf
            self.M = M
            self.S = S
        self.step = step
        self.value = value
コード例 #13
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 feature_mlp=True):

        # Here the batch size is 1, i.e. one trajectory
        # also assume nenvs=1
        if nsteps is None:
            ob_shape = (None, ) + ob_space.shape
            M = tf.placeholder(tf.float32, [None])
        else:
            ob_shape = (nsteps, ) + ob_space.shape
            M = tf.placeholder(tf.float32, [nsteps])

        if len(ac_space.shape) == 0:
            # discrete set of actions
            nact = ac_space.n
            discrete = True
        else:
            actdim = ac_space.shape[0]
            discrete = False
        X = tf.placeholder(tf.float32, ob_shape, name="Ob")
        S = tf.placeholder(tf.float32, [1, nlstm * 2])  # states

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            if feature_mlp:
                h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2)))
                h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2)))
                xs = batch_to_seq(h2, 1, nsteps)
            else:
                xs = batch_to_seq(X, 1, nsteps)
            ms = batch_to_seq(M, 1, nsteps)
            h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, "vf", 1)
            if discrete:
                pi = fc(h5, "pi", nact, init_scale=0.01)
            else:
                pi = fc(h5, "pi", actdim, init_scale=0.01)
                logstd = tf.get_variable(name="logstd",
                                         shape=[1, actdim],
                                         initializer=tf.zeros_initializer())

        self.pdtype = make_pdtype(ac_space)
        if discrete:
            self.pd = self.pdtype.pdfromflat(pi)
        else:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((1, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #14
0
ファイル: base.py プロジェクト: wwxFromTju/rl-generalization
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 feature_mlp=True):

        nenv = nbatch // nsteps
        # assume that inputs are vectors and reward is a scalar
        if len(ac_space.shape) == 0:
            # discrete set of actions, input as one-hot encodings
            nact = ac_space.n
            discrete = True
            input_length = ob_space.shape[0] + nact + 2
        else:
            actdim = ac_space.shape[0]
            discrete = False
            input_length = ob_space.shape[0] + actdim + 2
        input_shape = (nbatch, input_length)

        X = tf.placeholder(tf.float32, input_shape, name="Input")
        M = tf.placeholder(tf.float32,
                           [nbatch])  # mask (done with a trial at time t-1)
        S = tf.placeholder(tf.float32,
                           [nenv, nlstm * 2])  # states of the recurrent policy
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            if feature_mlp:
                print("Using feature network in front of LSTM")
                h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2)))
                h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2)))
                xs = batch_to_seq(h2, nenv, nsteps)
            else:
                print("No feature network in front of LSTM")
                xs = batch_to_seq(X, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, "vf", 1)
            if discrete:
                pi = fc(h5, "pi", nact, init_scale=0.01)
            else:
                pi = fc(h5, "pi", actdim, init_scale=0.01)
                logstd = tf.get_variable(name="logstd",
                                         shape=[1, actdim],
                                         initializer=tf.zeros_initializer())

        self.pdtype = make_pdtype(ac_space)
        if discrete:
            self.pd = self.pdtype.pdfromflat(pi)
        else:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, ac, rew, done, mask):
            # if discrete action space, convert ac to one-hot encoding and done to int
            rew = np.reshape(np.asarray([rew]), (nbatch, 1))
            done = np.reshape(np.asarray([done], dtype=float), (nbatch, 1))
            if discrete:
                if ac[0] == -1:
                    ac = np.zeros((nbatch, nact), dtype=np.int)
                else:
                    ac = np.reshape(np.asarray([ac]), (nbatch, ))
                    ac = np.eye(nact)[ac]
                x = np.concatenate((ob, ac, rew, done), axis=1)
            else:
                ac = np.reshape(np.asarray([ac]), (nbatch, actdim))
                x = np.concatenate((ob, ac, rew, done), axis=1)
            return sess.run([a0, v0, snew, neglogp0], {
                X: x,
                S: state,
                M: mask
            })

        def value(ob, state, ac, rew, done, mask):
            rew = np.reshape(np.asarray([rew]), (nbatch, 1))
            done = np.reshape(np.asarray([done], dtype=float), (nbatch, 1))
            if discrete:
                if ac[0] == -1:
                    ac = np.zeros((nbatch, nact), dtype=np.int)
                else:
                    ac = np.reshape(np.asarray([ac]), (nbatch, ))
                    ac = np.eye(nact)[ac]
                x = np.concatenate((ob, ac, rew, np.array(done, dtype=float)),
                                   axis=1)
            else:
                ac = np.reshape(np.asarray([ac]), (nbatch, actdim))
                x = np.concatenate((ob, ac, rew, np.array(done, dtype=float)),
                                   axis=1)
            return sess.run(v0, {X: x, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #15
0
    def _init(self,
              ob_name,
              m_name,
              svfname,
              spiname,
              ob_space,
              ac_space,
              usecnn=False,
              nlstm=256):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        init_std = 1.0
        nenv = 1
        # nbatch = nenv * nsteps

        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.ob = U.get_placeholder(name=ob_name,
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        M = U.get_placeholder(m_name, tf.float32,
                              [sequence_length])  # mask (done t-1)
        Svf = U.get_placeholder(svfname, tf.float32,
                                [nenv, nlstm * 2])  # states
        Spi = U.get_placeholder(spiname, tf.float32,
                                [nenv, nlstm * 2])  # states

        with tf.variable_scope("vf"):
            if usecnn:
                h = nature_cnn(self.ob)
            else:
                h = self.ob
            # xs = batch_to_seq(h, nenv, nsteps)
            # ms = batch_to_seq(M, nenv, nsteps)
            # h5, vfsnew = lstm(xs, ms, Svf, 'lstmvf', nh=nlstm)
            h5, vfsnew = lstm(h, M, Svf, 'lstmvf', nh=nlstm)
            h5 = seq_to_batch(h5)
            self.vpred = fc(h5, 'value', 1)

        with tf.variable_scope("pol"):

            if usecnn:
                h = nature_cnn(self.ob)
            else:
                h = self.ob
            # xs = batch_to_seq(h, nenv, nsteps)
            # ms = batch_to_seq(M, nenv, nsteps)
            # h5, pisnew = lstm(xs, ms, Spi, 'lstmpi', nh=nlstm)
            h5, pisnew = lstm(h, M, Spi, 'lstmpi', nh=nlstm)
            h5 = seq_to_batch(h5)

            self.action_dim = ac_space.shape[0]
            self.varphi = h5
            self.varphi_dim = 64

            stddev_init = np.ones([1, self.action_dim]) * init_std
            prec_init = 1. / (np.multiply(stddev_init, stddev_init))  # 1 x |a|
            self.prec = tf.get_variable(
                name="prec",
                shape=[1, self.action_dim],
                initializer=tf.constant_initializer(prec_init))
            kt_init = np.ones([self.varphi_dim, self.action_dim
                               ]) * 0.5 / self.varphi_dim
            ktprec_init = kt_init * prec_init
            self.ktprec = tf.get_variable(
                name="ktprec",
                shape=[self.varphi_dim, self.action_dim],
                initializer=tf.constant_initializer(ktprec_init))
            kt = tf.divide(self.ktprec, self.prec)
            mean = tf.matmul(h5, kt)

            logstd = tf.log(tf.sqrt(1. / self.prec))

            self.prec_get_flat = U.GetFlat([self.prec])
            self.prec_set_from_flat = U.SetFromFlat([self.prec])

            self.ktprec_get_flat = U.GetFlat([self.ktprec])
            self.ktprec_set_from_flat = U.SetFromFlat([self.ktprec])

            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

        self.pd = pdtype.pdfromflat(pdparam)
        self.M = M
        self.Svf = Svf
        self.Spi = Spi

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, self.ob, M, Spi, Svf],
                               [ac, self.vpred, pisnew, vfsnew])

        # Get all policy parameters
        vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                 self.scope + '/pol')
        # Remove log-linear parameters ktprec and prec to get only non-linear parameters
        del vars[-1]
        del vars[-1]
        beta_params = vars

        # Flat w_beta
        beta_len = np.sum(
            [np.prod(p.get_shape().as_list()) for p in beta_params])
        w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len])

        # Unflatten w_beta
        beta_shapes = list(map(tf.shape, beta_params))
        w_beta_unflat_var = self.unflatten_tensor_variables(
            w_beta_var, beta_shapes)

        # w_beta^T * \grad_beta \varphi(s)^T
        v = tf.placeholder(dtype=self.varphi.dtype,
                           shape=self.varphi.get_shape(),
                           name="v_in_Rop")
        features_beta = self.alternative_Rop(self.varphi, beta_params,
                                             w_beta_unflat_var, v)

        self.features_beta = U.function([self.ob, w_beta_var, v],
                                        features_beta)
コード例 #16
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=16,
                 reuse=False):
        nenv = nbatch // nsteps

        qmdp_param = {}
        qmdp_param['K'] = 3
        qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n
        qmdp_param['num_action'] = ac_space.n
        qmdp_param['num_state'] = 32
        qmdp_param['num_obs'] = 17

        input_len = ob_space.shape
        input_shape = (nbatch, ) + input_len  # [nbatch, input_length]
        num_action = qmdp_param["num_action"]
        obs_len = qmdp_param["obs_len"]
        num_state = qmdp_param['num_state']
        num_obs = qmdp_param['num_obs']

        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, input_shape)  #[nbatch,obs+prev action]
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(
            tf.float32,
            [nenv, num_state + 2 * nlstm])  # belief state (for each env)
        # S is belief state concatenated with initial hidden and cell states for vf lstm

        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps)
            #xs originaly [nbatch,input_len]
            #reshape xs to [nenv,nsteps,input_len]
            #split xs along axis=1 to nsteps
            #xs becomes [nsteps,nenv,input_len]
            #divide xs to obs and pre_action
            obs = [x[:, 0:obs_len] for x in xs]
            acts = [x[:, obs_len:] for x in xs]
            ms = batch_to_seq(M, nenv, nsteps)
            #same as xs
            #ms has shape [nsteps,nenv]
            bi = S[:, 0:num_state]  # initial/previous belief
            hi = S[:, num_state:]  # initial/previous hidden unit

            #build variabels
            self.planner_net = PlannerNet("planner", qmdp_param)
            self.filter_net = FilterNet("filter", qmdp_param)

            #calculate action value q, and belief bnew
            # s_hist is really belief state history, so really belief history
            # snew is the newest belief
            s_hist, snew = self.filter_net.beliefupdate(obs, acts, ms, bi)
            # s_hist, snew, w_O, Z_o, b_prime_a, b_f = self.filter_net.beliefupdate(obs, acts, ms, S)
            #s_hist: [nstep,nenv,num_state]
            # snew: [nenv, num_state]
            Q, _, _ = self.planner_net.VI(nbatch)
            # Q: [nbatches, num_state, num_action]

            # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            # h5 = seq_to_batch(h5)

            #calculate action and value
            s_hist = seq_to_batch(s_hist)  #[nbatch,num_state] (belief history)
            q = self.planner_net.policy(Q, s_hist)  # [num_batch, num_action]

            # separate value function for baseline
            # takes in sequence of observations and actions and returns values of the belief states
            # in the belief history
            vn_scope = "value_network"
            # hi is of dim 2*nlstm
            # xs is the obs and acts concatenated
            # TODO: What shape do I want xs to be in? [nsteps, nenv, nobs+nacts], which is what it is!
            # TODO: And what shape do I want chnew to be? [nenv, nlstm]
            h_hist, chnew = lstm(xs, ms, hi, vn_scope, nlstm)
            h_hist = tf.convert_to_tensor(h_hist, dtype=tf.float32)
            # h_hist.shape: (nstep, nenv, nlstm)
            # chnew.shape: (nenv, 2*nlstm)
            Snew = tf.concat(axis=1, values=[snew, chnew])
            # stack snew and chnew
            ############### baseline value function #####################################
            #############################################################################
            self.pd, self.pi = self.pdtype.pdfromlatent(q)
            # input dim of fc: shape(q)[1] = num_action, output dim of fc: 1
            #vf = fc(q, 'v', 1) #critic value function, output shape: [num_batch, 1]
            vf = fc(fc(fc(h_hist, 'v1', nlstm), 'v2', nlstm), 'v3', 1)
            #############################################################################

            #pi = fc(h5, 'pi', nact) #actor
            #vf = fc(h5, 'v', 1) #critic value function

        v0 = vf[:, 0]  # reduce dims from [num_batch, 1] to [num_batch, ]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        # self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.initial_state = np.ones(
            (nenv, num_state), dtype=np.float32) / num_state

        def step(ob, belief_state, mask):
            return sess.run([a0, v0, Snew, neglogp0], {
                X: ob,
                S: belief_state,
                M: mask
            })
            # a,b,c,d,q_val = sess.run([a0, v0, snew, neglogp0, q], {X:ob, S:state, M:mask})
            # print("q: ",q_val)
            # print("q shape: ",q_val.shape)
            # return a,b,c,d

        def value(ob, belief_state, mask):
            return sess.run(v0, {X: ob, S: belief_state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
コード例 #17
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=50,
                 reuse=False):
        nenv = nbatch // nsteps
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess)
        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        ob_shape = (nbatch, ) + ob_space.shape
        X = tf.placeholder(tf.float32, ob_shape)
        nact = ac_space.shape[0] - 1
        # X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch],
                           name='mask')  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2],
                           name='state')  # states
        S_pred = tf.placeholder(tf.float32, [nenv, nlstm * 2],
                                name='predict_state')  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            # h = tf.nn.tanh(fc(X, 'fc1', 20))
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, nact],
                                     initializer=tf.zeros_initializer())
        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
        self.pdtype = make_pdtype(
            spaces.Box(ac_space.low[0], ac_space.high[0], [
                nact,
            ]))
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = tf.clip_by_value(self.pd.sample(), -1, 1)
        neglogp0 = self.pd.neglogp(a0)

        with tf.variable_scope('predictor', reuse=reuse):
            h = nature_cnn(X)
            # h = tf.nn.relu(fc(X, 'fc1', 20))
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew_pred = lstm(xs, ms, S_pred, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            # h7 = (fc(a0, 'prediction_fc_action', 10))
            # h6 = tf.concat([h7, h5],axis=1)
            h7 = fc(h5, 'prediction_fc', 256)
            self.prediction = tf.nn.relu(fc(h7, 'prediction_out', 1))

        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, predict_state, mask):
            prediction_out, a0_out, v0_out, snew_out, snew_predict_out, neglogp0_out = sess.run(
                [self.prediction, a0, v0, snew, snew_pred, neglogp0], {
                    X: ob,
                    S: state,
                    S_pred: predict_state,
                    M: mask
                })

            return np.concatenate(
                [a0_out, prediction_out],
                axis=-1), v0_out, snew_out, snew_predict_out, neglogp0_out

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.S_pred = S_pred
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #18
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=32,
                 reuse=False):
        nenv = nbatch // nsteps

        ob_shape = (nbatch, ) + ob_space.shape
        # actdim = ac_space.shape[0]  # hv: I changed this to ac_space.n becuase the ac_space.shape does not work
        actdim = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            xs = batch_to_seq(h1, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h2, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h2 = seq_to_batch(h2)
            pi = fc(h2, 'pi', actdim, act=lambda x: x, init_scale=0.01)

            h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            vf = fc(h2, 'vf', 1, act=lambda x: x)

        self.pdtype = make_pdtype(ac_space)

        if isinstance(ac_space, gym.spaces.Discrete):
            self.pd = self.pdtype.pdfromflat(pi)
        else:
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        def get_act(ob, state, mask):
            a = sess.run(a0, {X: ob, S: state, M: mask})
            return a

        def get_mean(ob, state, mask):
            a, state_new = sess.run([pi, snew], {X: ob, S: state, M: mask})
            return a, state_new

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.act = get_act
        self.mean = get_mean
コード例 #19
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 nlstm=256,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi = fc(h5, 'pi', nact, act=lambda x: x)
            pix = fc(h5, 'pix', FLAGS.screen_resolution, act=lambda x: x)
            piy = fc(h5, 'piy', FLAGS.screen_resolution, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        x0 = sample(pix)
        y0 = sample(piy)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            a, x, y, v, s = sess.run([a0, x0, y0, v0, snew], {
                X: ob,
                S: state,
                M: mask
            })
            return a, x, y, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.pix = pix
        self.piy = piy
        self.vf = vf
        self.step = step
        self.value = value
コード例 #20
0
ファイル: policy2.py プロジェクト: Recharrs/NavRobot
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n

        X = tf.placeholder(tf.float32, ob_shape)  #obs
        I = tf.placeholder(tf.int32, [nbatch, 5])
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states

        # Model
        with tf.variable_scope("model", reuse=reuse):
            # Image Processing
            with tf.variable_scope("cnn"):
                x_image_rep = nature_cnn(X)

            # Instructioin Processing
            with tf.variable_scope("GRU"):
                embedding = tf.get_variable(
                    'word_embedding',
                    shape=[12, 32],
                    initializer=tf.random_uniform_initializer(-1e-3, 1e-3))
                gru_cell = tf.contrib.rnn.GRUCell(
                    num_units=256,
                    kernel_initializer=tf.random_uniform_initializer(
                        -1e-3, 1e-3),
                    bias_initializer=tf.random_uniform_initializer(
                        -1e-3, 1e-3))

                encoder_hidden = gru_cell.zero_state(nbatch, dtype=tf.float32)
                for i in range(5):
                    word_embedding = tf.nn.embedding_lookup(embedding, I[:, i])
                    output, encoder_hidden = gru_cell.call(
                        word_embedding, encoder_hidden)
                x_insts_rep = encoder_hidden

            # Gated-Attention layers
            with tf.variable_scope("x-attn"):
                x_attention = tf.sigmoid(
                    fc(x_insts_rep, 'x-attn', 64, init_scale=1.0))
                x_attention = tf.expand_dims(x_attention, 1)
                x_attention = tf.expand_dims(x_attention, 2)

            with tf.variable_scope("Gated-Attention"):
                x = x_image_rep * x_attention
                x = conv_to_fc(x)
                x = tf.nn.relu(fc(x, 'x-Ga', 256, init_scale=1.0))

            xs = batch_to_seq(x, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h20, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm, init_scale=1.0)
            h20 = seq_to_batch(h20)

            with tf.variable_scope("pi"):
                pi = tf.layers.dense(
                    h20,
                    nact,
                    kernel_initializer=normalized_columns_initializer(0.01))
            with tf.variable_scope("vf"):
                vf = tf.layers.dense(
                    h20,
                    1,
                    kernel_initializer=normalized_columns_initializer(0.01))

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, insts, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                I: insts,
                S: state,
                M: mask
            })

        def value(ob, insts, state, mask):
            return sess.run(v0, {X: ob, I: insts, S: state, M: mask})

        self.X = X
        self.I = I  #
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value

        # start logging
        # =============
        if reuse:
            self.var_summary('./Asset/logdir', sess)
コード例 #21
0
ファイル: base.py プロジェクト: wwxFromTju/rl-generalization
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 feature_mlp=True):

        nenv = nbatch // nsteps
        ob_shape = (nbatch, ) + ob_space.shape
        if len(ac_space.shape) == 0:
            # discrete set of actions
            nact = ac_space.n
            discrete = True
        else:  # continuous
            actdim = ac_space.shape[0]
            discrete = False
        X = tf.placeholder(tf.float32, ob_shape, name="Ob")
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            if feature_mlp:
                print("Using feature network in front of LSTM")
                h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2)))
                h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2)))
                xs = batch_to_seq(h2, nenv, nsteps)
            else:
                print("No feature network in front of LSTM")
                xs = batch_to_seq(X, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, "vf", 1)
            if discrete:
                pi = fc(h5, "pi", nact, init_scale=0.01)
            else:
                pi = fc(h5, "pi", actdim, init_scale=0.01)
                logstd = tf.get_variable(name="logstd",
                                         shape=[1, actdim],
                                         initializer=tf.zeros_initializer())

        self.pdtype = make_pdtype(ac_space)
        if discrete:
            self.pd = self.pdtype.pdfromflat(pi)
        else:
            pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
コード例 #22
0
ファイル: cnn_lstm.py プロジェクト: zjucsphd/openai_acer
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        super().__init__(sess,
                         ob_space,
                         ac_space,
                         nenv,
                         nsteps,
                         nstack,
                         reuse=reuse)
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        nlstm = self.lstm_units
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            X = tf.cast(X, tf.float32)
            h = conv(X,
                     'c1',
                     nf=16,
                     rf=3,
                     stride=1,
                     pad='SAME',
                     init_scale=np.sqrt(2))
            h = tf.nn.relu(h)
            h = conv(h,
                     'c2',
                     nf=32,
                     rf=3,
                     stride=1,
                     pad='SAME',
                     init_scale=np.sqrt(2))
            h = tf.nn.relu(h)
            h = conv_to_fc(h)
            h = fc(h, 'fc1', nh=self.dense_units, init_scale=np.sqrt(2))
            h = tf.nn.relu(h)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        self.a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.snew = snew
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q
        self.sess = sess
コード例 #23
0
ファイル: lstm2_policy.py プロジェクト: maxiaoba/QMDPNET
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=16,
                 reuse=False):
        nenv = nbatch // nsteps

        qmdp_param = {}
        qmdp_param['K'] = 3
        qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n
        qmdp_param['num_action'] = ac_space.n
        qmdp_param['num_state'] = 32
        qmdp_param['num_obs'] = 17

        input_len = ob_space.shape
        input_shape = (nbatch, ) + input_len
        num_action = qmdp_param["num_action"]
        obs_len = qmdp_param["obs_len"]
        num_state = qmdp_param['num_state']
        num_obs = qmdp_param['num_obs']

        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, input_shape)  #[nbatch,obs+prev action]
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #beliefs

        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h = S[:, 0:nlstm]
            c = S[:, nlstm:]

            self.lstm = lstm('lstm', input_len[0], nlstm)
            h5, snew = self.lstm.update(xs, ms, h, c)

            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })
            # a,b,c,d,q_val = sess.run([a0, v0, snew, neglogp0, q], {X:ob, S:state, M:mask})
            # print("q: ",q_val)
            # print("q shape: ",q_val.shape)
            # return a,b,c,d

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
コード例 #24
0
ファイル: policies.py プロジェクト: ethanabrooks/baselines
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 size_mem=256,
                 reuse=False):  # pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        if ac_space.shape == ():
            actdim = 1
        else:
            actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs

        nenv = nbatch // nsteps
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, size_mem * 2])  # states

        with tf.variable_scope("model", reuse=reuse):
            # h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            # h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)

            h2 = tf.cast(X, tf.float32)
            xs = batch_to_seq(h2, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm', nh=size_mem)
            h5 = seq_to_batch(h5)

            pi = fc(h5, 'pi', actdim, act=lambda x: x, init_scale=0.01)
            h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh)
            vf = fc(h5, 'vf', 1, act=lambda x: x)[:, 0]
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        # v0 = vf[0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, vf, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(vf, {X: ob, S: state, M: mask})

        # def step(ob, *_args, **_kwargs):
        #     a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
        #     return a, v, self.initial_state, neglogp
        #
        # def value(ob, *_args, **_kwargs):
        #     return sess.run(vf, {X: ob})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value