예제 #1
0
    def network_fn(X, nenv=1):
        print("")
        print("IN HERE LSTM and this is X ",str(X))
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
        #T = tf.get_variable(name='init', shape=[1, 2], initializer=tf.constant_initializer(1)) # task desciptor

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)

        ## TODO:  need to change initialization of state!
        initial_state = np.zeros(S.shape.as_list(), dtype=float)


        print("")
        print("HHHHH ",str(S.shape.as_list()))
        print(nenv)

        #initial_state = utils.fc(T,'pi_init', [nenv,48], init_scale=0.01, init_bias=0.01)
        #initial_state = tf.get_variable(name='init_state', shape=initial_state.shape, initializer=tf.zeros_initializer(), trainable=True) # task desciptor

        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
예제 #2
0
    def network_fn(X, nenv=1):

        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        ob_g, ob_l = tf.split(X, 2, axis=1)
        ob_g = tf.squeeze(ob_g, axis=1) - 128.0
        ob_l = tf.squeeze(ob_l, axis=1) - 128.0

        # Conv layer
        net_g = vggm1234(ob_g)
        net_l = vggm1234(ob_l)
        feat = tf.concat([net_g, net_l], 1)

        # LSTM
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(feat, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return (feat, h), {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
예제 #3
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
예제 #4
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
예제 #5
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = nature_cnn(X, **conv_kwargs)

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
예제 #6
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)

        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  # states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope="lnlstm", nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope="lstm", nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            "S": S,
            "M": M,
            "state": snew,
            "initial_state": initial_state
        }
예제 #7
0
    def network_fn(X, nenv=1):
        # TODO(akadian): modify the below code to adapt for depth
        nbatch = X[0].shape[0]
        nsteps = nbatch // nenv

        if X[0].shape[3] == 3:
            h = nature_cnn(X[0], **conv_kwargs)  # rgb
        elif X[0].shape[3] == 1:
            h = depth_cnn(X[0], **conv_kwargs)  # depth
        else:
            raise ValueError

        h = tf.concat([h, X[1]], 1)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
예제 #8
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 scope_name="model"):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope(scope_name, reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
예제 #9
0
            def network_fn(X, nenv=1):
                nbatch = X.shape[0]
                nsteps = nbatch // nenv

                h = X
                with tf.variable_scope('mlp_in', reuse=tf.AUTO_REUSE):
                    for i in range(num_layers_in):
                        h = fc(h,
                               'mlp_in_fc{}'.format(i),
                               nh=num_hidden_in,
                               init_scale=np.sqrt(2))
                        if layer_norm_in:
                            h = tf.contrib.layers.layer_norm(h,
                                                             center=True,
                                                             scale=True)
                        h = activation(h)

                h = tf.layers.flatten(X)

                M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
                S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

                xs = batch_to_seq(h, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)

                if layer_norm_lstm:
                    h5, snew = utils.lnlstm(xs,
                                            ms,
                                            S,
                                            scope='lnlstm',
                                            nh=nlstm)
                else:
                    h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

                h = seq_to_batch(h5)

                with tf.variable_scope('mlp_out', reuse=tf.AUTO_REUSE):
                    for i in range(num_layers_out):
                        h = fc(h,
                               'mlp_out_fc{}'.format(i),
                               nh=num_hidden_out,
                               init_scale=np.sqrt(2))
                        if layer_norm_out:
                            h = tf.contrib.layers.layer_norm(h,
                                                             center=True,
                                                             scale=True)
                        h = activation(h)

                initial_state = np.zeros(S.shape.as_list(), dtype=float)

                return h, {
                    'S': S,
                    'M': M,
                    'state': snew,
                    'initial_state': initial_state
                }
예제 #10
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False,
                 param=None):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  # states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
예제 #11
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=64, reuse=False):
        nenv = nbatch // nsteps
        ob_shape = add_batch_dimension(ob_space.shape, nbatch)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name="X") #obs
        M = tf.placeholder(tf.float32, [nbatch], name="M") #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2], name="S") #states
        with tf.variable_scope("model", reuse=reuse):
            xs = batch_to_seq(X, nenv, nsteps) # Observation sequences
            ms = batch_to_seq(M, nenv, nsteps) # Done sequences
            h0, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h0 = seq_to_batch(h0)
            h0 = tf.concat([h0,X],1)
            # Policy
            h1 = fc(h0, 'pi_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu)
            pi = fc(h1, 'pi', nact, act=tf.tanh, init_scale=0.01)
            # Value function
            h1 = fc(h0, 'vf_fc1', nh=128, init_scale=np.sqrt(2), act=tf.nn.relu)
            vf = fc(h1, 'vf', 1, act=lambda x:x)
            # Current policy variance
            logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
예제 #12
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
예제 #13
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps) # Comments by Fei: xs is list of nsteps, each is nenv * nh
            ms = batch_to_seq(M, nenv, nsteps) # Comments by Fei: ms is list of nsteps, each is nenv vector
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) # Comment by Fei: h5 is the same dimension as xs, but with value changed by LSTM. snew is new S
            h5 = seq_to_batch(h5) # Comments by Fei: h5 is nbatch * nh again, just like h4
            pi = fc(h5, 'pi', nact, act=lambda x:x) # Comments by Fei: pi is nbatch * nact
            vf = fc(h5, 'v', 1, act=lambda x:x) # Comments by Fei: vf is nbatch * 1

        v0 = vf[:, 0] # Comments by Fei: v0 is nbatch vector, each value is the value function of a state
        a0 = sample(pi) # Comments by Fei: a0 is nbatch vector, each value is the best choice of action, at that state
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
예제 #14
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
예제 #15
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        h = tf.layers.flatten(X)
        for i in range(len(hiddens) - 1):
            h = utils.fc(h,
                         'mlp_fc{}'.format(i),
                         nh=hiddens[i],
                         init_scale=np.sqrt(2))
            if layer_norm:
                h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
            h = activation(h)

        nlstm = hiddens[-1]

        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  #states

        xs = utils.batch_to_seq(h, nenv, nsteps)
        ms = utils.batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = utils.seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
예제 #16
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet,
                 reuse=False,
                 flownet=None, train_from_scratch=False,
                 recurrent=None,
                 large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False):
        ob_shape_vec = (nbatch,) + ob_space["vector"].shape
        nh, nw, nc = ob_space["image"].shape
        ob_shape_im = (nbatch, nh, nw, nc)

        actdim = ac_space.shape[0]
        X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec')  # obs
        X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im')

        if add_flownet:
            # adding previous image placeholder:
            X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p')  # obs t-1
        else:
            X_p = None

        if recurrent:
            nenv = nbatch // nsteps
            M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
            S = tf.placeholder(tf.float32, [nenv, nlstm*2])  # states

        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h_im = mujoco_cnn(
                X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec,
                X_p, flownet,
                train_from_scratch,
                large_cnn, diff_frames)

            if add_predicted_flow_to_vec:
                flow_vec = get_flow_vec(
                    X_im, 'pi', nbatch, add_flownet,
                    X_p, flownet,
                    train_from_scratch,
                    large_cnn, diff_frames)
                h_vec = tf.concat([X_vec, flow_vec], axis=-1)
                h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            else:
                h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h1 = tf.concat([h_im, h_vec], 1)

            if recurrent:
                xs = batch_to_seq(h1, nenv, nsteps)
                ms = batch_to_seq(M, nenv, nsteps)
                if recurrent == 'lstm':
                    h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
                else:
                    assert recurrent == 'lnlstm'
                    h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
                h2 = seq_to_batch(h5)
            else:
                h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)

            vf = fc(h2, 'vf', 1)
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                                     initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        a0_r = self.pd.mode()
        neglogp0 = self.pd.neglogp(a0)
        if not recurrent:
            self.initial_state = None
        else:
            self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)


        self.placeholder_dict = {
            "image": X_im,
            "vector": X_vec
        }
        if add_flownet:
            self.placeholder_dict["last_image"] = X_p

        if not recurrent:
            def step(ob, *_args, remove_noise=False, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                if not remove_noise:
                    a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict)
                return a, v, self.initial_state, neglogp

            def value(ob, *_args, **_kwargs):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                return sess.run(v0, feed_dict=feed_dict)
        else:
            def step(ob, state, mask, remove_noise=False):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                if not remove_noise:
                    a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict)
                else:
                    a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict)
                return a, v, s, neglogp

            def value(ob, state, mask):
                feed_dict = {}
                for key, value in self.placeholder_dict.items():
                    feed_dict[value] = ob[key]
                feed_dict[S] = state
                feed_dict[M] = mask
                return sess.run(v0, feed_dict=feed_dict)

        self.X_im = X_im
        self.X_vec = X_vec
        self.X_p = X_p
        self.pi = pi
        if not recurrent:
            self.vf = v0
        else:
            self.vf = vf
            self.M = M
            self.S = S
        self.step = step
        self.value = value
예제 #17
0
 def memory_fn(xs, ms, S, nh):
     return lnlstm(xs, ms, S, 'lnlstm1', nh=nh)
예제 #18
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 nlstm=256,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x: x)
            pix = fc(h5, 'pix', FLAGS.screen_resolution, act=lambda x: x)
            piy = fc(h5, 'piy', FLAGS.screen_resolution, act=lambda x: x)

            vf = fc(h5, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        x0 = sample(pix)
        y0 = sample(piy)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            a, x, y, v, s = sess.run([a0, x0, y0, v0, snew], {
                X: ob,
                S: state,
                M: mask
            })
            return a, x, y, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.pix = pix
        self.piy = piy
        self.vf = vf
        self.step = step
        self.value = value
예제 #19
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        # nh, nw, nc = ob_space.shape  # (nh, nw, nc) = (height, width, channels)
        ob_shape = (nbatch, ob_space.shape[0])
        # nact = ac_space.n
        # X = tf.placeholder(tf.uint8, ob_shape)  # obs
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='phOb')
        M = tf.placeholder(tf.float32, [nbatch],
                           name='phMaskDone')  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2],
                           name='phCellState')  # states and output: (c, h)
        with tf.variable_scope("model", reuse=reuse):
            # h = nature_cnn(X)
            # h = tf.add(X, 0, name='h')  # need more network to power enough
            h = mlp(X)
            xs = batch_to_seq(
                h, nenv,
                nsteps)  # A List contain tensors all with shape [nenv, -1]
            ms = batch_to_seq(
                M, nenv,
                nsteps)  # A List contain tensors all with shape [nenv, 1]
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'fc_pi', actdim)
            # acs = fc(h5, 'actions', actdim, init_scale=0.01)
            # move = tf.multiply(tf.nn.sigmoid(acs[:, 1:2]), 20, name='movement')
            # pi = tf.concat([acs[:, 0:1], move], axis=1, name='pi')
            vf = fc(h5, 'v', 1)
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, actdim],
                                     initializer=tf.zeros_initializer())
        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
        # self.pdtype = make_pdtype(ac_space)
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        action = tf.add(
            a0, 0, name='action')  # use this tensor as action when inference
        newState = tf.add(snew, 0, name='newCellState')
        print('sel.pd.shape', self.pd.shape, a0.shape)
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value