Example #1
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)


        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Example #2
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0 = sess.run([a, pi], {X: ob})
            return a0, pi0, []  # dummy state

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
Example #3
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi = fc(h, 'pi', nact, init_scale=0.01)
            vf = fc(h, 'v', 1)[:,0]

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Example #4
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Example #5
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, act=lambda x:x)
            vf = fc(h4, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = [] #not stateful

        def step(ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Example #6
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        ob_shape = (nbatch,) + ob_space.shape
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            flatten = tf.layers.flatten
            pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:,0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)


        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
def CNN7(unscaled_images, index, filmObj):
    with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
                        activation_fn=tf.nn.relu,
                        weights_initializer=tf.contrib.layers.
                        variance_scaling_initializer()):
        scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
        activ = tf.nn.relu

        # w_1 = tf.slice(filmObj.film_w_1,index*32,[32])
        # b_1 = tf.slice(filmObj.film_b_1,index*32,[32])
        w_2 = tf.slice(filmObj.film_w_2, index * 64, [64])
        b_2 = tf.slice(filmObj.film_b_2, index * 64, [64])
        # w_3 = tf.slice(filmObj.film_w_3,index*48,[48])
        # b_3 = tf.slice(filmObj.film_b_3,index*48,[48])

        h = slim.separable_conv2d(scaled_images, 32, 8, 1, 4)
        # h = tf.math.add(tf.multiply(h, temp['weights_1']), temp['bias_1'])
        # h = tf.math.add(tf.multiply(h, w_1), b_1)

        h2 = slim.separable_conv2d(h, 64, 4, 1, 2)
        # h2 = tf.math.add(tf.multiply(h2, temp['weights_2']), temp['bias_2'])
        h2 = tf.math.add(tf.multiply(h2, w_2), b_2)

        h3 = slim.separable_conv2d(h2, 48, 3, 1, 1)
        # h3 = tf.math.add(tf.multiply(h3, temp['weights_3']), temp['bias_3'])
        # h3 = tf.math.add(tf.multiply(h3, w_3), b_3)

        h3 = conv_to_fc(h3)
        return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Example #8
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Example #9
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     pdparam = fc(latent_vector,
                  'pi',
                  self.ncat,
                  init_scale=init_scale,
                  init_bias=init_bias)
     return self.pdfromflat(pdparam), pdparam
def cnn7(unscaled_images, **conv_kwargs):
    """
    Network 96x96:
    model/SeparableConv2d/depthwise_weights:0 (8, 8, 4, 1)
    model/SeparableConv2d/pointwise_weights:0 (1, 1, 4, 32)
    model/SeparableConv2d/biases:0 (32,)
    model/SeparableConv2d_1/depthwise_weights:0 (4, 4, 32, 1)
    model/SeparableConv2d_1/pointwise_weights:0 (1, 1, 32, 64)
    model/SeparableConv2d_1/biases:0 (64,)
    model/SeparableConv2d_2/depthwise_weights:0 (3, 3, 64, 1)
    model/SeparableConv2d_2/pointwise_weights:0 (1, 1, 64, 48)
    model/SeparableConv2d_2/biases:0 (48,)
    model/fc1/w:0 (6912, 512)
    model/fc1/b:0 (512,)
    model/v/w:0 (512, 1)
    model/v/b:0 (1,)
    model/pi/w:0 (512, 7)
    model/pi/b:0 (7,)
    Trainable variables:
    3550296
    """
    with slim.arg_scope([slim.conv2d, slim.separable_conv2d],
                        activation_fn=tf.nn.relu,
                        weights_initializer=tf.contrib.layers.
                        variance_scaling_initializer()):
        scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
        activ = tf.nn.relu
        h = slim.separable_conv2d(scaled_images, 32, 8, 1, 4)
        h2 = slim.separable_conv2d(h, 64, 4, 1, 2)
        h3 = slim.separable_conv2d(h2, 48, 3, 1, 1)
        h3 = conv_to_fc(h3)
        return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Example #11
0
    def network_fn(X, nenv=1):
        nbatch = X.shape[0]
        nsteps = nbatch // nenv

        fm = nature_cnn(X, **conv_kwargs)
        fm_flat = conv_to_fc(fm)
        h = tf.nn.relu(fc(fm_flat, 'fc1', nh=nh, init_scale=np.sqrt(2)))

        M = tf.placeholder(tf.float32, [nbatch])  # mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, 2 * nlstm])  # states

        xs = batch_to_seq(h, nenv, nsteps)
        ms = batch_to_seq(M, nenv, nsteps)

        if layer_norm:
            h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
        else:
            h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)

        h = seq_to_batch(h5)
        initial_state = np.zeros(S.shape.as_list(), dtype=float)

        return fm, h, {
            'S': S,
            'M': M,
            'state': snew,
            'initial_state': initial_state
        }
Example #12
0
def nature_cnn(unscaled_images, **conv_kwargs):
    """
    CNN from Nature paper.
    """
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(
        conv(scaled_images,
             'c1',
             nf=32,
             rf=8,
             stride=4,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h2 = activ(
        conv(h,
             'c2',
             nf=64,
             rf=4,
             stride=2,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = activ(
        conv(h2,
             'c3',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Example #13
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, filmObj, reuse=False,st = "act", **conv_kwargs): #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (int(nbatch/nenvs), nh, nw, nc) # Use this
        self.pdtype = make_pdtype(ac_space)
        index = tf.placeholder(tf.int32,[1])
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        with tf.variable_scope("model", reuse=reuse):
            h = CNN7(X,index,filmObj) #**conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
            print("Network:")
            [print(v.name, v.shape) for v in tf.trainable_variables()]
            print("Trainable variables:")
            print(np.sum([np.prod(v.get_shape()) for v in tf.trainable_variables()]))

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob,idx, *_args, **_kwargs):

            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob, index:[idx]})
            
            return a, v, self.initial_state, neglogp

        def value(ob,idx, *_args, **_kwargs):
            # print('the shape of ob when value is called is ',ob.shape)

            return sess.run(vf, {X:ob, index:[idx]})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.index = index
Example #14
0
def cnn(unscaled_images, scope, activ=None, nfeat=None, reuse=False):
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = activ or tf.nn.leaky_relu
    nfeat = nfeat or 512
    h = activ(
        conv(scaled_images,
             scope + '_conv1',
             nf=32,
             rf=8,
             stride=4,
             init_scale=np.sqrt(2),
             reuse=reuse))
    h2 = activ(
        conv(h,
             scope + '_conv2',
             nf=64,
             rf=4,
             stride=2,
             init_scale=np.sqrt(2),
             reuse=reuse))
    h3 = activ(
        conv(h2,
             scope + '_conv3',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             reuse=reuse))
    h3 = conv_to_fc(h3)
    return fc(h3,
              scope + '_conv_to_fc',
              nh=nfeat,
              init_scale=np.sqrt(2),
              reuse=reuse)
Example #15
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps

        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Example #16
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Example #17
0
def nature_cnn(unscaled_images, scope, **conv_kwargs):
    """
    CNN from Nature paper.
    """
    #unscaled_images = tf.placeholder(tf.float32, shape=[None, 84, 84, 1], name='unscaled_images')
    with tf.variable_scope(scope):
        scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
        activ = tf.nn.relu
        # 8x8 filter size is common on the very 1st conv layer, looking at the input image
        h = activ(
            conv(scaled_images,
                 'c1',
                 nf=32,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h2 = activ(
            conv(h,
                 'c2',
                 nf=64,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h3 = activ(
            conv(h2,
                 'c3',
                 nf=64,
                 rf=3,
                 stride=1,
                 init_scale=np.sqrt(2),
                 **conv_kwargs))
        h3 = conv_to_fc(h3)
        return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            h = cnn7(X, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            # The a value returned here defines the action that Sonic takes. It is a vector of one element (action index)
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Example #19
0
        def final_nn(input_final_nn, fnn_args, no_actions, initializer):
            # input: r, c, w (and w_ex) concatenated
            # output: no_actions-dimensional vector

            activ = tf.nn.relu
            #softmax = tf.nn.softmax

            # fc layer(s) specified by lw_args
            h = input_final_nn
            for i, nneurons in enumerate(fnn_args):
                h = activ(
                    fc(h,
                       'fnn_fc{}'.format(i),
                       nh=nneurons,
                       initializer=initializer,
                       init_scale=np.sqrt(2)), 'fnn_fc_relu{}'.format(i))

            # last fc layer that predicts action
            #if not fnn_args:
            #    last_fcl_name = 'fnn_fc0'
            #else:
            #    last_fcl_name = 'fnn_fc{}'.format(len(fnn_args))
            #h = activ(fc(h, last_fcl_name, nh=no_actions, initializer=initializer, init_scale=np.sqrt(2)))
            #output = softmax(h)

            #return output
            return h
Example #20
0
def nature_cnn(unscaled_images, keep_probs, **conv_kwargs):
    """
    CNN from Nature paper.
    """
    # scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(
        conv(unscaled_images,
             'c1',
             nf=32,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h2 = activ(
        conv(h,
             'c2',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = activ(
        conv(h2,
             'c3',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = conv_to_fc(h3)
    h4 = activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))

    return tf.nn.dropout(h4, keep_prob=keep_probs)
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)

        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
            h, self.dropout_assign_ops = choose_cnn(processed_x)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, vf, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(vf, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            #h = custom_cnn(X, **conv_kwargs)
            #print(conv_kwargs)
            h = policies.nature_cnn(X, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Example #23
0
def nature_cnn(scaled_images, **conv_kwargs):
    """
    Model used in the paper "Human-level control through deep reinforcement learning" 
    https://www.nature.com/articles/nature14236
    """
    def activ(curr):
        return tf.nn.relu(curr)

    h = activ(
        conv(scaled_images,
             'c1',
             nf=32,
             rf=8,
             stride=4,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h2 = activ(
        conv(h,
             'c2',
             nf=64,
             rf=4,
             stride=2,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = activ(
        conv(h2,
             'c3',
             nf=64,
             rf=3,
             stride=1,
             init_scale=np.sqrt(2),
             **conv_kwargs))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Example #24
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 reuse=False,
                 **kwargs):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(
                fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(
                fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run(
                [a0, vf, neglogp0, self.pd.flat], {X: ob})
            return a, v, self.initial_state, neglogp, polciyflat

        def step_test(ob, *_args, **_kwargs):
            a = sess.run([self.pd.mean], {X: ob})
            return a

        self.X = X
        self.vf = vf
        self.step = step
        self.step_policyflat = step_policyflat
        self.value = value
        self.step_test = step_test
Example #25
0
    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:,0]
Example #26
0
    def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess or tf.get_default_session()

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:,0]
Example #27
0
    def __call__(self, obs, reuse=True):
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            policy_latent = self.network_builder(obs)
            action = tf.nn.tanh(
                fc(policy_latent, "pi/mean", self.ac_space.shape[0]))
            # action = fc(policy_latent, "pi/mean", self.ac_space.shape[0])

        return action
Example #28
0
 def _create_qf(self, policy_latent, vf_latent):
     with tf.variable_scope('qf', reuse=tf.AUTO_REUSE):
         qf_input = tf.concat([policy_latent, vf_latent], axis=1)
         # qf_input = self.policy_latent * vf_latent
         qf_latent = self.q_network(qf_input)
         qf = fc(qf_latent, 'qf', 1)
         qf = qf[:, 0]
         return qf
Example #29
0
 def network_fn(X, action):
     buffer_size = X.shape[1]
     net = X
     net = layers.conv1d(net, 20, 5, scope='cnn1d_c1')
     net = layers.conv1d(net, 15, 3, scope='cnn1d_c2')
     net = layers.conv1d(net, 10, 3, scope='cnn1d_c3')
     net = layers.conv1d(net, 5, 3, scope='cnn1d_c4')
     net = layers.conv1d(net, 1, 3, scope='cnn1d_c5')
     net = tf.reshape(net, [-1, buffer_size])
     net = tf.concat([net, action], 1)
     net = fc(net, 'cnn1d_fc1', nh=32, init_scale=np.sqrt(2))
     net = fc(net, 'cnn1d_fc2', nh=24, init_scale=np.sqrt(2))
     net = fc(net, 'cnn1d_fc3', nh=16, init_scale=np.sqrt(2))
     net = tf.tanh(net)
     #        tf.nn.conv1d(X, w, stride, 'SAME')
     #        print(X)
     return net
Example #30
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 taskScope="Task0"):  #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope(taskScope + '/model', reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(
                fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(
                fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        with tf.variable_scope(taskScope + '/modelVars', reuse=reuse):
            _mean = self.pi
            a0 = self.pd.sample()
            neglogp0 = self.pd.neglogp(a0)
            self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def detStep(ob, *_args, **_kwargs):

            a = sess.run([_mean], {X: ob})
            return a

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.detStep = detStep
        self.value = value
Example #31
0
 def network_fn(X):
     h = tf.layers.flatten(X)
     for i in range(num_layers):
         h = fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2))
         if layer_norm:
             h = tf.contrib.layers.layer_norm(h, center=True, scale=True)
         h = activation(h)
     return h
Example #32
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 nlstm=256,
                 reuse=False):
        nenv = nbatch // nsteps
        X, processed_x = observation_input(ob_space, nbatch)
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            # h = nature_cnn(X)
            activ = tf.tanh
            h = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))

            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            vf = fc(h5, 'v', 1)
            self.pd, self.pi = self.pdtype.pdfromlatent(h5)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {
                X: ob,
                S: state,
                M: mask
            })

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.vf = vf
        self.step = step
        self.value = value
Example #33
0
 def __call__(self, obs, action, reuse=True):
     with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
         qf_input = tf.concat(
             [obs, action], axis=-1
         )  # this assumes observation and action can be concatenated
         qf_latent = self.network_builder(qf_input)
         qf = fc(qf_latent, "last_fc", 1)
     return qf
Example #34
0
def last_linear_hidden_layer(x, actions=None, d=512, **conv_kwargs):
    h = dcgan_cnn(x, **conv_kwargs)
    activ = leaky_relu

    if actions is not None:
        h = tf.concat([actions, h], axis=1)

    return activ('h_final', fc(h, 'fc1', nh=d, init_scale=np.sqrt(2)))
Example #35
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     temp_pdparam = fc(latent_vector,
                       'pi',
                       self.nac * self.npol,
                       init_scale=init_scale,
                       init_bias=init_bias)
     pdparam = tf.reshape(temp_pdparam, [-1, self.npol, self.nac])
     return self.pdfromflat(pdparam), pdparam
Example #36
0
    def network_fn(X, mode="pi"):
        filtered_conv_kwargs = {}

        def filter_kwargs(k):
            if k in conv_kwargs.keys():
                filtered_conv_kwargs[k] = conv_kwargs[k]

        filter_kwargs("pad")
        filter_kwargs("data_format")
        filter_kwargs("one_dim_bias")

        scaled_images = tf.cast(X, tf.float32) / 255.
        activ = tf.nn.relu
        bn = tf.contrib.layers.batch_norm
        drp = tf.nn.dropout

        def addbndrp(h):
            if (batchnormpi and mode == "pi") or (batchnormvf
                                                  and mode == "vf"):
                h = bn(h,
                       center=True,
                       scale=True,
                       is_training=isbnpitrainmode
                       if mode == "pi" else isbnvftrainmode,
                       updates_collections=None)
            h = activ(h)
            if (dropoutpi < 1.0 and mode == "pi"):
                h = drp(h, keep_prob=dropoutpi_keep_prob)
            if (dropoutvf < 1.0 and mode == "vf"):
                h = drp(h, keep_prob=dropoutvf_keep_prob)
            return h

        h = addbndrp(
            conv(scaled_images,
                 'c1',
                 nf=32,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2),
                 **filtered_conv_kwargs))
        h2 = addbndrp(
            conv(h,
                 'c2',
                 nf=64,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2),
                 **filtered_conv_kwargs))
        h3 = addbndrp(
            conv(h2,
                 'c3',
                 nf=64,
                 rf=3,
                 stride=1,
                 init_scale=np.sqrt(2),
                 **filtered_conv_kwargs))
        h3 = conv_to_fc(h3)
        return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Example #37
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  #pylint: disable=W0613
        nenv = nbatch // nsteps
        ob_shape = (nbatch, ) + ob_space.shape
        nact = ac_space.n
        X = tf.compat.v1.placeholder(tf.float32, ob_shape, name='Ob')  #obs
        with tf.compat.v1.variable_scope('intrinsic', reuse=reuse):
            h3 = nature_cnn(X)
            r_in0 = tf.tanh(fc(h3, 'r_in', nact))
            v_ex0 = fc(h3, 'v_ex', 1)[:, 0]
        with tf.compat.v1.variable_scope('policy', reuse=reuse):
            h3 = nature_cnn(X)
            pi = fc(h3, 'pi', nact, init_scale=0.01)
            v_mix0 = fc(h3, 'v_mix', 1)[:, 0]

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.init_policy_state = None

        def step(ob, *_args, **_kwargs):
            a, v_ex, v_mix, neglogp = sess.run([a0, v_ex0, v_mix0, neglogp0],
                                               {X: ob})
            return a, v_ex, v_mix, self.init_policy_state, neglogp

        def value(ob, *_args, **_kwargs):
            v_ex, v_mix = sess.run([v_ex0, v_mix0], {X: ob})
            return v_ex, v_mix

        def intrinsic_reward(ob, ac, *_args, **_kwargs):
            r_in = sess.run(r_in0, {X: ob})
            return r_in[np.arange(nbatch), ac]

        self.X = X
        self.r_in = r_in0
        self.v_ex = v_ex0
        self.pi = pi
        self.v_mix = v_mix0
        self.step = step
        self.value = value
        self.intrinsic_reward = intrinsic_reward
        self.policy_params = tf.compat.v1.trainable_variables("policy")
        self.intrinsic_params = tf.compat.v1.trainable_variables("intrinsic")
        self.policy_new_fn = CnnPolicyNew
Example #38
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, act=lambda x: x)
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        aprobs0 = tf.nn.softmax(pi)  # action probs
        self.initial_state = []  #not stateful

        def step(ob, *_args, **_kwargs):
            a, v, aprobs = sess.run([a0, v0, aprobs0], {X: ob})
            return a, v, aprobs, []  #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X: ob})

        self.X = X
        self.pi = pi  # policy
        self.aprobs0 = aprobs0
        self.vf = vf
        self.step = step
        self.value = value
Example #39
0
def nature_cnn(unscaled_images):
    """
    CNN from Nature paper.
    """
    scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
    activ = tf.nn.relu
    h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
    h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
    h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
    h3 = conv_to_fc(h3)
    return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
Example #40
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Example #41
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact)
            vf = fc(h5, 'v', 1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)

        v0 = vf[:, 0]
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Example #42
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, nbatch)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Example #43
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
        ob_shape = (nbatch,) + ob_space.shape
        actdim = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            pi = fc(h2, 'pi', actdim, init_scale=0.01)
            h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(h2, 'vf', 1)[:,0]
            logstd = tf.get_variable(name="logstd", shape=[1, actdim],
                initializer=tf.zeros_initializer())

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Example #44
0
def _matching_fc(tensor, name, size, init_scale, init_bias):
    if tensor.shape[-1] == size:
        return tensor
    else:
        return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
Example #45
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
     return self.pdfromflat(pdparam), pdparam
Example #46
0
 def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
     mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
     logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
     return self.pdfromflat(pdparam), mean
Example #47
0
  def __init__(self,
               sess,
               ob_space,
               ac_space,
               nenv,
               nsteps,
               nstack,
               reuse=False):
    nbatch = nenv * nsteps
    nh, nw, nc = (32, 32, 3)
    ob_shape = (nbatch, nh, nw, nc * nstack)
    nact = 3  # 524
    # nsub3 = 2
    # nsub4 = 5
    # nsub5 = 10
    # nsub6 = 4
    # nsub7 = 2
    # nsub8 = 4
    # nsub9 = 500
    # nsub10 = 4
    # nsub11 = 10
    # nsub12 = 500

    # (64, 64, 13)
    # 80 * 24

    X = tf.placeholder(tf.uint8, ob_shape)  #obs
    with tf.variable_scope("model", reuse=reuse):
      with tf.variable_scope("common", reuse=reuse):
        h = conv(
            tf.cast(X, tf.float32),
            'c1',
            nf=32,
            rf=5,
            stride=1,
            init_scale=np.sqrt(2),
            pad="SAME")  # ?, 32, 32, 16
        h2 = conv(
            h,
            'c2',
            nf=64,
            rf=3,
            stride=1,
            init_scale=np.sqrt(2),
            pad="SAME")  # ?, 32, 32, 32

      with tf.variable_scope("pi1", reuse=reuse):
        h3 = conv_to_fc(h2)  # 131072
        h4 = fc(h3, 'fc1', nh=256, init_scale=np.sqrt(2))  # ?, 256
        pi_ = fc(
            h4, 'pi', nact)  # ( nenv * nsteps, 524) # ?, 524
        pi = tf.nn.softmax(pi_)

        vf = fc(
            h4, 'v', 1)  # ( nenv * nsteps, 1) # ?, 1

      # vf = tf.nn.l2_normalize(vf_, 1)

      with tf.variable_scope("xy0", reuse=reuse):
        # 1 x 1 convolution for dimensionality reduction
        pi_xy0_ = conv(
            h2, 'xy0', nf=1, rf=1, stride=1,
            init_scale=np.sqrt(2))  # (? nenv * nsteps, 32, 32, 1)
        pi_xy0__ = conv_to_fc(pi_xy0_)  # 32 x 32 => 1024
        pi_xy0 = tf.nn.softmax(pi_xy0__)


      with tf.variable_scope("xy1", reuse=reuse):
        pi_xy1_ = conv(
            h2, 'xy1', nf=1, rf=1, stride=1,
            init_scale=np.sqrt(2))  # (? nenv * nsteps, 32, 32, 1)
        pi_xy1__ = conv_to_fc(pi_xy1_)  # 32 x 32 => 1024
        pi_xy1 = tf.nn.softmax(pi_xy1__)

    v0 = vf[:, 0]
    a0 = sample(pi)
    self.initial_state = []  #not stateful

    def step(ob, *_args, **_kwargs):
      #obs, states, rewards, masks, actions, actions2, x1, y1, x2, y2, values
      _pi1, _xy0, _xy1, _v = sess.run([pi, pi_xy0, pi_xy1, v0], {X: ob})
      return _pi1, _xy0, _xy1, _v, []  #dummy state

    def value(ob, *_args, **_kwargs):
      return sess.run(v0, {X: ob})

    self.X = X
    self.pi = pi
    # self.pi_sub3 = pi_sub3
    # self.pi_sub4 = pi_sub4
    # self.pi_sub5 = pi_sub5
    # self.pi_sub6 = pi_sub6
    # self.pi_sub7 = pi_sub7
    # self.pi_sub8 = pi_sub8
    # self.pi_sub9 = pi_sub9
    # self.pi_sub10 = pi_sub10
    # self.pi_sub11 = pi_sub11
    # self.pi_sub12 = pi_sub12
    self.pi_xy0 = pi_xy0
    self.pi_xy1 = pi_xy1
    # self.pi_y0 = pi_y0
    # self.pi_x1 = pi_x1
    # self.pi_y1 = pi_y1
    # self.pi_x2 = pi_x2
    # self.pi_y2 = pi_y2
    self.vf = vf
    self.step = step
    self.value = value
Example #48
0
    def __init__(self, tf_session, ob_space, ac_space, nbatch,
                 reward_redistribution_config, observation_network_config, lstm_network_config, training_config,
                 exploration_config, nsteps, nlstm=64, reuse=False):
        """LSTM policy network, as described in RUDDER paper
        
        Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and
        the features from the reward redistribution observation network;
        
        Parameters
        -------
        tf_session : tensorflow session
            tensorflow session to compute the graph in
        ob_space
            Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes;
        ac_space
            Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions;
        nbatch : int
            Batchsize
        nsteps : int
            Fixed number of timesteps to process at once
        reward_redistribution_config : dict
            Dictionary containing config for reward redistribution:
            -----
            lambda_eligibility_trace : float
                Eligibility trace value for redistributed reward
            vf_contrib : float
                Weighting of original value function (vf) vs. redistributed reward (rr), s.t.
                :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)`
            use_reward_redistribution_quality_threshold : float
                Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used;
                use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction
                error, as described in RUDDER paper;
            use_reward_redistribution : bool
                Use reward redistribution?
            rr_junksize : int
                Junksize for reward redistribution; Junks overlap by 1 half each
            cont_pred_w : float
                Weighting of continous prediciton loss vs. prediction loss of final return at last timestep
            intgrd_steps : int
                Stepsize for integrated gradients
            intgrd_batchsize : int
                Integrated gradients is computed batch-wise if intgrd_batchsize > 1
        observation_network_config : dict
            Dictionary containing config for observation network that processes observations and feeds them to LSTM
            network:
            -----
            show_states : bool
                Show frames to network?
            show_statedeltas : bool
                Show frame deltas to network?
            prepoc_states : list of dicts
                Network config to preprocess frames
            prepoc_deltas : list of dicts
                Network config to preprocess frame deltas
            prepoc_observations : list of dicts
                Network config to preprocess features from frame and frame-delta preprocessing networks
        lstm_network_config : dict
            Dictionary containing config for LSTM network:
            -----
            show_actions : bool
                Show taken actions to LSTM?
            reversed : bool
                Process game sequence in reversed order?
            layers : list of dicts
                Network config for LSTM network and optional additional dense layers
            initializations : dict
                Initialization config for LSTM network
            timestep_encoding : dict
                Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class
        training_config : dict
            Dictionary containing config for training and update procedure:
            -----
            n_no_rr_updates : int
                Number of updates to perform without training or using reward redistribution network
            n_pretrain_games : int
                Number of games to pretrain the reward redistribution network without using it;
            downscale_lr_policylag : bool
                Downscale learningrate permanently if policy lag gets too large?
            optimizer : tf.train optimizer
                Optimizer in tf.train, e.g. "AdamOptimizer"
            optimizer_params : dict
                Kwargs for optimizer
            l1 : float
                Weighting for l1 weight regularization
            l2 : float
                Weighting for l2 weight regularization
            clip_gradients : float
                Threshold for clipping gradients (clipping by norm)
        exploration_config : dict
            Dictionary containing config for exploration:
            -----
            sample_actions_from_softmax : bool
                True: Apply softmax to policy network output and use it as probabilities to pick an action
                False: Use the max. policy network output as action
            temporal_safe_exploration : bool
                User RUDDER safe exploration
            save_pi_threshold : float
                Threshold value in range [0,1] for safe actions in RUDDER safe exploration
        nlstm : int
            Number of LSTM units (=memory cells)
        reuse : bool
            Reuse tensorflow variables?
        """
        #
        # Shapes
        #
        nenv = nbatch // nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        seq_ob_shape = (nenv, -1, nh, nw, 1)
        nact = ac_space.n
        
        #
        # Placeholders for inputs
        #
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        
        #
        # Prepare input
        #
        single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32)
        delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32)
        
        #
        #  Get observation features from RR model
        #
        rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config,
                                             observation_network_config=observation_network_config,
                                             lstm_network_config=lstm_network_config, training_config=training_config,
                                             scopename="RR")
        self.rr_observation_model = rr_model
        rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames,
                                                            additional_inputs=[])
        
        #
        #  Build policy network
        #
        with tf.variable_scope("model", reuse=reuse):
            temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False,
                                          name='temperature')
            
            additional_inputs = [StopGradientLayer(rr_observation_layer)]
            observation_layers, observation_features = observation_network(
                    single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs,
                    observation_network_config=observation_network_config)
            
            self.observation_features_shape = observation_features.get_output_shape()
            
            xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps,
                                                       value=tf.reshape(observation_layers[-1].get_output(),
                                                                        [nenv, nsteps, -1]))]
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            h6 = h5
            pi = fc(h6, 'pi', nact)
            vf = fc(h6, 'v', 1)
        
        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pi)
        
        if exploration_config['sample_actions_from_softmax']:
            a0 = self.pd.sample_temp(temperature=temperature)
        else:
            a0 = tf.argmax(pi, axis=-1)
        
        v0 = vf[:, 0]
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        
        def step(ob, state, mask):
            a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, v, s, neglogp
        
        def value(ob, state, mask):
            return tf_session.run(v0, {X:ob, S:state, M:mask})
        
        def action(ob, state, mask, *_args, **_kwargs):
            a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask})
            return a, s, neglogp
        
        #
        # Placeholders for exploration
        #
        n_envs = pi.shape.as_list()[0]
        exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,))
        prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,))
        exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,))
        
        #
        # Setting up safe exploration
        #
        explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl),
                                                tf.less_equal(gamelengths_pl,
                                                              exploration_timesteps_pl + exploration_durations_pl)),
                                 tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32)))

        safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True)
        safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True)
        save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1)
                                   / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold']))
        safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32)
        safe_pi /= tf.reduce_sum(safe_pi)
        
        rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0]
        
        safe_pi_flat = tf.reshape(safe_pi, (-1,))
        prev_action_is_safe = tf.gather(safe_pi_flat,
                                        prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64)
                                        * safe_pi.shape.as_list()[1])
        prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32))
        
        a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl,
                                                           tf.not_equal(gamelengths_pl, exploration_timesteps_pl)),
                                            prev_action_is_safe),
                             prev_actions_pl, rand_safe_a)
        
        a_explore = tf.where(explore, a_explore, a0)
        
        # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start)
        rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype)
        a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore)
        
        if not exploration_config['temporal_safe_exploration']:
            a_explore = a0
            
        neglogp_explore = self.pd.neglogp(a_explore)
        
        def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths,
                               keep_prev_action, prev_action_count, exploration_durations, **_kwargs):
            """Get actions with exploration for long-term reward"""
            a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore],
                                  {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps,
                                   prev_actions_pl: prev_actions,
                                   gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations,
                                   keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count})
            return a, s, neglogp
        
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.action = action
        self.action_exploration = action_exploration
        self.seq_ob_shape = seq_ob_shape
        self.exploration_config = exploration_config