Beispiel #1
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            pi = fc(h4, 'pi', nact, act=lambda x:x)
            vf = fc(h4, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = [] #not stateful

        def step(ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #2
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0 = sess.run([a, pi], {X: ob})
            return a0, pi0, []  # dummy state

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
Beispiel #3
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Beispiel #4
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
        nbatch = nenv*nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc*nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape) #obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)
            pi = fc(h5, 'pi', nact, act=lambda x:x)
            vf = fc(h5, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)

        def step(ob, state, mask):
            a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
            return a, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X:ob, S:state, M:mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #5
0
  def __init__(self,
               sess,
               ob_space,
               ac_space,
               nenv,
               nsteps,
               nstack,
               reuse=False):
    nbatch = nenv * nsteps
    nh, nw, nc = (32, 32, 3)
    ob_shape = (nbatch, nh, nw, nc * nstack)
    nact = 3  # 524
    # nsub3 = 2
    # nsub4 = 5
    # nsub5 = 10
    # nsub6 = 4
    # nsub7 = 2
    # nsub8 = 4
    # nsub9 = 500
    # nsub10 = 4
    # nsub11 = 10
    # nsub12 = 500

    # (64, 64, 13)
    # 80 * 24

    X = tf.placeholder(tf.uint8, ob_shape)  #obs
    with tf.variable_scope("model", reuse=reuse):
      with tf.variable_scope("common", reuse=reuse):
        h = conv(
            tf.cast(X, tf.float32),
            'c1',
            nf=32,
            rf=5,
            stride=1,
            init_scale=np.sqrt(2),
            pad="SAME")  # ?, 32, 32, 16
        h2 = conv(
            h,
            'c2',
            nf=64,
            rf=3,
            stride=1,
            init_scale=np.sqrt(2),
            pad="SAME")  # ?, 32, 32, 32

      with tf.variable_scope("pi1", reuse=reuse):
        h3 = conv_to_fc(h2)  # 131072
        h4 = fc(h3, 'fc1', nh=256, init_scale=np.sqrt(2))  # ?, 256
        pi_ = fc(
            h4, 'pi', nact)  # ( nenv * nsteps, 524) # ?, 524
        pi = tf.nn.softmax(pi_)

        vf = fc(
            h4, 'v', 1)  # ( nenv * nsteps, 1) # ?, 1

      # vf = tf.nn.l2_normalize(vf_, 1)

      with tf.variable_scope("xy0", reuse=reuse):
        # 1 x 1 convolution for dimensionality reduction
        pi_xy0_ = conv(
            h2, 'xy0', nf=1, rf=1, stride=1,
            init_scale=np.sqrt(2))  # (? nenv * nsteps, 32, 32, 1)
        pi_xy0__ = conv_to_fc(pi_xy0_)  # 32 x 32 => 1024
        pi_xy0 = tf.nn.softmax(pi_xy0__)


      with tf.variable_scope("xy1", reuse=reuse):
        pi_xy1_ = conv(
            h2, 'xy1', nf=1, rf=1, stride=1,
            init_scale=np.sqrt(2))  # (? nenv * nsteps, 32, 32, 1)
        pi_xy1__ = conv_to_fc(pi_xy1_)  # 32 x 32 => 1024
        pi_xy1 = tf.nn.softmax(pi_xy1__)

    v0 = vf[:, 0]
    a0 = sample(pi)
    self.initial_state = []  #not stateful

    def step(ob, *_args, **_kwargs):
      #obs, states, rewards, masks, actions, actions2, x1, y1, x2, y2, values
      _pi1, _xy0, _xy1, _v = sess.run([pi, pi_xy0, pi_xy1, v0], {X: ob})
      return _pi1, _xy0, _xy1, _v, []  #dummy state

    def value(ob, *_args, **_kwargs):
      return sess.run(v0, {X: ob})

    self.X = X
    self.pi = pi
    # self.pi_sub3 = pi_sub3
    # self.pi_sub4 = pi_sub4
    # self.pi_sub5 = pi_sub5
    # self.pi_sub6 = pi_sub6
    # self.pi_sub7 = pi_sub7
    # self.pi_sub8 = pi_sub8
    # self.pi_sub9 = pi_sub9
    # self.pi_sub10 = pi_sub10
    # self.pi_sub11 = pi_sub11
    # self.pi_sub12 = pi_sub12
    self.pi_xy0 = pi_xy0
    self.pi_xy1 = pi_xy1
    # self.pi_y0 = pi_y0
    # self.pi_x1 = pi_x1
    # self.pi_y1 = pi_y1
    # self.pi_x2 = pi_x2
    # self.pi_y2 = pi_y2
    self.vf = vf
    self.step = step
    self.value = value
Beispiel #6
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 continuous_actions=False,
                 itr=0,
                 particleEnvs=False,
                 communication=False):
        self.sess = sess
        self.continuous_actions = continuous_actions
        # print('reuse= ', reuse)
        nbatch = nenv * nsteps
        # print('obs space: ', ob_space)
        ob_shape = np.asarray([nbatch, ob_space[itr].shape[0]])
        self.ob_space = ob_space
        # print('model ob shape: ', ob_shape)
        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc*nstack)
        # print('observation shape: ', ob_shape)
        # print('ac_space: ', ac_space)
        if communication == False:
            nact = ac_space[itr].n
            # print('nact: ', nact)
        else:
            nact = ac_space[itr].high - ac_space[itr].low  # + [1, 1]
        # print('nact: ', nact)
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        # X = tf.transpose(tf.expand_dims(X, nbatch))
        # print('Input Shape: ', X.get_shape())
        with tf.variable_scope("model", reuse=reuse):
            # f = fc(X, 'fc1', nh=64)
            f = fc(tf.cast(X, tf.float32),
                   'fc1_' + str(itr),
                   nh=64,
                   init_scale=np.sqrt(2))
            f2 = fc(f, 'fc2_' + str(itr), nh=64, init_scale=np.sqrt(2))
            # f3 = fc(f2, 'fc3', nh=64, init_scale=np.sqrt(2))
            # h3 = conv_to_fc(h3)
            # h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            if self.continuous_actions:
                pi = fc(f3, 'pi', 2 * nact, act=lambda x: x)
                self.mu = fc(pi, 'mu', nact, act=lambda x: x)
                self.sigma = fc(pi, 'sigma', nact, act=lambda x: x)
                vf = fc(f2, 'v', 1, act=lambda x: x)
            elif communication == True:
                pi_c = fc(f2, 'pi_c', nact[1], act=lambda x: x)
                pi_u = fc(f2, 'pi_u', nact[0], act=lambda x: x)
                vf = fc(f2, 'v', 1, act=lambda x: x)
                self.pi_c = pi_c
                self.pi_u = pi_u
            else:
                pi = fc(f2, 'pi_' + str(itr), nact, act=lambda x: x)
                vf = fc(f2, 'v_' + str(itr), 1, act=lambda x: x)
                self.pi = pi

            # print('action output size:')
            # print(pi_c.get_shape())
            # print(pi_u.get_shape())
            # vf = fc(f2, 'v', 1, act=lambda x:x)

        v0 = vf[:, 0]
        if self.continuous_actions:
            a0 = sample_normal(self.mu, self.sigma)
        elif communication == True:
            a0 = [sample(pi_u), sample(pi_c)]
        else:
            a0 = sample(pi)
        self.initial_state = []  #not stateful

        def step(ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {X: ob})
            #import ipdb; ipdb.set_trace()
            # print('a: ', a)
            # time.sleep(1)
            # if np.isnan(a[0]):
            #     import ipdb; ipdb.set_trace()
            return a, v, []  #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X: ob})

        def summarize_weights(*_args, **_kwargs):
            all_layer_vars = sess.run(tf.all_variables())
            import numpy as np
            layer_sums = [np.sum(layer_vars) for layer_vars in all_layer_vars]
            res = np.any(np.isnan(np.sum(layer_sums)))
            print('Layer weight sums: ', layer_sums)
            print('NaN weights: ', res)
            if res:
                import ipdb
                ipdb.set_trace()

        self.summarize_weights = summarize_weights

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 nlstm=256,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            xs = batch_to_seq(h4, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi = fc(h5, 'pi', nact, act=lambda x: x)
            pix = fc(h5, 'pix', FLAGS.screen_resolution, act=lambda x: x)
            piy = fc(h5, 'piy', FLAGS.screen_resolution, act=lambda x: x)
            vf = fc(h5, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        a0 = sample(pi)
        x0 = sample(pix)
        y0 = sample(piy)
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)

        def step(ob, state, mask):
            a, x, y, v, s = sess.run([a0, x0, y0, v0, snew], {
                X: ob,
                S: state,
                M: mask
            })
            return a, x, y, v, s

        def value(ob, state, mask):
            return sess.run(v0, {X: ob, S: state, M: mask})

        self.X = X
        self.M = M
        self.S = S
        self.pi = pi
        self.pix = pix
        self.piy = piy
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #8
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 continuous_actions=False):
        self.sess = sess
        self.continuous_actions = continuous_actions
        # print('reuse= ', reuse)
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            h = conv(tf.cast(X, tf.float32) / 255.,
                     'c1',
                     nf=64,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2))
            h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
            h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
            h3 = conv_to_fc(h3)
            h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
            if self.continuous_actions:
                pi = fc(h4, 'pi', 2 * nact, act=lambda x: x)
                self.mu = fc(pi, 'mu', nact, act=lambda x: x)
                self.sigma = fc(pi, 'sigma', nact, act=lambda x: x)
            else:
                pi = fc(h4, 'pi', nact, act=lambda x: x)
            vf = fc(h4, 'v', 1, act=lambda x: x)

        v0 = vf[:, 0]
        if self.continuous_actions:
            a0 = sample_normal(self.mu, self.sigma)
        else:
            a0 = sample(pi)
        self.initial_state = []  #not stateful

        def step(ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {X: ob})
            #import ipdb; ipdb.set_trace()
            #print('a: ', a)
            if np.isnan(a[0]):
                import ipdb
                ipdb.set_trace()
            return a, v, []  #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X: ob})

        def summarize_weights(*_args, **_kwargs):
            all_layer_vars = sess.run(tf.all_variables())
            import numpy as np
            layer_sums = [np.sum(layer_vars) for layer_vars in all_layer_vars]
            res = np.any(np.isnan(np.sum(layer_sums)))
            print('Layer weight sums: ', layer_sums)
            print('NaN weights: ', res)
            if res:
                import ipdb
                ipdb.set_trace()

        self.summarize_weights = summarize_weights

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #9
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            with tf.variable_scope("acer"):
                h = nature_cnn(X)
                pi_logits = fc(h, 'pi', nact, init_scale=0.01)
                pi = tf.nn.softmax(pi_logits)
                q = fc(h, 'q', nact)

            with tf.variable_scope("explore"):
                # for explore
                nogradient_h = tf.stop_gradient(h)
                e_pi_logits = fc(nogradient_h, 'e_pi', nact, init_scale=0.01)
                e_pi = tf.nn.softmax(e_pi_logits)
                # e_v = fc(nogradient_h, 'e_v', 1)[:, 0]
                e_q = fc(nogradient_h, 'e_q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        evaluate_a = sample(pi_logits)

        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q

        # for explore
        e_a = sample(e_pi_logits)  # could change this to use self.pi instead
        self.e_pi_logits = e_pi_logits
        self.e_pi = e_pi
        # self.e_v = e_v
        self.e_q = e_q

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, e_pi0 = sess.run([a, pi, e_pi], {X: ob})
            return a0, pi0, e_pi0, []  # dummy state

        def evaluate_step(ob, *args, **kwargs):
            evaluate_a0, pi0, e_pi0 = sess.run([evaluate_a, pi, e_pi], {X: ob})
            return evaluate_a0, pi0, e_pi0, []  # dummy state

        self.evaluate_step = evaluate_step

        # for explore
        def e_step(ob, *args, **kwargs):
            a0, e_a0, pi0, e_pi0 = sess.run([a, e_a, pi, e_pi], {X: ob})
            return a0, e_a0, pi0, e_pi0, []  # dummy state

        self.e_step = e_step

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
Beispiel #10
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = (32, 32, 3)
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = 3  # 524
        # nsub3 = 2
        # nsub4 = 5
        # nsub5 = 10
        # nsub6 = 4
        # nsub7 = 2
        # nsub8 = 4
        # nsub9 = 500
        # nsub10 = 4
        # nsub11 = 10
        # nsub12 = 500

        # (64, 64, 13)
        # 80 * 24

        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            with tf.variable_scope("common", reuse=reuse):
                h = tf.nn.relu(
                    conv(tf.cast(X, tf.float32),
                         'c1',
                         nf=32,
                         rf=5,
                         stride=1,
                         init_scale=np.sqrt(2),
                         pad="SAME"))  # ?, 32, 32, 16
                h2 = tf.nn.relu(
                    conv(h,
                         'c2',
                         nf=64,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2),
                         pad="SAME"))  # ?, 32, 32, 32

            with tf.variable_scope("pi1", reuse=reuse):
                h3 = conv_to_fc(h2)  # 131072
                print(h3.shape)
                h4 = tf.nn.relu(fc(h3, 'fc1', nh=256,
                                   init_scale=np.sqrt(2)))  # ?, 256
                print(h4.shape)
                pi_ = fc(h4, 'pi', nact)  # ( nenv * nsteps, 524) # ?, 524
                # leave act
                pi = tf.nn.softmax(pi_)
                print(pi.shape)

                vf = fc(h4, 'v', 1)  # ( nenv * nsteps, 1) # ?, 1
                print(vf.shape)
                # leave act

            # vf = tf.nn.l2_normalize(vf_, 1)

            with tf.variable_scope("xy0", reuse=reuse):
                # 1 x 1 convolution for dimensionality reduction
                pi_xy0_ = conv(
                    h2, 'xy0', nf=1, rf=1, stride=1,
                    init_scale=np.sqrt(2))  # (? nenv * nsteps, 32, 32, 1)
                pi_xy0__ = conv_to_fc(pi_xy0_)  # 32 x 32 => 1024
                pi_xy0 = tf.nn.softmax(pi_xy0__)

            with tf.variable_scope("xy1", reuse=reuse):
                pi_xy1_ = conv(
                    h2, 'xy1', nf=1, rf=1, stride=1,
                    init_scale=np.sqrt(2))  # (? nenv * nsteps, 32, 32, 1)
                pi_xy1__ = conv_to_fc(pi_xy1_)  # 32 x 32 => 1024
                pi_xy1 = tf.nn.softmax(pi_xy1__)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  #not stateful

        def step(ob, *_args, **_kwargs):
            #obs, states, rewards, masks, actions, actions2, x1, y1, x2, y2, values
            _pi1, _xy0, _xy1, _v = sess.run([pi, pi_xy0, pi_xy1, v0], {X: ob})
            return _pi1, _xy0, _xy1, _v, []  #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X: ob})

        self.X = X
        self.pi = pi
        # self.pi_sub3 = pi_sub3
        # self.pi_sub4 = pi_sub4
        # self.pi_sub5 = pi_sub5
        # self.pi_sub6 = pi_sub6
        # self.pi_sub7 = pi_sub7
        # self.pi_sub8 = pi_sub8
        # self.pi_sub9 = pi_sub9
        # self.pi_sub10 = pi_sub10
        # self.pi_sub11 = pi_sub11
        # self.pi_sub12 = pi_sub12
        self.pi_xy0 = pi_xy0
        self.pi_xy1 = pi_xy1
        # self.pi_y0 = pi_y0
        # self.pi_x1 = pi_x1
        # self.pi_y1 = pi_y1
        # self.pi_x2 = pi_x2
        # self.pi_y2 = pi_y2
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #11
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        X = tf.placeholder(tf.int32, ob_shape)  # obs

        with tf.variable_scope("fullyconv_model", reuse=reuse):
            x_onehot = layers.one_hot_encoding(
                # assuming we have only one channel
                X[:, :, :, 0],
                num_classes=SCREEN_FEATURES.player_relative.scale)

            #don't one hot 0-category
            x_onehot = x_onehot[:, :, :, 1:]

            h = layers.conv2d(x_onehot,
                              num_outputs=16,
                              kernel_size=5,
                              stride=1,
                              padding='SAME',
                              scope="conv1")
            h2 = layers.conv2d(h,
                               num_outputs=32,
                               kernel_size=3,
                               stride=1,
                               padding='SAME',
                               scope="conv2")
            pi = layers.flatten(
                layers.conv2d(h,
                              num_outputs=1,
                              kernel_size=1,
                              stride=1,
                              scope="spatial_action",
                              activation_fn=None))

            pi *= 3.0  # make it little bit more deterministic, not sure if good idea

            f = layers.fully_connected(layers.flatten(h2),
                                       num_outputs=64,
                                       activation_fn=tf.nn.relu,
                                       scope="value_h_layer")

            vf = layers.fully_connected(f,
                                        num_outputs=1,
                                        activation_fn=None,
                                        scope="value_out")

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  # not stateful

        def step(ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {X: ob})
            return a, v, []  # dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #12
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 enc_size=32):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            input = tf.cast(X, tf.float32) / 255.

            enc1conv = tf.layers.conv2d(inputs=input,
                                        filters=32,
                                        kernel_size=[3, 3],
                                        padding="SAME",
                                        use_bias=True,
                                        activation=tf.nn.leaky_relu)
            enc2pool = tf.layers.max_pooling2d(inputs=enc1conv,
                                               pool_size=[2, 2],
                                               strides=2)
            enc2conv = tf.layers.conv2d(inputs=enc2pool,
                                        filters=16,
                                        kernel_size=[3, 3],
                                        padding="SAME",
                                        use_bias=True,
                                        activation=tf.nn.leaky_relu)
            enc3pool = tf.layers.max_pooling2d(inputs=enc2conv,
                                               pool_size=[2, 2],
                                               strides=2)
            enc3conv = tf.layers.conv2d(inputs=enc3pool,
                                        filters=8,
                                        kernel_size=[3, 3],
                                        padding="SAME",
                                        use_bias=True,
                                        activation=tf.nn.leaky_relu)
            enc_pool = tf.layers.max_pooling2d(inputs=enc3conv,
                                               pool_size=[3, 3],
                                               strides=3,
                                               name="enc_pool")

            enc_fc = conv_to_fc(enc_pool)

            # Policy
            h = fc(enc_fc, 'fc1', nh=enc_size, init_scale=np.sqrt(2))
            pi = fc(h, 'pi', nact, act=lambda x: x)
            vf = fc(h, 'v', 1, act=lambda x: x)

            # Checkpoint 3

            # Decoder
            dec1conv = tf.layers.conv2d(enc_pool,
                                        filters=8,
                                        kernel_size=(3, 3),
                                        strides=1,
                                        name='dec1conv',
                                        padding='SAME',
                                        use_bias=True,
                                        activation=tf.nn.leaky_relu)
            dec2up = tf.layers.conv2d_transpose(dec1conv,
                                                filters=8,
                                                kernel_size=3,
                                                strides=3,
                                                padding='same',
                                                name='dec2up')
            dec3conv = tf.layers.conv2d(dec2up,
                                        filters=16,
                                        kernel_size=(3, 3),
                                        strides=1,
                                        name='dec3conv',
                                        padding='SAME',
                                        use_bias=True,
                                        activation=tf.nn.leaky_relu)
            dec4up = tf.layers.conv2d_transpose(dec3conv,
                                                filters=16,
                                                kernel_size=2,
                                                strides=2,
                                                padding='same',
                                                name='dec4up')
            dec5conv = tf.layers.conv2d(dec4up,
                                        filters=32,
                                        kernel_size=(3, 3),
                                        strides=1,
                                        name='dec5conv',
                                        padding='SAME',
                                        use_bias=True,
                                        activation=tf.nn.leaky_relu)
            dec6up = tf.layers.conv2d_transpose(dec5conv,
                                                filters=32,
                                                kernel_size=2,
                                                strides=2,
                                                padding='same',
                                                name='dec6up')

            decoded = tf.layers.conv2d(dec6up,
                                       filters=4,
                                       kernel_size=(3, 3),
                                       strides=(1, 1),
                                       name='decoded',
                                       padding='SAME',
                                       use_bias=True,
                                       activation=tf.nn.leaky_relu)

        v0 = vf[:, 0]
        a0 = sample(pi)
        self.initial_state = []  #not stateful
        dec = decoded
        enc = enc_fc
        orig = input

        def step(ob, *_args, **_kwargs):
            a, v, d, e = sess.run([a0, v0, dec, enc], {X: ob})
            return a, v, d, e, []  #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X: ob})

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
        self.decoded = decoded
        self.orig = orig