Ejemplo n.º 1
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Ejemplo n.º 2
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0 = sess.run([a, pi], {X: ob})
            return a0, pi0, []  # dummy state

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  #pylint: disable=W0613
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc)
        self.pdtype = make_pdtype(ac_space)
        X = tf.placeholder(tf.uint8, ob_shape)  #obs
        with tf.variable_scope("model", reuse=reuse):
            #h = custom_cnn(X, **conv_kwargs)
            #print(conv_kwargs)
            h = policies.nature_cnn(X, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
Ejemplo n.º 4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        super().__init__(sess,
                         ob_space,
                         ac_space,
                         nenv,
                         nsteps,
                         nstack,
                         reuse=reuse)
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h, 'q', nact)

        self.a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q
        self.sess = sess
Ejemplo n.º 5
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False,
                 nlstm=256):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        M = tf.placeholder(tf.float32, [nbatch])  #mask (done t-1)
        S = tf.placeholder(tf.float32, [nenv, nlstm * 2])  #states
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)

            # lstm
            xs = batch_to_seq(h, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)
            h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
            h5 = seq_to_batch(h5)

            pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h5, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32)
        self.X = X
        self.M = M
        self.S = S
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, state, mask, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
            return a0, pi0, s

        self.step = step
Ejemplo n.º 6
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(X)
            pi_logits = fc(h, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logits)
            q = fc(h, 'q', nact)

        a = sample(pi_logits)  # could change this to use self.pi instead
        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0 = sess.run([a, pi], {X: ob})
            return a0, pi0, []  # dummy state

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
Ejemplo n.º 7
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        with tf.variable_scope("model", reuse=reuse):
            with tf.variable_scope("acer"):
                h = nature_cnn(X)
                pi_logits = fc(h, 'pi', nact, init_scale=0.01)
                pi = tf.nn.softmax(pi_logits)
                q = fc(h, 'q', nact)

            with tf.variable_scope("explore"):
                # for explore
                nogradient_h = tf.stop_gradient(h)
                e_pi_logits = fc(nogradient_h, 'e_pi', nact, init_scale=0.01)
                e_pi = tf.nn.softmax(e_pi_logits)
                e_v = fc(nogradient_h, 'e_v', 1)[:, 0]

        a = sample(pi_logits)  # could change this to use self.pi instead

        self.initial_state = []  # not stateful
        self.X = X
        self.pi = pi  # actual policy params now
        self.q = q

        # for explore
        e_a = sample(e_pi_logits)  # could change this to use self.pi instead
        self.e_pi_logits = e_pi_logits
        self.e_pi = e_pi
        self.e_v = e_v

        def step(ob, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0 = sess.run([a, pi], {X: ob})
            return a0, pi0, []  # dummy state

        # for explore
        def e_step(ob, *args, **kwargs):
            e_a0, pi0, e_v0 = sess.run([e_a, pi, e_v], {X: ob})
            return e_a0, pi0, e_v0, []  # dummy state

        self.e_step = e_step

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act
Ejemplo n.º 8
0
    def _init(self,
              ob_space,
              ac_space,
              gaussian_fixed_var=True,
              use_bias=True,
              use_critic=True,
              seed=None,
              hidden_W_init=U.normc_initializer(1.0),
              hidden_b_init=tf.zeros_initializer(),
              output_W_init=U.normc_initializer(0.01),
              output_b_init=tf.zeros_initializer()):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid_size: width of hidden layers
            num_hid_layers: depth
            gaussian_fixed_var: True->separate parameter for logstd, False->two-headed mlp
            use_bias: whether to include bias in neurons
        """
        assert isinstance(ob_space, gym.spaces.Box)

        if seed is not None:
            tf.set_random_seed(seed)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        #Critic
        if use_critic:
            raise Exception("Critic still not supported")
            with tf.variable_scope('vf'):
                obz = tf.clip_by_value(
                    (ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(last_out,
                                        hid_size[i],
                                        name="fc%i" % (i + 1),
                                        kernel_initializer=hidden_W_init))
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=hidden_W_init)[:, 0]

        #Actor
        with tf.variable_scope('pol'):
            last_out = nature_cnn(ob)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                self.mean = mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=output_W_init,
                    use_bias=use_bias)
                self.logstd = logstd = tf.get_variable(
                    name="pol_logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=output_b_init)
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out,
                                          pdtype.param_shape()[0],
                                          name='final',
                                          kernel_initializer=output_W_init)

        #Acting
        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if use_critic:
            self._act = U.function([stochastic, ob], [ac, self.vpred])
        else:
            self._act = U.function([stochastic, ob], [ac, tf.zeros(1)])

        #Evaluating
        self.ob = ob
        self.ac_in = self.pdtype.sample_placeholder([sequence_length] +
                                                    list(ac_space.shape),
                                                    name='ac_in')
        self.gamma = U.get_placeholder(name="gamma",
                                       dtype=tf.float32,
                                       shape=[])
        self.rew = U.get_placeholder(name="rew",
                                     dtype=tf.float32,
                                     shape=[sequence_length] + [1])
        self.logprobs = self.pd.logp(self.ac_in)  #  [\log\pi(a|s)]

        #Fisher
        with tf.variable_scope('pol') as vs:
            self.weights = weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=vs.name)
        self.flat_weights = flat_weights = tf.concat(
            [tf.reshape(w, [-1]) for w in weights], axis=0)
        self.n_weights = flat_weights.shape[0].value
        self.score = score = U.flatgrad(self.logprobs,
                                        weights)  # \nabla\log p(\tau)
        self.fisher = tf.einsum('i,j->ij', score, score)

        #Performance graph initializations
        self._setting = []
Ejemplo n.º 9
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nenv,
                 nsteps,
                 nstack,
                 num_nonspatial,
                 reuse=False):
        nbatch = nenv * nsteps
        nh, nw, nc = ob_space.shape
        ob_shape = (nbatch, nh, nw, nc * nstack)
        nact = ac_space.n
        X = tf.placeholder(tf.uint8, ob_shape)  # obs
        NonspatialX = tf.placeholder(tf.float32, (nbatch, num_nonspatial))
        with tf.variable_scope("model", reuse=reuse):
            with tf.variable_scope("acer"):
                h = nature_cnn(X)
                h = tf.concat([h, NonspatialX], axis=1)
                pi_logits = fc(h, 'pi', nact, init_scale=0.01)
                pi = tf.nn.softmax(pi_logits)
                q = fc(h, 'q', nact)

            with tf.variable_scope("explore"):
                # for explore
                nogradient_h = tf.stop_gradient(h)
                e_pi_logits = fc(nogradient_h, 'e_pi', nact, init_scale=0.01)
                e_pi = tf.nn.softmax(e_pi_logits)
                # e_v = fc(nogradient_h, 'e_v', 1)[:, 0]
                e_q = fc(nogradient_h, 'e_q', nact)

        # a = sample(pi_logits)  # could change this to use self.pi instead
        a = tf.squeeze(tf.multinomial(pi_logits, 1), 1)
        evaluate_a = tf.argmax(pi_logits, 1)

        self.initial_state = []  # not stateful
        self.X = X
        self.NonspatialX = NonspatialX
        self.pi = pi  # actual policy params now
        self.q = q

        # for explore
        # e_a = sample(e_pi_logits)  # could change this to use self.pi instead
        e_a = tf.squeeze(tf.multinomial(e_pi_logits, 1), 1)
        self.e_pi_logits = e_pi_logits
        self.e_pi = e_pi
        # self.e_v = e_v
        self.e_q = e_q

        def step(ob, nonspatial, *args, **kwargs):
            # returns actions, mus, states
            a0, pi0, e_pi0 = sess.run([a, pi, e_pi], {
                X: ob,
                NonspatialX: nonspatial
            })
            return a0, pi0, e_pi0, []  # dummy state

        def evaluate_step(ob, nonspatial, *args, **kwargs):
            evaluate_a0, pi0, e_pi0 = sess.run([evaluate_a, pi, e_pi], {
                X: ob,
                NonspatialX: nonspatial
            })
            return evaluate_a0, pi0, e_pi0, []  # dummy state

        self.evaluate_step = evaluate_step

        # for explore
        def e_step(ob, nonspatial, *args, **kwargs):
            e_a0, pi0, e_pi0 = sess.run([e_a, pi, e_pi], {
                X: ob,
                NonspatialX: nonspatial
            })
            return e_a0, pi0, e_pi0, []  # dummy state

        self.e_step = e_step

        def out(ob, *args, **kwargs):
            pi0, q0 = sess.run([pi, q], {X: ob})
            return pi0, q0

        def act(ob, *args, **kwargs):
            return sess.run(a, {X: ob})

        self.step = step
        self.out = out
        self.act = act