def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh flatten = tf.layers.flatten pi_h1 = activ( fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ( fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def nature_cnn(unscaled_images, **conv_kwargs): """ CNN from Nature paper. """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) return self.pdfromflat(pdparam), pdparam
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now self.q = q def step(ob, *args, **kwargs): # returns actions, mus, states a0, pi0 = sess.run([a, pi], {X: ob}) return a0, pi0, [] # dummy state def out(ob, *args, **kwargs): pi0, q0 = sess.run([pi, q], {X: ob}) return pi0, q0 def act(ob, *args, **kwargs): return sess.run(a, {X: ob}) self.step = step self.out = out self.act = act
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): mean = tf.sigmoid( fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)) logstd = tf.get_variable( name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) #tf.constant_initializer(-3) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) return self.pdfromflat(pdparam), mean
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value