def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi_logits = fc(h, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = [] # not stateful self.X = X self.pi = pi # actual policy params now self.q = q def step(ob, *args, **kwargs): # returns actions, mus, states a0, pi0 = sess.run([a, pi], {X: ob}) return a0, pi0, [] # dummy state def out(ob, *args, **kwargs): pi0, q0 = sess.run([pi, q], {X: ob}) return pi0, q0 def act(ob, *args, **kwargs): return sess.run(a, {X: ob}) self.step = step self.out = out self.act = act
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi = fc(h, 'pi', nact, init_scale=0.01) vf = fc(h, 'v', 1)[:,0] self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) pi = fc(h4, 'pi', nact, act=lambda x:x) vf = fc(h4, 'v', 1, act=lambda x:x) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = [] #not stateful def step(ob, *_args, **_kwargs): a, v = sess.run([a0, v0], {X:ob}) return a, v, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X:ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch,) + ob_space.shape self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh flatten = tf.layers.flatten pi_h1 = activ(fc(flatten(X), 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(flatten(X), 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def CNN7(unscaled_images, index, filmObj): with slim.arg_scope([slim.conv2d, slim.separable_conv2d], activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers. variance_scaling_initializer()): scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu # w_1 = tf.slice(filmObj.film_w_1,index*32,[32]) # b_1 = tf.slice(filmObj.film_b_1,index*32,[32]) w_2 = tf.slice(filmObj.film_w_2, index * 64, [64]) b_2 = tf.slice(filmObj.film_b_2, index * 64, [64]) # w_3 = tf.slice(filmObj.film_w_3,index*48,[48]) # b_3 = tf.slice(filmObj.film_b_3,index*48,[48]) h = slim.separable_conv2d(scaled_images, 32, 8, 1, 4) # h = tf.math.add(tf.multiply(h, temp['weights_1']), temp['bias_1']) # h = tf.math.add(tf.multiply(h, w_1), b_1) h2 = slim.separable_conv2d(h, 64, 4, 1, 2) # h2 = tf.math.add(tf.multiply(h2, temp['weights_2']), temp['bias_2']) h2 = tf.math.add(tf.multiply(h2, w_2), b_2) h3 = slim.separable_conv2d(h2, 48, 3, 1, 1) # h3 = tf.math.add(tf.multiply(h3, temp['weights_3']), temp['bias_3']) # h3 = tf.math.add(tf.multiply(h3, w_3), b_3) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias) return self.pdfromflat(pdparam), pdparam
def cnn7(unscaled_images, **conv_kwargs): """ Network 96x96: model/SeparableConv2d/depthwise_weights:0 (8, 8, 4, 1) model/SeparableConv2d/pointwise_weights:0 (1, 1, 4, 32) model/SeparableConv2d/biases:0 (32,) model/SeparableConv2d_1/depthwise_weights:0 (4, 4, 32, 1) model/SeparableConv2d_1/pointwise_weights:0 (1, 1, 32, 64) model/SeparableConv2d_1/biases:0 (64,) model/SeparableConv2d_2/depthwise_weights:0 (3, 3, 64, 1) model/SeparableConv2d_2/pointwise_weights:0 (1, 1, 64, 48) model/SeparableConv2d_2/biases:0 (48,) model/fc1/w:0 (6912, 512) model/fc1/b:0 (512,) model/v/w:0 (512, 1) model/v/b:0 (1,) model/pi/w:0 (512, 7) model/pi/b:0 (7,) Trainable variables: 3550296 """ with slim.arg_scope([slim.conv2d, slim.separable_conv2d], activation_fn=tf.nn.relu, weights_initializer=tf.contrib.layers. variance_scaling_initializer()): scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = slim.separable_conv2d(scaled_images, 32, 8, 1, 4) h2 = slim.separable_conv2d(h, 64, 4, 1, 2) h3 = slim.separable_conv2d(h2, 48, 3, 1, 1) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv fm = nature_cnn(X, **conv_kwargs) fm_flat = conv_to_fc(fm) h = tf.nn.relu(fc(fm_flat, 'fc1', nh=nh, init_scale=np.sqrt(2))) M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) # states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return fm, h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def nature_cnn(unscaled_images, **conv_kwargs): """ CNN from Nature paper. """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, filmObj, reuse=False,st = "act", **conv_kwargs): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (int(nbatch/nenvs), nh, nw, nc) # Use this self.pdtype = make_pdtype(ac_space) index = tf.placeholder(tf.int32,[1]) X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = CNN7(X,index,filmObj) #**conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) print("Network:") [print(v.name, v.shape) for v in tf.trainable_variables()] print("Trainable variables:") print(np.sum([np.prod(v.get_shape()) for v in tf.trainable_variables()])) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob,idx, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob, index:[idx]}) return a, v, self.initial_state, neglogp def value(ob,idx, *_args, **_kwargs): # print('the shape of ob when value is called is ',ob.shape) return sess.run(vf, {X:ob, index:[idx]}) self.X = X self.vf = vf self.step = step self.value = value self.index = index
def cnn(unscaled_images, scope, activ=None, nfeat=None, reuse=False): scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = activ or tf.nn.leaky_relu nfeat = nfeat or 512 h = activ( conv(scaled_images, scope + '_conv1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), reuse=reuse)) h2 = activ( conv(h, scope + '_conv2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), reuse=reuse)) h3 = activ( conv(h2, scope + '_conv3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), reuse=reuse)) h3 = conv_to_fc(h3) return fc(h3, scope + '_conv_to_fc', nh=nfeat, init_scale=np.sqrt(2), reuse=reuse)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def nature_cnn(unscaled_images, scope, **conv_kwargs): """ CNN from Nature paper. """ #unscaled_images = tf.placeholder(tf.float32, shape=[None, 84, 84, 1], name='unscaled_images') with tf.variable_scope(scope): scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu # 8x8 filter size is common on the very 1st conv layer, looking at the input image h = activ( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = cnn7(X, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) # The a value returned here defines the action that Sonic takes. It is a vector of one element (action index) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def final_nn(input_final_nn, fnn_args, no_actions, initializer): # input: r, c, w (and w_ex) concatenated # output: no_actions-dimensional vector activ = tf.nn.relu #softmax = tf.nn.softmax # fc layer(s) specified by lw_args h = input_final_nn for i, nneurons in enumerate(fnn_args): h = activ( fc(h, 'fnn_fc{}'.format(i), nh=nneurons, initializer=initializer, init_scale=np.sqrt(2)), 'fnn_fc_relu{}'.format(i)) # last fc layer that predicts action #if not fnn_args: # last_fcl_name = 'fnn_fc0' #else: # last_fcl_name = 'fnn_fc{}'.format(len(fnn_args)) #h = activ(fc(h, last_fcl_name, nh=no_actions, initializer=initializer, init_scale=np.sqrt(2))) #output = softmax(h) #return output return h
def nature_cnn(unscaled_images, keep_probs, **conv_kwargs): """ CNN from Nature paper. """ # scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(unscaled_images, 'c1', nf=32, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) h4 = activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) return tf.nn.dropout(h4, keep_prob=keep_probs)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h5) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, vf, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(vf, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): #h = custom_cnn(X, **conv_kwargs) #print(conv_kwargs) h = policies.nature_cnn(X, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def nature_cnn(scaled_images, **conv_kwargs): """ Model used in the paper "Human-level control through deep reinforcement learning" https://www.nature.com/articles/nature14236 """ def activ(curr): return tf.nn.relu(curr) h = activ( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nbatch, reuse=False, **kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ( fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ( fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def step_policyflat(ob, *_args, **_kwargs): a, v, neglogp, polciyflat = sess.run( [a0, vf, neglogp0, self.pd.flat], {X: ob}) return a, v, self.initial_state, neglogp, polciyflat def step_test(ob, *_args, **_kwargs): a = sess.run([self.pd.mean], {X: ob}) return a self.X = X self.vf = vf self.step = step self.step_policyflat = step_policyflat self.value = value self.step_test = step_test
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess or tf.get_default_session() if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:,0]
def __call__(self, obs, reuse=True): with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): policy_latent = self.network_builder(obs) action = tf.nn.tanh( fc(policy_latent, "pi/mean", self.ac_space.shape[0])) # action = fc(policy_latent, "pi/mean", self.ac_space.shape[0]) return action
def _create_qf(self, policy_latent, vf_latent): with tf.variable_scope('qf', reuse=tf.AUTO_REUSE): qf_input = tf.concat([policy_latent, vf_latent], axis=1) # qf_input = self.policy_latent * vf_latent qf_latent = self.q_network(qf_input) qf = fc(qf_latent, 'qf', 1) qf = qf[:, 0] return qf
def network_fn(X, action): buffer_size = X.shape[1] net = X net = layers.conv1d(net, 20, 5, scope='cnn1d_c1') net = layers.conv1d(net, 15, 3, scope='cnn1d_c2') net = layers.conv1d(net, 10, 3, scope='cnn1d_c3') net = layers.conv1d(net, 5, 3, scope='cnn1d_c4') net = layers.conv1d(net, 1, 3, scope='cnn1d_c5') net = tf.reshape(net, [-1, buffer_size]) net = tf.concat([net, action], 1) net = fc(net, 'cnn1d_fc1', nh=32, init_scale=np.sqrt(2)) net = fc(net, 'cnn1d_fc2', nh=24, init_scale=np.sqrt(2)) net = fc(net, 'cnn1d_fc3', nh=16, init_scale=np.sqrt(2)) net = tf.tanh(net) # tf.nn.conv1d(X, w, stride, 'SAME') # print(X) return net
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, taskScope="Task0"): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope(taskScope + '/model', reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ( fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ( fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) with tf.variable_scope(taskScope + '/modelVars', reuse=reuse): _mean = self.pi a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def detStep(ob, *_args, **_kwargs): a = sess.run([_mean], {X: ob}) return a def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.detStep = detStep self.value = value
def network_fn(X): h = tf.layers.flatten(X) for i in range(num_layers): h = fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)) if layer_norm: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) return h
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): # h = nature_cnn(X) activ = tf.tanh h = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __call__(self, obs, action, reuse=True): with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): qf_input = tf.concat( [obs, action], axis=-1 ) # this assumes observation and action can be concatenated qf_latent = self.network_builder(qf_input) qf = fc(qf_latent, "last_fc", 1) return qf
def last_linear_hidden_layer(x, actions=None, d=512, **conv_kwargs): h = dcgan_cnn(x, **conv_kwargs) activ = leaky_relu if actions is not None: h = tf.concat([actions, h], axis=1) return activ('h_final', fc(h, 'fc1', nh=d, init_scale=np.sqrt(2)))
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): temp_pdparam = fc(latent_vector, 'pi', self.nac * self.npol, init_scale=init_scale, init_bias=init_bias) pdparam = tf.reshape(temp_pdparam, [-1, self.npol, self.nac]) return self.pdfromflat(pdparam), pdparam
def network_fn(X, mode="pi"): filtered_conv_kwargs = {} def filter_kwargs(k): if k in conv_kwargs.keys(): filtered_conv_kwargs[k] = conv_kwargs[k] filter_kwargs("pad") filter_kwargs("data_format") filter_kwargs("one_dim_bias") scaled_images = tf.cast(X, tf.float32) / 255. activ = tf.nn.relu bn = tf.contrib.layers.batch_norm drp = tf.nn.dropout def addbndrp(h): if (batchnormpi and mode == "pi") or (batchnormvf and mode == "vf"): h = bn(h, center=True, scale=True, is_training=isbnpitrainmode if mode == "pi" else isbnvftrainmode, updates_collections=None) h = activ(h) if (dropoutpi < 1.0 and mode == "pi"): h = drp(h, keep_prob=dropoutpi_keep_prob) if (dropoutvf < 1.0 and mode == "vf"): h = drp(h, keep_prob=dropoutvf_keep_prob) return h h = addbndrp( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **filtered_conv_kwargs)) h2 = addbndrp( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **filtered_conv_kwargs)) h3 = addbndrp( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **filtered_conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 nenv = nbatch // nsteps ob_shape = (nbatch, ) + ob_space.shape nact = ac_space.n X = tf.compat.v1.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.compat.v1.variable_scope('intrinsic', reuse=reuse): h3 = nature_cnn(X) r_in0 = tf.tanh(fc(h3, 'r_in', nact)) v_ex0 = fc(h3, 'v_ex', 1)[:, 0] with tf.compat.v1.variable_scope('policy', reuse=reuse): h3 = nature_cnn(X) pi = fc(h3, 'pi', nact, init_scale=0.01) v_mix0 = fc(h3, 'v_mix', 1)[:, 0] self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.init_policy_state = None def step(ob, *_args, **_kwargs): a, v_ex, v_mix, neglogp = sess.run([a0, v_ex0, v_mix0, neglogp0], {X: ob}) return a, v_ex, v_mix, self.init_policy_state, neglogp def value(ob, *_args, **_kwargs): v_ex, v_mix = sess.run([v_ex0, v_mix0], {X: ob}) return v_ex, v_mix def intrinsic_reward(ob, ac, *_args, **_kwargs): r_in = sess.run(r_in0, {X: ob}) return r_in[np.arange(nbatch), ac] self.X = X self.r_in = r_in0 self.v_ex = v_ex0 self.pi = pi self.v_mix = v_mix0 self.step = step self.value = value self.intrinsic_reward = intrinsic_reward self.policy_params = tf.compat.v1.trainable_variables("policy") self.intrinsic_params = tf.compat.v1.trainable_variables("intrinsic") self.policy_new_fn = CnnPolicyNew
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) pi = fc(h4, 'pi', nact, act=lambda x: x) vf = fc(h4, 'v', 1, act=lambda x: x) v0 = vf[:, 0] a0 = sample(pi) aprobs0 = tf.nn.softmax(pi) # action probs self.initial_state = [] #not stateful def step(ob, *_args, **_kwargs): a, v, aprobs = sess.run([a0, v0, aprobs0], {X: ob}) return a, v, aprobs, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X: ob}) self.X = X self.pi = pi # policy self.aprobs0 = aprobs0 self.vf = vf self.step = step self.value = value
def nature_cnn(unscaled_images): """ CNN from Nature paper. """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))) h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))) h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 ob_shape = (nbatch,) + ob_space.shape actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', 1)[:,0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _matching_fc(tensor, name, size, init_scale, init_bias): if tensor.shape[-1] == size: return tensor else: return fc(tensor, name, size, init_scale=init_scale, init_bias=init_bias)
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0): mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias) logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) return self.pdfromflat(pdparam), mean
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv * nsteps nh, nw, nc = (32, 32, 3) ob_shape = (nbatch, nh, nw, nc * nstack) nact = 3 # 524 # nsub3 = 2 # nsub4 = 5 # nsub5 = 10 # nsub6 = 4 # nsub7 = 2 # nsub8 = 4 # nsub9 = 500 # nsub10 = 4 # nsub11 = 10 # nsub12 = 500 # (64, 64, 13) # 80 * 24 X = tf.placeholder(tf.uint8, ob_shape) #obs with tf.variable_scope("model", reuse=reuse): with tf.variable_scope("common", reuse=reuse): h = conv( tf.cast(X, tf.float32), 'c1', nf=32, rf=5, stride=1, init_scale=np.sqrt(2), pad="SAME") # ?, 32, 32, 16 h2 = conv( h, 'c2', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), pad="SAME") # ?, 32, 32, 32 with tf.variable_scope("pi1", reuse=reuse): h3 = conv_to_fc(h2) # 131072 h4 = fc(h3, 'fc1', nh=256, init_scale=np.sqrt(2)) # ?, 256 pi_ = fc( h4, 'pi', nact) # ( nenv * nsteps, 524) # ?, 524 pi = tf.nn.softmax(pi_) vf = fc( h4, 'v', 1) # ( nenv * nsteps, 1) # ?, 1 # vf = tf.nn.l2_normalize(vf_, 1) with tf.variable_scope("xy0", reuse=reuse): # 1 x 1 convolution for dimensionality reduction pi_xy0_ = conv( h2, 'xy0', nf=1, rf=1, stride=1, init_scale=np.sqrt(2)) # (? nenv * nsteps, 32, 32, 1) pi_xy0__ = conv_to_fc(pi_xy0_) # 32 x 32 => 1024 pi_xy0 = tf.nn.softmax(pi_xy0__) with tf.variable_scope("xy1", reuse=reuse): pi_xy1_ = conv( h2, 'xy1', nf=1, rf=1, stride=1, init_scale=np.sqrt(2)) # (? nenv * nsteps, 32, 32, 1) pi_xy1__ = conv_to_fc(pi_xy1_) # 32 x 32 => 1024 pi_xy1 = tf.nn.softmax(pi_xy1__) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = [] #not stateful def step(ob, *_args, **_kwargs): #obs, states, rewards, masks, actions, actions2, x1, y1, x2, y2, values _pi1, _xy0, _xy1, _v = sess.run([pi, pi_xy0, pi_xy1, v0], {X: ob}) return _pi1, _xy0, _xy1, _v, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X: ob}) self.X = X self.pi = pi # self.pi_sub3 = pi_sub3 # self.pi_sub4 = pi_sub4 # self.pi_sub5 = pi_sub5 # self.pi_sub6 = pi_sub6 # self.pi_sub7 = pi_sub7 # self.pi_sub8 = pi_sub8 # self.pi_sub9 = pi_sub9 # self.pi_sub10 = pi_sub10 # self.pi_sub11 = pi_sub11 # self.pi_sub12 = pi_sub12 self.pi_xy0 = pi_xy0 self.pi_xy1 = pi_xy1 # self.pi_y0 = pi_y0 # self.pi_x1 = pi_x1 # self.pi_y1 = pi_y1 # self.pi_x2 = pi_x2 # self.pi_y2 = pi_y2 self.vf = vf self.step = step self.value = value
def __init__(self, tf_session, ob_space, ac_space, nbatch, reward_redistribution_config, observation_network_config, lstm_network_config, training_config, exploration_config, nsteps, nlstm=64, reuse=False): """LSTM policy network, as described in RUDDER paper Based on baselines.ppo2.policies.py; LSTM layer sees features from it's own trainable observation network and the features from the reward redistribution observation network; Parameters ------- tf_session : tensorflow session tensorflow session to compute the graph in ob_space Baselines ob_space object (see ppo2_rudder.py); must provide .shape attribute for (x, y, c) shapes; ac_space Baselines ac_space object (see ppo2_rudder.py); must provide .n attribute for number of possible actions; nbatch : int Batchsize nsteps : int Fixed number of timesteps to process at once reward_redistribution_config : dict Dictionary containing config for reward redistribution: ----- lambda_eligibility_trace : float Eligibility trace value for redistributed reward vf_contrib : float Weighting of original value function (vf) vs. redistributed reward (rr), s.t. :math:`reward = vf \cdot vf\_contrib + rr \cdot (1-vf\_contrib)` use_reward_redistribution_quality_threshold : float Quality of reward redistribution has to exceed use_reward_redistribution_quality_threshold to be used; use_reward_redistribution_quality_threshold range is [0,1]; Quality measure is the squared prediction error, as described in RUDDER paper; use_reward_redistribution : bool Use reward redistribution? rr_junksize : int Junksize for reward redistribution; Junks overlap by 1 half each cont_pred_w : float Weighting of continous prediciton loss vs. prediction loss of final return at last timestep intgrd_steps : int Stepsize for integrated gradients intgrd_batchsize : int Integrated gradients is computed batch-wise if intgrd_batchsize > 1 observation_network_config : dict Dictionary containing config for observation network that processes observations and feeds them to LSTM network: ----- show_states : bool Show frames to network? show_statedeltas : bool Show frame deltas to network? prepoc_states : list of dicts Network config to preprocess frames prepoc_deltas : list of dicts Network config to preprocess frame deltas prepoc_observations : list of dicts Network config to preprocess features from frame and frame-delta preprocessing networks lstm_network_config : dict Dictionary containing config for LSTM network: ----- show_actions : bool Show taken actions to LSTM? reversed : bool Process game sequence in reversed order? layers : list of dicts Network config for LSTM network and optional additional dense layers initializations : dict Initialization config for LSTM network timestep_encoding : dict Set "max_value" and "triangle_span" for TeLL.utiltiy.misc_tensorflow.TriangularValueEncoding class training_config : dict Dictionary containing config for training and update procedure: ----- n_no_rr_updates : int Number of updates to perform without training or using reward redistribution network n_pretrain_games : int Number of games to pretrain the reward redistribution network without using it; downscale_lr_policylag : bool Downscale learningrate permanently if policy lag gets too large? optimizer : tf.train optimizer Optimizer in tf.train, e.g. "AdamOptimizer" optimizer_params : dict Kwargs for optimizer l1 : float Weighting for l1 weight regularization l2 : float Weighting for l2 weight regularization clip_gradients : float Threshold for clipping gradients (clipping by norm) exploration_config : dict Dictionary containing config for exploration: ----- sample_actions_from_softmax : bool True: Apply softmax to policy network output and use it as probabilities to pick an action False: Use the max. policy network output as action temporal_safe_exploration : bool User RUDDER safe exploration save_pi_threshold : float Threshold value in range [0,1] for safe actions in RUDDER safe exploration nlstm : int Number of LSTM units (=memory cells) reuse : bool Reuse tensorflow variables? """ # # Shapes # nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) seq_ob_shape = (nenv, -1, nh, nw, 1) nact = ac_space.n # # Placeholders for inputs # X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states # # Prepare input # single_frames = tf.cast(tf.reshape(X[..., -1:], shape=seq_ob_shape), dtype=tf.float32) delta_frames = single_frames - tf.cast(tf.reshape(X[..., -2:-1], shape=seq_ob_shape), dtype=tf.float32) # # Get observation features from RR model # rr_model = RewardRedistributionModel(reward_redistribution_config=reward_redistribution_config, observation_network_config=observation_network_config, lstm_network_config=lstm_network_config, training_config=training_config, scopename="RR") self.rr_observation_model = rr_model rr_observation_layer = rr_model.get_visual_features(single_frame=single_frames, delta_frame=delta_frames, additional_inputs=[]) # # Build policy network # with tf.variable_scope("model", reuse=reuse): temperature = tf.get_variable(initializer=tf.constant(1, dtype=tf.float32), trainable=False, name='temperature') additional_inputs = [StopGradientLayer(rr_observation_layer)] observation_layers, observation_features = observation_network( single_frame=single_frames, delta_frame=delta_frames, additional_inputs=additional_inputs, observation_network_config=observation_network_config) self.observation_features_shape = observation_features.get_output_shape() xs = [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=tf.reshape(observation_layers[-1].get_output(), [nenv, nsteps, -1]))] ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) h6 = h5 pi = fc(h6, 'pi', nact) vf = fc(h6, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) if exploration_config['sample_actions_from_softmax']: a0 = self.pd.sample_temp(temperature=temperature) else: a0 = tf.argmax(pi, axis=-1) v0 = vf[:, 0] neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s, neglogp = tf_session.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) return a, v, s, neglogp def value(ob, state, mask): return tf_session.run(v0, {X:ob, S:state, M:mask}) def action(ob, state, mask, *_args, **_kwargs): a, s, neglogp = tf_session.run([a0, snew, neglogp0], {X:ob, S:state, M:mask}) return a, s, neglogp # # Placeholders for exploration # n_envs = pi.shape.as_list()[0] exploration_timesteps_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) prev_actions_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,)) gamelengths_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) keep_prev_action_pl = tf.placeholder(dtype=tf.bool, shape=(n_envs,)) prev_action_count_pl = tf.placeholder(dtype=tf.int64, shape=(n_envs,)) exploration_durations_pl = tf.placeholder(dtype=tf.float32, shape=(n_envs,)) # # Setting up safe exploration # explore = tf.logical_and(tf.logical_and(tf.less_equal(exploration_timesteps_pl, gamelengths_pl), tf.less_equal(gamelengths_pl, exploration_timesteps_pl + exploration_durations_pl)), tf.not_equal(exploration_timesteps_pl, tf.constant(-1, dtype=tf.float32))) safe_pi = pi - tf.reduce_min(pi, axis=-1, keep_dims=True) safe_pi /= tf.reduce_max(safe_pi, axis=-1, keep_dims=True) save_pi_thresholds = (1 - (tf.expand_dims(tf.range(n_envs, dtype=tf.float32), axis=1) / (n_envs + (n_envs == 1) - 1)) * (1 - exploration_config['save_pi_threshold'])) safe_pi = tf.cast(tf.greater_equal(safe_pi, save_pi_thresholds), dtype=tf.float32) safe_pi /= tf.reduce_sum(safe_pi) rand_safe_a = tf.multinomial(safe_pi, 1)[:, 0] safe_pi_flat = tf.reshape(safe_pi, (-1,)) prev_action_is_safe = tf.gather(safe_pi_flat, prev_actions_pl + tf.range(safe_pi.shape.as_list()[0], dtype=tf.int64) * safe_pi.shape.as_list()[1]) prev_action_is_safe = tf.greater(prev_action_is_safe, tf.constant(0, dtype=tf.float32)) a_explore = tf.where(tf.logical_and(tf.logical_and(keep_prev_action_pl, tf.not_equal(gamelengths_pl, exploration_timesteps_pl)), prev_action_is_safe), prev_actions_pl, rand_safe_a) a_explore = tf.where(explore, a_explore, a0) # Make sure the actor doesn't repeat an action too often (otherwise screensaver might start) rand_a = tf.random_uniform(shape=a0.get_shape(), minval=0, maxval=ac_space.n, dtype=a0.dtype) a_explore = tf.where(tf.greater(prev_action_count_pl, tf.constant(20, dtype=tf.int64)), rand_a, a_explore) if not exploration_config['temporal_safe_exploration']: a_explore = a0 neglogp_explore = self.pd.neglogp(a_explore) def action_exploration(ob, state, mask, *_args, exploration_timesteps, prev_actions, gamelengths, keep_prev_action, prev_action_count, exploration_durations, **_kwargs): """Get actions with exploration for long-term reward""" a, s, neglogp = tf_session.run([a_explore, snew, neglogp_explore], {X: ob, S:state, M:mask, exploration_timesteps_pl: exploration_timesteps, prev_actions_pl: prev_actions, gamelengths_pl: gamelengths, exploration_durations_pl: exploration_durations, keep_prev_action_pl: keep_prev_action, prev_action_count_pl: prev_action_count}) return a, s, neglogp self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value self.action = action self.action_exploration = action_exploration self.seq_ob_shape = seq_ob_shape self.exploration_config = exploration_config