def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value
def nature_cnn(unscaled_images, **conv_kwargs): """ CNN from Nature paper. """ scaled_images = tf.cast(unscaled_images, tf.float32) / 255. activ = tf.nn.relu h = activ( conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) h2 = activ( conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) h3 = activ( conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) h3 = conv_to_fc(h3) return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
# however, I like the idea of activation as feature existence probablty # Q: could these be good for transfer learning? Maybe. # Tensor valued working memories (generalizing NTM) # List of job application plans # OpenAI fellowship # AI2 software engineer # AI for Brain Science -- look up positions # Google DeepMind -- research engineer X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask self.pdtype = make_pdtype(ac_space) with tf.variable_scope('model', reuse=reuse): h = caps_cnn(processed_x) h = capsule_conv(h, 'capsconv', 4, 2, 32, 8) h = capsule(h, 'caps', 16, 8, from_conv=True) vf = fc(h, 'v', 1)[:, 0] # value function # for discrete action spaces, create a final capsule layer # one capsule for each possible action if isinstance(ac_space, spaces.Discrete): p = capsule(h, 'pcaps', ac_space.n, 4, from_conv=False) pnorm = tf.reduce_sum(tf.square(p), axis=2) self.pd, self.pi = self.pdtype.pdfromflat(pnorm), pnorm else: self.pd, self.pi = self.pdtype.pdfromlatent(h) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): conv1 = caps_cnn(processed_x, **conv_kwargs) conv1 = tf.transpose( conv1, [0, 3, 1, 2]) # reshape to expected input format conv1 = tf.expand_dims(conv1, 1) capsule1 = layers.conv_slim_capsule( conv1, input_dim=1, output_dim=32, layer_name='conv_capsule1', num_routing=1, input_atoms=256, output_atoms=8, stride=2, kernel_size=9, padding='VALID', leaky=False, ) capsule1_atom_last = tf.transpose(capsule1, [0, 1, 3, 4, 2]) capsule1_3d = tf.reshape(capsule1_atom_last, [tf.shape(conv1)[0], -1, 8]) _, _, _, height, width = capsule1.get_shape() input_dim1 = 32 * height.value * width.value # main encoding layer h = layers.capsule( input_tensor=capsule1_3d, input_dim=input_dim1, output_dim=8, layer_name='capsule2', input_atoms=8, output_atoms=16, num_routing=3, leaky=False, ) # capsule policy layer hpi = layers.capsule( input_tensor=h, input_dim=8, output_dim=4, layer_name='capsule_pi', input_atoms=16, output_atoms=4, num_routing=3, leaky=False, ) pnorm = tf.reduce_sum(tf.square(hpi), axis=-1) # value function hvf = conv_to_fc(h) vf = fc(hvf, 'v', 1)[:, 0] # policy based on pnorm (the squared norms of policy capsule vecs) self.pd, self.pi = self.pdtype.pdfromflat(pnorm), pnorm a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.vf = vf self.step = step self.value = value