def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q def step(ob, state, mask, *args, **kwargs): # returns actions, mus, states a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) return a0, pi0, s self.step = step
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv*nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc*nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x:x) vf = fc(h5, 'v', 1, act=lambda x:x) v0 = vf[:, 0] a0 = sample(pi) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask}) return a, v, s def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) # states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X: ob, S: state, M: mask}) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=tf.AUTO_REUSE): h, self.dropout_assign_ops = choose_cnn(processed_x) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1)[:, 0] lp = fc(h, 'lp', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h5) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, vf, lp, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(vf, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.lp = lp self.step = step self.value = value
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv ob_g, ob_l = tf.split(X, 2, axis=1) ob_g = tf.squeeze(ob_g, axis=1) - 128.0 ob_l = tf.squeeze(ob_l, axis=1) - 128.0 # Conv layer net_g = vggm1234(ob_g) net_l = vggm1234(ob_l) feat = tf.concat([net_g, net_l], 1) # LSTM M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = batch_to_seq(feat, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) # FC h = slim.fully_connected(h, 4, scope='fc', activation_fn=tf.nn.tanh) return (feat, h), { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def network_fn(X, nenv=1): nbatch = X.shape[0] nsteps = nbatch // nenv h = tf.layers.flatten(X) for i in range(len(hiddens) - 1): h = utils.fc(h, 'mlp_fc{}'.format(i), nh=hiddens[i], init_scale=np.sqrt(2)) if layer_norm: h = tf.contrib.layers.layer_norm(h, center=True, scale=True) h = activation(h) nlstm = hiddens[-1] M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, 2 * nlstm]) #states xs = utils.batch_to_seq(h, nenv, nsteps) ms = utils.batch_to_seq(M, nenv, nsteps) if layer_norm: h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) else: h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) h = utils.seq_to_batch(h5) initial_state = np.zeros(S.shape.as_list(), dtype=float) return h, { 'S': S, 'M': M, 'state': snew, 'initial_state': initial_state }
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, add_flownet, reuse=False, flownet=None, train_from_scratch=False, recurrent=None, large_cnn=False, nlstm=64, add_predicted_flow_to_vec=False, diff_frames=False): ob_shape_vec = (nbatch,) + ob_space["vector"].shape nh, nw, nc = ob_space["image"].shape ob_shape_im = (nbatch, nh, nw, nc) actdim = ac_space.shape[0] X_vec = tf.placeholder(tf.float32, ob_shape_vec, name='Ob_vec') # obs X_im = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_im') if add_flownet: # adding previous image placeholder: X_p = tf.placeholder(tf.uint8, ob_shape_im, name='Ob_p') # obs t-1 else: X_p = None if recurrent: nenv = nbatch // nsteps M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh h_im = mujoco_cnn( X_im, 'pi', nbatch, add_flownet and not add_predicted_flow_to_vec, X_p, flownet, train_from_scratch, large_cnn, diff_frames) if add_predicted_flow_to_vec: flow_vec = get_flow_vec( X_im, 'pi', nbatch, add_flownet, X_p, flownet, train_from_scratch, large_cnn, diff_frames) h_vec = tf.concat([X_vec, flow_vec], axis=-1) h_vec = activ(fc(h_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) else: h_vec = activ(fc(X_vec, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) h1 = tf.concat([h_im, h_vec], 1) if recurrent: xs = batch_to_seq(h1, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) if recurrent == 'lstm': h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) else: assert recurrent == 'lnlstm' h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h2 = seq_to_batch(h5) else: h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) pi = fc(h2, 'pi', actdim, init_scale=0.01) vf = fc(h2, 'vf', 1) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() a0_r = self.pd.mode() neglogp0 = self.pd.neglogp(a0) if not recurrent: self.initial_state = None else: self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.placeholder_dict = { "image": X_im, "vector": X_vec } if add_flownet: self.placeholder_dict["last_image"] = X_p if not recurrent: def step(ob, *_args, remove_noise=False, **_kwargs): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] if not remove_noise: a, v, neglogp = sess.run([a0, v0, neglogp0], feed_dict=feed_dict) else: a, v, neglogp = sess.run([a0_r, v0, neglogp0], feed_dict=feed_dict) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] return sess.run(v0, feed_dict=feed_dict) else: def step(ob, state, mask, remove_noise=False): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] feed_dict[S] = state feed_dict[M] = mask if not remove_noise: a, v, s, neglogp = sess.run([a0, v0, snew, neglogp0], feed_dict=feed_dict) else: a, v, s, neglogp = sess.run([a0_r, v0, snew, neglogp0], feed_dict=feed_dict) return a, v, s, neglogp def value(ob, state, mask): feed_dict = {} for key, value in self.placeholder_dict.items(): feed_dict[value] = ob[key] feed_dict[S] = state feed_dict[M] = mask return sess.run(v0, feed_dict=feed_dict) self.X_im = X_im self.X_vec = X_vec self.X_p = X_p self.pi = pi if not recurrent: self.vf = v0 else: self.vf = vf self.M = M self.S = S self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, nlstm=256, reuse=False, feature_mlp=True): # Here the batch size is 1, i.e. one trajectory # also assume nenvs=1 if nsteps is None: ob_shape = (None, ) + ob_space.shape M = tf.placeholder(tf.float32, [None]) else: ob_shape = (nsteps, ) + ob_space.shape M = tf.placeholder(tf.float32, [nsteps]) if len(ac_space.shape) == 0: # discrete set of actions nact = ac_space.n discrete = True else: actdim = ac_space.shape[0] discrete = False X = tf.placeholder(tf.float32, ob_shape, name="Ob") S = tf.placeholder(tf.float32, [1, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh if feature_mlp: h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2))) h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2))) xs = batch_to_seq(h2, 1, nsteps) else: xs = batch_to_seq(X, 1, nsteps) ms = batch_to_seq(M, 1, nsteps) h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, "vf", 1) if discrete: pi = fc(h5, "pi", nact, init_scale=0.01) else: pi = fc(h5, "pi", actdim, init_scale=0.01) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) self.pdtype = make_pdtype(ac_space) if discrete: self.pd = self.pdtype.pdfromflat(pi) else: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((1, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False, feature_mlp=True): nenv = nbatch // nsteps # assume that inputs are vectors and reward is a scalar if len(ac_space.shape) == 0: # discrete set of actions, input as one-hot encodings nact = ac_space.n discrete = True input_length = ob_space.shape[0] + nact + 2 else: actdim = ac_space.shape[0] discrete = False input_length = ob_space.shape[0] + actdim + 2 input_shape = (nbatch, input_length) X = tf.placeholder(tf.float32, input_shape, name="Input") M = tf.placeholder(tf.float32, [nbatch]) # mask (done with a trial at time t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states of the recurrent policy with tf.variable_scope("model", reuse=reuse): activ = tf.tanh if feature_mlp: print("Using feature network in front of LSTM") h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2))) h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2))) xs = batch_to_seq(h2, nenv, nsteps) else: print("No feature network in front of LSTM") xs = batch_to_seq(X, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, "vf", 1) if discrete: pi = fc(h5, "pi", nact, init_scale=0.01) else: pi = fc(h5, "pi", actdim, init_scale=0.01) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) self.pdtype = make_pdtype(ac_space) if discrete: self.pd = self.pdtype.pdfromflat(pi) else: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, ac, rew, done, mask): # if discrete action space, convert ac to one-hot encoding and done to int rew = np.reshape(np.asarray([rew]), (nbatch, 1)) done = np.reshape(np.asarray([done], dtype=float), (nbatch, 1)) if discrete: if ac[0] == -1: ac = np.zeros((nbatch, nact), dtype=np.int) else: ac = np.reshape(np.asarray([ac]), (nbatch, )) ac = np.eye(nact)[ac] x = np.concatenate((ob, ac, rew, done), axis=1) else: ac = np.reshape(np.asarray([ac]), (nbatch, actdim)) x = np.concatenate((ob, ac, rew, done), axis=1) return sess.run([a0, v0, snew, neglogp0], { X: x, S: state, M: mask }) def value(ob, state, ac, rew, done, mask): rew = np.reshape(np.asarray([rew]), (nbatch, 1)) done = np.reshape(np.asarray([done], dtype=float), (nbatch, 1)) if discrete: if ac[0] == -1: ac = np.zeros((nbatch, nact), dtype=np.int) else: ac = np.reshape(np.asarray([ac]), (nbatch, )) ac = np.eye(nact)[ac] x = np.concatenate((ob, ac, rew, np.array(done, dtype=float)), axis=1) else: ac = np.reshape(np.asarray([ac]), (nbatch, actdim)) x = np.concatenate((ob, ac, rew, np.array(done, dtype=float)), axis=1) return sess.run(v0, {X: x, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, ob_name, m_name, svfname, spiname, ob_space, ac_space, usecnn=False, nlstm=256): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None init_std = 1.0 nenv = 1 # nbatch = nenv * nsteps self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.ob = U.get_placeholder(name=ob_name, dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) M = U.get_placeholder(m_name, tf.float32, [sequence_length]) # mask (done t-1) Svf = U.get_placeholder(svfname, tf.float32, [nenv, nlstm * 2]) # states Spi = U.get_placeholder(spiname, tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("vf"): if usecnn: h = nature_cnn(self.ob) else: h = self.ob # xs = batch_to_seq(h, nenv, nsteps) # ms = batch_to_seq(M, nenv, nsteps) # h5, vfsnew = lstm(xs, ms, Svf, 'lstmvf', nh=nlstm) h5, vfsnew = lstm(h, M, Svf, 'lstmvf', nh=nlstm) h5 = seq_to_batch(h5) self.vpred = fc(h5, 'value', 1) with tf.variable_scope("pol"): if usecnn: h = nature_cnn(self.ob) else: h = self.ob # xs = batch_to_seq(h, nenv, nsteps) # ms = batch_to_seq(M, nenv, nsteps) # h5, pisnew = lstm(xs, ms, Spi, 'lstmpi', nh=nlstm) h5, pisnew = lstm(h, M, Spi, 'lstmpi', nh=nlstm) h5 = seq_to_batch(h5) self.action_dim = ac_space.shape[0] self.varphi = h5 self.varphi_dim = 64 stddev_init = np.ones([1, self.action_dim]) * init_std prec_init = 1. / (np.multiply(stddev_init, stddev_init)) # 1 x |a| self.prec = tf.get_variable( name="prec", shape=[1, self.action_dim], initializer=tf.constant_initializer(prec_init)) kt_init = np.ones([self.varphi_dim, self.action_dim ]) * 0.5 / self.varphi_dim ktprec_init = kt_init * prec_init self.ktprec = tf.get_variable( name="ktprec", shape=[self.varphi_dim, self.action_dim], initializer=tf.constant_initializer(ktprec_init)) kt = tf.divide(self.ktprec, self.prec) mean = tf.matmul(h5, kt) logstd = tf.log(tf.sqrt(1. / self.prec)) self.prec_get_flat = U.GetFlat([self.prec]) self.prec_set_from_flat = U.SetFromFlat([self.prec]) self.ktprec_get_flat = U.GetFlat([self.ktprec]) self.ktprec_set_from_flat = U.SetFromFlat([self.ktprec]) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) self.pd = pdtype.pdfromflat(pdparam) self.M = M self.Svf = Svf self.Spi = Spi self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, self.ob, M, Spi, Svf], [ac, self.vpred, pisnew, vfsnew]) # Get all policy parameters vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + '/pol') # Remove log-linear parameters ktprec and prec to get only non-linear parameters del vars[-1] del vars[-1] beta_params = vars # Flat w_beta beta_len = np.sum( [np.prod(p.get_shape().as_list()) for p in beta_params]) w_beta_var = tf.placeholder(dtype=tf.float32, shape=[beta_len]) # Unflatten w_beta beta_shapes = list(map(tf.shape, beta_params)) w_beta_unflat_var = self.unflatten_tensor_variables( w_beta_var, beta_shapes) # w_beta^T * \grad_beta \varphi(s)^T v = tf.placeholder(dtype=self.varphi.dtype, shape=self.varphi.get_shape(), name="v_in_Rop") features_beta = self.alternative_Rop(self.varphi, beta_params, w_beta_unflat_var, v) self.features_beta = U.function([self.ob, w_beta_var, v], features_beta)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=16, reuse=False): nenv = nbatch // nsteps qmdp_param = {} qmdp_param['K'] = 3 qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n qmdp_param['num_action'] = ac_space.n qmdp_param['num_state'] = 32 qmdp_param['num_obs'] = 17 input_len = ob_space.shape input_shape = (nbatch, ) + input_len # [nbatch, input_length] num_action = qmdp_param["num_action"] obs_len = qmdp_param["obs_len"] num_state = qmdp_param['num_state'] num_obs = qmdp_param['num_obs'] self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, input_shape) #[nbatch,obs+prev action] M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder( tf.float32, [nenv, num_state + 2 * nlstm]) # belief state (for each env) # S is belief state concatenated with initial hidden and cell states for vf lstm with tf.variable_scope("model", reuse=reuse): xs = batch_to_seq(X, nenv, nsteps) #xs originaly [nbatch,input_len] #reshape xs to [nenv,nsteps,input_len] #split xs along axis=1 to nsteps #xs becomes [nsteps,nenv,input_len] #divide xs to obs and pre_action obs = [x[:, 0:obs_len] for x in xs] acts = [x[:, obs_len:] for x in xs] ms = batch_to_seq(M, nenv, nsteps) #same as xs #ms has shape [nsteps,nenv] bi = S[:, 0:num_state] # initial/previous belief hi = S[:, num_state:] # initial/previous hidden unit #build variabels self.planner_net = PlannerNet("planner", qmdp_param) self.filter_net = FilterNet("filter", qmdp_param) #calculate action value q, and belief bnew # s_hist is really belief state history, so really belief history # snew is the newest belief s_hist, snew = self.filter_net.beliefupdate(obs, acts, ms, bi) # s_hist, snew, w_O, Z_o, b_prime_a, b_f = self.filter_net.beliefupdate(obs, acts, ms, S) #s_hist: [nstep,nenv,num_state] # snew: [nenv, num_state] Q, _, _ = self.planner_net.VI(nbatch) # Q: [nbatches, num_state, num_action] # h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) # h5 = seq_to_batch(h5) #calculate action and value s_hist = seq_to_batch(s_hist) #[nbatch,num_state] (belief history) q = self.planner_net.policy(Q, s_hist) # [num_batch, num_action] # separate value function for baseline # takes in sequence of observations and actions and returns values of the belief states # in the belief history vn_scope = "value_network" # hi is of dim 2*nlstm # xs is the obs and acts concatenated # TODO: What shape do I want xs to be in? [nsteps, nenv, nobs+nacts], which is what it is! # TODO: And what shape do I want chnew to be? [nenv, nlstm] h_hist, chnew = lstm(xs, ms, hi, vn_scope, nlstm) h_hist = tf.convert_to_tensor(h_hist, dtype=tf.float32) # h_hist.shape: (nstep, nenv, nlstm) # chnew.shape: (nenv, 2*nlstm) Snew = tf.concat(axis=1, values=[snew, chnew]) # stack snew and chnew ############### baseline value function ##################################### ############################################################################# self.pd, self.pi = self.pdtype.pdfromlatent(q) # input dim of fc: shape(q)[1] = num_action, output dim of fc: 1 #vf = fc(q, 'v', 1) #critic value function, output shape: [num_batch, 1] vf = fc(fc(fc(h_hist, 'v1', nlstm), 'v2', nlstm), 'v3', 1) ############################################################################# #pi = fc(h5, 'pi', nact) #actor #vf = fc(h5, 'v', 1) #critic value function v0 = vf[:, 0] # reduce dims from [num_batch, 1] to [num_batch, ] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) # self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) self.initial_state = np.ones( (nenv, num_state), dtype=np.float32) / num_state def step(ob, belief_state, mask): return sess.run([a0, v0, Snew, neglogp0], { X: ob, S: belief_state, M: mask }) # a,b,c,d,q_val = sess.run([a0, v0, snew, neglogp0, q], {X:ob, S:state, M:mask}) # print("q: ",q_val) # print("q shape: ",q_val.shape) # return a,b,c,d def value(ob, belief_state, mask): return sess.run(v0, {X: ob, S: belief_state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=50, reuse=False): nenv = nbatch // nsteps # sess = tf_debug.LocalCLIDebugWrapperSession(sess) # nh, nw, nc = ob_space.shape # ob_shape = (nbatch, nh, nw, nc) ob_shape = (nbatch, ) + ob_space.shape X = tf.placeholder(tf.float32, ob_shape) nact = ac_space.shape[0] - 1 # X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch], name='mask') # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2], name='state') # states S_pred = tf.placeholder(tf.float32, [nenv, nlstm * 2], name='predict_state') # states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) # h = tf.nn.tanh(fc(X, 'fc1', 20)) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) logstd = tf.get_variable(name="logstd", shape=[1, nact], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype( spaces.Box(ac_space.low[0], ac_space.high[0], [ nact, ])) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = tf.clip_by_value(self.pd.sample(), -1, 1) neglogp0 = self.pd.neglogp(a0) with tf.variable_scope('predictor', reuse=reuse): h = nature_cnn(X) # h = tf.nn.relu(fc(X, 'fc1', 20)) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew_pred = lstm(xs, ms, S_pred, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) # h7 = (fc(a0, 'prediction_fc_action', 10)) # h6 = tf.concat([h7, h5],axis=1) h7 = fc(h5, 'prediction_fc', 256) self.prediction = tf.nn.relu(fc(h7, 'prediction_out', 1)) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, predict_state, mask): prediction_out, a0_out, v0_out, snew_out, snew_predict_out, neglogp0_out = sess.run( [self.prediction, a0, v0, snew, snew_pred, neglogp0], { X: ob, S: state, S_pred: predict_state, M: mask }) return np.concatenate( [a0_out, prediction_out], axis=-1), v0_out, snew_out, snew_predict_out, neglogp0_out def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.S_pred = S_pred self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=32, reuse=False): nenv = nbatch // nsteps ob_shape = (nbatch, ) + ob_space.shape # actdim = ac_space.shape[0] # hv: I changed this to ac_space.n becuase the ac_space.shape does not work actdim = ac_space.n X = tf.placeholder(tf.float32, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h1 = fc(X, 'fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) xs = batch_to_seq(h1, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h2, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h2 = seq_to_batch(h2) pi = fc(h2, 'pi', actdim, act=lambda x: x, init_scale=0.01) h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) vf = fc(h2, 'vf', 1, act=lambda x: x) self.pdtype = make_pdtype(ac_space) if isinstance(ac_space, gym.spaces.Discrete): self.pd = self.pdtype.pdfromflat(pi) else: logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() v0 = vf[:, 0] neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) def get_act(ob, state, mask): a = sess.run(a0, {X: ob, S: state, M: mask}) return a def get_mean(ob, state, mask): a, state_new = sess.run([pi, snew], {X: ob, S: state, M: mask}) return a, state_new self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value self.act = get_act self.mean = get_mean
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False): nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) #obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)) h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)) h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)) h3 = conv_to_fc(h3) h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)) xs = batch_to_seq(h4, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact, act=lambda x: x) pix = fc(h5, 'pix', FLAGS.screen_resolution, act=lambda x: x) piy = fc(h5, 'piy', FLAGS.screen_resolution, act=lambda x: x) vf = fc(h5, 'v', 1, act=lambda x: x) v0 = vf[:, 0] a0 = sample(pi) x0 = sample(pix) y0 = sample(piy) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): a, x, y, v, s = sess.run([a0, x0, y0, v0, snew], { X: ob, S: state, M: mask }) return a, x, y, v, s def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.pix = pix self.piy = piy self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.float32, ob_shape) #obs I = tf.placeholder(tf.int32, [nbatch, 5]) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states # Model with tf.variable_scope("model", reuse=reuse): # Image Processing with tf.variable_scope("cnn"): x_image_rep = nature_cnn(X) # Instructioin Processing with tf.variable_scope("GRU"): embedding = tf.get_variable( 'word_embedding', shape=[12, 32], initializer=tf.random_uniform_initializer(-1e-3, 1e-3)) gru_cell = tf.contrib.rnn.GRUCell( num_units=256, kernel_initializer=tf.random_uniform_initializer( -1e-3, 1e-3), bias_initializer=tf.random_uniform_initializer( -1e-3, 1e-3)) encoder_hidden = gru_cell.zero_state(nbatch, dtype=tf.float32) for i in range(5): word_embedding = tf.nn.embedding_lookup(embedding, I[:, i]) output, encoder_hidden = gru_cell.call( word_embedding, encoder_hidden) x_insts_rep = encoder_hidden # Gated-Attention layers with tf.variable_scope("x-attn"): x_attention = tf.sigmoid( fc(x_insts_rep, 'x-attn', 64, init_scale=1.0)) x_attention = tf.expand_dims(x_attention, 1) x_attention = tf.expand_dims(x_attention, 2) with tf.variable_scope("Gated-Attention"): x = x_image_rep * x_attention x = conv_to_fc(x) x = tf.nn.relu(fc(x, 'x-Ga', 256, init_scale=1.0)) xs = batch_to_seq(x, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h20, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm, init_scale=1.0) h20 = seq_to_batch(h20) with tf.variable_scope("pi"): pi = tf.layers.dense( h20, nact, kernel_initializer=normalized_columns_initializer(0.01)) with tf.variable_scope("vf"): vf = tf.layers.dense( h20, 1, kernel_initializer=normalized_columns_initializer(0.01)) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, insts, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, I: insts, S: state, M: mask }) def value(ob, insts, state, mask): return sess.run(v0, {X: ob, I: insts, S: state, M: mask}) self.X = X self.I = I # self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value # start logging # ============= if reuse: self.var_summary('./Asset/logdir', sess)
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False, feature_mlp=True): nenv = nbatch // nsteps ob_shape = (nbatch, ) + ob_space.shape if len(ac_space.shape) == 0: # discrete set of actions nact = ac_space.n discrete = True else: # continuous actdim = ac_space.shape[0] discrete = False X = tf.placeholder(tf.float32, ob_shape, name="Ob") M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): activ = tf.tanh if feature_mlp: print("Using feature network in front of LSTM") h1 = activ(fc(X, "fc1", nh=nlstm, init_scale=np.sqrt(2))) h2 = activ(fc(h1, "fc2", nh=nlstm, init_scale=np.sqrt(2))) xs = batch_to_seq(h2, nenv, nsteps) else: print("No feature network in front of LSTM") xs = batch_to_seq(X, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, "lstm1", nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, "vf", 1) if discrete: pi = fc(h5, "pi", nact, init_scale=0.01) else: pi = fc(h5, "pi", actdim, init_scale=0.01) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) self.pdtype = make_pdtype(ac_space) if discrete: self.pd = self.pdtype.pdfromflat(pi) else: pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): super().__init__(sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=reuse) nbatch = nenv * nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc * nstack) nact = ac_space.n nlstm = self.lstm_units X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #states with tf.variable_scope("model", reuse=reuse): X = tf.cast(X, tf.float32) h = conv(X, 'c1', nf=16, rf=3, stride=1, pad='SAME', init_scale=np.sqrt(2)) h = tf.nn.relu(h) h = conv(h, 'c2', nf=32, rf=3, stride=1, pad='SAME', init_scale=np.sqrt(2)) h = tf.nn.relu(h) h = conv_to_fc(h) h = fc(h, 'fc1', nh=self.dense_units, init_scale=np.sqrt(2)) h = tf.nn.relu(h) # lstm xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi_logits = fc(h5, 'pi', nact, init_scale=0.01) pi = tf.nn.softmax(pi_logits) q = fc(h5, 'q', nact) self.a = sample(pi_logits) # could change this to use self.pi instead self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) self.snew = snew self.X = X self.M = M self.S = S self.pi = pi # actual policy params now self.q = q self.sess = sess
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=16, reuse=False): nenv = nbatch // nsteps qmdp_param = {} qmdp_param['K'] = 3 qmdp_param['obs_len'] = ob_space.shape[0] - ac_space.n qmdp_param['num_action'] = ac_space.n qmdp_param['num_state'] = 32 qmdp_param['num_obs'] = 17 input_len = ob_space.shape input_shape = (nbatch, ) + input_len num_action = qmdp_param["num_action"] obs_len = qmdp_param["obs_len"] num_state = qmdp_param['num_state'] num_obs = qmdp_param['num_obs'] self.pdtype = make_pdtype(ac_space) X = tf.placeholder(tf.float32, input_shape) #[nbatch,obs+prev action] M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) #beliefs with tf.variable_scope("model", reuse=reuse): xs = batch_to_seq(X, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h = S[:, 0:nlstm] c = S[:, nlstm:] self.lstm = lstm('lstm', input_len[0], nlstm) h5, snew = self.lstm.update(xs, ms, h, c) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) # a,b,c,d,q_val = sess.run([a0, v0, snew, neglogp0, q], {X:ob, S:state, M:mask}) # print("q: ",q_val) # print("q shape: ",q_val.shape) # return a,b,c,d def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, size_mem=256, reuse=False): # pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape if ac_space.shape == (): actdim = 1 else: actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs nenv = nbatch // nsteps M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, size_mem * 2]) # states with tf.variable_scope("model", reuse=reuse): # h1 = fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) # h2 = fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2 = tf.cast(X, tf.float32) xs = batch_to_seq(h2, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm', nh=size_mem) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', actdim, act=lambda x: x, init_scale=0.01) h1 = fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2), act=tf.tanh) h2 = fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2), act=tf.tanh) vf = fc(h5, 'vf', 1, act=lambda x: x)[:, 0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) # v0 = vf[0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, size_mem * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, vf, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(vf, {X: ob, S: state, M: mask}) # def step(ob, *_args, **_kwargs): # a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) # return a, v, self.initial_state, neglogp # # def value(ob, *_args, **_kwargs): # return sess.run(vf, {X: ob}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value