def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X, processed_x = observation_input(ob_space, nbatch) activ = tf.tanh processed_x = tf.layers.flatten(processed_x) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=4, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=4, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=4, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=4, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): #print ob a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): # pylint: disable=W0613 nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) pi = fc(h, 'pi', nact, init_scale=0.01) vf = fc(h, 'v', 1)[:, 0] self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) #obz = ob #with tf.variable_scope("obfilter"): # self.ob_rms = RunningMeanStd(shape=ob_space.shape) #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = ob for i in range(num_hid_layers): last_out = tf.nn.tanh( U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) else: pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
def build_policy_network(self, sy_ob_no): net = sy_ob_no net = tf.layers.dense( net, 64, activation=tf.nn.tanh, kernel_initializer=tf.truncated_normal_initializer(stddev=1.0)) net = tf.layers.dense( net, 64, activation=tf.nn.tanh, kernel_initializer=tf.truncated_normal_initializer(stddev=1.0)) sy_mean_na = tf.layers.dense( net, self.ac_dim, activation=None, kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) sy_logstd = tf.Variable( tf.zeros([self.ac_dim]), name='action/logstd', dtype=tf.float32 ) # logstd should just be a trainable variable, not a network output. # construct distribution pdparam = tf.concat([sy_mean_na, sy_mean_na * 0.0 + sy_logstd], axis=1) pdtype = make_pdtype(self.ac_dim) pd = pdtype.pdfromflat(pdparam) return pd
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps X, processed_x = observation_input(ob_space, nbatch) M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) vf = fc(h5, 'v', 1) self.pd, self.pi = self.pdtype.pdfromlatent(h5) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask}) def value(ob, state, mask): return sess.run(v0, {X:ob, S:state, M:mask}) self.X = X self.M = M self.S = S self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): X = tf.placeholder(shape=(nbatch,) + ob_space.shape, dtype=tf.float32) activ = tf.tanh processed_x = tf.layers.flatten(X) pi_h1 = activ(fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2))) pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2))) vf_h1 = activ(fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2))) vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2))) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'model') def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) def neg_log_prob(actions): return self.pd.neglogp(actions) self.X = X self.vf = vf self.step = step self.value = value self.neg_log_prob = neg_log_prob self.entropy = self.pd.entropy()
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, deterministic = False): #pylint: disable=W0613 # Assign action as Gaussian Distribution self.pdtype = make_pdtype(ac_space) #print("action_space: {}".format(ac_space)) with tf.variable_scope("model", reuse=reuse): phero_values = tf.placeholder(shape=(None, 8), dtype=tf.float32, name="phero_values") #velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities") # Actor neural net pi_net = self.net(phero_values) # Critic neural net vf_h2 = self.net(phero_values) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_net, init_scale=0.01) if deterministic: a0 = self.pd.mode() else: a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.phero = phero_values #self.velocities = velocities self.vf = vf def step(ob, *_args, **_kwargs): ''' Generate action & value & log probability by inputting one observation into the policy neural net ''' # 20201009 Should I get just array or single value? phero = ob # lb = [o["laser"] for o in ob] # rb = [o["rel_goal"] for o in ob] # vb = [o["velocities"] for o in ob] #print(rb) #print("mean: {}, std: {}".format(self.pd.mean, self.pd.std)) a, v, neglogp = sess.run([a0, vf, neglogp0], {self.phero: phero}) # Action clipping (normalising action within the range (-1, 1) for better training) # The network will learn what is happening as the training goes. # for i in range(a.shape[1]): # a[0][i] = min(1.0, max(-1.0, a[0][i])) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): phero = ob # lb = [o["laser"] for o in ob] # rb = [o["rel_goal"] for o in ob] # vb = [o["velocities"] for o in ob] return sess.run(vf, {self.phero: phero}) self.step = step self.value = value
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype(ac_space) self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] self.features_model = small_convnet(self.ob_space, nl=self.nl, feat_dim=self.feat_dim, last_nl=None, layernormalize=self.layernormalize) self.pd_hidden = torch.nn.Sequential( torch.nn.Linear(feat_dim, hidsize), torch.nn.ReLU(), torch.nn.Linear(hidsize, hidsize), torch.nn.ReLU(), ) self.pd_head = torch.nn.Linear(hidsize, pdparamsize) self.vf_head = torch.nn.Linear(hidsize, 1) self.param_list = [ dict(params=self.features_model.parameters()), dict(params=self.pd_hidden.parameters()), dict(params=self.pd_head.parameters()), dict(params=self.vf_head.parameters()) ] self.flat_features = None self.pd = None self.vpred = None self.ac = None self.ob = None
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, deterministic=False): #pylint: disable=W0613 # Assign action as Gaussian Distribution self.pdtype = make_pdtype(ac_space) self.num_obs = 8 #print("action_space: {}".format(ac_space)) with tf.variable_scope("model", reuse=reuse): phero_values = tf.placeholder(shape=(None, self.num_obs), dtype=tf.float32, name="phero_values") #velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities") # Actor neural net pi_net = self.net(phero_values) # Critic neural net vf_h2 = self.net(phero_values) vf = fc(vf_h2, 'vf', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_net, init_scale=0.01) if deterministic: a0 = self.pd.mode() else: a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.phero = phero_values #self.velocities = velocities self.vf = vf def step(ob, *_args, **_kwargs): ''' Generate action & value & log probability by inputting one observation into the policy neural net ''' phero = [o for o in ob] a, v, neglogp = sess.run([a0, vf, neglogp0], {self.phero: phero}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): phero = [o for o in ob] return sess.run(vf, {self.phero: phero}) self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps nh, nw, nc = ob_space.shape ob_shape = (nbatch, nh, nw, nc) nact = ac_space.n X = tf.placeholder(tf.uint8, ob_shape) # obs M = tf.placeholder(tf.float32, [nbatch]) # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2]) # states with tf.variable_scope("model", reuse=reuse): h = nature_cnn(X) xs = batch_to_seq(h, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) pi = fc(h5, 'pi', nact) vf = fc(h5, 'v', 1) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pi) v0 = vf[:, 0] a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, env, observations, latent, estimate_q=False, vf_latent=None, sess=None, **tensors): """ Parameters: ---------- env RL environment observations tensorflow placeholder in which the observations will be fed latent latent state from which policy distribution parameters should be inferred vf_latent latent state from which value function should be inferred (if None, then latent is used) sess tensorflow session to run calculations in (if None, default session is used) **tensors tensorflow tensors for additional attributes such as state or mask """ self.X = observations self.state = tf.constant([]) self.initial_state = None self.__dict__.update(tensors) vf_latent = vf_latent if vf_latent is not None else latent vf_latent = tf.layers.flatten(vf_latent) latent = tf.layers.flatten(latent) # Based on the action space, will select what probability distribution type self.pdtype = make_pdtype(env.action_space) self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01) # Take an action self.action = self.pd.sample() # Calculate the neg log of our probability self.neglogp = self.pd.neglogp(self.action) self.sess = sess if estimate_q: assert isinstance(env.action_space, gym.spaces.Discrete) self.q = fc(vf_latent, 'q', env.action_space.n) self.vf = self.q else: self.vf = fc(vf_latent, 'vf', 1) self.vf = self.vf[:,0]
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, deterministic = False): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): laser = tf.placeholder(shape=(None, 512, 3), dtype=tf.float32, name="laser") rel_goal = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="rel_goal") velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities") pi_net = self.net(laser, rel_goal, velocities) vf_h2 = self.net(laser, rel_goal, velocities) vf = fc(vf_h2, 'vf', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(pi_net, init_scale=0.01) if deterministic: a0 = self.pd.mode() else: a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.laser = laser self.rel_goal = rel_goal self.velocities = velocities self.vf = vf def step(ob, *_args, **_kwargs): lb = [o["laser"] for o in ob] rb = [o["rel_goal"] for o in ob] vb = [o["velocities"] for o in ob] #print(rb) a, v, neglogp = sess.run([a0, vf, neglogp0], {self.laser: lb, self.rel_goal: rb, self.velocities: vb}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): lb = [o["laser"] for o in ob] rb = [o["rel_goal"] for o in ob] vb = [o["velocities"] for o in ob] return sess.run(vf, {self.laser: lb, self.rel_goal: rb, self.velocities: vb}) self.step = step self.value = value
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None sy_ob = U.get_placeholder(name="sy_ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = sy_ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) sy_ac = self.pd.sample() # XXX self._act = U.function([stochastic, sy_ob], [sy_ac, self.vpred])
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) X, processed_x = observation_input(ob_space, nbatch) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X:ob}) self.X = X self.vf = vf self.step = step self.value = value
def __init__(self, ob_space, ac_space, hidsize, ob_mean, ob_std, feat_dim, layernormalize, nl, scope="policy"): if layernormalize: print( "Warning: policy is operating on top of layer-normed features. It might slow down the training." ) self.layernormalize = layernormalize self.bool_actionclip = True #TODO Need to make this flexible self.nl = nl self.ob_mean = ob_mean self.ob_std = ob_std #self.ac_range = ac_range with tf.variable_scope(scope): self.ob_space = ob_space self.ac_space = ac_space self.ac_pdtype = make_pdtype( ac_space ) #RS: Should give a continuous action space, given a continuous action env self.ph_ob = tf.placeholder(dtype=tf.int32, shape=(None, None) + ob_space.shape, name='ob') self.ph_ac = self.ac_pdtype.sample_placeholder([None, None], name='ac') self.pd = self.vpred = None self.hidsize = hidsize self.feat_dim = feat_dim self.scope = scope pdparamsize = self.ac_pdtype.param_shape()[0] sh = tf.shape(self.ph_ob) x = flatten_two_dims(self.ph_ob) self.flat_features = self.get_features(x, reuse=False) self.features = unflatten_first_dim(self.flat_features, sh) with tf.variable_scope(scope, reuse=False): x = fc(self.flat_features, units=hidsize, activation=activ) x = fc(x, units=hidsize, activation=activ) pdparam = fc(x, name='pd', units=pdparamsize, activation=tf.nn.tanh) vpred = fc(x, name='value_function_output', units=1, activation=None) pdparam = unflatten_first_dim(pdparam, sh) self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0] self.pd = pd = self.ac_pdtype.pdfromflat(pdparam) self.a_samp = pd.sample() self.a_samp = self.clip_action( self.a_samp) if self.bool_actionclip else self.a_samp self.entropy = pd.entropy() self.nlp_samp = pd.neglogp(self.a_samp) self.pd_logstd = pd.logstd self.pd_std = pd.std self.pd_mean = pd.mean
def __init__(self, sess, ob_space, ac_space, nbatch, reuse=False, training=True): # pylint: disable=W0613 ob_shape = (nbatch, ) + ob_space.shape # 增加nbatch行 actdim = ac_space.shape[0] # X = tf.placeholder(tf.float32, ob_shape, name='Ob') # obs X = tf.placeholder(tf.float32, [None, ob_space.shape[0]], name='Ob') with tf.variable_scope("model", reuse=reuse): # activ = tf.tanh bn = tf.layers.batch_normalization activ = lkrelu # h1 = activ(bn(fc(X, 'pi_fc1', nh=512, init_scale=np.sqrt(2)), training=training)) # h2 = activ(bn(fc(h1, 'pi_fc2', nh=512, init_scale=np.sqrt(2)), training=training)) # h3 = activ(bn(fc(h2, 'pi_fc3', nh=256, init_scale=np.sqrt(2)), training=training)) h1 = activ(fc(X, 'pi_fc1', nh=100, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'pi_fc2', nh=100, init_scale=np.sqrt(2))) # pi0 = tf.nn.tanh(fc(h2, 'pi0', 1, init_scale=0.01))*3 # (-3, 3) # pi1 = tf.nn.sigmoid(fc(h2, 'pi1', 1, init_scale=0.01))*10 # (0, 10) # pi = tf.concat([pi0, pi1], axis=1, name='pi') pi = tf.nn.tanh(fc(h2, 'pi', nh=actdim)) * 10 # h1 = activ(bn(fc(X, 'vf_fc1', nh=512, init_scale=np.sqrt(2)), training=training)) # h2 = activ(bn(fc(h1, 'vf_fc2', nh=512, init_scale=np.sqrt(2)), training=training)) # h3 = activ(bn(fc(h2, 'vf_fc3', nh=256, init_scale=np.sqrt(2)), training=training)) h1 = activ(fc(X, 'vf_fc1', nh=100, init_scale=np.sqrt(2))) h2 = activ(fc(h1, 'vf_fc2', nh=100, init_scale=np.sqrt(2))) vf = fc(h2, 'vf', 1)[:, 0] logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) # logstd = tf.layers.dense(inputs=h2, activation=None, units=actdim, name='logstd') pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) # pi * 0.0 + logstd的作用是使得qi有相同的形状 self.pdtype = make_pdtype( ac_space) # Probability distribution function pd '''返回DiagGaussianPd的类''' self.pd = self.pdtype.pdfromflat(pdparam) a0 = self.pd.sample() self.action = tf.identity( a0, name='action') # use this tensor as action when inference # if I need action clipping? # a1 = tf.clip_by_value(a0[:, 0:1], -3, 3) # a2 = tf.clip_by_value(a0[:, 1:2], 0, 10) # a0 = tf.concat([a1, a2], axis=1) neglogp0 = self.pd.neglogp(a0) self.initial_state = None def step(ob, *_args, **_kwargs): # {X: ob}给placeholder赋值 a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) self.X = X self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False): nenv = nbatch // nsteps # nh, nw, nc = ob_space.shape # (nh, nw, nc) = (height, width, channels) ob_shape = (nbatch, ob_space.shape[0]) # nact = ac_space.n # X = tf.placeholder(tf.uint8, ob_shape) # obs actdim = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape, name='phOb') M = tf.placeholder(tf.float32, [nbatch], name='phMaskDone') # mask (done t-1) S = tf.placeholder(tf.float32, [nenv, nlstm * 2], name='phCellState') # states and output: (c, h) with tf.variable_scope("model", reuse=reuse): # h = nature_cnn(X) # h = tf.add(X, 0, name='h') # need more network to power enough h = mlp(X) xs = batch_to_seq( h, nenv, nsteps) # A List contain tensors all with shape [nenv, -1] ms = batch_to_seq( M, nenv, nsteps) # A List contain tensors all with shape [nenv, 1] h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm) h5 = seq_to_batch(h5) # pi = fc(h5, 'fc_pi', actdim) pi0 = tf.nn.tanh(h5[:, :1]) * 3 pi1 = tf.nn.sigmoid(h5[:, 1:2]) * 10 pi = tf.concat([pi0, pi1], axis=1, name='pi') vf = fc(h5, 'v', 1) logstd = tf.get_variable(name="logstd", shape=[1, actdim], initializer=tf.zeros_initializer()) pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1) # self.pdtype = make_pdtype(ac_space) self.pdtype = make_pdtype(ac_space) self.pd = self.pdtype.pdfromflat(pdparam) v0 = vf[:, 0] a0 = self.pd.sample() action = tf.add( a0, 0, name='action') # use this tensor as action when inference newState = tf.add(snew, 0, name='newCellState') print('sel.pd.shape', self.pd.shape, a0.shape) neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv, nlstm * 2), dtype=np.float32) def step(ob, state, mask): return sess.run([a0, v0, snew, neglogp0], { X: ob, S: state, M: mask }) def value(ob, state, mask): return sess.run(v0, {X: ob, S: state, M: mask}) self.X = X self.M = M self.S = S self.pi = pi self.vf = vf self.step = step self.value = value
def __init__(self, sess, ob_space, loc_space, ac_space, nbatch, nsteps, max_timesteps, reuse=False, seed=0): nenv = nbatch // nsteps self.pdtype = make_pdtype(ac_space) with tf.variable_scope("model", reuse=reuse): G = tf.placeholder(tf.float32, [nbatch, max_timesteps, loc_space]) X = tf.placeholder(tf.float32, (nbatch, )+ob_space.shape) Y = tf.placeholder(tf.float32, [nbatch, loc_space]) M = tf.placeholder(tf.float32, [nbatch]) S = tf.placeholder(tf.float32, [nenv, 128]) ys = batch_to_seq(Y, nenv, nsteps) ms = batch_to_seq(M, nenv, nsteps) tf.set_random_seed(seed) self.embed_W = tf.get_variable("embed_w", [loc_space, 64], initializer=ortho_init(1.0, seed)) self.embed_b = tf.get_variable("embed_b", [64,]) self.wa = tf.get_variable("wa", [128, 128], initializer=ortho_init(1.0, seed)) self.wb = tf.get_variable("wb", [128,]) self.ua = tf.get_variable("ua", [128, 128], initializer=ortho_init(1.0, seed)) self.ub = tf.get_variable("ub", [128,]) self.va = tf.get_variable("va", [128]) self.rnn = tf.nn.rnn_cell.GRUCell(128, kernel_initializer=ortho_init(1.0, seed)) enc_hidden = tf.zeros((nbatch, 128)) embed_G = tf.matmul(tf.reshape(G, (-1, loc_space)),self.embed_W)+self.embed_b embed_G = tf.reshape(embed_G, (nbatch, max_timesteps, -1)) enc_output, _ = tf.nn.dynamic_rnn(cell=self.rnn, inputs=embed_G, dtype=tf.float32) gs = batch_to_seq(enc_output, nenv, nsteps) dec_hidden = S h = [] for idx, (y, m, g) in enumerate(zip(ys, ms, gs)): dec_hidden = dec_hidden*(1-m) embed_y = tf.matmul(y,self.embed_W)+self.embed_b dec_output, dec_hidden = tf.nn.dynamic_rnn(cell=self.rnn, inputs=tf.expand_dims(embed_y,axis=1), initial_state=dec_hidden) tmp = tf.reshape(tf.matmul(tf.reshape(g, (-1, 128)), self.ua)+self.ub,(nenv, max_timesteps, 128)) tmp = tf.tanh(tf.expand_dims(tf.matmul(dec_hidden, self.wa)+self.wb,axis=1) + tmp) score = tf.reduce_sum(tmp*tf.expand_dims(tf.expand_dims(self.va, axis=0), axis=1), axis=2, keepdims=True) attention_weights = tf.nn.softmax(score, axis=1) context_vector = attention_weights * g context_vector = tf.reduce_sum(context_vector, axis=1) x = tf.concat([context_vector, dec_hidden], axis=-1) h.append(x) h = seq_to_batch(h) vf = fc(h, 'v', 1, seed=seed)[:,0] self.pd, self.pi = self.pdtype.pdfromlatent(h, seed=seed, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = np.zeros((nenv,128)) def step(ob, loc, goal, state, mask): a, v, state, neglogp = sess.run([a0, vf, dec_hidden, neglogp0], {X:ob, Y:loc, G:goal, M:mask, S:state}) return a, v, state, neglogp def value(ob, loc, goal, state, mask): return sess.run(vf, {X:ob, Y:loc, G:goal, M:mask, S:state}) self.G = G self.X = X self.Y = Y self.S = S self.M = M self.vf = vf self.step = step self.value = value